From 693e0a1d8932ba51b58cac1192d201bcbeb04c60 Mon Sep 17 00:00:00 2001 From: handsomezhuzhu <2658601135@qq.com> Date: Tue, 10 Feb 2026 19:09:14 +0800 Subject: [PATCH] feat: add CUDA-to-NPU monkey patches for GPUModelRunner compatibility --- vllm_npu/__init__.py | 6 +++ vllm_npu/cuda_compat.py | 89 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 vllm_npu/cuda_compat.py diff --git a/vllm_npu/__init__.py b/vllm_npu/__init__.py index 89af8be..9ea89ec 100644 --- a/vllm_npu/__init__.py +++ b/vllm_npu/__init__.py @@ -9,4 +9,10 @@ class name of the platform implementation. def register(): """Return the fully-qualified name of the NPU platform class.""" + # Apply CUDA→NPU compatibility patches early so that any code + # referencing torch.cuda.Stream / Event / etc. will transparently + # be redirected to the torch.npu equivalents. + from vllm_npu.cuda_compat import _patch_cuda_to_npu + _patch_cuda_to_npu() + return "vllm_npu.platform.NPUPlatform" diff --git a/vllm_npu/cuda_compat.py b/vllm_npu/cuda_compat.py new file mode 100644 index 0000000..8aea3a2 --- /dev/null +++ b/vllm_npu/cuda_compat.py @@ -0,0 +1,89 @@ +""" +CUDA-to-NPU Compatibility Layer. + +Monkey-patches ``torch.cuda`` APIs so that code written for CUDA +(e.g. ``torch.cuda.Stream()``, ``torch.cuda.Event()``) transparently +delegates to the corresponding ``torch.npu`` equivalents. This +allows vLLM's ``GPUModelRunner`` to run on Ascend NPU without +source modifications. +""" + +import types +from unittest.mock import MagicMock + +import torch + + +def _patch_cuda_to_npu() -> None: + """Apply monkey-patches: redirect torch.cuda → torch.npu.""" + import torch_npu # noqa: F401 + + # ------------------------------------------------------------------ + # Stream / Event + # ------------------------------------------------------------------ + torch.cuda.Stream = torch.npu.Stream # type: ignore[attr-defined] + torch.cuda.Event = torch.npu.Event # type: ignore[attr-defined] + torch.cuda.current_stream = torch.npu.current_stream # type: ignore + torch.cuda.default_stream = torch.npu.default_stream # type: ignore + + # torch.cuda.stream() context manager + torch.cuda.stream = torch.npu.stream # type: ignore[attr-defined] + + # ------------------------------------------------------------------ + # Device management + # ------------------------------------------------------------------ + torch.cuda.set_device = torch.npu.set_device # type: ignore + torch.cuda.synchronize = torch.npu.synchronize # type: ignore + torch.cuda.device_count = torch.npu.device_count # type: ignore + torch.cuda.current_device = torch.npu.current_device # type: ignore + torch.cuda.is_available = lambda: True # type: ignore + + # ------------------------------------------------------------------ + # Memory management + # ------------------------------------------------------------------ + torch.cuda.empty_cache = torch.npu.empty_cache # type: ignore + torch.cuda.mem_get_info = torch.npu.mem_get_info # type: ignore + torch.cuda.memory_allocated = torch.npu.memory_allocated # type: ignore + torch.cuda.max_memory_allocated = torch.npu.max_memory_allocated # type: ignore + torch.cuda.memory_reserved = torch.npu.memory_reserved # type: ignore + torch.cuda.max_memory_reserved = torch.npu.max_memory_reserved # type: ignore + torch.cuda.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore + torch.cuda.memory_stats = torch.npu.memory_stats # type: ignore + + # ------------------------------------------------------------------ + # Device properties + # ------------------------------------------------------------------ + _real_npu_props = torch.npu.get_device_properties + + def _get_device_properties(device=None): + """Return NPU device properties with CUDA-compatible attributes.""" + props = _real_npu_props(device) + # GPUModelRunner accesses .multi_processor_count which may not + # exist on NPU. Provide a sensible fallback. + if not hasattr(props, "multi_processor_count"): + props.multi_processor_count = 1 # type: ignore[attr-defined] + if not hasattr(props, "major"): + props.major = 9 # type: ignore[attr-defined] + props.minor = 0 # type: ignore[attr-defined] + return props + + torch.cuda.get_device_properties = _get_device_properties # type: ignore + + # ------------------------------------------------------------------ + # Misc + # ------------------------------------------------------------------ + if not hasattr(torch.cuda, "_get_device_index"): + torch.cuda._get_device_index = torch.npu._get_device_index # type: ignore + + # graph / CUDAGraph stubs (NPU does not support CUDA graphs) + if not hasattr(torch.cuda, "CUDAGraph") or True: + torch.cuda.CUDAGraph = MagicMock # type: ignore[attr-defined] + + if not hasattr(torch.cuda, "graph"): + + def _noop_graph(*args, **kwargs): + """No-op context manager for CUDA graphs on NPU.""" + import contextlib + return contextlib.nullcontext() + + torch.cuda.graph = _noop_graph # type: ignore[attr-defined]