feat: add CUDA-to-NPU monkey patches for GPUModelRunner compatibility

2026-02-20 19:50:15 +00:00 · 2026-02-10 19:09:14 +08:00
parent 0765fc9fd3
commit 693e0a1d89
2 changed files with 95 additions and 0 deletions
--- a/vllm_npu/cuda_compat.py
+++ b/vllm_npu/cuda_compat.py
@@ -0,0 +1,89 @@
+"""
+CUDA-to-NPU Compatibility Layer.
+
+Monkey-patches ``torch.cuda`` APIs so that code written for CUDA
+(e.g. ``torch.cuda.Stream()``, ``torch.cuda.Event()``) transparently
+delegates to the corresponding ``torch.npu`` equivalents. This
+allows vLLM's ``GPUModelRunner`` to run on Ascend NPU without
+source modifications.
+"""
+
+import types
+from unittest.mock import MagicMock
+
+import torch
+
+
+def _patch_cuda_to_npu() -> None:
+    """Apply monkey-patches: redirect torch.cuda → torch.npu."""
+    import torch_npu  # noqa: F401
+
+    # ------------------------------------------------------------------
+    # Stream / Event
+    # ------------------------------------------------------------------
+    torch.cuda.Stream = torch.npu.Stream  # type: ignore[attr-defined]
+    torch.cuda.Event = torch.npu.Event  # type: ignore[attr-defined]
+    torch.cuda.current_stream = torch.npu.current_stream  # type: ignore
+    torch.cuda.default_stream = torch.npu.default_stream  # type: ignore
+
+    # torch.cuda.stream() context manager
+    torch.cuda.stream = torch.npu.stream  # type: ignore[attr-defined]
+
+    # ------------------------------------------------------------------
+    # Device management
+    # ------------------------------------------------------------------
+    torch.cuda.set_device = torch.npu.set_device  # type: ignore
+    torch.cuda.synchronize = torch.npu.synchronize  # type: ignore
+    torch.cuda.device_count = torch.npu.device_count  # type: ignore
+    torch.cuda.current_device = torch.npu.current_device  # type: ignore
+    torch.cuda.is_available = lambda: True  # type: ignore
+
+    # ------------------------------------------------------------------
+    # Memory management
+    # ------------------------------------------------------------------
+    torch.cuda.empty_cache = torch.npu.empty_cache  # type: ignore
+    torch.cuda.mem_get_info = torch.npu.mem_get_info  # type: ignore
+    torch.cuda.memory_allocated = torch.npu.memory_allocated  # type: ignore
+    torch.cuda.max_memory_allocated = torch.npu.max_memory_allocated  # type: ignore
+    torch.cuda.memory_reserved = torch.npu.memory_reserved  # type: ignore
+    torch.cuda.max_memory_reserved = torch.npu.max_memory_reserved  # type: ignore
+    torch.cuda.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats  # type: ignore
+    torch.cuda.memory_stats = torch.npu.memory_stats  # type: ignore
+
+    # ------------------------------------------------------------------
+    # Device properties
+    # ------------------------------------------------------------------
+    _real_npu_props = torch.npu.get_device_properties
+
+    def _get_device_properties(device=None):
+        """Return NPU device properties with CUDA-compatible attributes."""
+        props = _real_npu_props(device)
+        # GPUModelRunner accesses .multi_processor_count which may not
+        # exist on NPU.  Provide a sensible fallback.
+        if not hasattr(props, "multi_processor_count"):
+            props.multi_processor_count = 1  # type: ignore[attr-defined]
+        if not hasattr(props, "major"):
+            props.major = 9  # type: ignore[attr-defined]
+            props.minor = 0  # type: ignore[attr-defined]
+        return props
+
+    torch.cuda.get_device_properties = _get_device_properties  # type: ignore
+
+    # ------------------------------------------------------------------
+    # Misc
+    # ------------------------------------------------------------------
+    if not hasattr(torch.cuda, "_get_device_index"):
+        torch.cuda._get_device_index = torch.npu._get_device_index  # type: ignore
+
+    # graph / CUDAGraph stubs (NPU does not support CUDA graphs)
+    if not hasattr(torch.cuda, "CUDAGraph") or True:
+        torch.cuda.CUDAGraph = MagicMock  # type: ignore[attr-defined]
+
+    if not hasattr(torch.cuda, "graph"):
+
+        def _noop_graph(*args, **kwargs):
+            """No-op context manager for CUDA graphs on NPU."""
+            import contextlib
+            return contextlib.nullcontext()
+
+        torch.cuda.graph = _noop_graph  # type: ignore[attr-defined]