From 693e0a1d8932ba51b58cac1192d201bcbeb04c60 Mon Sep 17 00:00:00 2001
From: handsomezhuzhu <2658601135@qq.com>
Date: Tue, 10 Feb 2026 19:09:14 +0800
Subject: [PATCH] feat: add CUDA-to-NPU monkey patches for GPUModelRunner
 compatibility

---
 vllm_npu/__init__.py    |  6 +++
 vllm_npu/cuda_compat.py | 89 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 vllm_npu/cuda_compat.py

diff --git a/vllm_npu/__init__.py b/vllm_npu/__init__.py
index 89af8be..9ea89ec 100644
--- a/vllm_npu/__init__.py
+++ b/vllm_npu/__init__.py
@@ -9,4 +9,10 @@ class name of the platform implementation.
 
 def register():
     """Return the fully-qualified name of the NPU platform class."""
+    # Apply CUDA→NPU compatibility patches early so that any code
+    # referencing torch.cuda.Stream / Event / etc. will transparently
+    # be redirected to the torch.npu equivalents.
+    from vllm_npu.cuda_compat import _patch_cuda_to_npu
+    _patch_cuda_to_npu()
+
     return "vllm_npu.platform.NPUPlatform"
diff --git a/vllm_npu/cuda_compat.py b/vllm_npu/cuda_compat.py
new file mode 100644
index 0000000..8aea3a2
--- /dev/null
+++ b/vllm_npu/cuda_compat.py
@@ -0,0 +1,89 @@
+"""
+CUDA-to-NPU Compatibility Layer.
+
+Monkey-patches ``torch.cuda`` APIs so that code written for CUDA
+(e.g. ``torch.cuda.Stream()``, ``torch.cuda.Event()``) transparently
+delegates to the corresponding ``torch.npu`` equivalents. This
+allows vLLM's ``GPUModelRunner`` to run on Ascend NPU without
+source modifications.
+"""
+
+import types
+from unittest.mock import MagicMock
+
+import torch
+
+
+def _patch_cuda_to_npu() -> None:
+    """Apply monkey-patches: redirect torch.cuda → torch.npu."""
+    import torch_npu  # noqa: F401
+
+    # ------------------------------------------------------------------
+    # Stream / Event
+    # ------------------------------------------------------------------
+    torch.cuda.Stream = torch.npu.Stream  # type: ignore[attr-defined]
+    torch.cuda.Event = torch.npu.Event  # type: ignore[attr-defined]
+    torch.cuda.current_stream = torch.npu.current_stream  # type: ignore
+    torch.cuda.default_stream = torch.npu.default_stream  # type: ignore
+
+    # torch.cuda.stream() context manager
+    torch.cuda.stream = torch.npu.stream  # type: ignore[attr-defined]
+
+    # ------------------------------------------------------------------
+    # Device management
+    # ------------------------------------------------------------------
+    torch.cuda.set_device = torch.npu.set_device  # type: ignore
+    torch.cuda.synchronize = torch.npu.synchronize  # type: ignore
+    torch.cuda.device_count = torch.npu.device_count  # type: ignore
+    torch.cuda.current_device = torch.npu.current_device  # type: ignore
+    torch.cuda.is_available = lambda: True  # type: ignore
+
+    # ------------------------------------------------------------------
+    # Memory management
+    # ------------------------------------------------------------------
+    torch.cuda.empty_cache = torch.npu.empty_cache  # type: ignore
+    torch.cuda.mem_get_info = torch.npu.mem_get_info  # type: ignore
+    torch.cuda.memory_allocated = torch.npu.memory_allocated  # type: ignore
+    torch.cuda.max_memory_allocated = torch.npu.max_memory_allocated  # type: ignore
+    torch.cuda.memory_reserved = torch.npu.memory_reserved  # type: ignore
+    torch.cuda.max_memory_reserved = torch.npu.max_memory_reserved  # type: ignore
+    torch.cuda.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats  # type: ignore
+    torch.cuda.memory_stats = torch.npu.memory_stats  # type: ignore
+
+    # ------------------------------------------------------------------
+    # Device properties
+    # ------------------------------------------------------------------
+    _real_npu_props = torch.npu.get_device_properties
+
+    def _get_device_properties(device=None):
+        """Return NPU device properties with CUDA-compatible attributes."""
+        props = _real_npu_props(device)
+        # GPUModelRunner accesses .multi_processor_count which may not
+        # exist on NPU.  Provide a sensible fallback.
+        if not hasattr(props, "multi_processor_count"):
+            props.multi_processor_count = 1  # type: ignore[attr-defined]
+        if not hasattr(props, "major"):
+            props.major = 9  # type: ignore[attr-defined]
+            props.minor = 0  # type: ignore[attr-defined]
+        return props
+
+    torch.cuda.get_device_properties = _get_device_properties  # type: ignore
+
+    # ------------------------------------------------------------------
+    # Misc
+    # ------------------------------------------------------------------
+    if not hasattr(torch.cuda, "_get_device_index"):
+        torch.cuda._get_device_index = torch.npu._get_device_index  # type: ignore
+
+    # graph / CUDAGraph stubs (NPU does not support CUDA graphs)
+    if not hasattr(torch.cuda, "CUDAGraph") or True:
+        torch.cuda.CUDAGraph = MagicMock  # type: ignore[attr-defined]
+
+    if not hasattr(torch.cuda, "graph"):
+
+        def _noop_graph(*args, **kwargs):
+            """No-op context manager for CUDA graphs on NPU."""
+            import contextlib
+            return contextlib.nullcontext()
+
+        torch.cuda.graph = _noop_graph  # type: ignore[attr-defined]