feat: add CUDA-to-NPU monkey patches for GPUModelRunner compatibility

2026-02-20 11:42:30 +00:00 · 2026-02-10 19:09:14 +08:00
parent 0765fc9fd3
commit 693e0a1d89
2 changed files with 95 additions and 0 deletions
--- a/vllm_npu/init.py
+++ b/vllm_npu/init.py
@@ -9,4 +9,10 @@ class name of the platform implementation.
 def register():
    """Return the fully-qualified name of the NPU platform class."""
    # Apply CUDA→NPU compatibility patches early so that any code
    # referencing torch.cuda.Stream / Event / etc. will transparently
    # be redirected to the torch.npu equivalents.
    from vllm_npu.cuda_compat import _patch_cuda_to_npu
    _patch_cuda_to_npu()
    return "vllm_npu.platform.NPUPlatform"
--- a/vllm_npu/cuda_compat.py
+++ b/vllm_npu/cuda_compat.py
@@ -0,0 +1,89 @@
 """
 CUDA-to-NPU Compatibility Layer.
 Monkey-patches ``torch.cuda`` APIs so that code written for CUDA
 (e.g. ``torch.cuda.Stream()``, ``torch.cuda.Event()``) transparently
 delegates to the corresponding ``torch.npu`` equivalents. This
 allows vLLM's ``GPUModelRunner`` to run on Ascend NPU without
 source modifications.
 """
 import types
 from unittest.mock import MagicMock
 import torch
 def _patch_cuda_to_npu() -> None:
    """Apply monkey-patches: redirect torch.cuda → torch.npu."""
    import torch_npu  # noqa: F401
    # ------------------------------------------------------------------
    # Stream / Event
    # ------------------------------------------------------------------
    torch.cuda.Stream = torch.npu.Stream  # type: ignore[attr-defined]
    torch.cuda.Event = torch.npu.Event  # type: ignore[attr-defined]
    torch.cuda.current_stream = torch.npu.current_stream  # type: ignore
    torch.cuda.default_stream = torch.npu.default_stream  # type: ignore
    # torch.cuda.stream() context manager
    torch.cuda.stream = torch.npu.stream  # type: ignore[attr-defined]
    # ------------------------------------------------------------------
    # Device management
    # ------------------------------------------------------------------
    torch.cuda.set_device = torch.npu.set_device  # type: ignore
    torch.cuda.synchronize = torch.npu.synchronize  # type: ignore
    torch.cuda.device_count = torch.npu.device_count  # type: ignore
    torch.cuda.current_device = torch.npu.current_device  # type: ignore
    torch.cuda.is_available = lambda: True  # type: ignore
    # ------------------------------------------------------------------
    # Memory management
    # ------------------------------------------------------------------
    torch.cuda.empty_cache = torch.npu.empty_cache  # type: ignore
    torch.cuda.mem_get_info = torch.npu.mem_get_info  # type: ignore
    torch.cuda.memory_allocated = torch.npu.memory_allocated  # type: ignore
    torch.cuda.max_memory_allocated = torch.npu.max_memory_allocated  # type: ignore
    torch.cuda.memory_reserved = torch.npu.memory_reserved  # type: ignore
    torch.cuda.max_memory_reserved = torch.npu.max_memory_reserved  # type: ignore
    torch.cuda.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats  # type: ignore
    torch.cuda.memory_stats = torch.npu.memory_stats  # type: ignore
    # ------------------------------------------------------------------
    # Device properties
    # ------------------------------------------------------------------
    _real_npu_props = torch.npu.get_device_properties
    def _get_device_properties(device=None):
        """Return NPU device properties with CUDA-compatible attributes."""
        props = _real_npu_props(device)
        # GPUModelRunner accesses .multi_processor_count which may not
        # exist on NPU.  Provide a sensible fallback.
        if not hasattr(props, "multi_processor_count"):
            props.multi_processor_count = 1  # type: ignore[attr-defined]
        if not hasattr(props, "major"):
            props.major = 9  # type: ignore[attr-defined]
            props.minor = 0  # type: ignore[attr-defined]
        return props
    torch.cuda.get_device_properties = _get_device_properties  # type: ignore
    # ------------------------------------------------------------------
    # Misc
    # ------------------------------------------------------------------
    if not hasattr(torch.cuda, "_get_device_index"):
        torch.cuda._get_device_index = torch.npu._get_device_index  # type: ignore
    # graph / CUDAGraph stubs (NPU does not support CUDA graphs)
    if not hasattr(torch.cuda, "CUDAGraph") or True:
        torch.cuda.CUDAGraph = MagicMock  # type: ignore[attr-defined]
    if not hasattr(torch.cuda, "graph"):
        def _noop_graph(*args, **kwargs):
            """No-op context manager for CUDA graphs on NPU."""
            import contextlib
            return contextlib.nullcontext()
        torch.cuda.graph = _noop_graph  # type: ignore[attr-defined]