mirror of
https://github.com/handsomezhuzhu/vllm-npu-plugin.git
synced 2026-02-20 11:42:30 +00:00
feat: add CUDA-to-NPU monkey patches for GPUModelRunner compatibility
This commit is contained in:
@@ -9,4 +9,10 @@ class name of the platform implementation.
|
|||||||
|
|
||||||
def register():
|
def register():
|
||||||
"""Return the fully-qualified name of the NPU platform class."""
|
"""Return the fully-qualified name of the NPU platform class."""
|
||||||
|
# Apply CUDA→NPU compatibility patches early so that any code
|
||||||
|
# referencing torch.cuda.Stream / Event / etc. will transparently
|
||||||
|
# be redirected to the torch.npu equivalents.
|
||||||
|
from vllm_npu.cuda_compat import _patch_cuda_to_npu
|
||||||
|
_patch_cuda_to_npu()
|
||||||
|
|
||||||
return "vllm_npu.platform.NPUPlatform"
|
return "vllm_npu.platform.NPUPlatform"
|
||||||
|
|||||||
89
vllm_npu/cuda_compat.py
Normal file
89
vllm_npu/cuda_compat.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
"""
|
||||||
|
CUDA-to-NPU Compatibility Layer.
|
||||||
|
|
||||||
|
Monkey-patches ``torch.cuda`` APIs so that code written for CUDA
|
||||||
|
(e.g. ``torch.cuda.Stream()``, ``torch.cuda.Event()``) transparently
|
||||||
|
delegates to the corresponding ``torch.npu`` equivalents. This
|
||||||
|
allows vLLM's ``GPUModelRunner`` to run on Ascend NPU without
|
||||||
|
source modifications.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import types
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_cuda_to_npu() -> None:
|
||||||
|
"""Apply monkey-patches: redirect torch.cuda → torch.npu."""
|
||||||
|
import torch_npu # noqa: F401
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Stream / Event
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
torch.cuda.Stream = torch.npu.Stream # type: ignore[attr-defined]
|
||||||
|
torch.cuda.Event = torch.npu.Event # type: ignore[attr-defined]
|
||||||
|
torch.cuda.current_stream = torch.npu.current_stream # type: ignore
|
||||||
|
torch.cuda.default_stream = torch.npu.default_stream # type: ignore
|
||||||
|
|
||||||
|
# torch.cuda.stream() context manager
|
||||||
|
torch.cuda.stream = torch.npu.stream # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Device management
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
torch.cuda.set_device = torch.npu.set_device # type: ignore
|
||||||
|
torch.cuda.synchronize = torch.npu.synchronize # type: ignore
|
||||||
|
torch.cuda.device_count = torch.npu.device_count # type: ignore
|
||||||
|
torch.cuda.current_device = torch.npu.current_device # type: ignore
|
||||||
|
torch.cuda.is_available = lambda: True # type: ignore
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Memory management
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
torch.cuda.empty_cache = torch.npu.empty_cache # type: ignore
|
||||||
|
torch.cuda.mem_get_info = torch.npu.mem_get_info # type: ignore
|
||||||
|
torch.cuda.memory_allocated = torch.npu.memory_allocated # type: ignore
|
||||||
|
torch.cuda.max_memory_allocated = torch.npu.max_memory_allocated # type: ignore
|
||||||
|
torch.cuda.memory_reserved = torch.npu.memory_reserved # type: ignore
|
||||||
|
torch.cuda.max_memory_reserved = torch.npu.max_memory_reserved # type: ignore
|
||||||
|
torch.cuda.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore
|
||||||
|
torch.cuda.memory_stats = torch.npu.memory_stats # type: ignore
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Device properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
_real_npu_props = torch.npu.get_device_properties
|
||||||
|
|
||||||
|
def _get_device_properties(device=None):
|
||||||
|
"""Return NPU device properties with CUDA-compatible attributes."""
|
||||||
|
props = _real_npu_props(device)
|
||||||
|
# GPUModelRunner accesses .multi_processor_count which may not
|
||||||
|
# exist on NPU. Provide a sensible fallback.
|
||||||
|
if not hasattr(props, "multi_processor_count"):
|
||||||
|
props.multi_processor_count = 1 # type: ignore[attr-defined]
|
||||||
|
if not hasattr(props, "major"):
|
||||||
|
props.major = 9 # type: ignore[attr-defined]
|
||||||
|
props.minor = 0 # type: ignore[attr-defined]
|
||||||
|
return props
|
||||||
|
|
||||||
|
torch.cuda.get_device_properties = _get_device_properties # type: ignore
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Misc
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
if not hasattr(torch.cuda, "_get_device_index"):
|
||||||
|
torch.cuda._get_device_index = torch.npu._get_device_index # type: ignore
|
||||||
|
|
||||||
|
# graph / CUDAGraph stubs (NPU does not support CUDA graphs)
|
||||||
|
if not hasattr(torch.cuda, "CUDAGraph") or True:
|
||||||
|
torch.cuda.CUDAGraph = MagicMock # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
if not hasattr(torch.cuda, "graph"):
|
||||||
|
|
||||||
|
def _noop_graph(*args, **kwargs):
|
||||||
|
"""No-op context manager for CUDA graphs on NPU."""
|
||||||
|
import contextlib
|
||||||
|
return contextlib.nullcontext()
|
||||||
|
|
||||||
|
torch.cuda.graph = _noop_graph # type: ignore[attr-defined]
|
||||||
Reference in New Issue
Block a user