mirror of
https://github.com/handsomezhuzhu/vllm-npu-plugin.git
synced 2026-02-20 19:50:15 +00:00
feat: add CUDA-to-NPU monkey patches for GPUModelRunner compatibility
This commit is contained in:
89
vllm_npu/cuda_compat.py
Normal file
89
vllm_npu/cuda_compat.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""
|
||||
CUDA-to-NPU Compatibility Layer.
|
||||
|
||||
Monkey-patches ``torch.cuda`` APIs so that code written for CUDA
|
||||
(e.g. ``torch.cuda.Stream()``, ``torch.cuda.Event()``) transparently
|
||||
delegates to the corresponding ``torch.npu`` equivalents. This
|
||||
allows vLLM's ``GPUModelRunner`` to run on Ascend NPU without
|
||||
source modifications.
|
||||
"""
|
||||
|
||||
import types
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def _patch_cuda_to_npu() -> None:
|
||||
"""Apply monkey-patches: redirect torch.cuda → torch.npu."""
|
||||
import torch_npu # noqa: F401
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Stream / Event
|
||||
# ------------------------------------------------------------------
|
||||
torch.cuda.Stream = torch.npu.Stream # type: ignore[attr-defined]
|
||||
torch.cuda.Event = torch.npu.Event # type: ignore[attr-defined]
|
||||
torch.cuda.current_stream = torch.npu.current_stream # type: ignore
|
||||
torch.cuda.default_stream = torch.npu.default_stream # type: ignore
|
||||
|
||||
# torch.cuda.stream() context manager
|
||||
torch.cuda.stream = torch.npu.stream # type: ignore[attr-defined]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Device management
|
||||
# ------------------------------------------------------------------
|
||||
torch.cuda.set_device = torch.npu.set_device # type: ignore
|
||||
torch.cuda.synchronize = torch.npu.synchronize # type: ignore
|
||||
torch.cuda.device_count = torch.npu.device_count # type: ignore
|
||||
torch.cuda.current_device = torch.npu.current_device # type: ignore
|
||||
torch.cuda.is_available = lambda: True # type: ignore
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Memory management
|
||||
# ------------------------------------------------------------------
|
||||
torch.cuda.empty_cache = torch.npu.empty_cache # type: ignore
|
||||
torch.cuda.mem_get_info = torch.npu.mem_get_info # type: ignore
|
||||
torch.cuda.memory_allocated = torch.npu.memory_allocated # type: ignore
|
||||
torch.cuda.max_memory_allocated = torch.npu.max_memory_allocated # type: ignore
|
||||
torch.cuda.memory_reserved = torch.npu.memory_reserved # type: ignore
|
||||
torch.cuda.max_memory_reserved = torch.npu.max_memory_reserved # type: ignore
|
||||
torch.cuda.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore
|
||||
torch.cuda.memory_stats = torch.npu.memory_stats # type: ignore
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Device properties
|
||||
# ------------------------------------------------------------------
|
||||
_real_npu_props = torch.npu.get_device_properties
|
||||
|
||||
def _get_device_properties(device=None):
|
||||
"""Return NPU device properties with CUDA-compatible attributes."""
|
||||
props = _real_npu_props(device)
|
||||
# GPUModelRunner accesses .multi_processor_count which may not
|
||||
# exist on NPU. Provide a sensible fallback.
|
||||
if not hasattr(props, "multi_processor_count"):
|
||||
props.multi_processor_count = 1 # type: ignore[attr-defined]
|
||||
if not hasattr(props, "major"):
|
||||
props.major = 9 # type: ignore[attr-defined]
|
||||
props.minor = 0 # type: ignore[attr-defined]
|
||||
return props
|
||||
|
||||
torch.cuda.get_device_properties = _get_device_properties # type: ignore
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Misc
|
||||
# ------------------------------------------------------------------
|
||||
if not hasattr(torch.cuda, "_get_device_index"):
|
||||
torch.cuda._get_device_index = torch.npu._get_device_index # type: ignore
|
||||
|
||||
# graph / CUDAGraph stubs (NPU does not support CUDA graphs)
|
||||
if not hasattr(torch.cuda, "CUDAGraph") or True:
|
||||
torch.cuda.CUDAGraph = MagicMock # type: ignore[attr-defined]
|
||||
|
||||
if not hasattr(torch.cuda, "graph"):
|
||||
|
||||
def _noop_graph(*args, **kwargs):
|
||||
"""No-op context manager for CUDA graphs on NPU."""
|
||||
import contextlib
|
||||
return contextlib.nullcontext()
|
||||
|
||||
torch.cuda.graph = _noop_graph # type: ignore[attr-defined]
|
||||
Reference in New Issue
Block a user