Files
vllm-npu-plugin/vllm_npu/cuda_compat.py

90 lines
4.0 KiB
Python

"""
CUDA-to-NPU Compatibility Layer.
Monkey-patches ``torch.cuda`` APIs so that code written for CUDA
(e.g. ``torch.cuda.Stream()``, ``torch.cuda.Event()``) transparently
delegates to the corresponding ``torch.npu`` equivalents. This
allows vLLM's ``GPUModelRunner`` to run on Ascend NPU without
source modifications.
"""
import types
from unittest.mock import MagicMock
import torch
def _patch_cuda_to_npu() -> None:
"""Apply monkey-patches: redirect torch.cuda → torch.npu."""
import torch_npu # noqa: F401
# ------------------------------------------------------------------
# Stream / Event
# ------------------------------------------------------------------
torch.cuda.Stream = torch.npu.Stream # type: ignore[attr-defined]
torch.cuda.Event = torch.npu.Event # type: ignore[attr-defined]
torch.cuda.current_stream = torch.npu.current_stream # type: ignore
torch.cuda.default_stream = torch.npu.default_stream # type: ignore
# torch.cuda.stream() context manager
torch.cuda.stream = torch.npu.stream # type: ignore[attr-defined]
# ------------------------------------------------------------------
# Device management
# ------------------------------------------------------------------
torch.cuda.set_device = torch.npu.set_device # type: ignore
torch.cuda.synchronize = torch.npu.synchronize # type: ignore
torch.cuda.device_count = torch.npu.device_count # type: ignore
torch.cuda.current_device = torch.npu.current_device # type: ignore
torch.cuda.is_available = lambda: True # type: ignore
# ------------------------------------------------------------------
# Memory management
# ------------------------------------------------------------------
torch.cuda.empty_cache = torch.npu.empty_cache # type: ignore
torch.cuda.mem_get_info = torch.npu.mem_get_info # type: ignore
torch.cuda.memory_allocated = torch.npu.memory_allocated # type: ignore
torch.cuda.max_memory_allocated = torch.npu.max_memory_allocated # type: ignore
torch.cuda.memory_reserved = torch.npu.memory_reserved # type: ignore
torch.cuda.max_memory_reserved = torch.npu.max_memory_reserved # type: ignore
torch.cuda.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore
torch.cuda.memory_stats = torch.npu.memory_stats # type: ignore
# ------------------------------------------------------------------
# Device properties
# ------------------------------------------------------------------
_real_npu_props = torch.npu.get_device_properties
def _get_device_properties(device=None):
"""Return NPU device properties with CUDA-compatible attributes."""
props = _real_npu_props(device)
# GPUModelRunner accesses .multi_processor_count which may not
# exist on NPU. Provide a sensible fallback.
if not hasattr(props, "multi_processor_count"):
props.multi_processor_count = 1 # type: ignore[attr-defined]
if not hasattr(props, "major"):
props.major = 9 # type: ignore[attr-defined]
props.minor = 0 # type: ignore[attr-defined]
return props
torch.cuda.get_device_properties = _get_device_properties # type: ignore
# ------------------------------------------------------------------
# Misc
# ------------------------------------------------------------------
if not hasattr(torch.cuda, "_get_device_index"):
torch.cuda._get_device_index = torch.npu._get_device_index # type: ignore
# graph / CUDAGraph stubs (NPU does not support CUDA graphs)
if not hasattr(torch.cuda, "CUDAGraph") or True:
torch.cuda.CUDAGraph = MagicMock # type: ignore[attr-defined]
if not hasattr(torch.cuda, "graph"):
def _noop_graph(*args, **kwargs):
"""No-op context manager for CUDA graphs on NPU."""
import contextlib
return contextlib.nullcontext()
torch.cuda.graph = _noop_graph # type: ignore[attr-defined]