""" CUDA-to-NPU Compatibility Layer. Monkey-patches ``torch.cuda`` APIs so that code written for CUDA (e.g. ``torch.cuda.Stream()``, ``torch.cuda.Event()``) transparently delegates to the corresponding ``torch.npu`` equivalents. This allows vLLM's ``GPUModelRunner`` to run on Ascend NPU without source modifications. """ import types from unittest.mock import MagicMock import torch def _patch_cuda_to_npu() -> None: """Apply monkey-patches: redirect torch.cuda → torch.npu.""" import torch_npu # noqa: F401 # ------------------------------------------------------------------ # Stream / Event # ------------------------------------------------------------------ torch.cuda.Stream = torch.npu.Stream # type: ignore[attr-defined] torch.cuda.Event = torch.npu.Event # type: ignore[attr-defined] torch.cuda.current_stream = torch.npu.current_stream # type: ignore torch.cuda.default_stream = torch.npu.default_stream # type: ignore # torch.cuda.stream() context manager torch.cuda.stream = torch.npu.stream # type: ignore[attr-defined] # ------------------------------------------------------------------ # Device management # ------------------------------------------------------------------ torch.cuda.set_device = torch.npu.set_device # type: ignore torch.cuda.synchronize = torch.npu.synchronize # type: ignore torch.cuda.device_count = torch.npu.device_count # type: ignore torch.cuda.current_device = torch.npu.current_device # type: ignore torch.cuda.is_available = lambda: True # type: ignore # ------------------------------------------------------------------ # Memory management # ------------------------------------------------------------------ torch.cuda.empty_cache = torch.npu.empty_cache # type: ignore torch.cuda.mem_get_info = torch.npu.mem_get_info # type: ignore torch.cuda.memory_allocated = torch.npu.memory_allocated # type: ignore torch.cuda.max_memory_allocated = torch.npu.max_memory_allocated # type: ignore torch.cuda.memory_reserved = torch.npu.memory_reserved # type: ignore torch.cuda.max_memory_reserved = torch.npu.max_memory_reserved # type: ignore torch.cuda.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore torch.cuda.memory_stats = torch.npu.memory_stats # type: ignore # ------------------------------------------------------------------ # Device properties # ------------------------------------------------------------------ _real_npu_props = torch.npu.get_device_properties def _get_device_properties(device=None): """Return NPU device properties with CUDA-compatible attributes.""" props = _real_npu_props(device) # GPUModelRunner accesses .multi_processor_count which may not # exist on NPU. Provide a sensible fallback. if not hasattr(props, "multi_processor_count"): props.multi_processor_count = 1 # type: ignore[attr-defined] if not hasattr(props, "major"): props.major = 9 # type: ignore[attr-defined] props.minor = 0 # type: ignore[attr-defined] return props torch.cuda.get_device_properties = _get_device_properties # type: ignore # ------------------------------------------------------------------ # Misc # ------------------------------------------------------------------ if not hasattr(torch.cuda, "_get_device_index"): torch.cuda._get_device_index = torch.npu._get_device_index # type: ignore # graph / CUDAGraph stubs (NPU does not support CUDA graphs) if not hasattr(torch.cuda, "CUDAGraph") or True: torch.cuda.CUDAGraph = MagicMock # type: ignore[attr-defined] if not hasattr(torch.cuda, "graph"): def _noop_graph(*args, **kwargs): """No-op context manager for CUDA graphs on NPU.""" import contextlib return contextlib.nullcontext() torch.cuda.graph = _noop_graph # type: ignore[attr-defined]