"""
CUDA-to-NPU Compatibility Layer.

Monkey-patches ``torch.cuda`` APIs so that code written for CUDA
(e.g. ``torch.cuda.Stream()``, ``torch.cuda.Event()``) transparently
delegates to the corresponding ``torch.npu`` equivalents. This
allows vLLM's ``GPUModelRunner`` to run on Ascend NPU without
source modifications.
"""

import types
from unittest.mock import MagicMock

import torch


def _patch_cuda_to_npu() -> None:
    """Apply monkey-patches: redirect torch.cuda → torch.npu."""
    import torch_npu  # noqa: F401

    # ------------------------------------------------------------------
    # Stream / Event
    # ------------------------------------------------------------------
    torch.cuda.Stream = torch.npu.Stream  # type: ignore[attr-defined]
    torch.cuda.Event = torch.npu.Event  # type: ignore[attr-defined]
    torch.cuda.current_stream = torch.npu.current_stream  # type: ignore
    torch.cuda.default_stream = torch.npu.default_stream  # type: ignore

    # torch.cuda.stream() context manager
    torch.cuda.stream = torch.npu.stream  # type: ignore[attr-defined]

    # ------------------------------------------------------------------
    # Device management
    # ------------------------------------------------------------------
    torch.cuda.set_device = torch.npu.set_device  # type: ignore
    torch.cuda.synchronize = torch.npu.synchronize  # type: ignore
    torch.cuda.device_count = torch.npu.device_count  # type: ignore
    torch.cuda.current_device = torch.npu.current_device  # type: ignore
    torch.cuda.is_available = lambda: True  # type: ignore

    # ------------------------------------------------------------------
    # Memory management
    # ------------------------------------------------------------------
    torch.cuda.empty_cache = torch.npu.empty_cache  # type: ignore
    torch.cuda.mem_get_info = torch.npu.mem_get_info  # type: ignore
    torch.cuda.memory_allocated = torch.npu.memory_allocated  # type: ignore
    torch.cuda.max_memory_allocated = torch.npu.max_memory_allocated  # type: ignore
    torch.cuda.memory_reserved = torch.npu.memory_reserved  # type: ignore
    torch.cuda.max_memory_reserved = torch.npu.max_memory_reserved  # type: ignore
    torch.cuda.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats  # type: ignore
    torch.cuda.memory_stats = torch.npu.memory_stats  # type: ignore

    # ------------------------------------------------------------------
    # Device properties
    # ------------------------------------------------------------------
    _real_npu_props = torch.npu.get_device_properties

    def _get_device_properties(device=None):
        """Return NPU device properties with CUDA-compatible attributes."""
        props = _real_npu_props(device)
        # GPUModelRunner accesses .multi_processor_count which may not
        # exist on NPU.  Provide a sensible fallback.
        if not hasattr(props, "multi_processor_count"):
            props.multi_processor_count = 1  # type: ignore[attr-defined]
        if not hasattr(props, "major"):
            props.major = 9  # type: ignore[attr-defined]
            props.minor = 0  # type: ignore[attr-defined]
        return props

    torch.cuda.get_device_properties = _get_device_properties  # type: ignore

    # ------------------------------------------------------------------
    # Misc
    # ------------------------------------------------------------------
    if not hasattr(torch.cuda, "_get_device_index"):
        torch.cuda._get_device_index = torch.npu._get_device_index  # type: ignore

    # graph / CUDAGraph stubs (NPU does not support CUDA graphs)
    if not hasattr(torch.cuda, "CUDAGraph") or True:
        torch.cuda.CUDAGraph = MagicMock  # type: ignore[attr-defined]

    if not hasattr(torch.cuda, "graph"):

        def _noop_graph(*args, **kwargs):
            """No-op context manager for CUDA graphs on NPU."""
            import contextlib
            return contextlib.nullcontext()

        torch.cuda.graph = _noop_graph  # type: ignore[attr-defined]