feat: Add Ascend NPU attention backend with NPU-specific FlashAttention, LayerNorm, and Rotary Embedding implementations.

2026-02-20 19:50:15 +00:00 · 2026-02-10 21:56:45 +08:00
parent 3aebca03d9
commit 4ca9d52cf2
4 changed files with 119 additions and 55 deletions
--- a/vllm_npu/init.py
+++ b/vllm_npu/init.py
@@ -15,4 +15,25 @@ def register():
    from vllm_npu.cuda_compat import _patch_cuda_to_npu
    _patch_cuda_to_npu()

+    # Register NPU custom ops with vLLM's CustomOp dispatch so that
+    # ops like SiluAndMul, RMSNorm, RotaryEmbedding use NPU kernels
+    # instead of falling back to CUDA (which would produce garbage).
+    _register_npu_ops()
+
    return "vllm_npu.platform.NPUPlatform"
+
+
+def _register_npu_ops():
+    """Register Ascend NPU op overrides with vLLM's CustomOp system."""
+    from vllm.model_executor.custom_op import CustomOp
+
+    from vllm_npu.ops.activation import AscendSiluAndMul
+    from vllm_npu.ops.layernorm import AscendRMSNorm
+    from vllm_npu.ops.rotary_embedding import AscendRotaryEmbedding
+
+    for name, op_cls in {
+        "SiluAndMul": AscendSiluAndMul,
+        "RMSNorm": AscendRMSNorm,
+        "RotaryEmbedding": AscendRotaryEmbedding,
+    }.items():
+        CustomOp.register_oot(_decorated_op_cls=op_cls, name=name)