From f49538ea8dcf81f54aea45e0e995619af380fa7c Mon Sep 17 00:00:00 2001 From: handsomezhuzhu <2658601135@qq.com> Date: Tue, 10 Feb 2026 23:17:30 +0800 Subject: [PATCH] fix: Update MLAAttention import logic for vllm version compatibility --- vllm_npu/models/layers/mla.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm_npu/models/layers/mla.py b/vllm_npu/models/layers/mla.py index b50422e..5cb7d91 100644 --- a/vllm_npu/models/layers/mla.py +++ b/vllm_npu/models/layers/mla.py @@ -32,15 +32,17 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.utils import direct_register_custom_op from vllm_npu.ascend_config import get_ascend_config -from vllm_npu.utils import vllm_version_is -if vllm_version_is("0.11.0"): +try: + # vllm >= 0.11.0: MLAAttention merged into Attention from vllm.attention import Attention from vllm.model_executor.layers.mla import \ MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper -else: + _VLLM_HAS_UNIFIED_ATTENTION = True +except ImportError: from vllm.attention.layer import MLAAttention from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper + _VLLM_HAS_UNIFIED_ATTENTION = False # TODO(whx): adapt v0.11.0 and DSA @@ -78,7 +80,7 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper): self.tp_size = get_tensor_model_parallel_world_size() self.layers = hf_config.num_hidden_layers - if vllm_version_is("0.11.0"): + if _VLLM_HAS_UNIFIED_ATTENTION: self.mla_attn = Attention( num_heads=num_heads, head_size=self.kv_lora_rank + self.qk_rope_head_dim,