fix: Update MLAAttention import logic for vllm version compatibility

This commit is contained in:
2026-02-10 23:17:30 +08:00
parent 5df056dd17
commit f49538ea8d

View File

@@ -32,15 +32,17 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.utils import direct_register_custom_op from vllm.utils import direct_register_custom_op
from vllm_npu.ascend_config import get_ascend_config from vllm_npu.ascend_config import get_ascend_config
from vllm_npu.utils import vllm_version_is
if vllm_version_is("0.11.0"): try:
# vllm >= 0.11.0: MLAAttention merged into Attention
from vllm.attention import Attention from vllm.attention import Attention
from vllm.model_executor.layers.mla import \ from vllm.model_executor.layers.mla import \
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
else: _VLLM_HAS_UNIFIED_ATTENTION = True
except ImportError:
from vllm.attention.layer import MLAAttention from vllm.attention.layer import MLAAttention
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
_VLLM_HAS_UNIFIED_ATTENTION = False
# TODO(whx): adapt v0.11.0 and DSA # TODO(whx): adapt v0.11.0 and DSA
@@ -78,7 +80,7 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
self.tp_size = get_tensor_model_parallel_world_size() self.tp_size = get_tensor_model_parallel_world_size()
self.layers = hf_config.num_hidden_layers self.layers = hf_config.num_hidden_layers
if vllm_version_is("0.11.0"): if _VLLM_HAS_UNIFIED_ATTENTION:
self.mla_attn = Attention( self.mla_attn = Attention(
num_heads=num_heads, num_heads=num_heads,
head_size=self.kv_lora_rank + self.qk_rope_head_dim, head_size=self.kv_lora_rank + self.qk_rope_head_dim,