mirror of
https://github.com/handsomezhuzhu/vllm-npu-plugin.git
synced 2026-02-20 19:50:15 +00:00
fix: Update MLAAttention import logic for vllm version compatibility
This commit is contained in:
@@ -32,15 +32,17 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
|||||||
from vllm.utils import direct_register_custom_op
|
from vllm.utils import direct_register_custom_op
|
||||||
|
|
||||||
from vllm_npu.ascend_config import get_ascend_config
|
from vllm_npu.ascend_config import get_ascend_config
|
||||||
from vllm_npu.utils import vllm_version_is
|
|
||||||
|
|
||||||
if vllm_version_is("0.11.0"):
|
try:
|
||||||
|
# vllm >= 0.11.0: MLAAttention merged into Attention
|
||||||
from vllm.attention import Attention
|
from vllm.attention import Attention
|
||||||
from vllm.model_executor.layers.mla import \
|
from vllm.model_executor.layers.mla import \
|
||||||
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
||||||
else:
|
_VLLM_HAS_UNIFIED_ATTENTION = True
|
||||||
|
except ImportError:
|
||||||
from vllm.attention.layer import MLAAttention
|
from vllm.attention.layer import MLAAttention
|
||||||
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
||||||
|
_VLLM_HAS_UNIFIED_ATTENTION = False
|
||||||
|
|
||||||
|
|
||||||
# TODO(whx): adapt v0.11.0 and DSA
|
# TODO(whx): adapt v0.11.0 and DSA
|
||||||
@@ -78,7 +80,7 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
|
|||||||
self.tp_size = get_tensor_model_parallel_world_size()
|
self.tp_size = get_tensor_model_parallel_world_size()
|
||||||
self.layers = hf_config.num_hidden_layers
|
self.layers = hf_config.num_hidden_layers
|
||||||
|
|
||||||
if vllm_version_is("0.11.0"):
|
if _VLLM_HAS_UNIFIED_ATTENTION:
|
||||||
self.mla_attn = Attention(
|
self.mla_attn = Attention(
|
||||||
num_heads=num_heads,
|
num_heads=num_heads,
|
||||||
head_size=self.kv_lora_rank + self.qk_rope_head_dim,
|
head_size=self.kv_lora_rank + self.qk_rope_head_dim,
|
||||||
|
|||||||
Reference in New Issue
Block a user