diff --git a/vllm_npu/attention/attention_v1.py b/vllm_npu/attention/attention_v1.py index 859df09..3c0d687 100644 --- a/vllm_npu/attention/attention_v1.py +++ b/vllm_npu/attention/attention_v1.py @@ -83,6 +83,7 @@ class AscendAttentionBackend(AttentionBackend): block_size: int, num_kv_heads: int, head_size: int, + **kwargs, ) -> Tuple[int, int, int, int]: """KV cache shape: (num_blocks, block_size, num_kv_heads, head_size). @@ -91,6 +92,7 @@ class AscendAttentionBackend(AttentionBackend): """ return (num_blocks, block_size, num_kv_heads, head_size) + @staticmethod def swap_blocks( src_kv_cache: List[torch.Tensor],