From 30cf7ccd1ff15fadc2f8505e6529951772fb61df Mon Sep 17 00:00:00 2001 From: handsomezhuzhu <2658601135@qq.com> Date: Tue, 10 Feb 2026 20:29:18 +0800 Subject: [PATCH] fix: revert to _npu_reshape_and_cache (contiguous) and _npu_flash_attention --- vllm_npu/attention/attention_v1.py | 64 +++++++++++++++++------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/vllm_npu/attention/attention_v1.py b/vllm_npu/attention/attention_v1.py index 7b3cfbd..f3a9894 100644 --- a/vllm_npu/attention/attention_v1.py +++ b/vllm_npu/attention/attention_v1.py @@ -322,21 +322,24 @@ class AscendAttentionBackendImpl(AttentionImpl): Matches Huawei vllm-ascend: splits kv_cache[0]/[1] and writes via slot_mapping indices. """ + import torch_npu # noqa: F401 + if kv_cache.numel() > 0: if self._key_cache is None: self._key_cache, self._value_cache = kv_cache[0], kv_cache[1] - slots = attn_metadata.slot_mapping - key_to_cache = key[:attn_metadata.num_actual_tokens] - val_to_cache = value[:attn_metadata.num_actual_tokens] + # Ensure contiguous tensors for the NPU op + key = key.contiguous() + value = value.contiguous() + slots = attn_metadata.slot_mapping.long() # indices must be long - # Use pure-PyTorch indexing (ATB reshape_and_cache may fail - # depending on environment; this is functionally identical) - block_size = self._key_cache.shape[1] - block_idx = slots // block_size - block_offset = slots % block_size - self._key_cache[block_idx, block_offset] = key_to_cache - self._value_cache[block_idx, block_offset] = val_to_cache + torch_npu._npu_reshape_and_cache( + key, + value, + self._key_cache, + self._value_cache, + slots, + ) return key, value @@ -450,28 +453,33 @@ class AscendAttentionBackendImpl(AttentionImpl): output: torch.Tensor, num_tokens: int, ) -> torch.Tensor: - """Prefill attention without KV cache (self-attention).""" + """Prefill attention without KV cache (self-attention) using _npu_flash_attention.""" import torch_npu # noqa: F401 - cum_seq_len = attn_metadata.query_start_loc[1:].tolist() - - attn_out = torch_npu.npu_fusion_attention( - query[:num_tokens], - key[:num_tokens], - value[:num_tokens], - head_num=self.num_heads, - input_layout="TND", - scale=self.scale, - sparse_mode=0, - atten_mask=attn_metadata.attn_mask, - pre_tockens=2147483647, - next_tockens=0, - actual_seq_qlen=cum_seq_len, - actual_seq_kvlen=cum_seq_len, + # Huawei uses _npu_flash_attention for prefill + # Ensure contiguous inputs + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() + + # mask needs to be contiguous and cast to expected format if needed + # but _npu_flash_attention handles generic mask? + # Huawei code: mask = attn_metadata.attn_mask... + # We'll pass it as is, assuming AscendMetadataBuilder created it correctly. + + torch_npu._npu_flash_attention( + query=query, + key=key, + value=value, + mask=attn_metadata.attn_mask, + seq_len=attn_metadata.seq_lens, + scale_value=self.scale, + num_heads=self.num_heads, + num_kv_heads=self.num_kv_heads, + out=output ) - output[:num_tokens] = attn_out[0] - return output + return output[:num_tokens] # ----------------------------------------------------------------- # Chunked prefill — mixed prefill+decode via npu_fusion_attention