fix: replace ATB reshape_and_cache with pure PyTorch indexing

2026-02-20 19:50:15 +00:00 · 2026-02-10 19:56:47 +08:00
parent 101435817a
commit b8b4516b98
1 changed files with 9 additions and 7 deletions
--- a/vllm_npu/attention/attention_v1.py
+++ b/vllm_npu/attention/attention_v1.py
@@ -351,13 +351,15 @@ class AscendAttentionBackendImpl(AttentionImpl):
                self._key_cache, self._value_cache = kv_cache.unbind(0)
            slots = attn_metadata.slot_mapping
-            torch_npu._npu_reshape_and_cache(
+            # Pure PyTorch reshape_and_cache (avoids ATB dependency)
-                key=key[:num_actual_tokens],
+            key_to_cache = key[:num_actual_tokens]
-                value=value[:num_actual_tokens],
+            val_to_cache = value[:num_actual_tokens]
-                key_cache=self._key_cache,
+            block_size = self._key_cache.shape[1]
-                value_cache=self._value_cache,
+            block_idx = slots // block_size
-                slot_indices=slots,
+            block_offset = slots % block_size
-            )
+            self._key_cache[block_idx, block_offset] = key_to_cache
            self._value_cache[block_idx, block_offset] = val_to_cache
        # ----------------------------------------------------------
        # Step 2: Compute attention