mirror of
https://github.com/handsomezhuzhu/vllm-npu-plugin.git
synced 2026-02-20 19:50:15 +00:00
fix: replace ATB reshape_and_cache with pure PyTorch indexing
This commit is contained in:
@@ -351,13 +351,15 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
|||||||
self._key_cache, self._value_cache = kv_cache.unbind(0)
|
self._key_cache, self._value_cache = kv_cache.unbind(0)
|
||||||
|
|
||||||
slots = attn_metadata.slot_mapping
|
slots = attn_metadata.slot_mapping
|
||||||
torch_npu._npu_reshape_and_cache(
|
# Pure PyTorch reshape_and_cache (avoids ATB dependency)
|
||||||
key=key[:num_actual_tokens],
|
key_to_cache = key[:num_actual_tokens]
|
||||||
value=value[:num_actual_tokens],
|
val_to_cache = value[:num_actual_tokens]
|
||||||
key_cache=self._key_cache,
|
block_size = self._key_cache.shape[1]
|
||||||
value_cache=self._value_cache,
|
block_idx = slots // block_size
|
||||||
slot_indices=slots,
|
block_offset = slots % block_size
|
||||||
)
|
self._key_cache[block_idx, block_offset] = key_to_cache
|
||||||
|
self._value_cache[block_idx, block_offset] = val_to_cache
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------
|
# ----------------------------------------------------------
|
||||||
# Step 2: Compute attention
|
# Step 2: Compute attention
|
||||||
|
|||||||
Reference in New Issue
Block a user