refactor: align attention with Huawei vllm-ascend - reshape_and_cache with kv_cache[0]/[1], _get_fia_params, npu_fused_infer_attention_score for chunked prefill, add actual_seq_lengths_q

2026-02-20 19:50:15 +00:00 · 2026-02-10 20:06:52 +08:00
parent b8b4516b98
commit 810a2ef757
1 changed files with 109 additions and 157 deletions
--- a/vllm_npu/attention/attention_v1.py
+++ b/vllm_npu/attention/attention_v1.py
@@ -141,6 +141,7 @@ class AscendMetadata:
    query_start_loc: Optional[torch.Tensor] = None   # (batch+1,)
    query_lens: Optional[torch.Tensor] = None
    max_query_len: Optional[int] = None
    actual_seq_lengths_q: Optional[List[int]] = None  # cumulative q positions
    # KV cache mapping
    block_tables: Optional[torch.Tensor] = None      # (batch, max_blocks)
@@ -207,12 +208,15 @@ class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]):
            attn_state = AscendAttentionState.ChunkedPrefill
        # Build cumulative sequence lengths for query (for prefill)
        num_reqs = common_attn_metadata.num_reqs
        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu[:num_reqs + 1]
        query_start_loc = common_attn_metadata.query_start_loc.to(
            dtype=torch.int64
        )
        actual_seq_lengths_q = query_start_loc_cpu[1:].tolist()
        seq_lens = common_attn_metadata.seq_lens
-        seq_lens_list = common_attn_metadata.seq_lens_cpu.tolist()
+        seq_lens_list = common_attn_metadata.seq_lens_cpu[:num_reqs].tolist()
        # Build attention mask for prefill (causal mask)
        attn_mask = None
@@ -232,6 +236,7 @@ class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]):
            seq_lens_list=seq_lens_list,
            query_start_loc=query_start_loc,
            max_query_len=max_query_len,
            actual_seq_lengths_q=actual_seq_lengths_q,
            block_tables=common_attn_metadata.block_table_tensor,
            slot_mapping=common_attn_metadata.slot_mapping,
            attn_mask=attn_mask,
@@ -297,15 +302,72 @@ class AscendAttentionBackendImpl(AttentionImpl):
        self._key_cache: Optional[torch.Tensor] = None
        self._value_cache: Optional[torch.Tensor] = None
    def reshape_and_cache(
        self,
        key: torch.Tensor,
        value: torch.Tensor,
        kv_cache: torch.Tensor,
        attn_metadata: "AscendMetadata",
    ):
        """Update KV cache with new key/value tensors.
        Matches Huawei vllm-ascend: splits kv_cache[0]/[1] and writes via
        slot_mapping indices.
        """
        if kv_cache.numel() > 0:
            if self._key_cache is None:
                self._key_cache, self._value_cache = kv_cache[0], kv_cache[1]
            slots = attn_metadata.slot_mapping
            key_to_cache = key[:attn_metadata.num_actual_tokens]
            val_to_cache = value[:attn_metadata.num_actual_tokens]
            # Use pure-PyTorch indexing (ATB reshape_and_cache may fail
            # depending on environment; this is functionally identical)
            block_size = self._key_cache.shape[1]
            block_idx = slots // block_size
            block_offset = slots % block_size
            self._key_cache[block_idx, block_offset] = key_to_cache
            self._value_cache[block_idx, block_offset] = val_to_cache
        return key, value
    # -----------------------------------------------------------------
    # Forward dispatch (matches Huawei vllm-ascend structure)
    # -----------------------------------------------------------------
    def _get_fia_params(
        self,
        key: torch.Tensor,
        value: torch.Tensor,
        attn_metadata: "AscendMetadata",
    ):
        """Prepare key, value, block_size, block_table and kv_seq_lens
        for npu_fused_infer_attention_score, following Huawei's approach."""
        if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
            block_size = 128
            block_table = None
            actual_seq_lengths_kv = attn_metadata.query_start_loc[1:].tolist()
        else:
            # DecodeOnly / PrefillCacheHit / ChunkedPrefill — read from cache
            num_block, block_size, _, _ = self._key_cache.shape
            key = self._key_cache.view(num_block, block_size, -1)
            value = self._value_cache.view(num_block, block_size, -1)
            block_table = attn_metadata.block_tables
            actual_seq_lengths_kv = attn_metadata.seq_lens_list
        return key, value, block_size, block_table, actual_seq_lengths_kv
    def forward(
        self,
        layer: nn.Module,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
-        kv_cache: Tuple[torch.Tensor, ...],
+        kv_cache: torch.Tensor,
        attn_metadata: AscendMetadata,
        output: Optional[torch.Tensor] = None,
        output_scale: Optional[torch.Tensor] = None,
        output_block_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Forward pass with Ascend attention.
@@ -313,8 +375,8 @@ class AscendAttentionBackendImpl(AttentionImpl):
            query:  (num_tokens, num_heads * head_size)
            key:    (num_tokens, num_kv_heads * head_size)
            value:  (num_tokens, num_kv_heads * head_size)
-            kv_cache: (key_cache, value_cache) each
+            kv_cache: tensor of shape
-                      (num_blocks, block_size, num_kv_heads, head_size)
+                      (2, num_blocks, block_size, num_kv_heads, head_size)
            attn_metadata: AscendMetadata for this forward call.
        Returns:
@@ -322,48 +384,24 @@ class AscendAttentionBackendImpl(AttentionImpl):
        """
        import torch_npu  # noqa: F401
        assert output is not None, "Output tensor must be provided."
        num_tokens = query.shape[0]
        if output is None:
            output = torch.empty(
                num_tokens,
                self.num_heads,
                self.head_size,
                dtype=query.dtype,
                device=query.device,
            )
        if attn_metadata is None:
-            return output.view(num_tokens, self.hidden_size).fill_(0)
+            return output.fill_(0)
-        num_actual_tokens = attn_metadata.num_actual_tokens
+        # Reshape Q/K/V to TND (tokens, heads, head_dim)
        # Reshape Q/K/V to BSH (tokens, heads, head_dim)
        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size).contiguous()
+        value = value.view(-1, self.num_kv_heads, self.head_size)
        # ----------------------------------------------------------
        # Step 1: Update KV cache
-        # ----------------------------------------------------------
+        if key is not None and value is not None:
-        if kv_cache is not None and len(kv_cache.shape) > 1:
+            key, value = self.reshape_and_cache(
-            if self._key_cache is None:
+                key, value, kv_cache, attn_metadata
-                self._key_cache, self._value_cache = kv_cache.unbind(0)
+            )
            slots = attn_metadata.slot_mapping
            # Pure PyTorch reshape_and_cache (avoids ATB dependency)
            key_to_cache = key[:num_actual_tokens]
            val_to_cache = value[:num_actual_tokens]
            block_size = self._key_cache.shape[1]
            block_idx = slots // block_size
            block_offset = slots % block_size
            self._key_cache[block_idx, block_offset] = key_to_cache
            self._value_cache[block_idx, block_offset] = val_to_cache
        # ----------------------------------------------------------
        # Step 2: Compute attention
        # ----------------------------------------------------------
        if attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
            output = self._forward_decode(
                query, attn_metadata, output, num_tokens
@@ -373,15 +411,15 @@ class AscendAttentionBackendImpl(AttentionImpl):
                query, key, value, attn_metadata, output, num_tokens
            )
        else:
-            # ChunkedPrefill or PrefillCacheHit
+            # ChunkedPrefill or PrefillCacheHit — use FIA with block tables
-            output = self._forward_chunked_prefill(
+            output = self._forward_fused_infer_attention(
-                query, key, value, attn_metadata, output, num_tokens
+                query, key, value, attn_metadata, output
            )
-        return output.view(num_tokens, self.hidden_size)
+        return output
    # -----------------------------------------------------------------
-    # Decode path — paged attention via npu_incre_flash_attention
+    # Decode path — paged attention (matches Huawei _npu_paged_attention)
    # -----------------------------------------------------------------
    def _forward_decode(
@@ -391,13 +429,9 @@ class AscendAttentionBackendImpl(AttentionImpl):
        output: torch.Tensor,
        num_tokens: int,
    ) -> torch.Tensor:
-        """Decode-only attention using incremental flash attention."""
+        """Decode-only via npu_incre_flash_attention."""
        import torch_npu  # noqa: F401
        # npu_incre_flash_attention expects:
        #   query:       (batch, 1, num_heads, head_size)
        #   key_cache:   (num_blocks, block_size, num_kv_heads, head_size)
        #   value_cache: (num_blocks, block_size, num_kv_heads, head_size)
        q = query[:num_tokens].unsqueeze(1)  # (B, 1, H, D)
        attn_out = torch_npu.npu_incre_flash_attention(
@@ -417,7 +451,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
        return output
    # -----------------------------------------------------------------
-    # Prefill without KV cache (first token, no paging)
+    # Prefill without KV cache
    # -----------------------------------------------------------------
    def _forward_prefill_no_cache(
@@ -453,127 +487,45 @@ class AscendAttentionBackendImpl(AttentionImpl):
        return output
    # -----------------------------------------------------------------
-    # Chunked prefill — mixed prefill+decode
+    # Fused Infer Attention (prefill with cache / chunked prefill)
    # Matches Huawei's forward_fused_infer_attention approach
    # -----------------------------------------------------------------
-    def _forward_chunked_prefill(
+    def _forward_fused_infer_attention(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        attn_metadata: AscendMetadata,
        output: torch.Tensor,
        num_tokens: int,
    ) -> torch.Tensor:
-        """Chunked prefill using npu_fusion_attention with paged KV cache."""
+        """Use npu_fused_infer_attention_score with TND layout and block
        tables — the same approach Huawei uses for chunked prefill and
        cache-hit prefill."""
        import torch_npu  # noqa: F401
-        # Split batch into decodes and prefills based on query length
+        key, value, block_size, block_table, actual_seq_lengths_kv = (
-        query_start_loc = attn_metadata.query_start_loc
+            self._get_fia_params(key, value, attn_metadata)
-        seq_lens = attn_metadata.seq_lens
+        )
        num_tokens = attn_metadata.actual_seq_lengths_q[-1]
        query = query[:num_tokens]
-        # Compute per-request query lengths
+        attn_output, _ = torch_npu.npu_fused_infer_attention_score(
-        query_lens = query_start_loc[1:] - query_start_loc[:-1]
+            query=query,
-        num_requests = len(query_lens)
+            key=key,
-
+            value=value,
-        # Separate decode (query_len == 1) and prefill requests
+            atten_mask=attn_metadata.attn_mask,
-        decode_mask = query_lens == 1
+            block_table=block_table,
-        prefill_mask = ~decode_mask
+            input_layout="TND",
-        num_decodes = decode_mask.sum().item()
+            block_size=block_size,
-
+            actual_seq_lengths=attn_metadata.actual_seq_lengths_q,
-        # Process decode tokens
+            actual_seq_lengths_kv=actual_seq_lengths_kv,
        if num_decodes > 0 and self._key_cache is not None:
            decode_indices = torch.where(decode_mask)[0]
            decode_query = query[query_start_loc[decode_indices]]
            decode_block_tables = attn_metadata.block_tables[decode_indices]
            decode_seq_lens = seq_lens[decode_indices].tolist()
            decode_q = decode_query.unsqueeze(1)  # (B_decode, 1, H, D)
            decode_out = torch_npu.npu_incre_flash_attention(
                decode_q,
                self._key_cache,
                self._value_cache,
                num_heads=self.num_heads,
            num_key_value_heads=self.num_kv_heads,
-                scale_value=self.scale,
+            num_heads=self.num_heads,
                block_table=decode_block_tables,
                actual_seq_lengths=decode_seq_lens,
                block_size=self._key_cache.shape[1],
                input_layout="BNSD",
            )
            for i, idx in enumerate(decode_indices):
                token_pos = query_start_loc[idx].item()
                output[token_pos] = decode_out[i].squeeze(0)
        # Process prefill tokens
        if prefill_mask.any():
            prefill_indices = torch.where(prefill_mask)[0]
            for idx in prefill_indices:
                start = query_start_loc[idx].item()
                end = query_start_loc[idx + 1].item()
                q_len = end - start
                kv_len = seq_lens[idx].item()
                q = query[start:end]  # (q_len, H, D)
                # Use npu_fusion_attention for this single prefill request
                # Build a causal mask for this sequence
                causal_mask = torch.ones(
                    kv_len, kv_len, dtype=torch.bool, device=query.device
                ).triu_(diagonal=1)
                # For chunked prefill, key/value come from the cache
                if self._key_cache is not None and kv_len > q_len:
                    # Gather KV from paged cache for this request
                    block_table = attn_metadata.block_tables[idx]
                    num_blocks_needed = (kv_len + self._key_cache.shape[1] - 1) \
                        // self._key_cache.shape[1]
                    block_ids = block_table[:num_blocks_needed]
                    # Gather KV from block cache
                    gathered_k = self._key_cache[block_ids].reshape(
                        -1, self.num_kv_heads, self.head_size
                    )[:kv_len]
                    gathered_v = self._value_cache[block_ids].reshape(
                        -1, self.num_kv_heads, self.head_size
                    )[:kv_len]
                    # Only last q_len rows of the mask
                    causal_mask = causal_mask[kv_len - q_len : kv_len, :kv_len]
                    attn_out = torch_npu.npu_fusion_attention(
                        q.unsqueeze(0),  # (1, q_len, H, D) — BSH layout
                        gathered_k.unsqueeze(0),
                        gathered_v.unsqueeze(0),
                        head_num=self.num_heads,
                        input_layout="BSND",
            scale=self.scale,
-                        sparse_mode=0,
+            sparse_mode=3,
                        atten_mask=causal_mask.unsqueeze(0),
                        pre_tockens=kv_len,
                        next_tockens=0,
        )
                    output[start:end] = attn_out[0].squeeze(0)
                else:
                    # Full self-attention (no prior cache)
                    k = key[start:end]
                    v = value[start:end]
                    causal_mask = causal_mask[:q_len, :q_len]
                    attn_out = torch_npu.npu_fusion_attention(
                        q.unsqueeze(0),
                        k.unsqueeze(0),
                        v.unsqueeze(0),
                        head_num=self.num_heads,
                        input_layout="BSND",
                        scale=self.scale,
                        sparse_mode=0,
                        atten_mask=causal_mask.unsqueeze(0),
                        pre_tockens=q_len,
                        next_tockens=0,
                    )
                    output[start:end] = attn_out[0].squeeze(0)
        attn_output = attn_output.view(num_tokens, self.num_heads, self.head_size)
        output[:num_tokens] = attn_output[:num_tokens]
        return output