mirror of
https://github.com/handsomezhuzhu/vllm-npu-plugin.git
synced 2026-02-20 19:50:15 +00:00
大改
This commit is contained in:
37
vllm_npu/patch/worker/__init__.py
Normal file
37
vllm_npu/patch/worker/__init__.py
Normal file
@@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
if HAS_TRITON:
|
||||
import vllm_npu.patch.worker.patch_triton
|
||||
|
||||
# isort: off
|
||||
import vllm_npu.patch.platform.patch_sched_yield # noqa
|
||||
import vllm_npu.patch.worker.patch_distributed # noqa
|
||||
import vllm_npu.patch.worker.patch_logits # noqa
|
||||
import vllm_npu.patch.worker.patch_roberta # noqa
|
||||
import vllm_npu.patch.worker.patch_weight_loader # noqa
|
||||
import vllm_npu.patch.worker.patch_multimodal_merge # noqa
|
||||
import vllm_npu.patch.worker.patch_minicpm # noqa
|
||||
import vllm_npu.patch.worker.patch_deepseek_mtp # noqa
|
||||
import vllm_npu.patch.worker.patch_attention_layer # noqa
|
||||
|
||||
if os.getenv("SHM_BARRIER", "true") == "true":
|
||||
import vllm_npu.patch.platform.patch_message_queue # noqa
|
||||
92
vllm_npu/patch/worker/patch_attention_layer.py
Normal file
92
vllm_npu/patch/worker/patch_attention_layer.py
Normal file
@@ -0,0 +1,92 @@
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import vllm
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
|
||||
|
||||
def forward(
|
||||
self,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
# For some alternate attention backends like MLA the attention output
|
||||
# shape does not match the query shape, so we optionally let the model
|
||||
# definition specify the output tensor shape.
|
||||
output_shape: Optional[torch.Size] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
The KV cache is stored inside this class and is accessed via
|
||||
`self.kv_cache`.
|
||||
Attention metadata (`attn_metadata`) is set using a context manager in
|
||||
the model runner's `execute_model` method. It is accessed via forward
|
||||
context using
|
||||
`vllm.forward_context.get_forward_context().attn_metadata`.
|
||||
"""
|
||||
if self.calculate_kv_scales:
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
if attn_metadata.enable_kv_scales_calculation:
|
||||
self.calc_kv_scales(query, key, value)
|
||||
|
||||
output_dtype = query.dtype
|
||||
if self.query_quant is not None:
|
||||
# quantizing with a simple torch operation enables
|
||||
# torch.compile to fuse this into previous ops
|
||||
# which reduces overheads during decoding.
|
||||
# Otherwise queries are quantized using custom ops
|
||||
# which causes decoding overheads
|
||||
assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"}
|
||||
query, _ = self.query_quant(query, self._q_scale)
|
||||
|
||||
if self.use_output:
|
||||
output_shape = (output_shape
|
||||
if output_shape is not None else query.shape)
|
||||
output = torch.empty(output_shape,
|
||||
dtype=output_dtype,
|
||||
device=query.device)
|
||||
hidden_size = output_shape[-1]
|
||||
# We skip reshaping query, key and value tensors for the MLA
|
||||
# backend since these tensors have different semantics and are
|
||||
# processed differently.
|
||||
if not self.use_mla:
|
||||
# Reshape the query, key, and value tensors.
|
||||
# NOTE(woosuk): We do this outside the custom op to minimize the
|
||||
# CPU overheads from the non-CUDA-graph regions.
|
||||
query = query.view(-1, self.num_heads, self.head_size)
|
||||
output = output.view(-1, self.num_heads, self.head_size)
|
||||
if key is not None:
|
||||
key = key.view(-1, self.num_kv_heads, self.head_size)
|
||||
if value is not None:
|
||||
value = value.view(-1, self.num_kv_heads, self.head_size)
|
||||
if self.use_direct_call:
|
||||
forward_context: ForwardContext = get_forward_context()
|
||||
attn_metadata = forward_context.attn_metadata
|
||||
if isinstance(attn_metadata, dict):
|
||||
attn_metadata = attn_metadata[self.layer_name]
|
||||
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
|
||||
self.impl.forward(self,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
self_kv_cache,
|
||||
attn_metadata,
|
||||
output=output)
|
||||
else:
|
||||
torch.ops.vllm.unified_attention_with_output(
|
||||
query, key, value, output, self.layer_name)
|
||||
return output.view(-1, hidden_size)
|
||||
else:
|
||||
if self.use_direct_call:
|
||||
forward_context = get_forward_context()
|
||||
attn_metadata = forward_context.attn_metadata
|
||||
if isinstance(attn_metadata, dict):
|
||||
attn_metadata = attn_metadata[self.layer_name]
|
||||
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
|
||||
return self.impl.forward(self, query, key, value, self_kv_cache,
|
||||
attn_metadata)
|
||||
else:
|
||||
return torch.ops.vllm.unified_attention(query, key, value,
|
||||
self.layer_name)
|
||||
|
||||
|
||||
vllm.attention.layer.Attention.forward = forward
|
||||
94
vllm_npu/patch/worker/patch_deepseek_mtp.py
Normal file
94
vllm_npu/patch/worker/patch_deepseek_mtp.py
Normal file
@@ -0,0 +1,94 @@
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import vllm
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.models.deepseek_mtp import (
|
||||
DeepSeekMTP, DeepSeekMultiTokenPredictorLayer)
|
||||
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
|
||||
from vllm.model_executor.models.utils import maybe_prefix
|
||||
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
previous_hidden_states: torch.Tensor,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
spec_step_index: int = 0,
|
||||
) -> torch.Tensor:
|
||||
assert inputs_embeds is not None
|
||||
# masking inputs at position 0, as not needed by MTP
|
||||
# Patch this for aclgraph support, as the original operation introduced d2h sync,
|
||||
# which breaks aclgraph
|
||||
inputs_embeds = torch.where(positions.unsqueeze(-1) == 0, 0, inputs_embeds)
|
||||
inputs_embeds = self.enorm(inputs_embeds)
|
||||
previous_hidden_states = self.hnorm(previous_hidden_states)
|
||||
|
||||
hidden_states = self.eh_proj(
|
||||
torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
|
||||
|
||||
hidden_states, residual = self.mtp_block(positions=positions,
|
||||
hidden_states=hidden_states,
|
||||
residual=None)
|
||||
hidden_states = residual + hidden_states
|
||||
return hidden_states
|
||||
|
||||
|
||||
# Patch this only for aclgraph support, as this is not support in vLLM 0.11.0
|
||||
@support_torch_compile
|
||||
class AscendDeepSeekMTP(DeepSeekMTP):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
|
||||
class SharedHead(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
prefix: str,
|
||||
quant_config: QuantizationConfig = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.head = ParallelLMHead(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "head"),
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
return self.norm(hidden_states)
|
||||
|
||||
|
||||
def predictor_init(self, vllm_config: VllmConfig, prefix: str) -> None:
|
||||
nn.Module.__init__(self)
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.eh_proj = nn.Linear(config.hidden_size * 2,
|
||||
config.hidden_size,
|
||||
bias=False)
|
||||
|
||||
# We don't need topk_indices_buffer in Ascend
|
||||
topk_indices_buffer = None
|
||||
self.shared_head = SharedHead(config=config,
|
||||
prefix=prefix,
|
||||
quant_config=quant_config)
|
||||
self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix,
|
||||
topk_indices_buffer)
|
||||
|
||||
|
||||
DeepSeekMultiTokenPredictorLayer.__init__ = predictor_init
|
||||
vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer.forward = forward
|
||||
115
vllm_npu/patch/worker/patch_distributed.py
Normal file
115
vllm_npu/patch/worker/patch_distributed.py
Normal file
@@ -0,0 +1,115 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import torch
|
||||
import vllm
|
||||
from torch.distributed import Backend
|
||||
from vllm.distributed.parallel_state import (GroupCoordinator,
|
||||
_get_unique_name, _register_group)
|
||||
|
||||
from vllm_npu.distributed.communicator import NPUCommunicator
|
||||
from vllm_npu.utils import create_hccl_pg_options
|
||||
|
||||
|
||||
class GroupCoordinatorPatch(GroupCoordinator):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
group_ranks: list[list[int]],
|
||||
local_rank: int,
|
||||
torch_distributed_backend: Union[str, Backend],
|
||||
use_device_communicator: bool, # whether to use device communicator
|
||||
use_message_queue_broadcaster: bool = False,
|
||||
group_name: Optional[str] = None,
|
||||
):
|
||||
group_name = group_name or "anonymous"
|
||||
self.unique_name = _get_unique_name(group_name)
|
||||
_register_group(self)
|
||||
|
||||
self.rank = torch.distributed.get_rank()
|
||||
self.local_rank = local_rank
|
||||
|
||||
self_device_group = None
|
||||
self_cpu_group = None
|
||||
hccl_pg_options = create_hccl_pg_options(group_name)
|
||||
|
||||
for ranks in group_ranks:
|
||||
device_group = torch.distributed.new_group(
|
||||
ranks,
|
||||
backend=torch_distributed_backend,
|
||||
pg_options=hccl_pg_options)
|
||||
|
||||
# a group with `gloo` backend, to allow direct coordination between
|
||||
# processes through the CPU.
|
||||
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
||||
if self.rank in ranks:
|
||||
self.ranks = ranks
|
||||
self.world_size = len(ranks)
|
||||
self.rank_in_group = ranks.index(self.rank)
|
||||
self_device_group = device_group
|
||||
self_cpu_group = cpu_group
|
||||
|
||||
assert self_cpu_group is not None
|
||||
assert self_device_group is not None
|
||||
|
||||
self.cpu_group = self_cpu_group
|
||||
self.device_group = self_device_group
|
||||
self.device = torch.npu.current_device()
|
||||
|
||||
self.use_device_communicator = use_device_communicator
|
||||
self.device_communicator = None
|
||||
if use_device_communicator and self.world_size > 1:
|
||||
self.device_communicator = NPUCommunicator(
|
||||
cpu_group=self.cpu_group,
|
||||
device=self.device,
|
||||
device_group=self.device_group,
|
||||
unique_name=self.unique_name,
|
||||
)
|
||||
|
||||
from vllm.distributed.device_communicators.shm_broadcast import \
|
||||
MessageQueue
|
||||
self.mq_broadcaster: Optional[MessageQueue] = None
|
||||
if use_message_queue_broadcaster and self.world_size > 1:
|
||||
self.mq_broadcaster = MessageQueue.create_from_process_group(
|
||||
self.cpu_group, 1 << 22, 6)
|
||||
|
||||
self.use_custom_op_call = False
|
||||
self.use_cpu_custom_send_recv = False
|
||||
|
||||
def all_to_all(self,
|
||||
input_: torch.Tensor,
|
||||
scatter_dim: int = 0,
|
||||
gather_dim: int = -1,
|
||||
scatter_sizes: Optional[List[int]] = None,
|
||||
gather_sizes: Optional[List[int]] = None) -> torch.Tensor:
|
||||
if self.world_size == 1:
|
||||
return input_
|
||||
assert -input_.dim() <= scatter_dim < input_.dim(), (
|
||||
f"Invalid scatter dim ({scatter_dim}) for input tensor with shape {input_.size()}"
|
||||
)
|
||||
assert -input_.dim() <= gather_dim < input_.dim(), (
|
||||
f"Invalid gather dim ({gather_dim}) for input tensor with shape {input_.size()}"
|
||||
)
|
||||
assert self.device_communicator is not None, "device_communicator should be initialized when world_size > 1"
|
||||
return self.device_communicator.all_to_all(input_, scatter_dim,
|
||||
gather_dim, scatter_sizes,
|
||||
gather_sizes)
|
||||
|
||||
|
||||
vllm.distributed.parallel_state.GroupCoordinator = GroupCoordinatorPatch
|
||||
26
vllm_npu/patch/worker/patch_logits.py
Normal file
26
vllm_npu/patch/worker/patch_logits.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import torch
|
||||
import vllm
|
||||
from vllm._custom_ops import apply_repetition_penalties_torch
|
||||
|
||||
|
||||
def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
|
||||
output_mask: torch.Tensor,
|
||||
repetition_penalties: torch.Tensor) -> None:
|
||||
"""Apply repetition penalties to logits in-place.
|
||||
|
||||
Args:
|
||||
logits: The logits tensor of shape [num_seqs, vocab_size].
|
||||
prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
|
||||
output_mask: A boolean tensor indicating which tokens appear in the output.
|
||||
repetition_penalties: The repetition penalties of shape (num_seqs, ).
|
||||
"""
|
||||
apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
|
||||
repetition_penalties)
|
||||
|
||||
|
||||
# NPU device type tensors have attributes is_cuda=True and is_npu=True, according to its implementation in
|
||||
# https://github.com/Ascend/pytorch/blob/863b9071cbdf47023c12c246e3efa9c6e2285fc6/torch_npu/npu/_stream_check.py#L74
|
||||
# This causes that vLLM's apply_repetition_penalties function will run into the branch of "if logits.is_cuda" and
|
||||
# call the custom op implemented in CUDA, which is not compatible with NPU.
|
||||
# Reference: https://github.com/vllm-project/vllm/blob/f66673a39d9f364194c249f28098cad8a5584ccb/vllm/_custom_ops.py#L314
|
||||
vllm._custom_ops.apply_repetition_penalties = apply_repetition_penalties
|
||||
36
vllm_npu/patch/worker/patch_minicpm.py
Normal file
36
vllm_npu/patch/worker/patch_minicpm.py
Normal file
@@ -0,0 +1,36 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import torch
|
||||
from vllm.model_executor.models.minicpm import MiniCPMAttention
|
||||
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
qkv, _ = self.qkv_proj(hidden_states)
|
||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
attn_output = self.attn(q, k, v)
|
||||
output, _ = self.o_proj(attn_output)
|
||||
return output
|
||||
|
||||
|
||||
# The type conversion in the forward function is deleted to support the rope operator.
|
||||
MiniCPMAttention.forward = forward
|
||||
58
vllm_npu/patch/worker/patch_multimodal_merge.py
Normal file
58
vllm_npu/patch/worker/patch_multimodal_merge.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
|
||||
import torch
|
||||
import vllm
|
||||
from vllm.model_executor.models.utils import (_embedding_count_expression,
|
||||
_flatten_embeddings)
|
||||
from vllm.multimodal import NestedTensors
|
||||
|
||||
|
||||
def _merge_multimodal_embeddings(
|
||||
inputs_embeds: torch.Tensor,
|
||||
is_multimodal: torch.Tensor,
|
||||
multimodal_embeddings: NestedTensors,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
|
||||
positions in ``inputs_embeds`` corresponding to placeholder tokens in
|
||||
``input_ids``.
|
||||
|
||||
Note:
|
||||
This updates ``inputs_embeds`` in place.
|
||||
"""
|
||||
flattened = _flatten_embeddings(multimodal_embeddings)
|
||||
try:
|
||||
inputs_embeds[is_multimodal] = flattened
|
||||
except RuntimeError as e:
|
||||
num_expected_tokens = is_multimodal.sum().item()
|
||||
assert isinstance(num_expected_tokens, int)
|
||||
|
||||
if flattened.shape[0] != num_expected_tokens:
|
||||
expr = _embedding_count_expression(multimodal_embeddings)
|
||||
raise ValueError(
|
||||
f"Attempted to assign {expr} = {flattened.shape[0]} "
|
||||
f"multimodal tokens to {num_expected_tokens} placeholders"
|
||||
) from e
|
||||
else:
|
||||
raise ValueError("Error during masked scatter operation") from e
|
||||
|
||||
return inputs_embeds
|
||||
|
||||
|
||||
vllm.model_executor.models.utils._merge_multimodal_embeddings = _merge_multimodal_embeddings
|
||||
88
vllm_npu/patch/worker/patch_roberta.py
Normal file
88
vllm_npu/patch/worker/patch_roberta.py
Normal file
@@ -0,0 +1,88 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from vllm.model_executor.models.roberta import (
|
||||
RobertaEmbedding, RobertaForSequenceClassification,
|
||||
replace_roberta_positions)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
# aclgraph does not support shift operator for now
|
||||
# TODO: revert me when aclgraph supports shift operator
|
||||
TOKEN_TYPE_SHIFT = 30
|
||||
TOKEN_TYPE_MULTIPLIER = 1 << 30
|
||||
TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1
|
||||
|
||||
|
||||
def _encode_token_type_ids(input_ids: torch.Tensor,
|
||||
token_type_ids: torch.Tensor) -> None:
|
||||
# input_ids can be padded to the right
|
||||
input_ids[:token_type_ids.shape[0]].bitwise_or_(token_type_ids *
|
||||
TOKEN_TYPE_MULTIPLIER)
|
||||
|
||||
|
||||
def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
token_type_ids = input_ids // TOKEN_TYPE_MULTIPLIER
|
||||
|
||||
input_ids.bitwise_and_(TOKEN_MASK)
|
||||
|
||||
return token_type_ids
|
||||
|
||||
|
||||
def roberta_for_sequence_classification_forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor],
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
token_type_ids: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
replace_roberta_positions(input_ids=input_ids,
|
||||
position_ids=positions,
|
||||
padding_idx=self.padding_idx)
|
||||
if token_type_ids is not None:
|
||||
assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
|
||||
assert input_ids is not None
|
||||
_encode_token_type_ids(input_ids, token_type_ids)
|
||||
return self.roberta(input_ids=input_ids,
|
||||
positions=positions,
|
||||
inputs_embeds=inputs_embeds,
|
||||
intermediate_tensors=intermediate_tensors)
|
||||
|
||||
|
||||
def roberta_embedding_forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
position_ids: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
|
||||
token_type_ids = _decode_token_type_ids(input_ids)
|
||||
|
||||
inputs_embeds = self.word_embeddings(input_ids)
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
|
||||
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||
embeddings = inputs_embeds + token_type_embeddings + position_embeddings
|
||||
embeddings = self.LayerNorm(embeddings)
|
||||
return embeddings
|
||||
|
||||
|
||||
RobertaEmbedding.forward = roberta_embedding_forward
|
||||
RobertaForSequenceClassification.forward = roberta_for_sequence_classification_forward
|
||||
16
vllm_npu/patch/worker/patch_triton.py
Normal file
16
vllm_npu/patch/worker/patch_triton.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import vllm.model_executor.layers.fla.ops.chunk
|
||||
import vllm.model_executor.layers.fla.ops.fused_recurrent
|
||||
import vllm.model_executor.layers.fla.ops.layernorm_guard
|
||||
import vllm.model_executor.layers.mamba.ops.causal_conv1d
|
||||
|
||||
from vllm_npu.ops.casual_conv1d import (causal_conv1d_fn,
|
||||
causal_conv1d_update_npu)
|
||||
from vllm_npu.ops.fla import LayerNormFn, torch_chunk_gated_delta_rule
|
||||
from vllm_npu.ops.sigmoid_gating import \
|
||||
fused_recurrent_gated_delta_rule_fwd_kernel
|
||||
|
||||
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu
|
||||
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
|
||||
vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel
|
||||
vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
|
||||
vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule
|
||||
41
vllm_npu/patch/worker/patch_weight_loader.py
Normal file
41
vllm_npu/patch/worker/patch_weight_loader.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import torch
|
||||
from torch.nn.parameter import Parameter
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
|
||||
output_partition_sizes: list[int], input_size: int,
|
||||
output_size: int, params_dtype: torch.dtype,
|
||||
**extra_weight_attrs):
|
||||
# This method creates unquantized linear weights.
|
||||
# The weights are not quantized, and they are not sharded.
|
||||
# The amount of memory allocated for the weights is
|
||||
# sum(output_partition_sizes) * input_size_per_partition.
|
||||
try:
|
||||
weight = Parameter(torch.empty(sum(output_partition_sizes),
|
||||
input_size_per_partition,
|
||||
dtype=params_dtype),
|
||||
requires_grad=False)
|
||||
except torch.cuda.OutOfMemoryError as e:
|
||||
logger.error("Failed to create unquantized linear weights: %s", e)
|
||||
if torch.cuda.is_available():
|
||||
logger.debug("CUDA device: %s", torch.cuda.current_device())
|
||||
logger.debug("Allocated: %.2f GiB",
|
||||
torch.cuda.memory_allocated() / GiB_bytes)
|
||||
logger.debug("Reserved: %.2f GiB",
|
||||
torch.cuda.memory_reserved() / GiB_bytes)
|
||||
raise RuntimeError(
|
||||
"Failed to create unquantized linear weights. "
|
||||
"This may be caused by insufficient memory to allocate "
|
||||
"the weight.") from e
|
||||
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
||||
layer.register_parameter("weight", weight)
|
||||
set_weight_attrs(weight, extra_weight_attrs)
|
||||
|
||||
|
||||
UnquantizedLinearMethod.create_weights = create_weights
|
||||
Reference in New Issue
Block a user