大改

2026-02-20 19:50:15 +00:00 · 2026-02-10 23:08:39 +08:00
parent 1baa36026c
commit 6680585975
172 changed files with 52867 additions and 892 deletions
--- a/vllm_npu/patch/worker/init.py
+++ b/vllm_npu/patch/worker/init.py
@@ -0,0 +1,37 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    import vllm_npu.patch.worker.patch_triton
+
+# isort: off
+import vllm_npu.patch.platform.patch_sched_yield  # noqa
+import vllm_npu.patch.worker.patch_distributed  # noqa
+import vllm_npu.patch.worker.patch_logits  # noqa
+import vllm_npu.patch.worker.patch_roberta  # noqa
+import vllm_npu.patch.worker.patch_weight_loader  # noqa
+import vllm_npu.patch.worker.patch_multimodal_merge  # noqa
+import vllm_npu.patch.worker.patch_minicpm  # noqa
+import vllm_npu.patch.worker.patch_deepseek_mtp  # noqa
+import vllm_npu.patch.worker.patch_attention_layer  # noqa
+
+if os.getenv("SHM_BARRIER", "true") == "true":
+    import vllm_npu.patch.platform.patch_message_queue  # noqa
--- a/vllm_npu/patch/worker/patch_attention_layer.py
+++ b/vllm_npu/patch/worker/patch_attention_layer.py
@@ -0,0 +1,92 @@
+from typing import Optional
+
+import torch
+import vllm
+from vllm.forward_context import ForwardContext, get_forward_context
+
+
+def forward(
+    self,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    # For some alternate attention backends like MLA the attention output
+    # shape does not match the query shape, so we optionally let the model
+    # definition specify the output tensor shape.
+    output_shape: Optional[torch.Size] = None,
+) -> torch.Tensor:
+    """
+    The KV cache is stored inside this class and is accessed via
+    `self.kv_cache`.
+    Attention metadata (`attn_metadata`) is set using a context manager in
+    the model runner's `execute_model` method. It is accessed via forward
+    context using
+    `vllm.forward_context.get_forward_context().attn_metadata`.
+    """
+    if self.calculate_kv_scales:
+        attn_metadata = get_forward_context().attn_metadata
+        if attn_metadata.enable_kv_scales_calculation:
+            self.calc_kv_scales(query, key, value)
+
+    output_dtype = query.dtype
+    if self.query_quant is not None:
+        # quantizing with a simple torch operation enables
+        # torch.compile to fuse this into previous ops
+        # which reduces overheads during decoding.
+        # Otherwise queries are quantized using custom ops
+        # which causes decoding overheads
+        assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"}
+        query, _ = self.query_quant(query, self._q_scale)
+
+    if self.use_output:
+        output_shape = (output_shape
+                        if output_shape is not None else query.shape)
+        output = torch.empty(output_shape,
+                             dtype=output_dtype,
+                             device=query.device)
+        hidden_size = output_shape[-1]
+        # We skip reshaping query, key and value tensors for the MLA
+        # backend since these tensors have different semantics and are
+        # processed differently.
+        if not self.use_mla:
+            # Reshape the query, key, and value tensors.
+            # NOTE(woosuk): We do this outside the custom op to minimize the
+            # CPU overheads from the non-CUDA-graph regions.
+            query = query.view(-1, self.num_heads, self.head_size)
+            output = output.view(-1, self.num_heads, self.head_size)
+            if key is not None:
+                key = key.view(-1, self.num_kv_heads, self.head_size)
+            if value is not None:
+                value = value.view(-1, self.num_kv_heads, self.head_size)
+        if self.use_direct_call:
+            forward_context: ForwardContext = get_forward_context()
+            attn_metadata = forward_context.attn_metadata
+            if isinstance(attn_metadata, dict):
+                attn_metadata = attn_metadata[self.layer_name]
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+            self.impl.forward(self,
+                              query,
+                              key,
+                              value,
+                              self_kv_cache,
+                              attn_metadata,
+                              output=output)
+        else:
+            torch.ops.vllm.unified_attention_with_output(
+                query, key, value, output, self.layer_name)
+        return output.view(-1, hidden_size)
+    else:
+        if self.use_direct_call:
+            forward_context = get_forward_context()
+            attn_metadata = forward_context.attn_metadata
+            if isinstance(attn_metadata, dict):
+                attn_metadata = attn_metadata[self.layer_name]
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+            return self.impl.forward(self, query, key, value, self_kv_cache,
+                                     attn_metadata)
+        else:
+            return torch.ops.vllm.unified_attention(query, key, value,
+                                                    self.layer_name)
+
+
+vllm.attention.layer.Attention.forward = forward
--- a/vllm_npu/patch/worker/patch_deepseek_mtp.py
+++ b/vllm_npu/patch/worker/patch_deepseek_mtp.py
@@ -0,0 +1,94 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import vllm
+from transformers import PretrainedConfig
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.deepseek_mtp import (
+    DeepSeekMTP, DeepSeekMultiTokenPredictorLayer)
+from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
+from vllm.model_executor.models.utils import maybe_prefix
+
+
+def forward(
+    self,
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    previous_hidden_states: torch.Tensor,
+    inputs_embeds: Optional[torch.Tensor] = None,
+    spec_step_index: int = 0,
+) -> torch.Tensor:
+    assert inputs_embeds is not None
+    # masking inputs at position 0, as not needed by MTP
+    # Patch this for aclgraph support, as the original operation introduced d2h sync,
+    # which breaks aclgraph
+    inputs_embeds = torch.where(positions.unsqueeze(-1) == 0, 0, inputs_embeds)
+    inputs_embeds = self.enorm(inputs_embeds)
+    previous_hidden_states = self.hnorm(previous_hidden_states)
+
+    hidden_states = self.eh_proj(
+        torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+
+    hidden_states, residual = self.mtp_block(positions=positions,
+                                             hidden_states=hidden_states,
+                                             residual=None)
+    hidden_states = residual + hidden_states
+    return hidden_states
+
+
+# Patch this only for aclgraph support, as this is not support in vLLM 0.11.0
+@support_torch_compile
+class AscendDeepSeekMTP(DeepSeekMTP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+
+class SharedHead(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        quant_config: QuantizationConfig = None,
+    ) -> None:
+        super().__init__()
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "head"),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.norm(hidden_states)
+
+
+def predictor_init(self, vllm_config: VllmConfig, prefix: str) -> None:
+    nn.Module.__init__(self)
+    config = vllm_config.model_config.hf_config
+    quant_config = vllm_config.quant_config
+
+    self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    self.eh_proj = nn.Linear(config.hidden_size * 2,
+                             config.hidden_size,
+                             bias=False)
+
+    # We don't need topk_indices_buffer in Ascend
+    topk_indices_buffer = None
+    self.shared_head = SharedHead(config=config,
+                                  prefix=prefix,
+                                  quant_config=quant_config)
+    self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix,
+                                            topk_indices_buffer)
+
+
+DeepSeekMultiTokenPredictorLayer.__init__ = predictor_init
+vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer.forward = forward
--- a/vllm_npu/patch/worker/patch_distributed.py
+++ b/vllm_npu/patch/worker/patch_distributed.py
@@ -0,0 +1,115 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional, Union
+
+import torch
+import vllm
+from torch.distributed import Backend
+from vllm.distributed.parallel_state import (GroupCoordinator,
+                                             _get_unique_name, _register_group)
+
+from vllm_npu.distributed.communicator import NPUCommunicator
+from vllm_npu.utils import create_hccl_pg_options
+
+
+class GroupCoordinatorPatch(GroupCoordinator):
+
+    def __init__(
+        self,
+        group_ranks: list[list[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        use_device_communicator: bool,  # whether to use device communicator
+        use_message_queue_broadcaster: bool = False,
+        group_name: Optional[str] = None,
+    ):
+        group_name = group_name or "anonymous"
+        self.unique_name = _get_unique_name(group_name)
+        _register_group(self)
+
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+
+        self_device_group = None
+        self_cpu_group = None
+        hccl_pg_options = create_hccl_pg_options(group_name)
+
+        for ranks in group_ranks:
+            device_group = torch.distributed.new_group(
+                ranks,
+                backend=torch_distributed_backend,
+                pg_options=hccl_pg_options)
+
+            # a group with `gloo` backend, to allow direct coordination between
+            # processes through the CPU.
+            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self_device_group = device_group
+                self_cpu_group = cpu_group
+
+        assert self_cpu_group is not None
+        assert self_device_group is not None
+
+        self.cpu_group = self_cpu_group
+        self.device_group = self_device_group
+        self.device = torch.npu.current_device()
+
+        self.use_device_communicator = use_device_communicator
+        self.device_communicator = None
+        if use_device_communicator and self.world_size > 1:
+            self.device_communicator = NPUCommunicator(
+                cpu_group=self.cpu_group,
+                device=self.device,
+                device_group=self.device_group,
+                unique_name=self.unique_name,
+            )
+
+        from vllm.distributed.device_communicators.shm_broadcast import \
+            MessageQueue
+        self.mq_broadcaster: Optional[MessageQueue] = None
+        if use_message_queue_broadcaster and self.world_size > 1:
+            self.mq_broadcaster = MessageQueue.create_from_process_group(
+                self.cpu_group, 1 << 22, 6)
+
+        self.use_custom_op_call = False
+        self.use_cpu_custom_send_recv = False
+
+    def all_to_all(self,
+                   input_: torch.Tensor,
+                   scatter_dim: int = 0,
+                   gather_dim: int = -1,
+                   scatter_sizes: Optional[List[int]] = None,
+                   gather_sizes: Optional[List[int]] = None) -> torch.Tensor:
+        if self.world_size == 1:
+            return input_
+        assert -input_.dim() <= scatter_dim < input_.dim(), (
+            f"Invalid scatter dim ({scatter_dim}) for input tensor with shape {input_.size()}"
+        )
+        assert -input_.dim() <= gather_dim < input_.dim(), (
+            f"Invalid gather dim ({gather_dim}) for input tensor with shape {input_.size()}"
+        )
+        assert self.device_communicator is not None, "device_communicator should be initialized when world_size > 1"
+        return self.device_communicator.all_to_all(input_, scatter_dim,
+                                                   gather_dim, scatter_sizes,
+                                                   gather_sizes)
+
+
+vllm.distributed.parallel_state.GroupCoordinator = GroupCoordinatorPatch
--- a/vllm_npu/patch/worker/patch_logits.py
+++ b/vllm_npu/patch/worker/patch_logits.py
@@ -0,0 +1,26 @@
+import torch
+import vllm
+from vllm._custom_ops import apply_repetition_penalties_torch
+
+
+def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
+                               output_mask: torch.Tensor,
+                               repetition_penalties: torch.Tensor) -> None:
+    """Apply repetition penalties to logits in-place.
+
+    Args:
+        logits: The logits tensor of shape [num_seqs, vocab_size].
+        prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
+        output_mask: A boolean tensor indicating which tokens appear in the output.
+        repetition_penalties: The repetition penalties of shape (num_seqs, ).
+    """
+    apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
+                                     repetition_penalties)
+
+
+# NPU device type tensors have attributes is_cuda=True and is_npu=True, according to its implementation in
+# https://github.com/Ascend/pytorch/blob/863b9071cbdf47023c12c246e3efa9c6e2285fc6/torch_npu/npu/_stream_check.py#L74
+# This causes that vLLM's apply_repetition_penalties function will run into the branch of "if logits.is_cuda" and
+# call the custom op implemented in CUDA, which is not compatible with NPU.
+# Reference: https://github.com/vllm-project/vllm/blob/f66673a39d9f364194c249f28098cad8a5584ccb/vllm/_custom_ops.py#L314
+vllm._custom_ops.apply_repetition_penalties = apply_repetition_penalties
--- a/vllm_npu/patch/worker/patch_minicpm.py
+++ b/vllm_npu/patch/worker/patch_minicpm.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+from vllm.model_executor.models.minicpm import MiniCPMAttention
+
+
+def forward(
+    self,
+    positions: torch.Tensor,
+    hidden_states: torch.Tensor,
+) -> torch.Tensor:
+    qkv, _ = self.qkv_proj(hidden_states)
+    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+    q, k = self.rotary_emb(positions, q, k)
+    attn_output = self.attn(q, k, v)
+    output, _ = self.o_proj(attn_output)
+    return output
+
+
+# The type conversion in the forward function is deleted to support the rope operator.
+MiniCPMAttention.forward = forward
--- a/vllm_npu/patch/worker/patch_multimodal_merge.py
+++ b/vllm_npu/patch/worker/patch_multimodal_merge.py
@@ -0,0 +1,58 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import torch
+import vllm
+from vllm.model_executor.models.utils import (_embedding_count_expression,
+                                              _flatten_embeddings)
+from vllm.multimodal import NestedTensors
+
+
+def _merge_multimodal_embeddings(
+    inputs_embeds: torch.Tensor,
+    is_multimodal: torch.Tensor,
+    multimodal_embeddings: NestedTensors,
+) -> torch.Tensor:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
+    positions in ``inputs_embeds`` corresponding to placeholder tokens in
+    ``input_ids``.
+
+    Note:
+        This updates ``inputs_embeds`` in place.
+    """
+    flattened = _flatten_embeddings(multimodal_embeddings)
+    try:
+        inputs_embeds[is_multimodal] = flattened
+    except RuntimeError as e:
+        num_expected_tokens = is_multimodal.sum().item()
+        assert isinstance(num_expected_tokens, int)
+
+        if flattened.shape[0] != num_expected_tokens:
+            expr = _embedding_count_expression(multimodal_embeddings)
+            raise ValueError(
+                f"Attempted to assign {expr} = {flattened.shape[0]} "
+                f"multimodal tokens to {num_expected_tokens} placeholders"
+            ) from e
+        else:
+            raise ValueError("Error during masked scatter operation") from e
+
+    return inputs_embeds
+
+
+vllm.model_executor.models.utils._merge_multimodal_embeddings = _merge_multimodal_embeddings
--- a/vllm_npu/patch/worker/patch_roberta.py
+++ b/vllm_npu/patch/worker/patch_roberta.py
@@ -0,0 +1,88 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional
+
+import torch
+from vllm.model_executor.models.roberta import (
+    RobertaEmbedding, RobertaForSequenceClassification,
+    replace_roberta_positions)
+from vllm.sequence import IntermediateTensors
+
+# aclgraph does not support shift operator for now
+# TODO: revert me when aclgraph supports shift operator
+TOKEN_TYPE_SHIFT = 30
+TOKEN_TYPE_MULTIPLIER = 1 << 30
+TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1
+
+
+def _encode_token_type_ids(input_ids: torch.Tensor,
+                           token_type_ids: torch.Tensor) -> None:
+    # input_ids can be padded to the right
+    input_ids[:token_type_ids.shape[0]].bitwise_or_(token_type_ids *
+                                                    TOKEN_TYPE_MULTIPLIER)
+
+
+def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
+
+    token_type_ids = input_ids // TOKEN_TYPE_MULTIPLIER
+
+    input_ids.bitwise_and_(TOKEN_MASK)
+
+    return token_type_ids
+
+
+def roberta_for_sequence_classification_forward(
+    self,
+    input_ids: Optional[torch.Tensor],
+    positions: torch.Tensor,
+    intermediate_tensors: Optional[IntermediateTensors] = None,
+    inputs_embeds: Optional[torch.Tensor] = None,
+    token_type_ids: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    replace_roberta_positions(input_ids=input_ids,
+                              position_ids=positions,
+                              padding_idx=self.padding_idx)
+    if token_type_ids is not None:
+        assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
+        assert input_ids is not None
+        _encode_token_type_ids(input_ids, token_type_ids)
+    return self.roberta(input_ids=input_ids,
+                        positions=positions,
+                        inputs_embeds=inputs_embeds,
+                        intermediate_tensors=intermediate_tensors)
+
+
+def roberta_embedding_forward(
+    self,
+    input_ids: torch.Tensor,
+    position_ids: torch.Tensor,
+) -> torch.Tensor:
+
+    token_type_ids = _decode_token_type_ids(input_ids)
+
+    inputs_embeds = self.word_embeddings(input_ids)
+    position_embeddings = self.position_embeddings(position_ids)
+
+    token_type_embeddings = self.token_type_embeddings(token_type_ids)
+    embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+    embeddings = self.LayerNorm(embeddings)
+    return embeddings
+
+
+RobertaEmbedding.forward = roberta_embedding_forward
+RobertaForSequenceClassification.forward = roberta_for_sequence_classification_forward
--- a/vllm_npu/patch/worker/patch_triton.py
+++ b/vllm_npu/patch/worker/patch_triton.py
@@ -0,0 +1,16 @@
+import vllm.model_executor.layers.fla.ops.chunk
+import vllm.model_executor.layers.fla.ops.fused_recurrent
+import vllm.model_executor.layers.fla.ops.layernorm_guard
+import vllm.model_executor.layers.mamba.ops.causal_conv1d
+
+from vllm_npu.ops.casual_conv1d import (causal_conv1d_fn,
+                                           causal_conv1d_update_npu)
+from vllm_npu.ops.fla import LayerNormFn, torch_chunk_gated_delta_rule
+from vllm_npu.ops.sigmoid_gating import \
+    fused_recurrent_gated_delta_rule_fwd_kernel
+
+vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu
+vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
+vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel
+vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
+vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule
--- a/vllm_npu/patch/worker/patch_weight_loader.py
+++ b/vllm_npu/patch/worker/patch_weight_loader.py
@@ -0,0 +1,41 @@
+import torch
+from torch.nn.parameter import Parameter
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import GiB_bytes
+
+logger = init_logger(__name__)
+
+
+def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
+                   output_partition_sizes: list[int], input_size: int,
+                   output_size: int, params_dtype: torch.dtype,
+                   **extra_weight_attrs):
+    # This method creates unquantized linear weights.
+    # The weights are not quantized, and they are not sharded.
+    # The amount of memory allocated for the weights is
+    # sum(output_partition_sizes) * input_size_per_partition.
+    try:
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=params_dtype),
+                           requires_grad=False)
+    except torch.cuda.OutOfMemoryError as e:
+        logger.error("Failed to create unquantized linear weights: %s", e)
+        if torch.cuda.is_available():
+            logger.debug("CUDA device: %s", torch.cuda.current_device())
+            logger.debug("Allocated: %.2f GiB",
+                         torch.cuda.memory_allocated() / GiB_bytes)
+            logger.debug("Reserved: %.2f GiB",
+                         torch.cuda.memory_reserved() / GiB_bytes)
+        raise RuntimeError(
+            "Failed to create unquantized linear weights. "
+            "This may be caused by insufficient memory to allocate "
+            "the weight.") from e
+    set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+    layer.register_parameter("weight", weight)
+    set_weight_attrs(weight, extra_weight_attrs)
+
+
+UnquantizedLinearMethod.create_weights = create_weights