vllm-npu-plugin/vllm_npu/patch/worker/patch_weight_loader.py

import torch
from torch.nn.parameter import Parameter
from vllm.logger import init_logger
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.utils import set_weight_attrs
from vllm.utils import GiB_bytes

logger = init_logger(__name__)


def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
                   output_partition_sizes: list[int], input_size: int,
                   output_size: int, params_dtype: torch.dtype,
                   **extra_weight_attrs):
    # This method creates unquantized linear weights.
    # The weights are not quantized, and they are not sharded.
    # The amount of memory allocated for the weights is
    # sum(output_partition_sizes) * input_size_per_partition.
    try:
        weight = Parameter(torch.empty(sum(output_partition_sizes),
                                       input_size_per_partition,
                                       dtype=params_dtype),
                           requires_grad=False)
    except torch.cuda.OutOfMemoryError as e:
        logger.error("Failed to create unquantized linear weights: %s", e)
        if torch.cuda.is_available():
            logger.debug("CUDA device: %s", torch.cuda.current_device())
            logger.debug("Allocated: %.2f GiB",
                         torch.cuda.memory_allocated() / GiB_bytes)
            logger.debug("Reserved: %.2f GiB",
                         torch.cuda.memory_reserved() / GiB_bytes)
        raise RuntimeError(
            "Failed to create unquantized linear weights. "
            "This may be caused by insufficient memory to allocate "
            "the weight.") from e
    set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
    layer.register_parameter("weight", weight)
    set_weight_attrs(weight, extra_weight_attrs)


UnquantizedLinearMethod.create_weights = create_weights