大改

2026-02-20 19:50:15 +00:00 · 2026-02-10 23:08:39 +08:00
parent 1baa36026c
commit 6680585975
172 changed files with 52867 additions and 892 deletions
--- a/vllm_npu/distributed/communicator.py
+++ b/vllm_npu/distributed/communicator.py
@@ -1,42 +1,46 @@
-"""
-NPUCommunicator — HCCL-based device communicator for Ascend NPU.
-
-Extends ``DeviceCommunicatorBase`` with NPU-specific collective
-operations using the HCCL backend.
-"""
-
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
 from typing import List, Optional

 import torch
 import torch.distributed as dist
-from vllm.distributed.device_communicators.base_device_communicator import (
-    DeviceCommunicatorBase,
-)
+from vllm.distributed.device_communicators.base_device_communicator import \
+    DeviceCommunicatorBase


 class NPUCommunicator(DeviceCommunicatorBase):
-    """Device communicator for Ascend NPU using HCCL."""

-    def __init__(
-        self,
-        cpu_group: dist.ProcessGroup,
-        device: Optional[torch.device] = None,
-        device_group: Optional[dist.ProcessGroup] = None,
-        unique_name: str = "",
-    ):
+    def __init__(self,
+                 cpu_group: dist.ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[dist.ProcessGroup] = None,
+                 unique_name: str = ""):
        super().__init__(cpu_group, device, device_group, unique_name)
-        import torch_npu  # noqa: F401
+        # TODO(hz): Refer to CudaCommunicator's implementation to integrate PyHcclCommunicator
+        # init device according to rank
        self.device = torch.npu.current_device()

-    def all_to_all(
-        self,
-        input_: torch.Tensor,
-        scatter_dim: int = 0,
-        gather_dim: int = -1,
-        scatter_sizes: Optional[List[int]] = None,
-        gather_sizes: Optional[List[int]] = None,
-    ) -> torch.Tensor:
-        """All-to-all communication for NPU tensors."""
+    def all_to_all(self,
+                   input_: torch.Tensor,
+                   scatter_dim: int = 0,
+                   gather_dim: int = -1,
+                   scatter_sizes: Optional[List[int]] = None,
+                   gather_sizes: Optional[List[int]] = None) -> torch.Tensor:
+
        if scatter_dim < 0:
            scatter_dim += input_.dim()
        if gather_dim < 0:
@@ -53,22 +57,17 @@ class NPUCommunicator(DeviceCommunicatorBase):
                tensor_shape = list(tensor_shape_base)
                tensor_shape[gather_dim] = gather_sizes[i]
                output_list.append(
-                    torch.empty(
-                        tensor_shape,
-                        dtype=input_.dtype,
-                        device=input_.device,
-                    )
-                )
+                    torch.empty(tensor_shape,
+                                dtype=input_.dtype,
+                                device=input_.device))
+
        else:
            input_list = [
-                t.contiguous()
-                for t in torch.tensor_split(
-                    input_, self.world_size, scatter_dim
-                )
+                t.contiguous() for t in torch.tensor_split(
+                    input_, self.world_size, scatter_dim)
            ]
            output_list = [
-                torch.empty_like(input_list[i])
-                for i in range(self.world_size)
+                torch.empty_like(input_list[i]) for i in range(self.world_size)
            ]

        dist.all_to_all(output_list, input_list, group=self.device_group)