[TPU] Support single and multi-host TPUs on GKE (#7613)

2024-08-30 00:27:40 -07:00
parent dc13e99348
commit 2148441fd3
5 changed files with 74 additions and 4 deletions
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -1,3 +1,5 @@
+import os
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
@@ -5,11 +7,12 @@ from torch.distributed import ProcessGroup
 from vllm.platforms import current_platform

 if current_platform.is_tpu():
-    import ray
    import torch_xla.core.xla_model as xm
    import torch_xla.runtime as xr
    from torch_xla._internal import pjrt

+    from vllm.executor import ray_utils
+

 class TpuCommunicator:

@@ -24,9 +27,29 @@ class TpuCommunicator:
        # be simply calculated as follows.
        global_rank = dist.get_rank(group)
        global_world_size = dist.get_world_size(group)
-        num_nodes = len(ray.nodes())
+
+        # Calculate how many TPU nodes are in the current deployment. This
+        # is the Ray placement group if it is deployed with Ray. Default
+        # to the number of TPU nodes in the Ray cluster. The number of TPU
+        # nodes is computed by the total number of TPUs divided by the
+        # number of TPU accelerators per node, to account for clusters
+        # with both CPUs and TPUs.
+        num_nodes = ray_utils.get_num_tpu_nodes()
+        num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
+        if num_nodes_in_pg > 0:
+            num_nodes = num_nodes_in_pg
+
        local_world_size = global_world_size // num_nodes
        local_rank = global_rank % local_world_size
+
+        # Ensure environment variables are set for multihost deployments.
+        # On GKE, this is needed for libtpu and TPU driver to know which TPU
+        # chip is actually visible. Otherwise the TPU driver will fail to
+        # initialize because the number of devices would be different from
+        # the number of visible worker addresses.
+        os.environ["CLOUD_TPU_TASK_ID"] = str(global_rank)
+        os.environ["TPU_VISIBLE_CHIPS"] = str(local_rank)
+
        pjrt.initialize_multiprocess(local_rank, local_world_size)
        xr._init_world_size_ordinal()