[core] LLM.collective_rpc interface and RLHF example (#12084)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-16 20:19:52 +08:00
parent bf53e0c70b
commit 92e793d91a
6 changed files with 270 additions and 35 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -126,11 +126,15 @@ steps:
  - tests/distributed
  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile
+  - examples/offline_inference/rlhf.py
  commands:
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - python3 ../examples/offline_inference/rlhf.py

 - label: Metrics, Tracing Test # 10min
  num_gpus: 2