[core] LLM.collective_rpc interface and RLHF example (#12084)

Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
youkaichao
2025-01-16 20:19:52 +08:00
committed by GitHub
parent bf53e0c70b
commit 92e793d91a
6 changed files with 270 additions and 35 deletions

View File

@@ -126,11 +126,15 @@ steps:
- tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
- examples/offline_inference/rlhf.py
commands:
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py
- label: Metrics, Tracing Test # 10min
num_gpus: 2