[Hardware][AMD] Improve OAM device ID + llama4 Maverick MOE tuning (#16263)

Signed-off-by: Lu Fang <lufang@fb.com>
Co-authored-by: Lu Fang <lufang@fb.com>
This commit is contained in:
Xiaodong Wang
2025-05-02 12:44:19 -07:00
committed by GitHub
parent 182f40ea8b
commit 9352cdb56d
3 changed files with 231 additions and 3 deletions

View File

@@ -442,8 +442,14 @@ class BenchmarkWorker:
hidden_size, search_space,
is_fp16, topk)
with torch.cuda.device(self.device_id) if current_platform.is_rocm(
) else nullcontext():
need_device_guard = False
if current_platform.is_rocm():
visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
if visible_device != f"{self.device_id}":
need_device_guard = True
with torch.cuda.device(
self.device_id) if need_device_guard else nullcontext():
for config in tqdm(search_space):
try:
kernel_time = benchmark_config(
@@ -578,6 +584,15 @@ def main(args: argparse.Namespace):
use_deep_gemm = bool(args.use_deep_gemm)
if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
# Ray will set ROCR_VISIBLE_DEVICES for device visibility
logger.warning(
"Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
"Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.")
val = os.environ["HIP_VISIBLE_DEVICES"]
os.environ["ROCR_VISIBLE_DEVICES"] = val
del os.environ["HIP_VISIBLE_DEVICES"]
ray.init()
num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]