[Hardware][AMD] Improve OAM device ID + llama4 Maverick MOE tuning (#16263)
Signed-off-by: Lu Fang <lufang@fb.com> Co-authored-by: Lu Fang <lufang@fb.com>
This commit is contained in:
@@ -442,8 +442,14 @@ class BenchmarkWorker:
|
||||
hidden_size, search_space,
|
||||
is_fp16, topk)
|
||||
|
||||
with torch.cuda.device(self.device_id) if current_platform.is_rocm(
|
||||
) else nullcontext():
|
||||
need_device_guard = False
|
||||
if current_platform.is_rocm():
|
||||
visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
|
||||
if visible_device != f"{self.device_id}":
|
||||
need_device_guard = True
|
||||
|
||||
with torch.cuda.device(
|
||||
self.device_id) if need_device_guard else nullcontext():
|
||||
for config in tqdm(search_space):
|
||||
try:
|
||||
kernel_time = benchmark_config(
|
||||
@@ -578,6 +584,15 @@ def main(args: argparse.Namespace):
|
||||
|
||||
use_deep_gemm = bool(args.use_deep_gemm)
|
||||
|
||||
if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
|
||||
# Ray will set ROCR_VISIBLE_DEVICES for device visibility
|
||||
logger.warning(
|
||||
"Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
|
||||
"Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.")
|
||||
val = os.environ["HIP_VISIBLE_DEVICES"]
|
||||
os.environ["ROCR_VISIBLE_DEVICES"] = val
|
||||
del os.environ["HIP_VISIBLE_DEVICES"]
|
||||
|
||||
ray.init()
|
||||
num_gpus = int(ray.available_resources()["GPU"])
|
||||
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
||||
|
||||
Reference in New Issue
Block a user