Unverified Commit 44a3783d authored by Zilin Zhu's avatar Zilin Zhu Committed by GitHub
Browse files

[fix][RL] Remove the incorrect barrier in init_weights_update_group (#5914)

parent f3bf6110
...@@ -631,7 +631,6 @@ class ModelRunner: ...@@ -631,7 +631,6 @@ class ModelRunner:
rank=rank, rank=rank,
group_name=group_name, group_name=group_name,
) )
dist.barrier(group=self._model_update_group, device_ids=[rank])
return True, "Succeeded to initialize custom process group." return True, "Succeeded to initialize custom process group."
except Exception as e: except Exception as e:
message = f"Failed to initialize custom process group: {e}." message = f"Failed to initialize custom process group: {e}."
......
...@@ -162,7 +162,6 @@ def init_process_hf( ...@@ -162,7 +162,6 @@ def init_process_hf(
rank=rank, rank=rank,
group_name="test_parameter_update_group", group_name="test_parameter_update_group",
) )
dist.barrier(group=group, device_ids=[rank])
torch.cuda.synchronize() torch.cuda.synchronize()
time_begin_broadcast = time.perf_counter() time_begin_broadcast = time.perf_counter()
...@@ -223,8 +222,8 @@ def init_process_sgl( ...@@ -223,8 +222,8 @@ def init_process_sgl(
if rank == 1: if rank == 1:
url = DEFAULT_URL_FOR_TEST url = DEFAULT_URL_FOR_TEST
else: else:
host, port = DEFAULT_URL_FOR_TEST.split(":") host, _, port = DEFAULT_URL_FOR_TEST.rpartition(":")
url = ":".join(host, str(int(port) + 10000)) url = ":".join([host, str(int(port) + 10000)])
print(f"[sgl] rank {rank} init server on url: {url}") print(f"[sgl] rank {rank} init server on url: {url}")
process = popen_launch_server( process = popen_launch_server(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment