# Copyright 2024 Bytedance Ltd. and/or its affiliates # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Utilities for distributed training.""" import os import torch.distributed from verl.utils.device import get_nccl_backend, get_torch_device def initialize_global_process_group(timeout_second=36000): from datetime import timedelta torch.distributed.init_process_group( get_nccl_backend(), timeout=timedelta(seconds=timeout_second), init_method=os.environ.get("DIST_INIT_METHOD", None), ) local_rank = int(os.environ["LOCAL_RANK"]) rank = int(os.environ["RANK"]) world_size = int(os.environ["WORLD_SIZE"]) if torch.distributed.is_initialized(): get_torch_device().set_device(local_rank) return local_rank, rank, world_size def destroy_global_process_group(): if torch.distributed.is_initialized(): torch.distributed.destroy_process_group()