Unverified Commit 1e6825da authored by Jeevan's avatar Jeevan Committed by GitHub
Browse files

Fix watchdog timeout (#1404)

* Fix watchdog timeout

* Pre-commit fix

* Timedelta
parent 921eab86
import copy import copy
import os import os
from datetime import timedelta
from pathlib import Path from pathlib import Path
from typing import List, Literal, Optional, Tuple, Union from typing import List, Literal, Optional, Tuple, Union
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import transformers import transformers
from accelerate import Accelerator, DistributedType, find_executable_batch_size from accelerate import (
Accelerator,
DistributedType,
InitProcessGroupKwargs,
find_executable_batch_size,
)
from packaging import version from packaging import version
from peft import PeftModel from peft import PeftModel
from peft import __version__ as PEFT_VERSION from peft import __version__ as PEFT_VERSION
...@@ -132,7 +138,8 @@ class HFLM(LM): ...@@ -132,7 +138,8 @@ class HFLM(LM):
assert isinstance(batch_size, (int, str)) assert isinstance(batch_size, (int, str))
gpus = torch.cuda.device_count() gpus = torch.cuda.device_count()
accelerator = Accelerator() accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
if accelerator.num_processes > 1: if accelerator.num_processes > 1:
self.accelerator = accelerator self.accelerator = accelerator
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment