Unverified Commit 1e6825da authored by Jeevan's avatar Jeevan Committed by GitHub
Browse files

Fix watchdog timeout (#1404)

* Fix watchdog timeout

* Pre-commit fix

* Timedelta
parent 921eab86
import copy
import os
from datetime import timedelta
from pathlib import Path
from typing import List, Literal, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import transformers
from accelerate import Accelerator, DistributedType, find_executable_batch_size
from accelerate import (
Accelerator,
DistributedType,
InitProcessGroupKwargs,
find_executable_batch_size,
)
from packaging import version
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
......@@ -132,7 +138,8 @@ class HFLM(LM):
assert isinstance(batch_size, (int, str))
gpus = torch.cuda.device_count()
accelerator = Accelerator()
accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
if accelerator.num_processes > 1:
self.accelerator = accelerator
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment