mirror of
https://github.com/THU-MIG/yolov10.git
synced 2025-05-24 06:14:55 +08:00
Increase NCCL timeout from 1 hour to 3 hours (#3343)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
This commit is contained in:
parent
137552996a
commit
2ebd808b69
@ -197,10 +197,11 @@ class BaseTrainer:
|
||||
self.device = torch.device('cuda', RANK)
|
||||
LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
|
||||
os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout
|
||||
dist.init_process_group('nccl' if dist.is_nccl_available() else 'gloo',
|
||||
timeout=timedelta(seconds=3600),
|
||||
rank=RANK,
|
||||
world_size=world_size)
|
||||
dist.init_process_group(
|
||||
'nccl' if dist.is_nccl_available() else 'gloo',
|
||||
timeout=timedelta(seconds=10800), # 3 hours
|
||||
rank=RANK,
|
||||
world_size=world_size)
|
||||
|
||||
def _setup_train(self, world_size):
|
||||
"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user