revert less related changes

This commit is contained in:
Thomas Friedel 2024-05-26 02:35:45 +02:00
parent 803f4013a2
commit 5108ff5c13

View File

@ -225,9 +225,9 @@ class BaseTrainer:
torch.cuda.set_device(RANK)
self.device = torch.device("cuda", RANK)
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
os.environ["NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
dist.init_process_group(
backend="nccl" if dist.is_nccl_available() else "gloo",
"nccl" if dist.is_nccl_available() else "gloo",
timeout=timedelta(seconds=10800), # 3 hours
rank=RANK,
world_size=world_size,