re-added commented out debug print

This commit is contained in:
Thomas Friedel 2024-05-26 02:14:09 +02:00
parent 99b3fac964
commit 803f4013a2

View File

@ -224,6 +224,7 @@ class BaseTrainer:
"""Initializes and sets the DistributedDataParallel parameters for training.""" """Initializes and sets the DistributedDataParallel parameters for training."""
torch.cuda.set_device(RANK) torch.cuda.set_device(RANK)
self.device = torch.device("cuda", RANK) self.device = torch.device("cuda", RANK)
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
dist.init_process_group( dist.init_process_group(
backend="nccl" if dist.is_nccl_available() else "gloo", backend="nccl" if dist.is_nccl_available() else "gloo",