diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py index da34d65c..82a398c4 100644 --- a/ultralytics/engine/trainer.py +++ b/ultralytics/engine/trainer.py @@ -224,6 +224,7 @@ class BaseTrainer: """Initializes and sets the DistributedDataParallel parameters for training.""" torch.cuda.set_device(RANK) self.device = torch.device("cuda", RANK) + # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}') os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout dist.init_process_group( backend="nccl" if dist.is_nccl_available() else "gloo",