ultralytics 8.0.162 Multi-GPU DDP fix (#4544)

Co-authored-by: Yonghye Kwon <developer.0hye@gmail.com> Co-authored-by: andresinsitu <andres.rodriguez@ingenieriainsitu.com>
2025-07-04 11:44:22 +08:00 · 2023-08-24 13:13:49 +02:00 · 2023-08-24 13:13:49 +02:00 · 2bcee56e70
commit 2bcee56e70
parent 1db9afc2e5
9 changed files with 24 additions and 14 deletions
--- a/docs/reference/utils/callbacks/tensorboard.md
+++ b/docs/reference/utils/callbacks/tensorboard.md
@ -21,6 +21,10 @@ keywords: Ultralytics, YOLO, documentation, callback utilities, log_scalars, on_
 ## ::: ultralytics.utils.callbacks.tensorboard.on_pretrain_routine_start
 <br><br>

+---
+## ::: ultralytics.utils.callbacks.tensorboard.on_train_start
+<br><br>
+
 ---
 ## ::: ultralytics.utils.callbacks.tensorboard.on_batch_end
 <br><br>
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -63,7 +63,7 @@ def test_export(model, format):

 def test_rtdetr(task='detect', model='yolov8n-rtdetr.yaml', data='coco8.yaml'):
    # Warning: MUST use imgsz=640
-    run(f'yolo train {task} model={model} data={data} imgsz=640 epochs=1 cache=disk')
+    run(f'yolo train {task} model={model} data={data} imgsz=640 epochs=1, cache = disk')  # add coma, space to args
    run(f"yolo predict {task} model={model} source={ASSETS / 'bus.jpg'} imgsz=640 save save_crop save_txt")


--- a/tests/test_python.py
+++ b/tests/test_python.py
@ -145,13 +145,13 @@ def test_val():

 def test_train_scratch():
    model = YOLO(CFG)
-    model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1)
+    model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1, name='model')
    model(SOURCE)


 def test_train_pretrained():
    model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt')
-    model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5)
+    model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5, name=0)
    model(SOURCE)


--- a/ultralytics/init.py
+++ b/ultralytics/init.py
@ -1,6 +1,6 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license

-__version__ = '8.0.161'
+__version__ = '8.0.162'

 from ultralytics.models import RTDETR, SAM, YOLO
 from ultralytics.models.fastsam import FastSAM
--- a/ultralytics/cfg/init.py
+++ b/ultralytics/cfg/init.py
@ -110,6 +110,7 @@ def get_cfg(cfg: Union[str, Path, Dict, SimpleNamespace] = DEFAULT_CFG_DICT, ove
    # Merge overrides
    if overrides:
        overrides = cfg2dict(overrides)
+        if 'save_dir' not in cfg:
            overrides.pop('save_dir', None)  # special override keys to ignore
        check_dict_alignment(cfg, overrides)
        cfg = {**cfg, **overrides}  # merge cfg and overrides dicts (prefer overrides)
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@ -343,7 +343,7 @@ def check_cls_dataset(dataset: str, split=''):

    # Print to console
    for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
-        prefix = f'{colorstr(k)} {v}...'
+        prefix = f'{colorstr(f"{k}:")} {v}...'
        if v is None:
            LOGGER.info(prefix)
        else:
--- a/ultralytics/engine/trainer.py
+++ b/ultralytics/engine/trainer.py
@ -184,7 +184,7 @@ class BaseTrainer:
            # Command
            cmd, file = generate_ddp_command(world_size, self)
            try:
-                LOGGER.info(f'DDP command: {cmd}')
+                LOGGER.info(f'{colorstr("DDP:")} debug command {" ".join(cmd)}')
                subprocess.run(cmd, check=True)
            except Exception as e:
                raise e
@ -197,7 +197,7 @@ class BaseTrainer:
        """Initializes and sets the DistributedDataParallel parameters for training."""
        torch.cuda.set_device(RANK)
        self.device = torch.device('cuda', RANK)
-        LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
+        # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
        os.environ['NCCL_BLOCKING_WAIT'] = '1'  # set to enforce timeout
        dist.init_process_group(
            'nccl' if dist.is_nccl_available() else 'gloo',
@ -299,8 +299,7 @@ class BaseTrainer:
        self.epoch_time_start = time.time()
        self.train_time_start = time.time()
        nb = len(self.train_loader)  # number of batches
-        nw = max(round(self.args.warmup_epochs *
-                       nb), 100) if self.args.warmup_epochs > 0 else -1  # number of warmup iterations
+        nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1  # warmup iterations
        last_opt_step = -1
        self.run_callbacks('on_train_start')
        LOGGER.info(f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
@ -557,7 +556,7 @@ class BaseTrainer:
        n = len(metrics) + 1  # number of cols
        s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n')  # header
        with open(self.csv, 'a') as f:
-            f.write(s + ('%23.5g,' * n % tuple([self.epoch] + vals)).rstrip(',') + '\n')
+            f.write(s + ('%23.5g,' * n % tuple([self.epoch + 1] + vals)).rstrip(',') + '\n')

    def plot_metrics(self):
        """Plot and display metrics visually."""
--- a/ultralytics/engine/validator.py
+++ b/ultralytics/engine/validator.py
@ -222,7 +222,7 @@ class BaseValidator:
        Args:
            pred_classes (torch.Tensor): Predicted class indices of shape(N,).
            true_classes (torch.Tensor): Target class indices of shape(M,).
-            iou (torch.Tensor): IoU thresholds from 0.50 to 0.95 in space of 0.05.
+            iou (torch.Tensor): An NxM tensor containing the pairwise IoU values for predictions and ground of truth

        Returns:
            (torch.Tensor): Correct tensor of shape(N,10) for 10 IoU thresholds.
--- a/ultralytics/utils/callbacks/tensorboard.py
+++ b/ultralytics/utils/callbacks/tensorboard.py
@ -23,7 +23,7 @@ def _log_scalars(scalars, step=0):


 def _log_tensorboard_graph(trainer):
-    # Log model graph to TensorBoard
+    """Log model graph to TensorBoard."""
    try:
        import warnings

@ -48,11 +48,16 @@ def on_pretrain_routine_start(trainer):
            WRITER = SummaryWriter(str(trainer.save_dir))
            prefix = colorstr('TensorBoard: ')
            LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/")
-            _log_tensorboard_graph(trainer)
        except Exception as e:
            LOGGER.warning(f'WARNING ⚠️ TensorBoard not initialized correctly, not logging this run. {e}')


+def on_train_start(trainer):
+    """Log TensorBoard graph."""
+    if WRITER:
+        _log_tensorboard_graph(trainer)
+
+
 def on_batch_end(trainer):
    """Logs scalar statistics at the end of a training batch."""
    _log_scalars(trainer.label_loss_items(trainer.tloss, prefix='train'), trainer.epoch + 1)
@ -65,5 +70,6 @@ def on_fit_epoch_end(trainer):

 callbacks = {
    'on_pretrain_routine_start': on_pretrain_routine_start,
+    'on_train_start': on_train_start,
    'on_fit_epoch_end': on_fit_epoch_end,
    'on_batch_end': on_batch_end}