ultralytics 8.0.162 Multi-GPU DDP fix (#4544)

Co-authored-by: Yonghye Kwon <developer.0hye@gmail.com>
Co-authored-by: andresinsitu <andres.rodriguez@ingenieriainsitu.com>
This commit is contained in:
Glenn Jocher 2023-08-24 13:13:49 +02:00 committed by GitHub
parent 1db9afc2e5
commit 2bcee56e70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 24 additions and 14 deletions

View File

@ -21,6 +21,10 @@ keywords: Ultralytics, YOLO, documentation, callback utilities, log_scalars, on_
## ::: ultralytics.utils.callbacks.tensorboard.on_pretrain_routine_start ## ::: ultralytics.utils.callbacks.tensorboard.on_pretrain_routine_start
<br><br> <br><br>
---
## ::: ultralytics.utils.callbacks.tensorboard.on_train_start
<br><br>
--- ---
## ::: ultralytics.utils.callbacks.tensorboard.on_batch_end ## ::: ultralytics.utils.callbacks.tensorboard.on_batch_end
<br><br> <br><br>

View File

@ -63,7 +63,7 @@ def test_export(model, format):
def test_rtdetr(task='detect', model='yolov8n-rtdetr.yaml', data='coco8.yaml'): def test_rtdetr(task='detect', model='yolov8n-rtdetr.yaml', data='coco8.yaml'):
# Warning: MUST use imgsz=640 # Warning: MUST use imgsz=640
run(f'yolo train {task} model={model} data={data} imgsz=640 epochs=1 cache=disk') run(f'yolo train {task} model={model} data={data} imgsz=640 epochs=1, cache = disk') # add coma, space to args
run(f"yolo predict {task} model={model} source={ASSETS / 'bus.jpg'} imgsz=640 save save_crop save_txt") run(f"yolo predict {task} model={model} source={ASSETS / 'bus.jpg'} imgsz=640 save save_crop save_txt")

View File

@ -145,13 +145,13 @@ def test_val():
def test_train_scratch(): def test_train_scratch():
model = YOLO(CFG) model = YOLO(CFG)
model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1) model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1, name='model')
model(SOURCE) model(SOURCE)
def test_train_pretrained(): def test_train_pretrained():
model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt') model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt')
model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5) model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5, name=0)
model(SOURCE) model(SOURCE)

View File

@ -1,6 +1,6 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
__version__ = '8.0.161' __version__ = '8.0.162'
from ultralytics.models import RTDETR, SAM, YOLO from ultralytics.models import RTDETR, SAM, YOLO
from ultralytics.models.fastsam import FastSAM from ultralytics.models.fastsam import FastSAM

View File

@ -110,6 +110,7 @@ def get_cfg(cfg: Union[str, Path, Dict, SimpleNamespace] = DEFAULT_CFG_DICT, ove
# Merge overrides # Merge overrides
if overrides: if overrides:
overrides = cfg2dict(overrides) overrides = cfg2dict(overrides)
if 'save_dir' not in cfg:
overrides.pop('save_dir', None) # special override keys to ignore overrides.pop('save_dir', None) # special override keys to ignore
check_dict_alignment(cfg, overrides) check_dict_alignment(cfg, overrides)
cfg = {**cfg, **overrides} # merge cfg and overrides dicts (prefer overrides) cfg = {**cfg, **overrides} # merge cfg and overrides dicts (prefer overrides)

View File

@ -343,7 +343,7 @@ def check_cls_dataset(dataset: str, split=''):
# Print to console # Print to console
for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items(): for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
prefix = f'{colorstr(k)} {v}...' prefix = f'{colorstr(f"{k}:")} {v}...'
if v is None: if v is None:
LOGGER.info(prefix) LOGGER.info(prefix)
else: else:

View File

@ -184,7 +184,7 @@ class BaseTrainer:
# Command # Command
cmd, file = generate_ddp_command(world_size, self) cmd, file = generate_ddp_command(world_size, self)
try: try:
LOGGER.info(f'DDP command: {cmd}') LOGGER.info(f'{colorstr("DDP:")} debug command {" ".join(cmd)}')
subprocess.run(cmd, check=True) subprocess.run(cmd, check=True)
except Exception as e: except Exception as e:
raise e raise e
@ -197,7 +197,7 @@ class BaseTrainer:
"""Initializes and sets the DistributedDataParallel parameters for training.""" """Initializes and sets the DistributedDataParallel parameters for training."""
torch.cuda.set_device(RANK) torch.cuda.set_device(RANK)
self.device = torch.device('cuda', RANK) self.device = torch.device('cuda', RANK)
LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}') # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout
dist.init_process_group( dist.init_process_group(
'nccl' if dist.is_nccl_available() else 'gloo', 'nccl' if dist.is_nccl_available() else 'gloo',
@ -299,8 +299,7 @@ class BaseTrainer:
self.epoch_time_start = time.time() self.epoch_time_start = time.time()
self.train_time_start = time.time() self.train_time_start = time.time()
nb = len(self.train_loader) # number of batches nb = len(self.train_loader) # number of batches
nw = max(round(self.args.warmup_epochs * nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1 # warmup iterations
nb), 100) if self.args.warmup_epochs > 0 else -1 # number of warmup iterations
last_opt_step = -1 last_opt_step = -1
self.run_callbacks('on_train_start') self.run_callbacks('on_train_start')
LOGGER.info(f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n' LOGGER.info(f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
@ -557,7 +556,7 @@ class BaseTrainer:
n = len(metrics) + 1 # number of cols n = len(metrics) + 1 # number of cols
s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n') # header s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n') # header
with open(self.csv, 'a') as f: with open(self.csv, 'a') as f:
f.write(s + ('%23.5g,' * n % tuple([self.epoch] + vals)).rstrip(',') + '\n') f.write(s + ('%23.5g,' * n % tuple([self.epoch + 1] + vals)).rstrip(',') + '\n')
def plot_metrics(self): def plot_metrics(self):
"""Plot and display metrics visually.""" """Plot and display metrics visually."""

View File

@ -222,7 +222,7 @@ class BaseValidator:
Args: Args:
pred_classes (torch.Tensor): Predicted class indices of shape(N,). pred_classes (torch.Tensor): Predicted class indices of shape(N,).
true_classes (torch.Tensor): Target class indices of shape(M,). true_classes (torch.Tensor): Target class indices of shape(M,).
iou (torch.Tensor): IoU thresholds from 0.50 to 0.95 in space of 0.05. iou (torch.Tensor): An NxM tensor containing the pairwise IoU values for predictions and ground of truth
Returns: Returns:
(torch.Tensor): Correct tensor of shape(N,10) for 10 IoU thresholds. (torch.Tensor): Correct tensor of shape(N,10) for 10 IoU thresholds.

View File

@ -23,7 +23,7 @@ def _log_scalars(scalars, step=0):
def _log_tensorboard_graph(trainer): def _log_tensorboard_graph(trainer):
# Log model graph to TensorBoard """Log model graph to TensorBoard."""
try: try:
import warnings import warnings
@ -48,11 +48,16 @@ def on_pretrain_routine_start(trainer):
WRITER = SummaryWriter(str(trainer.save_dir)) WRITER = SummaryWriter(str(trainer.save_dir))
prefix = colorstr('TensorBoard: ') prefix = colorstr('TensorBoard: ')
LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/") LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/")
_log_tensorboard_graph(trainer)
except Exception as e: except Exception as e:
LOGGER.warning(f'WARNING ⚠️ TensorBoard not initialized correctly, not logging this run. {e}') LOGGER.warning(f'WARNING ⚠️ TensorBoard not initialized correctly, not logging this run. {e}')
def on_train_start(trainer):
"""Log TensorBoard graph."""
if WRITER:
_log_tensorboard_graph(trainer)
def on_batch_end(trainer): def on_batch_end(trainer):
"""Logs scalar statistics at the end of a training batch.""" """Logs scalar statistics at the end of a training batch."""
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix='train'), trainer.epoch + 1) _log_scalars(trainer.label_loss_items(trainer.tloss, prefix='train'), trainer.epoch + 1)
@ -65,5 +70,6 @@ def on_fit_epoch_end(trainer):
callbacks = { callbacks = {
'on_pretrain_routine_start': on_pretrain_routine_start, 'on_pretrain_routine_start': on_pretrain_routine_start,
'on_train_start': on_train_start,
'on_fit_epoch_end': on_fit_epoch_end, 'on_fit_epoch_end': on_fit_epoch_end,
'on_batch_end': on_batch_end} 'on_batch_end': on_batch_end}