mirror of
https://github.com/THU-MIG/yolov10.git
synced 2025-05-23 13:34:23 +08:00
ultralytics 8.0.162
Multi-GPU DDP fix (#4544)
Co-authored-by: Yonghye Kwon <developer.0hye@gmail.com> Co-authored-by: andresinsitu <andres.rodriguez@ingenieriainsitu.com>
This commit is contained in:
parent
1db9afc2e5
commit
2bcee56e70
@ -21,6 +21,10 @@ keywords: Ultralytics, YOLO, documentation, callback utilities, log_scalars, on_
|
||||
## ::: ultralytics.utils.callbacks.tensorboard.on_pretrain_routine_start
|
||||
<br><br>
|
||||
|
||||
---
|
||||
## ::: ultralytics.utils.callbacks.tensorboard.on_train_start
|
||||
<br><br>
|
||||
|
||||
---
|
||||
## ::: ultralytics.utils.callbacks.tensorboard.on_batch_end
|
||||
<br><br>
|
||||
|
@ -63,7 +63,7 @@ def test_export(model, format):
|
||||
|
||||
def test_rtdetr(task='detect', model='yolov8n-rtdetr.yaml', data='coco8.yaml'):
|
||||
# Warning: MUST use imgsz=640
|
||||
run(f'yolo train {task} model={model} data={data} imgsz=640 epochs=1 cache=disk')
|
||||
run(f'yolo train {task} model={model} data={data} imgsz=640 epochs=1, cache = disk') # add coma, space to args
|
||||
run(f"yolo predict {task} model={model} source={ASSETS / 'bus.jpg'} imgsz=640 save save_crop save_txt")
|
||||
|
||||
|
||||
|
@ -145,13 +145,13 @@ def test_val():
|
||||
|
||||
def test_train_scratch():
|
||||
model = YOLO(CFG)
|
||||
model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1)
|
||||
model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1, name='model')
|
||||
model(SOURCE)
|
||||
|
||||
|
||||
def test_train_pretrained():
|
||||
model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt')
|
||||
model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5)
|
||||
model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5, name=0)
|
||||
model(SOURCE)
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Ultralytics YOLO 🚀, AGPL-3.0 license
|
||||
|
||||
__version__ = '8.0.161'
|
||||
__version__ = '8.0.162'
|
||||
|
||||
from ultralytics.models import RTDETR, SAM, YOLO
|
||||
from ultralytics.models.fastsam import FastSAM
|
||||
|
@ -110,6 +110,7 @@ def get_cfg(cfg: Union[str, Path, Dict, SimpleNamespace] = DEFAULT_CFG_DICT, ove
|
||||
# Merge overrides
|
||||
if overrides:
|
||||
overrides = cfg2dict(overrides)
|
||||
if 'save_dir' not in cfg:
|
||||
overrides.pop('save_dir', None) # special override keys to ignore
|
||||
check_dict_alignment(cfg, overrides)
|
||||
cfg = {**cfg, **overrides} # merge cfg and overrides dicts (prefer overrides)
|
||||
|
@ -343,7 +343,7 @@ def check_cls_dataset(dataset: str, split=''):
|
||||
|
||||
# Print to console
|
||||
for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
|
||||
prefix = f'{colorstr(k)} {v}...'
|
||||
prefix = f'{colorstr(f"{k}:")} {v}...'
|
||||
if v is None:
|
||||
LOGGER.info(prefix)
|
||||
else:
|
||||
|
@ -184,7 +184,7 @@ class BaseTrainer:
|
||||
# Command
|
||||
cmd, file = generate_ddp_command(world_size, self)
|
||||
try:
|
||||
LOGGER.info(f'DDP command: {cmd}')
|
||||
LOGGER.info(f'{colorstr("DDP:")} debug command {" ".join(cmd)}')
|
||||
subprocess.run(cmd, check=True)
|
||||
except Exception as e:
|
||||
raise e
|
||||
@ -197,7 +197,7 @@ class BaseTrainer:
|
||||
"""Initializes and sets the DistributedDataParallel parameters for training."""
|
||||
torch.cuda.set_device(RANK)
|
||||
self.device = torch.device('cuda', RANK)
|
||||
LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
|
||||
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
|
||||
os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout
|
||||
dist.init_process_group(
|
||||
'nccl' if dist.is_nccl_available() else 'gloo',
|
||||
@ -299,8 +299,7 @@ class BaseTrainer:
|
||||
self.epoch_time_start = time.time()
|
||||
self.train_time_start = time.time()
|
||||
nb = len(self.train_loader) # number of batches
|
||||
nw = max(round(self.args.warmup_epochs *
|
||||
nb), 100) if self.args.warmup_epochs > 0 else -1 # number of warmup iterations
|
||||
nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1 # warmup iterations
|
||||
last_opt_step = -1
|
||||
self.run_callbacks('on_train_start')
|
||||
LOGGER.info(f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
|
||||
@ -557,7 +556,7 @@ class BaseTrainer:
|
||||
n = len(metrics) + 1 # number of cols
|
||||
s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n') # header
|
||||
with open(self.csv, 'a') as f:
|
||||
f.write(s + ('%23.5g,' * n % tuple([self.epoch] + vals)).rstrip(',') + '\n')
|
||||
f.write(s + ('%23.5g,' * n % tuple([self.epoch + 1] + vals)).rstrip(',') + '\n')
|
||||
|
||||
def plot_metrics(self):
|
||||
"""Plot and display metrics visually."""
|
||||
|
@ -222,7 +222,7 @@ class BaseValidator:
|
||||
Args:
|
||||
pred_classes (torch.Tensor): Predicted class indices of shape(N,).
|
||||
true_classes (torch.Tensor): Target class indices of shape(M,).
|
||||
iou (torch.Tensor): IoU thresholds from 0.50 to 0.95 in space of 0.05.
|
||||
iou (torch.Tensor): An NxM tensor containing the pairwise IoU values for predictions and ground of truth
|
||||
|
||||
Returns:
|
||||
(torch.Tensor): Correct tensor of shape(N,10) for 10 IoU thresholds.
|
||||
|
@ -23,7 +23,7 @@ def _log_scalars(scalars, step=0):
|
||||
|
||||
|
||||
def _log_tensorboard_graph(trainer):
|
||||
# Log model graph to TensorBoard
|
||||
"""Log model graph to TensorBoard."""
|
||||
try:
|
||||
import warnings
|
||||
|
||||
@ -48,11 +48,16 @@ def on_pretrain_routine_start(trainer):
|
||||
WRITER = SummaryWriter(str(trainer.save_dir))
|
||||
prefix = colorstr('TensorBoard: ')
|
||||
LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/")
|
||||
_log_tensorboard_graph(trainer)
|
||||
except Exception as e:
|
||||
LOGGER.warning(f'WARNING ⚠️ TensorBoard not initialized correctly, not logging this run. {e}')
|
||||
|
||||
|
||||
def on_train_start(trainer):
|
||||
"""Log TensorBoard graph."""
|
||||
if WRITER:
|
||||
_log_tensorboard_graph(trainer)
|
||||
|
||||
|
||||
def on_batch_end(trainer):
|
||||
"""Logs scalar statistics at the end of a training batch."""
|
||||
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix='train'), trainer.epoch + 1)
|
||||
@ -65,5 +70,6 @@ def on_fit_epoch_end(trainer):
|
||||
|
||||
callbacks = {
|
||||
'on_pretrain_routine_start': on_pretrain_routine_start,
|
||||
'on_train_start': on_train_start,
|
||||
'on_fit_epoch_end': on_fit_epoch_end,
|
||||
'on_batch_end': on_batch_end}
|
||||
|
Loading…
x
Reference in New Issue
Block a user