mirror of
https://github.com/THU-MIG/yolov10.git
synced 2025-05-23 21:44:22 +08:00
ultralytics 8.0.162
Multi-GPU DDP fix (#4544)
Co-authored-by: Yonghye Kwon <developer.0hye@gmail.com> Co-authored-by: andresinsitu <andres.rodriguez@ingenieriainsitu.com>
This commit is contained in:
parent
1db9afc2e5
commit
2bcee56e70
@ -21,6 +21,10 @@ keywords: Ultralytics, YOLO, documentation, callback utilities, log_scalars, on_
|
|||||||
## ::: ultralytics.utils.callbacks.tensorboard.on_pretrain_routine_start
|
## ::: ultralytics.utils.callbacks.tensorboard.on_pretrain_routine_start
|
||||||
<br><br>
|
<br><br>
|
||||||
|
|
||||||
|
---
|
||||||
|
## ::: ultralytics.utils.callbacks.tensorboard.on_train_start
|
||||||
|
<br><br>
|
||||||
|
|
||||||
---
|
---
|
||||||
## ::: ultralytics.utils.callbacks.tensorboard.on_batch_end
|
## ::: ultralytics.utils.callbacks.tensorboard.on_batch_end
|
||||||
<br><br>
|
<br><br>
|
||||||
|
@ -63,7 +63,7 @@ def test_export(model, format):
|
|||||||
|
|
||||||
def test_rtdetr(task='detect', model='yolov8n-rtdetr.yaml', data='coco8.yaml'):
|
def test_rtdetr(task='detect', model='yolov8n-rtdetr.yaml', data='coco8.yaml'):
|
||||||
# Warning: MUST use imgsz=640
|
# Warning: MUST use imgsz=640
|
||||||
run(f'yolo train {task} model={model} data={data} imgsz=640 epochs=1 cache=disk')
|
run(f'yolo train {task} model={model} data={data} imgsz=640 epochs=1, cache = disk') # add coma, space to args
|
||||||
run(f"yolo predict {task} model={model} source={ASSETS / 'bus.jpg'} imgsz=640 save save_crop save_txt")
|
run(f"yolo predict {task} model={model} source={ASSETS / 'bus.jpg'} imgsz=640 save save_crop save_txt")
|
||||||
|
|
||||||
|
|
||||||
|
@ -145,13 +145,13 @@ def test_val():
|
|||||||
|
|
||||||
def test_train_scratch():
|
def test_train_scratch():
|
||||||
model = YOLO(CFG)
|
model = YOLO(CFG)
|
||||||
model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1)
|
model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1, name='model')
|
||||||
model(SOURCE)
|
model(SOURCE)
|
||||||
|
|
||||||
|
|
||||||
def test_train_pretrained():
|
def test_train_pretrained():
|
||||||
model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt')
|
model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt')
|
||||||
model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5)
|
model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5, name=0)
|
||||||
model(SOURCE)
|
model(SOURCE)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# Ultralytics YOLO 🚀, AGPL-3.0 license
|
# Ultralytics YOLO 🚀, AGPL-3.0 license
|
||||||
|
|
||||||
__version__ = '8.0.161'
|
__version__ = '8.0.162'
|
||||||
|
|
||||||
from ultralytics.models import RTDETR, SAM, YOLO
|
from ultralytics.models import RTDETR, SAM, YOLO
|
||||||
from ultralytics.models.fastsam import FastSAM
|
from ultralytics.models.fastsam import FastSAM
|
||||||
|
@ -110,6 +110,7 @@ def get_cfg(cfg: Union[str, Path, Dict, SimpleNamespace] = DEFAULT_CFG_DICT, ove
|
|||||||
# Merge overrides
|
# Merge overrides
|
||||||
if overrides:
|
if overrides:
|
||||||
overrides = cfg2dict(overrides)
|
overrides = cfg2dict(overrides)
|
||||||
|
if 'save_dir' not in cfg:
|
||||||
overrides.pop('save_dir', None) # special override keys to ignore
|
overrides.pop('save_dir', None) # special override keys to ignore
|
||||||
check_dict_alignment(cfg, overrides)
|
check_dict_alignment(cfg, overrides)
|
||||||
cfg = {**cfg, **overrides} # merge cfg and overrides dicts (prefer overrides)
|
cfg = {**cfg, **overrides} # merge cfg and overrides dicts (prefer overrides)
|
||||||
|
@ -343,7 +343,7 @@ def check_cls_dataset(dataset: str, split=''):
|
|||||||
|
|
||||||
# Print to console
|
# Print to console
|
||||||
for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
|
for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
|
||||||
prefix = f'{colorstr(k)} {v}...'
|
prefix = f'{colorstr(f"{k}:")} {v}...'
|
||||||
if v is None:
|
if v is None:
|
||||||
LOGGER.info(prefix)
|
LOGGER.info(prefix)
|
||||||
else:
|
else:
|
||||||
|
@ -184,7 +184,7 @@ class BaseTrainer:
|
|||||||
# Command
|
# Command
|
||||||
cmd, file = generate_ddp_command(world_size, self)
|
cmd, file = generate_ddp_command(world_size, self)
|
||||||
try:
|
try:
|
||||||
LOGGER.info(f'DDP command: {cmd}')
|
LOGGER.info(f'{colorstr("DDP:")} debug command {" ".join(cmd)}')
|
||||||
subprocess.run(cmd, check=True)
|
subprocess.run(cmd, check=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
@ -197,7 +197,7 @@ class BaseTrainer:
|
|||||||
"""Initializes and sets the DistributedDataParallel parameters for training."""
|
"""Initializes and sets the DistributedDataParallel parameters for training."""
|
||||||
torch.cuda.set_device(RANK)
|
torch.cuda.set_device(RANK)
|
||||||
self.device = torch.device('cuda', RANK)
|
self.device = torch.device('cuda', RANK)
|
||||||
LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
|
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
|
||||||
os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout
|
os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout
|
||||||
dist.init_process_group(
|
dist.init_process_group(
|
||||||
'nccl' if dist.is_nccl_available() else 'gloo',
|
'nccl' if dist.is_nccl_available() else 'gloo',
|
||||||
@ -299,8 +299,7 @@ class BaseTrainer:
|
|||||||
self.epoch_time_start = time.time()
|
self.epoch_time_start = time.time()
|
||||||
self.train_time_start = time.time()
|
self.train_time_start = time.time()
|
||||||
nb = len(self.train_loader) # number of batches
|
nb = len(self.train_loader) # number of batches
|
||||||
nw = max(round(self.args.warmup_epochs *
|
nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1 # warmup iterations
|
||||||
nb), 100) if self.args.warmup_epochs > 0 else -1 # number of warmup iterations
|
|
||||||
last_opt_step = -1
|
last_opt_step = -1
|
||||||
self.run_callbacks('on_train_start')
|
self.run_callbacks('on_train_start')
|
||||||
LOGGER.info(f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
|
LOGGER.info(f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
|
||||||
@ -557,7 +556,7 @@ class BaseTrainer:
|
|||||||
n = len(metrics) + 1 # number of cols
|
n = len(metrics) + 1 # number of cols
|
||||||
s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n') # header
|
s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n') # header
|
||||||
with open(self.csv, 'a') as f:
|
with open(self.csv, 'a') as f:
|
||||||
f.write(s + ('%23.5g,' * n % tuple([self.epoch] + vals)).rstrip(',') + '\n')
|
f.write(s + ('%23.5g,' * n % tuple([self.epoch + 1] + vals)).rstrip(',') + '\n')
|
||||||
|
|
||||||
def plot_metrics(self):
|
def plot_metrics(self):
|
||||||
"""Plot and display metrics visually."""
|
"""Plot and display metrics visually."""
|
||||||
|
@ -222,7 +222,7 @@ class BaseValidator:
|
|||||||
Args:
|
Args:
|
||||||
pred_classes (torch.Tensor): Predicted class indices of shape(N,).
|
pred_classes (torch.Tensor): Predicted class indices of shape(N,).
|
||||||
true_classes (torch.Tensor): Target class indices of shape(M,).
|
true_classes (torch.Tensor): Target class indices of shape(M,).
|
||||||
iou (torch.Tensor): IoU thresholds from 0.50 to 0.95 in space of 0.05.
|
iou (torch.Tensor): An NxM tensor containing the pairwise IoU values for predictions and ground of truth
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
(torch.Tensor): Correct tensor of shape(N,10) for 10 IoU thresholds.
|
(torch.Tensor): Correct tensor of shape(N,10) for 10 IoU thresholds.
|
||||||
|
@ -23,7 +23,7 @@ def _log_scalars(scalars, step=0):
|
|||||||
|
|
||||||
|
|
||||||
def _log_tensorboard_graph(trainer):
|
def _log_tensorboard_graph(trainer):
|
||||||
# Log model graph to TensorBoard
|
"""Log model graph to TensorBoard."""
|
||||||
try:
|
try:
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
@ -48,11 +48,16 @@ def on_pretrain_routine_start(trainer):
|
|||||||
WRITER = SummaryWriter(str(trainer.save_dir))
|
WRITER = SummaryWriter(str(trainer.save_dir))
|
||||||
prefix = colorstr('TensorBoard: ')
|
prefix = colorstr('TensorBoard: ')
|
||||||
LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/")
|
LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/")
|
||||||
_log_tensorboard_graph(trainer)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOGGER.warning(f'WARNING ⚠️ TensorBoard not initialized correctly, not logging this run. {e}')
|
LOGGER.warning(f'WARNING ⚠️ TensorBoard not initialized correctly, not logging this run. {e}')
|
||||||
|
|
||||||
|
|
||||||
|
def on_train_start(trainer):
|
||||||
|
"""Log TensorBoard graph."""
|
||||||
|
if WRITER:
|
||||||
|
_log_tensorboard_graph(trainer)
|
||||||
|
|
||||||
|
|
||||||
def on_batch_end(trainer):
|
def on_batch_end(trainer):
|
||||||
"""Logs scalar statistics at the end of a training batch."""
|
"""Logs scalar statistics at the end of a training batch."""
|
||||||
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix='train'), trainer.epoch + 1)
|
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix='train'), trainer.epoch + 1)
|
||||||
@ -65,5 +70,6 @@ def on_fit_epoch_end(trainer):
|
|||||||
|
|
||||||
callbacks = {
|
callbacks = {
|
||||||
'on_pretrain_routine_start': on_pretrain_routine_start,
|
'on_pretrain_routine_start': on_pretrain_routine_start,
|
||||||
|
'on_train_start': on_train_start,
|
||||||
'on_fit_epoch_end': on_fit_epoch_end,
|
'on_fit_epoch_end': on_fit_epoch_end,
|
||||||
'on_batch_end': on_batch_end}
|
'on_batch_end': on_batch_end}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user