mirror of
https://github.com/THU-MIG/yolov10.git
synced 2025-05-23 21:44:22 +08:00
Add TensorBoard support (#87)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
248d54ca03
commit
cb4f20f3cf
6
.github/workflows/ci.yaml
vendored
6
.github/workflows/ci.yaml
vendored
@ -91,15 +91,15 @@ jobs:
|
|||||||
shell: bash # for Windows compatibility
|
shell: bash # for Windows compatibility
|
||||||
run: |
|
run: |
|
||||||
yolo task=detect mode=train model=yolov5n.yaml data=coco128.yaml epochs=1 imgsz=64
|
yolo task=detect mode=train model=yolov5n.yaml data=coco128.yaml epochs=1 imgsz=64
|
||||||
yolo task=detect mode=val model=runs/exp/weights/last.pt imgsz=64
|
yolo task=detect mode=val model=runs/train/exp/weights/last.pt imgsz=64
|
||||||
- name: Test segmentation
|
- name: Test segmentation
|
||||||
shell: bash # for Windows compatibility
|
shell: bash # for Windows compatibility
|
||||||
# TODO: redo val test without hardcoded weights
|
# TODO: redo val test without hardcoded weights
|
||||||
run: |
|
run: |
|
||||||
yolo task=segment mode=train model=yolov5n-seg.yaml data=coco128-seg.yaml epochs=1 imgsz=64
|
yolo task=segment mode=train model=yolov5n-seg.yaml data=coco128-seg.yaml epochs=1 imgsz=64
|
||||||
yolo task=segment mode=val model=runs/exp2/weights/last.pt data=coco128-seg.yaml imgsz=64
|
yolo task=segment mode=val model=runs/train/exp2/weights/last.pt data=coco128-seg.yaml imgsz=64
|
||||||
- name: Test classification
|
- name: Test classification
|
||||||
shell: bash # for Windows compatibility
|
shell: bash # for Windows compatibility
|
||||||
run: |
|
run: |
|
||||||
yolo task=classify mode=train model=resnet18 data=mnist160 epochs=1 imgsz=32
|
yolo task=classify mode=train model=resnet18 data=mnist160 epochs=1 imgsz=32
|
||||||
yolo task=classify mode=val model=runs/exp3/weights/last.pt data=mnist160
|
yolo task=classify mode=val model=runs/train/exp3/weights/last.pt data=mnist160
|
||||||
|
@ -4,7 +4,6 @@ Simple training loop; Boilerplate that could apply to any arbitrary neural netwo
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
@ -128,6 +127,7 @@ class BaseTrainer:
|
|||||||
Builds dataloaders and optimizer on correct rank process
|
Builds dataloaders and optimizer on correct rank process
|
||||||
"""
|
"""
|
||||||
# model
|
# model
|
||||||
|
self.trigger_callbacks("on_pretrain_routine_start")
|
||||||
ckpt = self.setup_model()
|
ckpt = self.setup_model()
|
||||||
self.model = self.model.to(self.device)
|
self.model = self.model.to(self.device)
|
||||||
self.set_model_attributes()
|
self.set_model_attributes()
|
||||||
@ -159,13 +159,13 @@ class BaseTrainer:
|
|||||||
# metric_keys = self.validator.metric_keys + self.label_loss_items(prefix="val")
|
# metric_keys = self.validator.metric_keys + self.label_loss_items(prefix="val")
|
||||||
# self.metrics = dict(zip(metric_keys, [0] * len(metric_keys))) # TODO: init metrics for plot_results()?
|
# self.metrics = dict(zip(metric_keys, [0] * len(metric_keys))) # TODO: init metrics for plot_results()?
|
||||||
self.ema = ModelEMA(self.model)
|
self.ema = ModelEMA(self.model)
|
||||||
|
self.trigger_callbacks("on_pretrain_routine_end")
|
||||||
|
|
||||||
def _do_train(self, rank=-1, world_size=1):
|
def _do_train(self, rank=-1, world_size=1):
|
||||||
if world_size > 1:
|
if world_size > 1:
|
||||||
self._setup_ddp(rank, world_size)
|
self._setup_ddp(rank, world_size)
|
||||||
|
|
||||||
self._setup_train(rank, world_size)
|
self._setup_train(rank, world_size)
|
||||||
self.trigger_callbacks("before_train")
|
|
||||||
|
|
||||||
self.epoch_time = None
|
self.epoch_time = None
|
||||||
self.epoch_time_start = time.time()
|
self.epoch_time_start = time.time()
|
||||||
@ -173,9 +173,10 @@ class BaseTrainer:
|
|||||||
nb = len(self.train_loader) # number of batches
|
nb = len(self.train_loader) # number of batches
|
||||||
nw = max(round(self.args.warmup_epochs * nb), 100) # number of warmup iterations
|
nw = max(round(self.args.warmup_epochs * nb), 100) # number of warmup iterations
|
||||||
last_opt_step = -1
|
last_opt_step = -1
|
||||||
|
self.trigger_callbacks("on_train_start")
|
||||||
for epoch in range(self.start_epoch, self.epochs):
|
for epoch in range(self.start_epoch, self.epochs):
|
||||||
self.epoch = epoch
|
self.epoch = epoch
|
||||||
self.trigger_callbacks("on_epoch_start")
|
self.trigger_callbacks("on_train_epoch_start")
|
||||||
self.model.train()
|
self.model.train()
|
||||||
if rank != -1:
|
if rank != -1:
|
||||||
self.train_loader.sampler.set_epoch(epoch)
|
self.train_loader.sampler.set_epoch(epoch)
|
||||||
@ -186,7 +187,7 @@ class BaseTrainer:
|
|||||||
self.tloss = None
|
self.tloss = None
|
||||||
self.optimizer.zero_grad()
|
self.optimizer.zero_grad()
|
||||||
for i, batch in pbar:
|
for i, batch in pbar:
|
||||||
self.trigger_callbacks("on_batch_start")
|
self.trigger_callbacks("on_train_batch_start")
|
||||||
# forward
|
# forward
|
||||||
batch = self.preprocess_batch(batch)
|
batch = self.preprocess_batch(batch)
|
||||||
|
|
||||||
@ -207,7 +208,7 @@ class BaseTrainer:
|
|||||||
if rank != -1:
|
if rank != -1:
|
||||||
self.loss *= world_size
|
self.loss *= world_size
|
||||||
self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \
|
self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \
|
||||||
else self.loss_items
|
else self.loss_items
|
||||||
|
|
||||||
# backward
|
# backward
|
||||||
self.scaler.scale(self.loss).backward()
|
self.scaler.scale(self.loss).backward()
|
||||||
@ -229,8 +230,11 @@ class BaseTrainer:
|
|||||||
if self.args.plots and ni < 3:
|
if self.args.plots and ni < 3:
|
||||||
self.plot_training_samples(batch, ni)
|
self.plot_training_samples(batch, ni)
|
||||||
|
|
||||||
|
self.trigger_callbacks("on_train_batch_end")
|
||||||
|
|
||||||
lr = {f"lr{ir}": x['lr'] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers
|
lr = {f"lr{ir}": x['lr'] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers
|
||||||
self.scheduler.step()
|
self.scheduler.step()
|
||||||
|
self.trigger_callbacks("on_train_epoch_end")
|
||||||
|
|
||||||
if rank in [-1, 0]:
|
if rank in [-1, 0]:
|
||||||
# validation
|
# validation
|
||||||
@ -260,9 +264,11 @@ class BaseTrainer:
|
|||||||
if self.args.plots:
|
if self.args.plots:
|
||||||
self.plot_metrics()
|
self.plot_metrics()
|
||||||
self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours)")
|
self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours)")
|
||||||
|
self.log(f"Results saved to {colorstr('bold', self.save_dir)}")
|
||||||
self.trigger_callbacks('on_train_end')
|
self.trigger_callbacks('on_train_end')
|
||||||
dist.destroy_process_group() if world_size > 1 else None
|
dist.destroy_process_group() if world_size > 1 else None
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
self.trigger_callbacks('teardown')
|
||||||
|
|
||||||
def save_model(self):
|
def save_model(self):
|
||||||
ckpt = {
|
ckpt = {
|
||||||
|
@ -1,13 +1,36 @@
|
|||||||
def before_train(trainer):
|
def on_pretrain_routine_start(trainer):
|
||||||
# Initialize tensorboard logger
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def on_epoch_start(trainer):
|
def on_pretrain_routine_end(trainer):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def on_batch_start(trainer):
|
def on_train_start(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_train_epoch_start(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_train_batch_start(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def optimizer_step(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_before_zero_grad(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_train_batch_end(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_train_epoch_end(trainer):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ -15,27 +38,68 @@ def on_val_start(trainer):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_val_batch_start(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_val_image_end(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_val_batch_end(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def on_val_end(trainer):
|
def on_val_end(trainer):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_fit_epoch_end(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def on_model_save(trainer):
|
def on_model_save(trainer):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_train_end(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def on_params_update(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def teardown(trainer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
default_callbacks = {
|
default_callbacks = {
|
||||||
"before_train": before_train,
|
'on_pretrain_routine_start': on_pretrain_routine_start,
|
||||||
"on_epoch_start": on_epoch_start,
|
'on_pretrain_routine_end': on_pretrain_routine_end,
|
||||||
"on_batch_start": on_batch_start,
|
'on_train_start': on_train_start,
|
||||||
"on_val_start": on_val_start,
|
'on_train_epoch_start': on_train_epoch_start,
|
||||||
"on_val_end": on_val_end,
|
'on_train_batch_start': on_train_batch_start,
|
||||||
"on_model_save": on_model_save}
|
'optimizer_step': optimizer_step,
|
||||||
|
'on_before_zero_grad': on_before_zero_grad,
|
||||||
|
'on_train_batch_end': on_train_batch_end,
|
||||||
|
'on_train_epoch_end': on_train_epoch_end,
|
||||||
|
'on_val_start': on_val_start,
|
||||||
|
'on_val_batch_start': on_val_batch_start,
|
||||||
|
'on_val_image_end': on_val_image_end,
|
||||||
|
'on_val_batch_end': on_val_batch_end,
|
||||||
|
'on_val_end': on_val_end,
|
||||||
|
'on_fit_epoch_end': on_fit_epoch_end, # fit = train + val
|
||||||
|
'on_model_save': on_model_save,
|
||||||
|
'on_train_end': on_train_end,
|
||||||
|
'on_params_update': on_params_update,
|
||||||
|
'teardown': teardown}
|
||||||
|
|
||||||
|
|
||||||
def add_integration_callbacks(trainer):
|
def add_integration_callbacks(trainer):
|
||||||
callbacks = {}
|
from .clearml import callbacks as clearml_callbacks
|
||||||
|
from .tb import callbacks as tb_callbacks
|
||||||
|
|
||||||
from .clearml import callbacks, clearml
|
for x in tb_callbacks, clearml_callbacks:
|
||||||
if clearml:
|
for k, v in x.items():
|
||||||
for callback, func in callbacks.items():
|
trainer.add_callback(k, v) # add_callback(name, func)
|
||||||
trainer.add_callback(callback, func)
|
|
||||||
|
@ -9,47 +9,33 @@ except (ImportError, AssertionError):
|
|||||||
clearml = None
|
clearml = None
|
||||||
|
|
||||||
|
|
||||||
def _log_scalers(metric_dict, group="", step=0):
|
def on_train_start(trainer):
|
||||||
task = Task.current_task()
|
|
||||||
if task:
|
|
||||||
for k, v in metric_dict.items():
|
|
||||||
task.get_logger().report_scalar(group, k, v, step)
|
|
||||||
|
|
||||||
|
|
||||||
def before_train(trainer):
|
|
||||||
# TODO: reuse existing task
|
# TODO: reuse existing task
|
||||||
task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv5',
|
task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv8',
|
||||||
task_name=trainer.args.name if trainer.args.name != 'exp' else 'Training',
|
task_name=trainer.args.name,
|
||||||
tags=['YOLOv5'],
|
tags=['YOLOv8'],
|
||||||
output_uri=True,
|
output_uri=True,
|
||||||
reuse_last_task_id=False,
|
reuse_last_task_id=False,
|
||||||
auto_connect_frameworks={'pytorch': False})
|
auto_connect_frameworks={'pytorch': False})
|
||||||
task.connect(dict(trainer.args), name='General')
|
task.connect(dict(trainer.args), name='General')
|
||||||
|
|
||||||
|
|
||||||
def on_batch_end(trainer):
|
|
||||||
_log_scalers(trainer.label_loss_items(trainer.tloss, prefix="train"), "train", trainer.epoch)
|
|
||||||
|
|
||||||
|
|
||||||
def on_val_end(trainer):
|
def on_val_end(trainer):
|
||||||
_log_scalers(trainer.label_loss_items(trainer.validator.loss, prefix="val"), "val", trainer.epoch)
|
|
||||||
_log_scalers({k: v for k, v in trainer.metrics.items() if k.startswith("metrics")}, "metrics", trainer.epoch)
|
|
||||||
if trainer.epoch == 0:
|
if trainer.epoch == 0:
|
||||||
model_info = {
|
model_info = {
|
||||||
"inference_speed": trainer.validator.speed[1],
|
"Inference speed (ms/img)": round(trainer.validator.speed[1], 1),
|
||||||
"flops@640": get_flops(trainer.model),
|
"GFLOPs": round(get_flops(trainer.model), 1),
|
||||||
"params": get_num_params(trainer.model)}
|
"Parameters": get_num_params(trainer.model)}
|
||||||
Task.current_task().connect(model_info, 'Model')
|
Task.current_task().connect(model_info, name='Model')
|
||||||
|
|
||||||
|
|
||||||
def on_train_end(trainer):
|
def on_train_end(trainer):
|
||||||
task = Task.current_task()
|
Task.current_task().update_output_model(model_path=str(trainer.best),
|
||||||
if task:
|
model_name=trainer.args.name,
|
||||||
task.update_output_model(model_path=str(trainer.best), model_name='Best Model', auto_delete_file=False)
|
auto_delete_file=False)
|
||||||
|
|
||||||
|
|
||||||
callbacks = {
|
callbacks = {
|
||||||
"before_train": before_train,
|
"on_train_start": on_train_start,
|
||||||
"on_val_end": on_val_end,
|
"on_val_end": on_val_end,
|
||||||
"on_batch_end": on_batch_end,
|
"on_train_end": on_train_end} if clearml else {}
|
||||||
"on_train_end": on_train_end}
|
|
||||||
|
26
ultralytics/yolo/utils/callbacks/tb.py
Normal file
26
ultralytics/yolo/utils/callbacks/tb.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
writer = None # TensorBoard SummaryWriter instance
|
||||||
|
|
||||||
|
|
||||||
|
def _log_scalars(scalars, step=0):
|
||||||
|
for k, v in scalars.items():
|
||||||
|
writer.add_scalar(k, v, step)
|
||||||
|
|
||||||
|
|
||||||
|
def on_train_start(trainer):
|
||||||
|
global writer
|
||||||
|
writer = SummaryWriter(str(trainer.save_dir))
|
||||||
|
trainer.console.info(f"Logging results to {trainer.save_dir}\n"
|
||||||
|
f"Starting training for {trainer.args.epochs} epochs...")
|
||||||
|
|
||||||
|
|
||||||
|
def on_batch_end(trainer):
|
||||||
|
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch)
|
||||||
|
|
||||||
|
|
||||||
|
def on_val_end(trainer):
|
||||||
|
_log_scalars(trainer.metrics, trainer.epoch)
|
||||||
|
|
||||||
|
|
||||||
|
callbacks = {"on_train_start": on_train_start, "on_val_end": on_val_end, "on_batch_end": on_batch_end}
|
@ -15,7 +15,7 @@ nosave: False
|
|||||||
cache: False # True/ram, disk or False
|
cache: False # True/ram, disk or False
|
||||||
device: '' # cuda device, i.e. 0 or 0,1,2,3 or cpu
|
device: '' # cuda device, i.e. 0 or 0,1,2,3 or cpu
|
||||||
workers: 8
|
workers: 8
|
||||||
project: 'runs'
|
project: 'runs/train'
|
||||||
name: 'exp'
|
name: 'exp'
|
||||||
exist_ok: False
|
exist_ok: False
|
||||||
pretrained: False
|
pretrained: False
|
||||||
|
Loading…
x
Reference in New Issue
Block a user