From a4fabfdacff289009c758de6f0933fa57c59c64b Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 2 Sep 2023 04:25:03 +0200 Subject: [PATCH] Set `workers=0` for MPS Train and Val modes (#4697) --- .github/workflows/codeql.yaml | 1 + ultralytics/engine/trainer.py | 2 +- ultralytics/engine/validator.py | 2 +- ultralytics/utils/callbacks/base.py | 36 +++++++++++++-------- ultralytics/utils/callbacks/clearml.py | 14 ++++---- ultralytics/utils/callbacks/comet.py | 27 ++++++++-------- ultralytics/utils/callbacks/dvc.py | 37 +++++++++++----------- ultralytics/utils/callbacks/hub.py | 2 +- ultralytics/utils/callbacks/mlflow.py | 14 ++++---- ultralytics/utils/callbacks/neptune.py | 13 ++++---- ultralytics/utils/callbacks/raytune.py | 1 + ultralytics/utils/callbacks/tensorboard.py | 6 ++-- ultralytics/utils/callbacks/wb.py | 5 +-- ultralytics/utils/downloads.py | 2 +- 14 files changed, 88 insertions(+), 74 deletions(-) diff --git a/.github/workflows/codeql.yaml b/.github/workflows/codeql.yaml index 526f5e48..17dac1db 100644 --- a/.github/workflows/codeql.yaml +++ b/.github/workflows/codeql.yaml @@ -5,6 +5,7 @@ name: "CodeQL" on: schedule: - cron: '0 0 1 * *' + workflow_dispatch: jobs: analyze: diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py index 0f300ed2..c95cde23 100644 --- a/ultralytics/engine/trainer.py +++ b/ultralytics/engine/trainer.py @@ -107,7 +107,7 @@ class BaseTrainer: print_args(vars(self.args)) # Device - if self.device.type == 'cpu': + if self.device.type in ('cpu', 'mps'): self.args.workers = 0 # faster CPU training as time dominated by inference, not dataloading # Model and Dataset diff --git a/ultralytics/engine/validator.py b/ultralytics/engine/validator.py index 97a3a9ac..811520b9 100644 --- a/ultralytics/engine/validator.py +++ b/ultralytics/engine/validator.py @@ -144,7 +144,7 @@ class BaseValidator: else: raise FileNotFoundError(emojis(f"Dataset '{self.args.data}' for task={self.args.task} not found ❌")) - if self.device.type == 'cpu': + if self.device.type in ('cpu', 'mps'): self.args.workers = 0 # faster CPU val as time dominated by inference, not dataloading if not pt: self.args.rect = False diff --git a/ultralytics/utils/callbacks/base.py b/ultralytics/utils/callbacks/base.py index 0b173479..c45d3121 100644 --- a/ultralytics/utils/callbacks/base.py +++ b/ultralytics/utils/callbacks/base.py @@ -196,17 +196,27 @@ def add_integration_callbacks(instance): instance (Trainer, Predictor, Validator, Exporter): An object with a 'callbacks' attribute that is a dictionary of callback lists. """ - from .clearml import callbacks as clearml_cb - from .comet import callbacks as comet_cb - from .dvc import callbacks as dvc_cb - from .hub import callbacks as hub_cb - from .mlflow import callbacks as mlflow_cb - from .neptune import callbacks as neptune_cb - from .raytune import callbacks as tune_cb - from .tensorboard import callbacks as tensorboard_cb - from .wb import callbacks as wb_cb - for x in clearml_cb, comet_cb, hub_cb, mlflow_cb, neptune_cb, tune_cb, tensorboard_cb, wb_cb, dvc_cb: - for k, v in x.items(): - if v not in instance.callbacks[k]: # prevent duplicate callbacks addition - instance.callbacks[k].append(v) # callback[name].append(func) + # Load HUB callbacks + from .hub import callbacks + + # Load training callbacks + if 'Trainer' in instance.__class__.__name__: + from .clearml import callbacks as clear_cb + from .comet import callbacks as comet_cb + from .dvc import callbacks as dvc_cb + from .mlflow import callbacks as mlflow_cb + from .neptune import callbacks as neptune_cb + from .raytune import callbacks as tune_cb + from .tensorboard import callbacks as tb_cb + from .wb import callbacks as wb_cb + callbacks.update({**clear_cb, **comet_cb, **dvc_cb, **mlflow_cb, **neptune_cb, **tune_cb, **tb_cb, **wb_cb}) + + # Load export callbacks (patch to avoid CoreML protobuf error) + if 'Exporter' in instance.__class__.__name__: + from .tensorboard import callbacks as tb_cb + callbacks.update(tb_cb) + + for k, v in callbacks.items(): + if v not in instance.callbacks[k]: # prevent duplicate callbacks addition + instance.callbacks[k].append(v) # callback[name].append(func) diff --git a/ultralytics/utils/callbacks/clearml.py b/ultralytics/utils/callbacks/clearml.py index ee251dd4..48e91ece 100644 --- a/ultralytics/utils/callbacks/clearml.py +++ b/ultralytics/utils/callbacks/clearml.py @@ -1,12 +1,6 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -import re - -import matplotlib.image as mpimg -import matplotlib.pyplot as plt - from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING -from ultralytics.utils.torch_utils import model_info_for_loggers try: assert not TESTS_RUNNING # do not log pytest @@ -15,8 +9,8 @@ try: from clearml import Task from clearml.binding.frameworks.pytorch_bind import PatchPyTorchModelIO from clearml.binding.matplotlib_bind import PatchedMatplotlib - assert hasattr(clearml, '__version__') # verify package is not directory + except (ImportError, AssertionError): clearml = None @@ -29,6 +23,8 @@ def _log_debug_samples(files, title='Debug Samples') -> None: files (list): A list of file paths in PosixPath format. title (str): A title that groups together images with the same values. """ + import re + if task := Task.current_task(): for f in files: if f.exists(): @@ -48,6 +44,9 @@ def _log_plot(title, plot_path) -> None: title (str): The title of the plot. plot_path (str): The path to the saved image file. """ + import matplotlib.image as mpimg + import matplotlib.pyplot as plt + img = mpimg.imread(plot_path) fig = plt.figure() ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect='auto', xticks=[], yticks=[]) # no ticks @@ -103,6 +102,7 @@ def on_fit_epoch_end(trainer): value=trainer.epoch_time, iteration=trainer.epoch) if trainer.epoch == 0: + from ultralytics.utils.torch_utils import model_info_for_loggers for k, v in model_info_for_loggers(trainer).items(): task.get_logger().report_single_value(k, v) diff --git a/ultralytics/utils/callbacks/comet.py b/ultralytics/utils/callbacks/comet.py index c065b836..2da71a95 100644 --- a/ultralytics/utils/callbacks/comet.py +++ b/ultralytics/utils/callbacks/comet.py @@ -1,10 +1,6 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -import os -from pathlib import Path - from ultralytics.utils import LOGGER, RANK, SETTINGS, TESTS_RUNNING, ops -from ultralytics.utils.torch_utils import model_info_for_loggers try: assert not TESTS_RUNNING # do not log pytest @@ -12,18 +8,22 @@ try: import comet_ml assert hasattr(comet_ml, '__version__') # verify package is not directory + + import os + from pathlib import Path + + # Ensures certain logging functions only run for supported tasks + COMET_SUPPORTED_TASKS = ['detect'] + + # Names of plots created by YOLOv8 that are logged to Comet + EVALUATION_PLOT_NAMES = 'F1_curve', 'P_curve', 'R_curve', 'PR_curve', 'confusion_matrix' + LABEL_PLOT_NAMES = 'labels', 'labels_correlogram' + + _comet_image_prediction_count = 0 + except (ImportError, AssertionError): comet_ml = None -# Ensures certain logging functions only run for supported tasks -COMET_SUPPORTED_TASKS = ['detect'] - -# Names of plots created by YOLOv8 that are logged to Comet -EVALUATION_PLOT_NAMES = 'F1_curve', 'P_curve', 'R_curve', 'PR_curve', 'confusion_matrix' -LABEL_PLOT_NAMES = 'labels', 'labels_correlogram' - -_comet_image_prediction_count = 0 - def _get_comet_mode(): return os.getenv('COMET_MODE', 'online') @@ -327,6 +327,7 @@ def on_fit_epoch_end(trainer): experiment.log_metrics(trainer.metrics, step=curr_step, epoch=curr_epoch) experiment.log_metrics(trainer.lr, step=curr_step, epoch=curr_epoch) if curr_epoch == 1: + from ultralytics.utils.torch_utils import model_info_for_loggers experiment.log_metrics(model_info_for_loggers(trainer), step=curr_step, epoch=curr_epoch) if not save_assets: diff --git a/ultralytics/utils/callbacks/dvc.py b/ultralytics/utils/callbacks/dvc.py index 90f6f444..b5bfa9de 100644 --- a/ultralytics/utils/callbacks/dvc.py +++ b/ultralytics/utils/callbacks/dvc.py @@ -1,37 +1,37 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -import os -import re -from pathlib import Path - -import pkg_resources as pkg - from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING -from ultralytics.utils.torch_utils import model_info_for_loggers try: assert not TESTS_RUNNING # do not log pytest assert SETTINGS['dvc'] is True # verify integration is enabled - from importlib.metadata import version - import dvclive + assert hasattr(dvclive, '__version__') # verify package is not directory + + import os + import re + from importlib.metadata import version + from pathlib import Path + + import pkg_resources as pkg + ver = version('dvclive') if pkg.parse_version(ver) < pkg.parse_version('2.11.0'): LOGGER.debug(f'DVCLive is detected but version {ver} is incompatible (>=2.11 required).') dvclive = None # noqa: F811 + + # DVCLive logger instance + live = None + _processed_plots = {} + + # `on_fit_epoch_end` is called on final validation (probably need to be fixed) for now this is the way we + # distinguish final evaluation of the best model vs last epoch validation + _training_epoch = False + except (ImportError, AssertionError, TypeError): dvclive = None -# DVCLive logger instance -live = None -_processed_plots = {} - -# `on_fit_epoch_end` is called on final validation (probably need to be fixed) -# for now this is the way we distinguish final evaluation of the best model vs -# last epoch validation -_training_epoch = False - def _log_images(path, prefix=''): if live: @@ -103,6 +103,7 @@ def on_fit_epoch_end(trainer): live.log_metric(metric, value) if trainer.epoch == 0: + from ultralytics.utils.torch_utils import model_info_for_loggers for metric, value in model_info_for_loggers(trainer).items(): live.log_metric(metric, value, plot=False) diff --git a/ultralytics/utils/callbacks/hub.py b/ultralytics/utils/callbacks/hub.py index fba5a6b5..7171fb90 100644 --- a/ultralytics/utils/callbacks/hub.py +++ b/ultralytics/utils/callbacks/hub.py @@ -5,7 +5,6 @@ from time import time from ultralytics.hub.utils import HUB_WEB_ROOT, PREFIX, events from ultralytics.utils import LOGGER, SETTINGS -from ultralytics.utils.torch_utils import model_info_for_loggers def on_pretrain_routine_end(trainer): @@ -24,6 +23,7 @@ def on_fit_epoch_end(trainer): # Upload metrics after val end all_plots = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics} if trainer.epoch == 0: + from ultralytics.utils.torch_utils import model_info_for_loggers all_plots = {**all_plots, **model_info_for_loggers(trainer)} session.metrics_queue[trainer.epoch] = json.dumps(all_plots) if time() - session.timers['metrics'] > session.rate_limits['metrics']: diff --git a/ultralytics/utils/callbacks/mlflow.py b/ultralytics/utils/callbacks/mlflow.py index 1d9a0462..424c931d 100644 --- a/ultralytics/utils/callbacks/mlflow.py +++ b/ultralytics/utils/callbacks/mlflow.py @@ -1,17 +1,16 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -import os -import re -from pathlib import Path - -from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, colorstr +from ultralytics.utils import LOGGER, ROOT, SETTINGS, TESTS_RUNNING, colorstr try: assert not TESTS_RUNNING # do not log pytest assert SETTINGS['mlflow'] is True # verify integration is enabled import mlflow - assert hasattr(mlflow, '__version__') # verify package is not directory + + import os + import re + except (ImportError, AssertionError): mlflow = None @@ -56,11 +55,10 @@ def on_fit_epoch_end(trainer): def on_train_end(trainer): """Called at end of train loop to log model artifact info.""" if mlflow: - root_dir = Path(__file__).resolve().parents[3] run.log_artifact(trainer.last) run.log_artifact(trainer.best) run.pyfunc.log_model(artifact_path=experiment_name, - code_path=[str(root_dir)], + code_path=[str(ROOT.parent)], artifacts={'model_path': str(trainer.save_dir)}, python_model=run.pyfunc.PythonModel()) diff --git a/ultralytics/utils/callbacks/neptune.py b/ultralytics/utils/callbacks/neptune.py index 3488c3e9..40916a3c 100644 --- a/ultralytics/utils/callbacks/neptune.py +++ b/ultralytics/utils/callbacks/neptune.py @@ -1,10 +1,6 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -import matplotlib.image as mpimg -import matplotlib.pyplot as plt - from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING -from ultralytics.utils.torch_utils import model_info_for_loggers try: assert not TESTS_RUNNING # do not log pytest @@ -13,11 +9,12 @@ try: from neptune.types import File assert hasattr(neptune, '__version__') + + run = None # NeptuneAI experiment logger instance + except (ImportError, AssertionError): neptune = None -run = None # NeptuneAI experiment logger instance - def _log_scalars(scalars, step=0): """Log scalars to the NeptuneAI experiment logger.""" @@ -42,6 +39,9 @@ def _log_plot(title, plot_path): title (str) Title of the plot plot_path (PosixPath or str) Path to the saved image file """ + import matplotlib.image as mpimg + import matplotlib.pyplot as plt + img = mpimg.imread(plot_path) fig = plt.figure() ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect='auto', xticks=[], yticks=[]) # no ticks @@ -70,6 +70,7 @@ def on_train_epoch_end(trainer): def on_fit_epoch_end(trainer): """Callback function called at end of each fit (train+val) epoch.""" if run and trainer.epoch == 0: + from ultralytics.utils.torch_utils import model_info_for_loggers run['Configuration/Model'] = model_info_for_loggers(trainer) _log_scalars(trainer.metrics, trainer.epoch + 1) diff --git a/ultralytics/utils/callbacks/raytune.py b/ultralytics/utils/callbacks/raytune.py index f73c48cc..417b3314 100644 --- a/ultralytics/utils/callbacks/raytune.py +++ b/ultralytics/utils/callbacks/raytune.py @@ -7,6 +7,7 @@ try: import ray from ray import tune from ray.air import session + except (ImportError, AssertionError): tune = None diff --git a/ultralytics/utils/callbacks/tensorboard.py b/ultralytics/utils/callbacks/tensorboard.py index c82f63e0..c1fce53b 100644 --- a/ultralytics/utils/callbacks/tensorboard.py +++ b/ultralytics/utils/callbacks/tensorboard.py @@ -8,12 +8,12 @@ try: assert not TESTS_RUNNING # do not log pytest assert SETTINGS['tensorboard'] is True # verify integration is enabled + WRITER = None # TensorBoard SummaryWriter instance + except (ImportError, AssertionError, TypeError): # TypeError for handling 'Descriptors cannot not be created directly.' protobuf errors in Windows SummaryWriter = None -WRITER = None # TensorBoard SummaryWriter instance - def _log_scalars(scalars, step=0): """Logs scalar values to TensorBoard.""" @@ -72,4 +72,4 @@ callbacks = { 'on_pretrain_routine_start': on_pretrain_routine_start, 'on_train_start': on_train_start, 'on_fit_epoch_end': on_fit_epoch_end, - 'on_batch_end': on_batch_end} + 'on_batch_end': on_batch_end} if SummaryWriter else {} diff --git a/ultralytics/utils/callbacks/wb.py b/ultralytics/utils/callbacks/wb.py index 0a1b0c4e..27b38749 100644 --- a/ultralytics/utils/callbacks/wb.py +++ b/ultralytics/utils/callbacks/wb.py @@ -9,11 +9,12 @@ try: import wandb as wb assert hasattr(wb, '__version__') + + _processed_plots = {} + except (ImportError, AssertionError): wb = None -_processed_plots = {} - def _log_plots(plots, step): for name, params in plots.items(): diff --git a/ultralytics/utils/downloads.py b/ultralytics/utils/downloads.py index 4e494eba..c62095c2 100644 --- a/ultralytics/utils/downloads.py +++ b/ultralytics/utils/downloads.py @@ -273,7 +273,7 @@ def safe_download(url, """ # Check if the URL is a Google Drive link - gdrive = 'drive.google.com' in url + gdrive = url.startswith('https://drive.google.com/') if gdrive: url, file = get_google_drive_file_info(url)