diff --git a/ultralytics/hub/session.py b/ultralytics/hub/session.py index 2d26b5a9..ebc6692b 100644 --- a/ultralytics/hub/session.py +++ b/ultralytics/hub/session.py @@ -52,6 +52,7 @@ class HUBTrainingSession: "heartbeat": 300.0, } # rate limits (seconds) self.metrics_queue = {} # holds metrics for each epoch until upload + self.metrics_upload_failed_queue = {} # holds metrics for each epoch if upload failed self.timers = {} # holds timers in ultralytics/utils/callbacks/hub.py # Parse input @@ -234,6 +235,9 @@ class HUBTrainingSession: self._show_upload_progress(progress_total, response) if HTTPStatus.OK <= response.status_code < HTTPStatus.MULTIPLE_CHOICES: + # if request related to metrics upload + if kwargs.get("metrics"): + self.metrics_upload_failed_queue = {} return response # Success, no need to retry if i == 0: @@ -249,6 +253,10 @@ class HUBTrainingSession: time.sleep(2**i) # Exponential backoff for retries + # if request related to metrics upload and exceed retries + if response is None and kwargs.get("metrics"): + self.metrics_upload_failed_queue.update(kwargs.get("metrics", None)) + return response if thread: diff --git a/ultralytics/utils/callbacks/hub.py b/ultralytics/utils/callbacks/hub.py index 587b4590..cdb42b9b 100644 --- a/ultralytics/utils/callbacks/hub.py +++ b/ultralytics/utils/callbacks/hub.py @@ -33,6 +33,11 @@ def on_fit_epoch_end(trainer): all_plots = {**all_plots, **model_info_for_loggers(trainer)} session.metrics_queue[trainer.epoch] = json.dumps(all_plots) + + # If any metrics fail to upload, add them to the queue to attempt uploading again. + if session.metrics_upload_failed_queue: + session.metrics_queue.update(session.metrics_upload_failed_queue) + if time() - session.timers["metrics"] > session.rate_limits["metrics"]: session.upload_metrics() session.timers["metrics"] = time() # reset timer