Hold failed upload metrics and upload with next upload metrics (#8513)

Co-authored-by: hassaanfarooq01 <hassaanfarooq01@gmail.com>
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
This commit is contained in:
Hassaan Farooq 2024-03-01 05:08:29 +05:00 committed by GitHub
parent 33fff69f3d
commit 31cf94e7f9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 13 additions and 0 deletions

View File

@ -52,6 +52,7 @@ class HUBTrainingSession:
"heartbeat": 300.0,
} # rate limits (seconds)
self.metrics_queue = {} # holds metrics for each epoch until upload
self.metrics_upload_failed_queue = {} # holds metrics for each epoch if upload failed
self.timers = {} # holds timers in ultralytics/utils/callbacks/hub.py
# Parse input
@ -234,6 +235,9 @@ class HUBTrainingSession:
self._show_upload_progress(progress_total, response)
if HTTPStatus.OK <= response.status_code < HTTPStatus.MULTIPLE_CHOICES:
# if request related to metrics upload
if kwargs.get("metrics"):
self.metrics_upload_failed_queue = {}
return response # Success, no need to retry
if i == 0:
@ -249,6 +253,10 @@ class HUBTrainingSession:
time.sleep(2**i) # Exponential backoff for retries
# if request related to metrics upload and exceed retries
if response is None and kwargs.get("metrics"):
self.metrics_upload_failed_queue.update(kwargs.get("metrics", None))
return response
if thread:

View File

@ -33,6 +33,11 @@ def on_fit_epoch_end(trainer):
all_plots = {**all_plots, **model_info_for_loggers(trainer)}
session.metrics_queue[trainer.epoch] = json.dumps(all_plots)
# If any metrics fail to upload, add them to the queue to attempt uploading again.
if session.metrics_upload_failed_queue:
session.metrics_queue.update(session.metrics_upload_failed_queue)
if time() - session.timers["metrics"] > session.rate_limits["metrics"]:
session.upload_metrics()
session.timers["metrics"] = time() # reset timer