mirror of
https://github.com/THU-MIG/yolov10.git
synced 2025-05-23 05:24:22 +08:00
Hold failed upload metrics and upload with next upload metrics (#8513)
Co-authored-by: hassaanfarooq01 <hassaanfarooq01@gmail.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
This commit is contained in:
parent
33fff69f3d
commit
31cf94e7f9
@ -52,6 +52,7 @@ class HUBTrainingSession:
|
||||
"heartbeat": 300.0,
|
||||
} # rate limits (seconds)
|
||||
self.metrics_queue = {} # holds metrics for each epoch until upload
|
||||
self.metrics_upload_failed_queue = {} # holds metrics for each epoch if upload failed
|
||||
self.timers = {} # holds timers in ultralytics/utils/callbacks/hub.py
|
||||
|
||||
# Parse input
|
||||
@ -234,6 +235,9 @@ class HUBTrainingSession:
|
||||
self._show_upload_progress(progress_total, response)
|
||||
|
||||
if HTTPStatus.OK <= response.status_code < HTTPStatus.MULTIPLE_CHOICES:
|
||||
# if request related to metrics upload
|
||||
if kwargs.get("metrics"):
|
||||
self.metrics_upload_failed_queue = {}
|
||||
return response # Success, no need to retry
|
||||
|
||||
if i == 0:
|
||||
@ -249,6 +253,10 @@ class HUBTrainingSession:
|
||||
|
||||
time.sleep(2**i) # Exponential backoff for retries
|
||||
|
||||
# if request related to metrics upload and exceed retries
|
||||
if response is None and kwargs.get("metrics"):
|
||||
self.metrics_upload_failed_queue.update(kwargs.get("metrics", None))
|
||||
|
||||
return response
|
||||
|
||||
if thread:
|
||||
|
@ -33,6 +33,11 @@ def on_fit_epoch_end(trainer):
|
||||
all_plots = {**all_plots, **model_info_for_loggers(trainer)}
|
||||
|
||||
session.metrics_queue[trainer.epoch] = json.dumps(all_plots)
|
||||
|
||||
# If any metrics fail to upload, add them to the queue to attempt uploading again.
|
||||
if session.metrics_upload_failed_queue:
|
||||
session.metrics_queue.update(session.metrics_upload_failed_queue)
|
||||
|
||||
if time() - session.timers["metrics"] > session.rate_limits["metrics"]:
|
||||
session.upload_metrics()
|
||||
session.timers["metrics"] = time() # reset timer
|
||||
|
Loading…
x
Reference in New Issue
Block a user