mirror of
https://github.com/THU-MIG/yolov10.git
synced 2025-05-23 21:44:22 +08:00
Hold failed upload metrics and upload with next upload metrics (#8513)
Co-authored-by: hassaanfarooq01 <hassaanfarooq01@gmail.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
This commit is contained in:
parent
33fff69f3d
commit
31cf94e7f9
@ -52,6 +52,7 @@ class HUBTrainingSession:
|
|||||||
"heartbeat": 300.0,
|
"heartbeat": 300.0,
|
||||||
} # rate limits (seconds)
|
} # rate limits (seconds)
|
||||||
self.metrics_queue = {} # holds metrics for each epoch until upload
|
self.metrics_queue = {} # holds metrics for each epoch until upload
|
||||||
|
self.metrics_upload_failed_queue = {} # holds metrics for each epoch if upload failed
|
||||||
self.timers = {} # holds timers in ultralytics/utils/callbacks/hub.py
|
self.timers = {} # holds timers in ultralytics/utils/callbacks/hub.py
|
||||||
|
|
||||||
# Parse input
|
# Parse input
|
||||||
@ -234,6 +235,9 @@ class HUBTrainingSession:
|
|||||||
self._show_upload_progress(progress_total, response)
|
self._show_upload_progress(progress_total, response)
|
||||||
|
|
||||||
if HTTPStatus.OK <= response.status_code < HTTPStatus.MULTIPLE_CHOICES:
|
if HTTPStatus.OK <= response.status_code < HTTPStatus.MULTIPLE_CHOICES:
|
||||||
|
# if request related to metrics upload
|
||||||
|
if kwargs.get("metrics"):
|
||||||
|
self.metrics_upload_failed_queue = {}
|
||||||
return response # Success, no need to retry
|
return response # Success, no need to retry
|
||||||
|
|
||||||
if i == 0:
|
if i == 0:
|
||||||
@ -249,6 +253,10 @@ class HUBTrainingSession:
|
|||||||
|
|
||||||
time.sleep(2**i) # Exponential backoff for retries
|
time.sleep(2**i) # Exponential backoff for retries
|
||||||
|
|
||||||
|
# if request related to metrics upload and exceed retries
|
||||||
|
if response is None and kwargs.get("metrics"):
|
||||||
|
self.metrics_upload_failed_queue.update(kwargs.get("metrics", None))
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
if thread:
|
if thread:
|
||||||
|
@ -33,6 +33,11 @@ def on_fit_epoch_end(trainer):
|
|||||||
all_plots = {**all_plots, **model_info_for_loggers(trainer)}
|
all_plots = {**all_plots, **model_info_for_loggers(trainer)}
|
||||||
|
|
||||||
session.metrics_queue[trainer.epoch] = json.dumps(all_plots)
|
session.metrics_queue[trainer.epoch] = json.dumps(all_plots)
|
||||||
|
|
||||||
|
# If any metrics fail to upload, add them to the queue to attempt uploading again.
|
||||||
|
if session.metrics_upload_failed_queue:
|
||||||
|
session.metrics_queue.update(session.metrics_upload_failed_queue)
|
||||||
|
|
||||||
if time() - session.timers["metrics"] > session.rate_limits["metrics"]:
|
if time() - session.timers["metrics"] > session.rate_limits["metrics"]:
|
||||||
session.upload_metrics()
|
session.upload_metrics()
|
||||||
session.timers["metrics"] = time() # reset timer
|
session.timers["metrics"] = time() # reset timer
|
||||||
|
Loading…
x
Reference in New Issue
Block a user