mirror of
https://github.com/THU-MIG/yolov10.git
synced 2025-05-24 06:14:55 +08:00
fix for KeyError: 'epoch'
This commit is contained in:
parent
f19f3e521f
commit
e369c45a62
@ -224,10 +224,9 @@ class BaseTrainer:
|
|||||||
"""Initializes and sets the DistributedDataParallel parameters for training."""
|
"""Initializes and sets the DistributedDataParallel parameters for training."""
|
||||||
torch.cuda.set_device(RANK)
|
torch.cuda.set_device(RANK)
|
||||||
self.device = torch.device("cuda", RANK)
|
self.device = torch.device("cuda", RANK)
|
||||||
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
|
os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
|
||||||
os.environ["NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
|
|
||||||
dist.init_process_group(
|
dist.init_process_group(
|
||||||
"nccl" if dist.is_nccl_available() else "gloo",
|
backend="nccl" if dist.is_nccl_available() else "gloo",
|
||||||
timeout=timedelta(seconds=10800), # 3 hours
|
timeout=timedelta(seconds=10800), # 3 hours
|
||||||
rank=RANK,
|
rank=RANK,
|
||||||
world_size=world_size,
|
world_size=world_size,
|
||||||
@ -659,7 +658,8 @@ class BaseTrainer:
|
|||||||
|
|
||||||
def resume_training(self, ckpt):
|
def resume_training(self, ckpt):
|
||||||
"""Resume YOLO training from given epoch and best fitness."""
|
"""Resume YOLO training from given epoch and best fitness."""
|
||||||
if ckpt is None:
|
print(f"resume_training: {ckpt}")
|
||||||
|
if ckpt is None or not self.resume:
|
||||||
return
|
return
|
||||||
best_fitness = 0.0
|
best_fitness = 0.0
|
||||||
start_epoch = ckpt["epoch"] + 1
|
start_epoch = ckpt["epoch"] + 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user