Add docformatter to pre-commit (#5279)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Burhan <62214284+Burhan-Q@users.noreply.github.com>
This commit is contained in:
Glenn Jocher 2023-10-09 02:25:22 +02:00 committed by GitHub
parent c7aa83da31
commit 7517667a33
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
90 changed files with 1396 additions and 497 deletions

View File

@ -62,6 +62,11 @@ repos:
args: args:
- --ignore-words-list=crate,nd,strack,dota,ane,segway,fo - --ignore-words-list=crate,nd,strack,dota,ane,segway,fo
- repo: https://github.com/PyCQA/docformatter
rev: v1.7.5
hooks:
- id: docformatter
# - repo: https://github.com/asottile/yesqa # - repo: https://github.com/asottile/yesqa
# rev: v1.4.0 # rev: v1.4.0
# hooks: # hooks:

View File

@ -18,7 +18,15 @@ CODE_DIR = ROOT
REFERENCE_DIR = ROOT.parent / 'docs/reference' REFERENCE_DIR = ROOT.parent / 'docs/reference'
def extract_classes_and_functions(filepath): def extract_classes_and_functions(filepath: Path):
"""Extracts class and function names from a given Python file.
Args:
filepath (Path): The path to the Python file.
Returns:
(tuple): A tuple containing lists of class and function names.
"""
with open(filepath, 'r') as file: with open(filepath, 'r') as file:
content = file.read() content = file.read()
@ -31,7 +39,15 @@ def extract_classes_and_functions(filepath):
return classes, functions return classes, functions
def create_markdown(py_filepath, module_path, classes, functions): def create_markdown(py_filepath: Path, module_path: str, classes: list, functions: list):
"""Creates a Markdown file containing the API reference for the given Python module.
Args:
py_filepath (Path): The path to the Python file.
module_path (str): The import path for the Python module.
classes (list): A list of class names within the module.
functions (list): A list of function names within the module.
"""
md_filepath = py_filepath.with_suffix('.md') md_filepath = py_filepath.with_suffix('.md')
# Read existing content and keep header content between first two --- # Read existing content and keep header content between first two ---
@ -64,17 +80,35 @@ def create_markdown(py_filepath, module_path, classes, functions):
def nested_dict(): def nested_dict():
"""Creates and returns a nested defaultdict.
Returns:
(defaultdict): A nested defaultdict object.
"""
return defaultdict(nested_dict) return defaultdict(nested_dict)
def sort_nested_dict(d): def sort_nested_dict(d: dict):
"""Sorts a nested dictionary recursively.
Args:
d (dict): The dictionary to sort.
Returns:
(dict): The sorted dictionary.
"""
return { return {
key: sort_nested_dict(value) if isinstance(value, dict) else value key: sort_nested_dict(value) if isinstance(value, dict) else value
for key, value in sorted(d.items()) for key, value in sorted(d.items())
} }
def create_nav_menu_yaml(nav_items): def create_nav_menu_yaml(nav_items: list):
"""Creates a YAML file for the navigation menu based on the provided list of items.
Args:
nav_items (list): A list of relative file paths to Markdown files for the navigation menu.
"""
nav_tree = nested_dict() nav_tree = nested_dict()
for item_str in nav_items: for item_str in nav_items:
@ -90,6 +124,7 @@ def create_nav_menu_yaml(nav_items):
nav_tree_sorted = sort_nested_dict(nav_tree) nav_tree_sorted = sort_nested_dict(nav_tree)
def _dict_to_yaml(d, level=0): def _dict_to_yaml(d, level=0):
"""Converts a nested dictionary to a YAML-formatted string with indentation."""
yaml_str = '' yaml_str = ''
indent = ' ' * level indent = ' ' * level
for k, v in d.items(): for k, v in d.items():
@ -105,6 +140,7 @@ def create_nav_menu_yaml(nav_items):
def main(): def main():
"""Main function to extract class and function names, create Markdown files, and generate a YAML navigation menu."""
nav_items = [] nav_items = []
for root, _, files in os.walk(CODE_DIR): for root, _, files in os.walk(CODE_DIR):
for file in files: for file in files:

View File

@ -16,7 +16,3 @@ keywords: Ultralytics, YOLO, HungarianMatcher, inverse_sigmoid, detection models
--- ---
## ::: ultralytics.models.utils.ops.get_cdn_group ## ::: ultralytics.models.utils.ops.get_cdn_group
<br><br> <br><br>
---
## ::: ultralytics.models.utils.ops.inverse_sigmoid
<br><br>

View File

@ -9,11 +9,12 @@ from ultralytics.utils import ASSETS, yaml_load
from ultralytics.utils.checks import check_requirements, check_yaml from ultralytics.utils.checks import check_requirements, check_yaml
class Yolov8: class YOLOv8:
"""YOLOv8 object detection model class for handling inference and visualization."""
def __init__(self, onnx_model, input_image, confidence_thres, iou_thres): def __init__(self, onnx_model, input_image, confidence_thres, iou_thres):
""" """
Initializes an instance of the Yolov8 class. Initializes an instance of the YOLOv8 class.
Args: Args:
onnx_model: Path to the ONNX model. onnx_model: Path to the ONNX model.
@ -213,8 +214,8 @@ if __name__ == '__main__':
# Check the requirements and select the appropriate backend (CPU or GPU) # Check the requirements and select the appropriate backend (CPU or GPU)
check_requirements('onnxruntime-gpu' if torch.cuda.is_available() else 'onnxruntime') check_requirements('onnxruntime-gpu' if torch.cuda.is_available() else 'onnxruntime')
# Create an instance of the Yolov8 class with the specified arguments # Create an instance of the YOLOv8 class with the specified arguments
detection = Yolov8(args.model, args.img, args.conf_thres, args.iou_thres) detection = YOLOv8(args.model, args.img, args.conf_thres, args.iou_thres)
# Perform object detection and obtain the output image # Perform object detection and obtain the output image
output_image = detection.main() output_image = detection.main()

View File

@ -7,11 +7,22 @@ from ultralytics.utils import ASSETS, yaml_load
from ultralytics.utils.checks import check_yaml from ultralytics.utils.checks import check_yaml
CLASSES = yaml_load(check_yaml('coco128.yaml'))['names'] CLASSES = yaml_load(check_yaml('coco128.yaml'))['names']
colors = np.random.uniform(0, 255, size=(len(CLASSES), 3)) colors = np.random.uniform(0, 255, size=(len(CLASSES), 3))
def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h): def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
"""
Draws bounding boxes on the input image based on the provided arguments.
Args:
img (numpy.ndarray): The input image to draw the bounding box on.
class_id (int): Class ID of the detected object.
confidence (float): Confidence score of the detected object.
x (int): X-coordinate of the top-left corner of the bounding box.
y (int): Y-coordinate of the top-left corner of the bounding box.
x_plus_w (int): X-coordinate of the bottom-right corner of the bounding box.
y_plus_h (int): Y-coordinate of the bottom-right corner of the bounding box.
"""
label = f'{CLASSES[class_id]} ({confidence:.2f})' label = f'{CLASSES[class_id]} ({confidence:.2f})'
color = colors[class_id] color = colors[class_id]
cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2) cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2)
@ -19,18 +30,39 @@ def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
def main(onnx_model, input_image): def main(onnx_model, input_image):
"""
Main function to load ONNX model, perform inference, draw bounding boxes, and display the output image.
Args:
onnx_model (str): Path to the ONNX model.
input_image (str): Path to the input image.
Returns:
list: List of dictionaries containing detection information such as class_id, class_name, confidence, etc.
"""
# Load the ONNX model
model: cv2.dnn.Net = cv2.dnn.readNetFromONNX(onnx_model) model: cv2.dnn.Net = cv2.dnn.readNetFromONNX(onnx_model)
# Read the input image
original_image: np.ndarray = cv2.imread(input_image) original_image: np.ndarray = cv2.imread(input_image)
[height, width, _] = original_image.shape [height, width, _] = original_image.shape
# Prepare a square image for inference
length = max((height, width)) length = max((height, width))
image = np.zeros((length, length, 3), np.uint8) image = np.zeros((length, length, 3), np.uint8)
image[0:height, 0:width] = original_image image[0:height, 0:width] = original_image
# Calculate scale factor
scale = length / 640 scale = length / 640
# Preprocess the image and prepare blob for model
blob = cv2.dnn.blobFromImage(image, scalefactor=1 / 255, size=(640, 640), swapRB=True) blob = cv2.dnn.blobFromImage(image, scalefactor=1 / 255, size=(640, 640), swapRB=True)
model.setInput(blob) model.setInput(blob)
# Perform inference
outputs = model.forward() outputs = model.forward()
# Prepare output array
outputs = np.array([cv2.transpose(outputs[0])]) outputs = np.array([cv2.transpose(outputs[0])])
rows = outputs.shape[1] rows = outputs.shape[1]
@ -38,6 +70,7 @@ def main(onnx_model, input_image):
scores = [] scores = []
class_ids = [] class_ids = []
# Iterate through output to collect bounding boxes, confidence scores, and class IDs
for i in range(rows): for i in range(rows):
classes_scores = outputs[0][i][4:] classes_scores = outputs[0][i][4:]
(minScore, maxScore, minClassLoc, (x, maxClassIndex)) = cv2.minMaxLoc(classes_scores) (minScore, maxScore, minClassLoc, (x, maxClassIndex)) = cv2.minMaxLoc(classes_scores)
@ -49,9 +82,12 @@ def main(onnx_model, input_image):
scores.append(maxScore) scores.append(maxScore)
class_ids.append(maxClassIndex) class_ids.append(maxClassIndex)
# Apply NMS (Non-maximum suppression)
result_boxes = cv2.dnn.NMSBoxes(boxes, scores, 0.25, 0.45, 0.5) result_boxes = cv2.dnn.NMSBoxes(boxes, scores, 0.25, 0.45, 0.5)
detections = [] detections = []
# Iterate through NMS results to draw bounding boxes and labels
for i in range(len(result_boxes)): for i in range(len(result_boxes)):
index = result_boxes[i] index = result_boxes[i]
box = boxes[index] box = boxes[index]
@ -65,6 +101,7 @@ def main(onnx_model, input_image):
draw_bounding_box(original_image, class_ids[index], scores[index], round(box[0] * scale), round(box[1] * scale), draw_bounding_box(original_image, class_ids[index], scores[index], round(box[0] * scale), round(box[1] * scale),
round((box[0] + box[2]) * scale), round((box[1] + box[3]) * scale)) round((box[0] + box[2]) * scale), round((box[1] + box[3]) * scale))
# Display the image with bounding boxes
cv2.imshow('image', original_image) cv2.imshow('image', original_image)
cv2.waitKey(0) cv2.waitKey(0)
cv2.destroyAllWindows() cv2.destroyAllWindows()
@ -74,7 +111,7 @@ def main(onnx_model, input_image):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--model', default='yolov8n.onnx', help='Input your onnx model.') parser.add_argument('--model', default='yolov8n.onnx', help='Input your ONNX model.')
parser.add_argument('--img', default=str(ASSETS / 'bus.jpg'), help='Path to input image.') parser.add_argument('--img', default=str(ASSETS / 'bus.jpg'), help='Path to input image.')
args = parser.parse_args() args = parser.parse_args()
main(args.model, args.img) main(args.model, args.img)

View File

@ -33,10 +33,6 @@ counting_regions = [
}, ] }, ]
def is_inside_polygon(point, polygon):
return polygon.contains(Point(point))
def mouse_callback(event, x, y, flags, param): def mouse_callback(event, x, y, flags, param):
"""Mouse call back event.""" """Mouse call back event."""
global current_region global current_region
@ -44,7 +40,7 @@ def mouse_callback(event, x, y, flags, param):
# Mouse left button down event # Mouse left button down event
if event == cv2.EVENT_LBUTTONDOWN: if event == cv2.EVENT_LBUTTONDOWN:
for region in counting_regions: for region in counting_regions:
if is_inside_polygon((x, y), region['polygon']): if region['polygon'].contains(Point((x, y))):
current_region = region current_region = region
current_region['dragging'] = True current_region['dragging'] = True
current_region['offset_x'] = x current_region['offset_x'] = x
@ -150,7 +146,7 @@ def run(
# Check if detection inside region # Check if detection inside region
for region in counting_regions: for region in counting_regions:
if is_inside_polygon((x, y), region['polygon']): if region['polygon'].contains(Point((x, y))):
region['counts'] += 1 region['counts'] += 1
# Draw regions (Polygons/Rectangles) # Draw regions (Polygons/Rectangles)

View File

@ -60,3 +60,12 @@ SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = True
SPLIT_BEFORE_CLOSING_BRACKET = False SPLIT_BEFORE_CLOSING_BRACKET = False
SPLIT_BEFORE_FIRST_ARGUMENT = False SPLIT_BEFORE_FIRST_ARGUMENT = False
# EACH_DICT_ENTRY_ON_SEPARATE_LINE = False # EACH_DICT_ENTRY_ON_SEPARATE_LINE = False
[docformatter]
wrap-summaries = 120
wrap-descriptions = 120
in-place = true
make-summary-multi-line = false
pre-summary-newline = true
force-wrap = false
close-quotes-on-newline = true

View File

@ -12,6 +12,12 @@ README = (PARENT / 'README.md').read_text(encoding='utf-8')
def get_version(): def get_version():
"""
Retrieve the version number from the 'ultralytics/__init__.py' file.
Returns:
(str): The version number extracted from the '__version__' attribute in the 'ultralytics/__init__.py' file.
"""
file = PARENT / 'ultralytics/__init__.py' file = PARENT / 'ultralytics/__init__.py'
return re.search(r'^__version__ = [\'"]([^\'"]*)[\'"]', file.read_text(encoding='utf-8'), re.M)[1] return re.search(r'^__version__ = [\'"]([^\'"]*)[\'"]', file.read_text(encoding='utf-8'), re.M)[1]
@ -24,7 +30,7 @@ def parse_requirements(file_path: Path):
file_path (str | Path): Path to the requirements.txt file. file_path (str | Path): Path to the requirements.txt file.
Returns: Returns:
List[str]: List of parsed requirements. (List[str]): List of parsed requirements.
""" """
requirements = [] requirements = []

View File

@ -9,7 +9,8 @@ TMP = Path(__file__).resolve().parent / 'tmp' # temp directory for test files
def pytest_addoption(parser): def pytest_addoption(parser):
"""Add custom command-line options to pytest. """
Add custom command-line options to pytest.
Args: Args:
parser (pytest.config.Parser): The pytest parser object. parser (pytest.config.Parser): The pytest parser object.
@ -18,7 +19,8 @@ def pytest_addoption(parser):
def pytest_configure(config): def pytest_configure(config):
"""Register custom markers to avoid pytest warnings. """
Register custom markers to avoid pytest warnings.
Args: Args:
config (pytest.config.Config): The pytest config object. config (pytest.config.Config): The pytest config object.
@ -27,7 +29,8 @@ def pytest_configure(config):
def pytest_runtest_setup(item): def pytest_runtest_setup(item):
"""Setup hook to skip tests marked as slow if the --slow option is not provided. """
Setup hook to skip tests marked as slow if the --slow option is not provided.
Args: Args:
item (pytest.Item): The test item object. item (pytest.Item): The test item object.

View File

@ -22,11 +22,12 @@ EXPORT_ARGS = [
def run(cmd): def run(cmd):
# Run a subprocess command with check=True """Execute a shell command using subprocess."""
subprocess.run(cmd.split(), check=True) subprocess.run(cmd.split(), check=True)
def test_special_modes(): def test_special_modes():
"""Test various special command modes of YOLO."""
run('yolo help') run('yolo help')
run('yolo checks') run('yolo checks')
run('yolo version') run('yolo version')
@ -36,31 +37,37 @@ def test_special_modes():
@pytest.mark.parametrize('task,model,data', TASK_ARGS) @pytest.mark.parametrize('task,model,data', TASK_ARGS)
def test_train(task, model, data): def test_train(task, model, data):
"""Test YOLO training for a given task, model, and data."""
run(f'yolo train {task} model={model}.yaml data={data} imgsz=32 epochs=1 cache=disk') run(f'yolo train {task} model={model}.yaml data={data} imgsz=32 epochs=1 cache=disk')
@pytest.mark.parametrize('task,model,data', TASK_ARGS) @pytest.mark.parametrize('task,model,data', TASK_ARGS)
def test_val(task, model, data): def test_val(task, model, data):
"""Test YOLO validation for a given task, model, and data."""
run(f'yolo val {task} model={WEIGHTS_DIR / model}.pt data={data} imgsz=32 save_txt save_json') run(f'yolo val {task} model={WEIGHTS_DIR / model}.pt data={data} imgsz=32 save_txt save_json')
@pytest.mark.parametrize('task,model,data', TASK_ARGS) @pytest.mark.parametrize('task,model,data', TASK_ARGS)
def test_predict(task, model, data): def test_predict(task, model, data):
"""Test YOLO prediction on sample assets for a given task and model."""
run(f'yolo predict model={WEIGHTS_DIR / model}.pt source={ASSETS} imgsz=32 save save_crop save_txt') run(f'yolo predict model={WEIGHTS_DIR / model}.pt source={ASSETS} imgsz=32 save save_crop save_txt')
@pytest.mark.parametrize('model,format', EXPORT_ARGS) @pytest.mark.parametrize('model,format', EXPORT_ARGS)
def test_export(model, format): def test_export(model, format):
"""Test exporting a YOLO model to different formats."""
run(f'yolo export model={WEIGHTS_DIR / model}.pt format={format} imgsz=32') run(f'yolo export model={WEIGHTS_DIR / model}.pt format={format} imgsz=32')
def test_rtdetr(task='detect', model='yolov8n-rtdetr.yaml', data='coco8.yaml'): def test_rtdetr(task='detect', model='yolov8n-rtdetr.yaml', data='coco8.yaml'):
"""Test the RTDETR functionality with the Ultralytics framework."""
# Warning: MUST use imgsz=640 # Warning: MUST use imgsz=640
run(f'yolo train {task} model={model} data={data} --imgsz= 640 epochs =1, cache = disk') # add coma, spaces to args run(f'yolo train {task} model={model} data={data} --imgsz= 640 epochs =1, cache = disk') # add coma, spaces to args
run(f"yolo predict {task} model={model} source={ASSETS / 'bus.jpg'} imgsz=640 save save_crop save_txt") run(f"yolo predict {task} model={model} source={ASSETS / 'bus.jpg'} imgsz=640 save save_crop save_txt")
def test_fastsam(task='segment', model=WEIGHTS_DIR / 'FastSAM-s.pt', data='coco8-seg.yaml'): def test_fastsam(task='segment', model=WEIGHTS_DIR / 'FastSAM-s.pt', data='coco8-seg.yaml'):
"""Test FastSAM segmentation functionality within Ultralytics."""
source = ASSETS / 'bus.jpg' source = ASSETS / 'bus.jpg'
run(f'yolo segment val {task} model={model} data={data} imgsz=32') run(f'yolo segment val {task} model={model} data={data} imgsz=32')
@ -97,6 +104,7 @@ def test_fastsam(task='segment', model=WEIGHTS_DIR / 'FastSAM-s.pt', data='coco8
def test_mobilesam(): def test_mobilesam():
"""Test MobileSAM segmentation functionality using Ultralytics."""
from ultralytics import SAM from ultralytics import SAM
# Load the model # Load the model
@ -121,5 +129,6 @@ def test_mobilesam():
@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available') @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available')
@pytest.mark.skipif(CUDA_DEVICE_COUNT < 2, reason='DDP is not available') @pytest.mark.skipif(CUDA_DEVICE_COUNT < 2, reason='DDP is not available')
def test_train_gpu(task, model, data): def test_train_gpu(task, model, data):
"""Test YOLO training on GPU(s) for various tasks and models."""
run(f'yolo train {task} model={model}.yaml data={data} imgsz=32 epochs=1 device=0') # single GPU run(f'yolo train {task} model={model}.yaml data={data} imgsz=32 epochs=1 device=0') # single GPU
run(f'yolo train {task} model={model}.pt data={data} imgsz=32 epochs=1 device=0,1') # multi GPU run(f'yolo train {task} model={model}.pt data={data} imgsz=32 epochs=1 device=0,1') # multi GPU

View File

@ -1,4 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
import contextlib import contextlib
import pytest import pytest
@ -17,18 +18,21 @@ BUS = ASSETS / 'bus.jpg'
def test_checks(): def test_checks():
"""Validate CUDA settings against torch CUDA functions."""
assert torch.cuda.is_available() == CUDA_IS_AVAILABLE assert torch.cuda.is_available() == CUDA_IS_AVAILABLE
assert torch.cuda.device_count() == CUDA_DEVICE_COUNT assert torch.cuda.device_count() == CUDA_DEVICE_COUNT
@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available') @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available')
def test_train(): def test_train():
"""Test model training on a minimal dataset."""
device = 0 if CUDA_DEVICE_COUNT == 1 else [0, 1] device = 0 if CUDA_DEVICE_COUNT == 1 else [0, 1]
YOLO(MODEL).train(data=DATA, imgsz=64, epochs=1, device=device) # requires imgsz>=64 YOLO(MODEL).train(data=DATA, imgsz=64, epochs=1, device=device) # requires imgsz>=64
@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available') @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available')
def test_predict_multiple_devices(): def test_predict_multiple_devices():
"""Validate model prediction on multiple devices."""
model = YOLO('yolov8n.pt') model = YOLO('yolov8n.pt')
model = model.cpu() model = model.cpu()
assert str(model.device) == 'cpu' assert str(model.device) == 'cpu'
@ -53,6 +57,7 @@ def test_predict_multiple_devices():
@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available') @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available')
def test_autobatch(): def test_autobatch():
"""Check batch size for YOLO model using autobatch."""
from ultralytics.utils.autobatch import check_train_batch_size from ultralytics.utils.autobatch import check_train_batch_size
check_train_batch_size(YOLO(MODEL).model.cuda(), imgsz=128, amp=True) check_train_batch_size(YOLO(MODEL).model.cuda(), imgsz=128, amp=True)
@ -60,6 +65,7 @@ def test_autobatch():
@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available') @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available')
def test_utils_benchmarks(): def test_utils_benchmarks():
"""Profile YOLO models for performance benchmarks."""
from ultralytics.utils.benchmarks import ProfileModels from ultralytics.utils.benchmarks import ProfileModels
# Pre-export a dynamic engine model to use dynamic inference # Pre-export a dynamic engine model to use dynamic inference
@ -69,6 +75,7 @@ def test_utils_benchmarks():
@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available') @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available')
def test_predict_sam(): def test_predict_sam():
"""Test SAM model prediction with various prompts."""
from ultralytics import SAM from ultralytics import SAM
from ultralytics.models.sam import Predictor as SAMPredictor from ultralytics.models.sam import Predictor as SAMPredictor
@ -102,6 +109,7 @@ def test_predict_sam():
@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available') @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available')
def test_model_ray_tune(): def test_model_ray_tune():
"""Tune YOLO model with Ray optimization library."""
with contextlib.suppress(RuntimeError): # RuntimeError may be caused by out-of-memory with contextlib.suppress(RuntimeError): # RuntimeError may be caused by out-of-memory
YOLO('yolov8n-cls.yaml').tune(use_ray=True, YOLO('yolov8n-cls.yaml').tune(use_ray=True,
data='imagenet10', data='imagenet10',
@ -115,12 +123,14 @@ def test_model_ray_tune():
@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available') @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available')
def test_model_tune(): def test_model_tune():
"""Tune YOLO model for performance."""
YOLO('yolov8n-pose.pt').tune(data='coco8-pose.yaml', plots=False, imgsz=32, epochs=1, iterations=2, device='cpu') YOLO('yolov8n-pose.pt').tune(data='coco8-pose.yaml', plots=False, imgsz=32, epochs=1, iterations=2, device='cpu')
YOLO('yolov8n-cls.pt').tune(data='imagenet10', plots=False, imgsz=32, epochs=1, iterations=2, device='cpu') YOLO('yolov8n-cls.pt').tune(data='imagenet10', plots=False, imgsz=32, epochs=1, iterations=2, device='cpu')
@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available') @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason='CUDA is not available')
def test_pycocotools(): def test_pycocotools():
"""Validate model predictions using pycocotools."""
from ultralytics.models.yolo.detect import DetectionValidator from ultralytics.models.yolo.detect import DetectionValidator
from ultralytics.models.yolo.pose import PoseValidator from ultralytics.models.yolo.pose import PoseValidator
from ultralytics.models.yolo.segment import SegmentationValidator from ultralytics.models.yolo.segment import SegmentationValidator

View File

@ -14,10 +14,12 @@ MODEL = WEIGHTS_DIR / 'yolov8n'
def test_func(*args): # noqa def test_func(*args): # noqa
"""Test function callback."""
print('callback test passed') print('callback test passed')
def test_export(): def test_export():
"""Test model exporting functionality."""
exporter = Exporter() exporter = Exporter()
exporter.add_callback('on_export_start', test_func) exporter.add_callback('on_export_start', test_func)
assert test_func in exporter.callbacks['on_export_start'], 'callback test failed' assert test_func in exporter.callbacks['on_export_start'], 'callback test failed'
@ -26,6 +28,7 @@ def test_export():
def test_detect(): def test_detect():
"""Test object detection functionality."""
overrides = {'data': 'coco8.yaml', 'model': CFG_DET, 'imgsz': 32, 'epochs': 1, 'save': False} overrides = {'data': 'coco8.yaml', 'model': CFG_DET, 'imgsz': 32, 'epochs': 1, 'save': False}
CFG.data = 'coco8.yaml' CFG.data = 'coco8.yaml'
CFG.imgsz = 32 CFG.imgsz = 32
@ -61,6 +64,7 @@ def test_detect():
def test_segment(): def test_segment():
"""Test image segmentation functionality."""
overrides = {'data': 'coco8-seg.yaml', 'model': CFG_SEG, 'imgsz': 32, 'epochs': 1, 'save': False} overrides = {'data': 'coco8-seg.yaml', 'model': CFG_SEG, 'imgsz': 32, 'epochs': 1, 'save': False}
CFG.data = 'coco8-seg.yaml' CFG.data = 'coco8-seg.yaml'
CFG.imgsz = 32 CFG.imgsz = 32
@ -98,6 +102,7 @@ def test_segment():
def test_classify(): def test_classify():
"""Test image classification functionality."""
overrides = {'data': 'imagenet10', 'model': CFG_CLS, 'imgsz': 32, 'epochs': 1, 'save': False} overrides = {'data': 'imagenet10', 'model': CFG_CLS, 'imgsz': 32, 'epochs': 1, 'save': False}
CFG.data = 'imagenet10' CFG.data = 'imagenet10'
CFG.imgsz = 32 CFG.imgsz = 32

View File

@ -27,11 +27,13 @@ IS_TMP_WRITEABLE = is_dir_writeable(TMP)
def test_model_forward(): def test_model_forward():
"""Test the forward pass of the YOLO model."""
model = YOLO(CFG) model = YOLO(CFG)
model(source=None, imgsz=32, augment=True) # also test no source and augment model(source=None, imgsz=32, augment=True) # also test no source and augment
def test_model_methods(): def test_model_methods():
"""Test various methods and properties of the YOLO model."""
model = YOLO(MODEL) model = YOLO(MODEL)
# Model methods # Model methods
@ -51,7 +53,7 @@ def test_model_methods():
def test_model_profile(): def test_model_profile():
# Test profile=True model argument """Test profiling of the YOLO model with 'profile=True' argument."""
from ultralytics.nn.tasks import DetectionModel from ultralytics.nn.tasks import DetectionModel
model = DetectionModel() # build model model = DetectionModel() # build model
@ -61,7 +63,7 @@ def test_model_profile():
@pytest.mark.skipif(not IS_TMP_WRITEABLE, reason='directory is not writeable') @pytest.mark.skipif(not IS_TMP_WRITEABLE, reason='directory is not writeable')
def test_predict_txt(): def test_predict_txt():
# Write a list of sources (file, dir, glob, recursive glob) to a txt file """Test YOLO predictions with sources (file, dir, glob, recursive glob) specified in a text file."""
txt_file = TMP / 'sources.txt' txt_file = TMP / 'sources.txt'
with open(txt_file, 'w') as f: with open(txt_file, 'w') as f:
for x in [ASSETS / 'bus.jpg', ASSETS, ASSETS / '*', ASSETS / '**/*.jpg']: for x in [ASSETS / 'bus.jpg', ASSETS, ASSETS / '*', ASSETS / '**/*.jpg']:
@ -70,6 +72,7 @@ def test_predict_txt():
def test_predict_img(): def test_predict_img():
"""Test YOLO prediction on various types of image sources."""
model = YOLO(MODEL) model = YOLO(MODEL)
seg_model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt') seg_model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt')
cls_model = YOLO(WEIGHTS_DIR / 'yolov8n-cls.pt') cls_model = YOLO(WEIGHTS_DIR / 'yolov8n-cls.pt')
@ -105,7 +108,7 @@ def test_predict_img():
def test_predict_grey_and_4ch(): def test_predict_grey_and_4ch():
# Convert SOURCE to greyscale and 4-ch """Test YOLO prediction on SOURCE converted to greyscale and 4-channel images."""
im = Image.open(SOURCE) im = Image.open(SOURCE)
directory = TMP / 'im4' directory = TMP / 'im4'
directory.mkdir(parents=True, exist_ok=True) directory.mkdir(parents=True, exist_ok=True)
@ -132,8 +135,11 @@ def test_predict_grey_and_4ch():
@pytest.mark.skipif(not ONLINE, reason='environment is offline') @pytest.mark.skipif(not ONLINE, reason='environment is offline')
@pytest.mark.skipif(not IS_TMP_WRITEABLE, reason='directory is not writeable') @pytest.mark.skipif(not IS_TMP_WRITEABLE, reason='directory is not writeable')
def test_track_stream(): def test_track_stream():
# Test YouTube streaming inference (short 10 frame video) with non-default ByteTrack tracker """
# imgsz=160 required for tracking for higher confidence and better matches Test YouTube streaming tracking (short 10 frame video) with non-default ByteTrack tracker.
Note imgsz=160 required for tracking for higher confidence and better matches
"""
import yaml import yaml
model = YOLO(MODEL) model = YOLO(MODEL)
@ -153,37 +159,44 @@ def test_track_stream():
def test_val(): def test_val():
"""Test the validation mode of the YOLO model."""
YOLO(MODEL).val(data='coco8.yaml', imgsz=32, save_hybrid=True) YOLO(MODEL).val(data='coco8.yaml', imgsz=32, save_hybrid=True)
def test_train_scratch(): def test_train_scratch():
"""Test training the YOLO model from scratch."""
model = YOLO(CFG) model = YOLO(CFG)
model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1, name='model') model.train(data='coco8.yaml', epochs=2, imgsz=32, cache='disk', batch=-1, close_mosaic=1, name='model')
model(SOURCE) model(SOURCE)
def test_train_pretrained(): def test_train_pretrained():
"""Test training the YOLO model from a pre-trained state."""
model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt') model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt')
model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5, name=0) model.train(data='coco8-seg.yaml', epochs=1, imgsz=32, cache='ram', copy_paste=0.5, mixup=0.5, name=0)
model(SOURCE) model(SOURCE)
def test_export_torchscript(): def test_export_torchscript():
"""Test exporting the YOLO model to TorchScript format."""
f = YOLO(MODEL).export(format='torchscript', optimize=False) f = YOLO(MODEL).export(format='torchscript', optimize=False)
YOLO(f)(SOURCE) # exported model inference YOLO(f)(SOURCE) # exported model inference
def test_export_onnx(): def test_export_onnx():
"""Test exporting the YOLO model to ONNX format."""
f = YOLO(MODEL).export(format='onnx', dynamic=True) f = YOLO(MODEL).export(format='onnx', dynamic=True)
YOLO(f)(SOURCE) # exported model inference YOLO(f)(SOURCE) # exported model inference
def test_export_openvino(): def test_export_openvino():
"""Test exporting the YOLO model to OpenVINO format."""
f = YOLO(MODEL).export(format='openvino') f = YOLO(MODEL).export(format='openvino')
YOLO(f)(SOURCE) # exported model inference YOLO(f)(SOURCE) # exported model inference
def test_export_coreml(): def test_export_coreml():
"""Test exporting the YOLO model to CoreML format."""
if not WINDOWS: # RuntimeError: BlobWriter not loaded with coremltools 7.0 on windows if not WINDOWS: # RuntimeError: BlobWriter not loaded with coremltools 7.0 on windows
if MACOS: if MACOS:
f = YOLO(MODEL).export(format='coreml') f = YOLO(MODEL).export(format='coreml')
@ -193,7 +206,11 @@ def test_export_coreml():
def test_export_tflite(enabled=False): def test_export_tflite(enabled=False):
# TF suffers from install conflicts on Windows and macOS """
Test exporting the YOLO model to TFLite format.
Note TF suffers from install conflicts on Windows and macOS.
"""
if enabled and LINUX: if enabled and LINUX:
model = YOLO(MODEL) model = YOLO(MODEL)
f = model.export(format='tflite') f = model.export(format='tflite')
@ -201,7 +218,11 @@ def test_export_tflite(enabled=False):
def test_export_pb(enabled=False): def test_export_pb(enabled=False):
# TF suffers from install conflicts on Windows and macOS """
Test exporting the YOLO model to *.pb format.
Note TF suffers from install conflicts on Windows and macOS.
"""
if enabled and LINUX: if enabled and LINUX:
model = YOLO(MODEL) model = YOLO(MODEL)
f = model.export(format='pb') f = model.export(format='pb')
@ -209,18 +230,24 @@ def test_export_pb(enabled=False):
def test_export_paddle(enabled=False): def test_export_paddle(enabled=False):
# Paddle protobuf requirements conflicting with onnx protobuf requirements """
Test exporting the YOLO model to Paddle format.
Note Paddle protobuf requirements conflicting with onnx protobuf requirements.
"""
if enabled: if enabled:
YOLO(MODEL).export(format='paddle') YOLO(MODEL).export(format='paddle')
@pytest.mark.slow @pytest.mark.slow
def test_export_ncnn(): def test_export_ncnn():
"""Test exporting the YOLO model to NCNN format."""
f = YOLO(MODEL).export(format='ncnn') f = YOLO(MODEL).export(format='ncnn')
YOLO(f)(SOURCE) # exported model inference YOLO(f)(SOURCE) # exported model inference
def test_all_model_yamls(): def test_all_model_yamls():
"""Test YOLO model creation for all available YAML configurations."""
for m in (ROOT / 'cfg' / 'models').rglob('*.yaml'): for m in (ROOT / 'cfg' / 'models').rglob('*.yaml'):
if 'rtdetr' in m.name: if 'rtdetr' in m.name:
if TORCH_1_9: # torch<=1.8 issue - TypeError: __init__() got an unexpected keyword argument 'batch_first' if TORCH_1_9: # torch<=1.8 issue - TypeError: __init__() got an unexpected keyword argument 'batch_first'
@ -230,6 +257,7 @@ def test_all_model_yamls():
def test_workflow(): def test_workflow():
"""Test the complete workflow including training, validation, prediction, and exporting."""
model = YOLO(MODEL) model = YOLO(MODEL)
model.train(data='coco8.yaml', epochs=1, imgsz=32, optimizer='SGD') model.train(data='coco8.yaml', epochs=1, imgsz=32, optimizer='SGD')
model.val(imgsz=32) model.val(imgsz=32)
@ -238,12 +266,14 @@ def test_workflow():
def test_predict_callback_and_setup(): def test_predict_callback_and_setup():
# Test callback addition for prediction """Test callback functionality during YOLO prediction."""
def on_predict_batch_end(predictor): # results -> List[batch_size]
def on_predict_batch_end(predictor):
"""Callback function that handles operations at the end of a prediction batch."""
path, im0s, _, _ = predictor.batch path, im0s, _, _ = predictor.batch
im0s = im0s if isinstance(im0s, list) else [im0s] im0s = im0s if isinstance(im0s, list) else [im0s]
bs = [predictor.dataset.bs for _ in range(len(path))] bs = [predictor.dataset.bs for _ in range(len(path))]
predictor.results = zip(predictor.results, im0s, bs) predictor.results = zip(predictor.results, im0s, bs) # results is List[batch_size]
model = YOLO(MODEL) model = YOLO(MODEL)
model.add_callback('on_predict_batch_end', on_predict_batch_end) model.add_callback('on_predict_batch_end', on_predict_batch_end)
@ -259,6 +289,7 @@ def test_predict_callback_and_setup():
def test_results(): def test_results():
"""Test various result formats for the YOLO model."""
for m in 'yolov8n-pose.pt', 'yolov8n-seg.pt', 'yolov8n.pt', 'yolov8n-cls.pt': for m in 'yolov8n-pose.pt', 'yolov8n-seg.pt', 'yolov8n.pt', 'yolov8n-cls.pt':
results = YOLO(WEIGHTS_DIR / m)([SOURCE, SOURCE], imgsz=160) results = YOLO(WEIGHTS_DIR / m)([SOURCE, SOURCE], imgsz=160)
for r in results: for r in results:
@ -274,7 +305,7 @@ def test_results():
@pytest.mark.skipif(not ONLINE, reason='environment is offline') @pytest.mark.skipif(not ONLINE, reason='environment is offline')
def test_data_utils(): def test_data_utils():
# Test functions in ultralytics/data/utils.py """Test utility functions in ultralytics/data/utils.py."""
from ultralytics.data.utils import HUBDatasetStats, autosplit from ultralytics.data.utils import HUBDatasetStats, autosplit
from ultralytics.utils.downloads import zip_directory from ultralytics.utils.downloads import zip_directory
@ -294,7 +325,7 @@ def test_data_utils():
@pytest.mark.skipif(not ONLINE, reason='environment is offline') @pytest.mark.skipif(not ONLINE, reason='environment is offline')
def test_data_converter(): def test_data_converter():
# Test dataset converters """Test dataset converters."""
from ultralytics.data.converter import coco80_to_coco91_class, convert_coco from ultralytics.data.converter import coco80_to_coco91_class, convert_coco
file = 'instances_val2017.json' file = 'instances_val2017.json'
@ -304,6 +335,7 @@ def test_data_converter():
def test_data_annotator(): def test_data_annotator():
"""Test automatic data annotation."""
from ultralytics.data.annotator import auto_annotate from ultralytics.data.annotator import auto_annotate
auto_annotate(ASSETS, auto_annotate(ASSETS,
@ -313,7 +345,7 @@ def test_data_annotator():
def test_events(): def test_events():
# Test event sending """Test event sending functionality."""
from ultralytics.hub.utils import Events from ultralytics.hub.utils import Events
events = Events() events = Events()
@ -324,6 +356,7 @@ def test_events():
def test_cfg_init(): def test_cfg_init():
"""Test configuration initialization utilities."""
from ultralytics.cfg import check_dict_alignment, copy_default_cfg, smart_value from ultralytics.cfg import check_dict_alignment, copy_default_cfg, smart_value
with contextlib.suppress(SyntaxError): with contextlib.suppress(SyntaxError):
@ -334,6 +367,7 @@ def test_cfg_init():
def test_utils_init(): def test_utils_init():
"""Test initialization utilities."""
from ultralytics.utils import get_git_branch, get_git_origin_url, get_ubuntu_version, is_github_actions_ci from ultralytics.utils import get_git_branch, get_git_origin_url, get_ubuntu_version, is_github_actions_ci
get_ubuntu_version() get_ubuntu_version()
@ -343,6 +377,7 @@ def test_utils_init():
def test_utils_checks(): def test_utils_checks():
"""Test various utility checks."""
checks.check_yolov5u_filename('yolov5n.pt') checks.check_yolov5u_filename('yolov5n.pt')
checks.git_describe(ROOT) checks.git_describe(ROOT)
checks.check_requirements() # check requirements.txt checks.check_requirements() # check requirements.txt
@ -354,12 +389,14 @@ def test_utils_checks():
def test_utils_benchmarks(): def test_utils_benchmarks():
"""Test model benchmarking."""
from ultralytics.utils.benchmarks import ProfileModels from ultralytics.utils.benchmarks import ProfileModels
ProfileModels(['yolov8n.yaml'], imgsz=32, min_time=1, num_timed_runs=3, num_warmup_runs=1).profile() ProfileModels(['yolov8n.yaml'], imgsz=32, min_time=1, num_timed_runs=3, num_warmup_runs=1).profile()
def test_utils_torchutils(): def test_utils_torchutils():
"""Test Torch utility functions."""
from ultralytics.nn.modules.conv import Conv from ultralytics.nn.modules.conv import Conv
from ultralytics.utils.torch_utils import get_flops_with_torch_profiler, profile, time_sync from ultralytics.utils.torch_utils import get_flops_with_torch_profiler, profile, time_sync
@ -373,12 +410,14 @@ def test_utils_torchutils():
@pytest.mark.skipif(not ONLINE, reason='environment is offline') @pytest.mark.skipif(not ONLINE, reason='environment is offline')
def test_utils_downloads(): def test_utils_downloads():
"""Test file download utilities."""
from ultralytics.utils.downloads import get_google_drive_file_info from ultralytics.utils.downloads import get_google_drive_file_info
get_google_drive_file_info('https://drive.google.com/file/d/1cqT-cJgANNrhIHCrEufUYhQ4RqiWG_lJ/view?usp=drive_link') get_google_drive_file_info('https://drive.google.com/file/d/1cqT-cJgANNrhIHCrEufUYhQ4RqiWG_lJ/view?usp=drive_link')
def test_utils_ops(): def test_utils_ops():
"""Test various operations utilities."""
from ultralytics.utils.ops import (ltwh2xywh, ltwh2xyxy, make_divisible, xywh2ltwh, xywh2xyxy, xywhn2xyxy, from ultralytics.utils.ops import (ltwh2xywh, ltwh2xyxy, make_divisible, xywh2ltwh, xywh2xyxy, xywhn2xyxy,
xywhr2xyxyxyxy, xyxy2ltwh, xyxy2xywh, xyxy2xywhn, xyxyxyxy2xywhr) xywhr2xyxyxyxy, xyxy2ltwh, xyxy2xywh, xyxy2xywhn, xyxyxyxy2xywhr)
@ -396,6 +435,7 @@ def test_utils_ops():
def test_utils_files(): def test_utils_files():
"""Test file handling utilities."""
from ultralytics.utils.files import file_age, file_date, get_latest_run, spaces_in_path from ultralytics.utils.files import file_age, file_date, get_latest_run, spaces_in_path
file_age(SOURCE) file_age(SOURCE)
@ -409,6 +449,7 @@ def test_utils_files():
def test_nn_modules_conv(): def test_nn_modules_conv():
"""Test Convolutional Neural Network modules."""
from ultralytics.nn.modules.conv import CBAM, Conv2, ConvTranspose, DWConvTranspose2d, Focus from ultralytics.nn.modules.conv import CBAM, Conv2, ConvTranspose, DWConvTranspose2d, Focus
c1, c2 = 8, 16 # input and output channels c1, c2 = 8, 16 # input and output channels
@ -427,6 +468,7 @@ def test_nn_modules_conv():
def test_nn_modules_block(): def test_nn_modules_block():
"""Test Neural Network block modules."""
from ultralytics.nn.modules.block import C1, C3TR, BottleneckCSP, C3Ghost, C3x from ultralytics.nn.modules.block import C1, C3TR, BottleneckCSP, C3Ghost, C3x
c1, c2 = 8, 16 # input and output channels c1, c2 = 8, 16 # input and output channels
@ -442,6 +484,7 @@ def test_nn_modules_block():
@pytest.mark.skipif(not ONLINE, reason='environment is offline') @pytest.mark.skipif(not ONLINE, reason='environment is offline')
def test_hub(): def test_hub():
"""Test Ultralytics HUB functionalities."""
from ultralytics.hub import export_fmts_hub, logout from ultralytics.hub import export_fmts_hub, logout
from ultralytics.hub.utils import smart_request from ultralytics.hub.utils import smart_request
@ -453,6 +496,7 @@ def test_hub():
@pytest.mark.slow @pytest.mark.slow
@pytest.mark.skipif(not ONLINE, reason='environment is offline') @pytest.mark.skipif(not ONLINE, reason='environment is offline')
def test_triton(): def test_triton():
"""Test NVIDIA Triton Server functionalities."""
checks.check_requirements('tritonclient[all]') checks.check_requirements('tritonclient[all]')
import subprocess import subprocess
import time import time

View File

@ -180,8 +180,8 @@ def _handle_deprecation(custom):
def check_dict_alignment(base: Dict, custom: Dict, e=None): def check_dict_alignment(base: Dict, custom: Dict, e=None):
""" """
This function checks for any mismatched keys between a custom configuration list and a base configuration list. This function checks for any mismatched keys between a custom configuration list and a base configuration list. If
If any mismatched keys are found, the function prints out similar keys from the base list and exits the program. any mismatched keys are found, the function prints out similar keys from the base list and exits the program.
Args: Args:
custom (dict): a dictionary of custom configuration options custom (dict): a dictionary of custom configuration options
@ -205,9 +205,8 @@ def check_dict_alignment(base: Dict, custom: Dict, e=None):
def merge_equals_args(args: List[str]) -> List[str]: def merge_equals_args(args: List[str]) -> List[str]:
""" """
Merges arguments around isolated '=' args in a list of strings. Merges arguments around isolated '=' args in a list of strings. The function considers cases where the first
The function considers cases where the first argument ends with '=' or the second starts with '=', argument ends with '=' or the second starts with '=', as well as when the middle one is an equals sign.
as well as when the middle one is an equals sign.
Args: Args:
args (List[str]): A list of strings where each element is an argument. args (List[str]): A list of strings where each element is an argument.

View File

@ -20,16 +20,30 @@ from .utils import polygons2masks, polygons2masks_overlap
# TODO: we might need a BaseTransform to make all these augments be compatible with both classification and semantic # TODO: we might need a BaseTransform to make all these augments be compatible with both classification and semantic
class BaseTransform: class BaseTransform:
"""
Base class for image transformations.
This is a generic transformation class that can be extended for specific image processing needs.
The class is designed to be compatible with both classification and semantic segmentation tasks.
Methods:
__init__: Initializes the BaseTransform object.
apply_image: Applies image transformation to labels.
apply_instances: Applies transformations to object instances in labels.
apply_semantic: Applies semantic segmentation to an image.
__call__: Applies all label transformations to an image, instances, and semantic masks.
"""
def __init__(self) -> None: def __init__(self) -> None:
"""Initializes the BaseTransform object."""
pass pass
def apply_image(self, labels): def apply_image(self, labels):
"""Applies image transformation to labels.""" """Applies image transformations to labels."""
pass pass
def apply_instances(self, labels): def apply_instances(self, labels):
"""Applies transformations to input 'labels' and returns object instances.""" """Applies transformations to object instances in labels."""
pass pass
def apply_semantic(self, labels): def apply_semantic(self, labels):
@ -37,13 +51,14 @@ class BaseTransform:
pass pass
def __call__(self, labels): def __call__(self, labels):
"""Applies label transformations to an image, instances and semantic masks.""" """Applies all label transformations to an image, instances, and semantic masks."""
self.apply_image(labels) self.apply_image(labels)
self.apply_instances(labels) self.apply_instances(labels)
self.apply_semantic(labels) self.apply_semantic(labels)
class Compose: class Compose:
"""Class for composing multiple image transformations."""
def __init__(self, transforms): def __init__(self, transforms):
"""Initializes the Compose object with a list of transforms.""" """Initializes the Compose object with a list of transforms."""
@ -60,18 +75,23 @@ class Compose:
self.transforms.append(transform) self.transforms.append(transform)
def tolist(self): def tolist(self):
"""Converts list of transforms to a standard Python list.""" """Converts the list of transforms to a standard Python list."""
return self.transforms return self.transforms
def __repr__(self): def __repr__(self):
"""Return string representation of object.""" """Returns a string representation of the object."""
return f"{self.__class__.__name__}({', '.join([f'{t}' for t in self.transforms])})" return f"{self.__class__.__name__}({', '.join([f'{t}' for t in self.transforms])})"
class BaseMixTransform: class BaseMixTransform:
"""This implementation is from mmyolo.""" """
Class for base mix (MixUp/Mosaic) transformations.
This implementation is from mmyolo.
"""
def __init__(self, dataset, pre_transform=None, p=0.0) -> None: def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
"""Initializes the BaseMixTransform object with dataset, pre_transform, and probability."""
self.dataset = dataset self.dataset = dataset
self.pre_transform = pre_transform self.pre_transform = pre_transform
self.p = p self.p = p
@ -262,8 +282,10 @@ class Mosaic(BaseMixTransform):
class MixUp(BaseMixTransform): class MixUp(BaseMixTransform):
"""Class for applying MixUp augmentation to the dataset."""
def __init__(self, dataset, pre_transform=None, p=0.0) -> None: def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
"""Initializes MixUp object with dataset, pre_transform, and probability of applying MixUp."""
super().__init__(dataset=dataset, pre_transform=pre_transform, p=p) super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
def get_indexes(self): def get_indexes(self):
@ -271,7 +293,7 @@ class MixUp(BaseMixTransform):
return random.randint(0, len(self.dataset) - 1) return random.randint(0, len(self.dataset) - 1)
def _mix_transform(self, labels): def _mix_transform(self, labels):
"""Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf.""" """Applies MixUp augmentation as per https://arxiv.org/pdf/1710.09412.pdf."""
r = np.random.beta(32.0, 32.0) # mixup ratio, alpha=beta=32.0 r = np.random.beta(32.0, 32.0) # mixup ratio, alpha=beta=32.0
labels2 = labels['mix_labels'][0] labels2 = labels['mix_labels'][0]
labels['img'] = (labels['img'] * r + labels2['img'] * (1 - r)).astype(np.uint8) labels['img'] = (labels['img'] * r + labels2['img'] * (1 - r)).astype(np.uint8)
@ -281,6 +303,28 @@ class MixUp(BaseMixTransform):
class RandomPerspective: class RandomPerspective:
"""
Implements random perspective and affine transformations on images and corresponding bounding boxes, segments, and
keypoints. These transformations include rotation, translation, scaling, and shearing. The class also offers the
option to apply these transformations conditionally with a specified probability.
Attributes:
degrees (float): Degree range for random rotations.
translate (float): Fraction of total width and height for random translation.
scale (float): Scaling factor interval, e.g., a scale factor of 0.1 allows a resize between 90%-110%.
shear (float): Shear intensity (angle in degrees).
perspective (float): Perspective distortion factor.
border (tuple): Tuple specifying mosaic border.
pre_transform (callable): A function/transform to apply to the image before starting the random transformation.
Methods:
affine_transform(img, border): Applies a series of affine transformations to the image.
apply_bboxes(bboxes, M): Transforms bounding boxes using the calculated affine matrix.
apply_segments(segments, M): Transforms segments and generates new bounding boxes.
apply_keypoints(keypoints, M): Transforms keypoints.
__call__(labels): Main method to apply transformations to both images and their corresponding annotations.
box_candidates(box1, box2): Filters out bounding boxes that don't meet certain criteria post-transformation.
"""
def __init__(self, def __init__(self,
degrees=0.0, degrees=0.0,
@ -290,17 +334,31 @@ class RandomPerspective:
perspective=0.0, perspective=0.0,
border=(0, 0), border=(0, 0),
pre_transform=None): pre_transform=None):
"""Initializes RandomPerspective object with transformation parameters."""
self.degrees = degrees self.degrees = degrees
self.translate = translate self.translate = translate
self.scale = scale self.scale = scale
self.shear = shear self.shear = shear
self.perspective = perspective self.perspective = perspective
# Mosaic border self.border = border # mosaic border
self.border = border
self.pre_transform = pre_transform self.pre_transform = pre_transform
def affine_transform(self, img, border): def affine_transform(self, img, border):
"""Center.""" """
Applies a sequence of affine transformations centered around the image center.
Args:
img (ndarray): Input image.
border (tuple): Border dimensions.
Returns:
img (ndarray): Transformed image.
M (ndarray): Transformation matrix.
s (float): Scale factor.
"""
# Center
C = np.eye(3, dtype=np.float32) C = np.eye(3, dtype=np.float32)
C[0, 2] = -img.shape[1] / 2 # x translation (pixels) C[0, 2] = -img.shape[1] / 2 # x translation (pixels)
@ -462,8 +520,22 @@ class RandomPerspective:
labels['resized_shape'] = img.shape[:2] labels['resized_shape'] = img.shape[:2]
return labels return labels
def box_candidates(self, box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16): # box1(4,n), box2(4,n) def box_candidates(self, box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16):
# Compute box candidates: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio """
Compute box candidates based on a set of thresholds. This method compares the characteristics of the boxes
before and after augmentation to decide whether a box is a candidate for further processing.
Args:
box1 (numpy.ndarray): The 4,n bounding box before augmentation, represented as [x1, y1, x2, y2].
box2 (numpy.ndarray): The 4,n bounding box after augmentation, represented as [x1, y1, x2, y2].
wh_thr (float, optional): The width and height threshold in pixels. Default is 2.
ar_thr (float, optional): The aspect ratio threshold. Default is 100.
area_thr (float, optional): The area ratio threshold. Default is 0.1.
eps (float, optional): A small epsilon value to prevent division by zero. Default is 1e-16.
Returns:
(numpy.ndarray): A boolean array indicating which boxes are candidates based on the given thresholds.
"""
w1, h1 = box1[2] - box1[0], box1[3] - box1[1] w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
w2, h2 = box2[2] - box2[0], box2[3] - box2[1] w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps)) # aspect ratio ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps)) # aspect ratio
@ -471,14 +543,32 @@ class RandomPerspective:
class RandomHSV: class RandomHSV:
"""
This class is responsible for performing random adjustments to the Hue, Saturation, and Value (HSV) channels of an
image.
The adjustments are random but within limits set by hgain, sgain, and vgain.
"""
def __init__(self, hgain=0.5, sgain=0.5, vgain=0.5) -> None: def __init__(self, hgain=0.5, sgain=0.5, vgain=0.5) -> None:
"""
Initialize RandomHSV class with gains for each HSV channel.
Args:
hgain (float, optional): Maximum variation for hue. Default is 0.5.
sgain (float, optional): Maximum variation for saturation. Default is 0.5.
vgain (float, optional): Maximum variation for value. Default is 0.5.
"""
self.hgain = hgain self.hgain = hgain
self.sgain = sgain self.sgain = sgain
self.vgain = vgain self.vgain = vgain
def __call__(self, labels): def __call__(self, labels):
"""Applies image HSV augmentation""" """
Applies random HSV augmentation to an image within the predefined limits.
The modified image replaces the original image in the input 'labels' dict.
"""
img = labels['img'] img = labels['img']
if self.hgain or self.sgain or self.vgain: if self.hgain or self.sgain or self.vgain:
r = np.random.uniform(-1, 1, 3) * [self.hgain, self.sgain, self.vgain] + 1 # random gains r = np.random.uniform(-1, 1, 3) * [self.hgain, self.sgain, self.vgain] + 1 # random gains
@ -496,9 +586,22 @@ class RandomHSV:
class RandomFlip: class RandomFlip:
"""Applies random horizontal or vertical flip to an image with a given probability.""" """
Applies a random horizontal or vertical flip to an image with a given probability.
Also updates any instances (bounding boxes, keypoints, etc.) accordingly.
"""
def __init__(self, p=0.5, direction='horizontal', flip_idx=None) -> None: def __init__(self, p=0.5, direction='horizontal', flip_idx=None) -> None:
"""
Initializes the RandomFlip class with probability and direction.
Args:
p (float, optional): The probability of applying the flip. Must be between 0 and 1. Default is 0.5.
direction (str, optional): The direction to apply the flip. Must be 'horizontal' or 'vertical'.
Default is 'horizontal'.
flip_idx (array-like, optional): Index mapping for flipping keypoints, if any.
"""
assert direction in ['horizontal', 'vertical'], f'Support direction `horizontal` or `vertical`, got {direction}' assert direction in ['horizontal', 'vertical'], f'Support direction `horizontal` or `vertical`, got {direction}'
assert 0 <= p <= 1.0 assert 0 <= p <= 1.0
@ -507,7 +610,16 @@ class RandomFlip:
self.flip_idx = flip_idx self.flip_idx = flip_idx
def __call__(self, labels): def __call__(self, labels):
"""Resize image and padding for detection, instance segmentation, pose.""" """
Applies random flip to an image and updates any instances like bounding boxes or keypoints accordingly.
Args:
labels (dict): A dictionary containing the keys 'img' and 'instances'. 'img' is the image to be flipped.
'instances' is an object containing bounding boxes and optionally keypoints.
Returns:
(dict): The same dict with the flipped image and updated instances under the 'img' and 'instances' keys.
"""
img = labels['img'] img = labels['img']
instances = labels.pop('instances') instances = labels.pop('instances')
instances.convert_bbox(format='xywh') instances.convert_bbox(format='xywh')
@ -599,12 +711,38 @@ class LetterBox:
class CopyPaste: class CopyPaste:
"""
Implements the Copy-Paste augmentation as described in the paper https://arxiv.org/abs/2012.07177. This class is
responsible for applying the Copy-Paste augmentation on images and their corresponding instances.
"""
def __init__(self, p=0.5) -> None: def __init__(self, p=0.5) -> None:
"""
Initializes the CopyPaste class with a given probability.
Args:
p (float, optional): The probability of applying the Copy-Paste augmentation. Must be between 0 and 1.
Default is 0.5.
"""
self.p = p self.p = p
def __call__(self, labels): def __call__(self, labels):
"""Implement Copy-Paste augmentation https://arxiv.org/abs/2012.07177, labels as nx5 np.array(cls, xyxy).""" """
Applies the Copy-Paste augmentation to the given image and instances.
Args:
labels (dict): A dictionary containing:
- 'img': The image to augment.
- 'cls': Class labels associated with the instances.
- 'instances': Object containing bounding boxes, and optionally, keypoints and segments.
Returns:
(dict): Dict with augmented image and updated instances under the 'img', 'cls', and 'instances' keys.
Notes:
1. Instances are expected to have 'segments' as one of their attributes for this augmentation to work.
2. This method modifies the input dictionary 'labels' in place.
"""
im = labels['img'] im = labels['img']
cls = labels['cls'] cls = labels['cls']
h, w = im.shape[:2] h, w = im.shape[:2]
@ -639,9 +777,13 @@ class CopyPaste:
class Albumentations: class Albumentations:
"""Albumentations transformations. Optional, uninstall package to disable. """
Applies Blur, Median Blur, convert to grayscale, Contrast Limited Adaptive Histogram Equalization, Albumentations transformations.
random change of brightness and contrast, RandomGamma and lowering of image quality by compression."""
Optional, uninstall package to disable. Applies Blur, Median Blur, convert to grayscale, Contrast Limited Adaptive
Histogram Equalization, random change of brightness and contrast, RandomGamma and lowering of image quality by
compression.
"""
def __init__(self, p=1.0): def __init__(self, p=1.0):
"""Initialize the transform object for YOLO bbox formatted params.""" """Initialize the transform object for YOLO bbox formatted params."""
@ -690,6 +832,19 @@ class Albumentations:
# TODO: technically this is not an augmentation, maybe we should put this to another files # TODO: technically this is not an augmentation, maybe we should put this to another files
class Format: class Format:
"""
Formats image annotations for object detection, instance segmentation, and pose estimation tasks. The class
standardizes the image and instance annotations to be used by the `collate_fn` in PyTorch DataLoader.
Attributes:
bbox_format (str): Format for bounding boxes. Default is 'xywh'.
normalize (bool): Whether to normalize bounding boxes. Default is True.
return_mask (bool): Return instance masks for segmentation. Default is False.
return_keypoint (bool): Return keypoints for pose estimation. Default is False.
mask_ratio (int): Downsample ratio for masks. Default is 4.
mask_overlap (bool): Whether to overlap masks. Default is True.
batch_idx (bool): Keep batch indexes. Default is True.
"""
def __init__(self, def __init__(self,
bbox_format='xywh', bbox_format='xywh',
@ -699,6 +854,7 @@ class Format:
mask_ratio=4, mask_ratio=4,
mask_overlap=True, mask_overlap=True,
batch_idx=True): batch_idx=True):
"""Initializes the Format class with given parameters."""
self.bbox_format = bbox_format self.bbox_format = bbox_format
self.normalize = normalize self.normalize = normalize
self.return_mask = return_mask # set False when training detection only self.return_mask = return_mask # set False when training detection only
@ -746,7 +902,7 @@ class Format:
return img return img
def _format_segments(self, instances, cls, w, h): def _format_segments(self, instances, cls, w, h):
"""convert polygon points to bitmap.""" """Convert polygon points to bitmap."""
segments = instances.segments segments = instances.segments
if self.mask_overlap: if self.mask_overlap:
masks, sorted_idx = polygons2masks_overlap((h, w), segments, downsample_ratio=self.mask_ratio) masks, sorted_idx = polygons2masks_overlap((h, w), segments, downsample_ratio=self.mask_ratio)
@ -851,35 +1007,75 @@ def classify_albumentations(
class ClassifyLetterBox: class ClassifyLetterBox:
"""YOLOv8 LetterBox class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()])""" """
YOLOv8 LetterBox class for image preprocessing, designed to be part of a transformation pipeline, e.g.,
T.Compose([LetterBox(size), ToTensor()]).
Attributes:
h (int): Target height of the image.
w (int): Target width of the image.
auto (bool): If True, automatically solves for short side using stride.
stride (int): The stride value, used when 'auto' is True.
"""
def __init__(self, size=(640, 640), auto=False, stride=32): def __init__(self, size=(640, 640), auto=False, stride=32):
"""Resizes image and crops it to center with max dimensions 'h' and 'w'.""" """
Initializes the ClassifyLetterBox class with a target size, auto-flag, and stride.
Args:
size (Union[int, Tuple[int, int]]): The target dimensions (height, width) for the letterbox.
auto (bool): If True, automatically calculates the short side based on stride.
stride (int): The stride value, used when 'auto' is True.
"""
super().__init__() super().__init__()
self.h, self.w = (size, size) if isinstance(size, int) else size self.h, self.w = (size, size) if isinstance(size, int) else size
self.auto = auto # pass max size integer, automatically solve for short side using stride self.auto = auto # pass max size integer, automatically solve for short side using stride
self.stride = stride # used with auto self.stride = stride # used with auto
def __call__(self, im): # im = np.array HWC def __call__(self, im):
"""
Resizes the image and pads it with a letterbox method.
Args:
im (numpy.ndarray): The input image as a numpy array of shape HWC.
Returns:
(numpy.ndarray): The letterboxed and resized image as a numpy array.
"""
imh, imw = im.shape[:2] imh, imw = im.shape[:2]
r = min(self.h / imh, self.w / imw) # ratio of new/old r = min(self.h / imh, self.w / imw) # ratio of new/old dimensions
h, w = round(imh * r), round(imw * r) # resized image h, w = round(imh * r), round(imw * r) # resized image dimensions
# Calculate padding dimensions
hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else (self.h, self.w) hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else (self.h, self.w)
top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1) top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1)
# Create padded image
im_out = np.full((hs, ws, 3), 114, dtype=im.dtype) im_out = np.full((hs, ws, 3), 114, dtype=im.dtype)
im_out[top:top + h, left:left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR) im_out[top:top + h, left:left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
return im_out return im_out
class CenterCrop: class CenterCrop:
"""YOLOv8 CenterCrop class for image preprocessing, i.e. T.Compose([CenterCrop(size), ToTensor()])""" """YOLOv8 CenterCrop class for image preprocessing, designed to be part of a transformation pipeline, e.g.,
T.Compose([CenterCrop(size), ToTensor()]).
"""
def __init__(self, size=640): def __init__(self, size=640):
"""Converts an image from numpy array to PyTorch tensor.""" """Converts an image from numpy array to PyTorch tensor."""
super().__init__() super().__init__()
self.h, self.w = (size, size) if isinstance(size, int) else size self.h, self.w = (size, size) if isinstance(size, int) else size
def __call__(self, im): # im = np.array HWC def __call__(self, im):
"""
Resizes and crops the center of the image using a letterbox method.
Args:
im (numpy.ndarray): The input image as a numpy array of shape HWC.
Returns:
(numpy.ndarray): The center-cropped and resized image as a numpy array.
"""
imh, imw = im.shape[:2] imh, imw = im.shape[:2]
m = min(imh, imw) # min dimension m = min(imh, imw) # min dimension
top, left = (imh - m) // 2, (imw - m) // 2 top, left = (imh - m) // 2, (imw - m) // 2
@ -887,14 +1083,23 @@ class CenterCrop:
class ToTensor: class ToTensor:
"""YOLOv8 ToTensor class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()]).""" """YOLOv8 ToTensor class for image preprocessing, i.e., T.Compose([LetterBox(size), ToTensor()])."""
def __init__(self, half=False): def __init__(self, half=False):
"""Initialize YOLOv8 ToTensor object with optional half-precision support.""" """Initialize YOLOv8 ToTensor object with optional half-precision support."""
super().__init__() super().__init__()
self.half = half self.half = half
def __call__(self, im): # im = np.array HWC in BGR order def __call__(self, im):
"""
Transforms an image from a numpy array to a PyTorch tensor, applying optional half-precision and normalization.
Args:
im (numpy.ndarray): Input image as a numpy array with shape (H, W, C) in BGR order.
Returns:
(torch.Tensor): The transformed image as a PyTorch tensor in float32 or float16, normalized to [0, 1].
"""
im = np.ascontiguousarray(im.transpose((2, 0, 1))[::-1]) # HWC to CHW -> BGR to RGB -> contiguous im = np.ascontiguousarray(im.transpose((2, 0, 1))[::-1]) # HWC to CHW -> BGR to RGB -> contiguous
im = torch.from_numpy(im) # to torch im = torch.from_numpy(im) # to torch
im = im.half() if self.half else im.float() # uint8 to fp16/32 im = im.half() if self.half else im.float() # uint8 to fp16/32

View File

@ -62,6 +62,7 @@ class BaseDataset(Dataset):
classes=None, classes=None,
fraction=1.0): fraction=1.0):
super().__init__() super().__init__()
"""Initialize BaseDataset with given configuration and options."""
self.img_path = img_path self.img_path = img_path
self.imgsz = imgsz self.imgsz = imgsz
self.augment = augment self.augment = augment
@ -256,7 +257,7 @@ class BaseDataset(Dataset):
return len(self.labels) return len(self.labels)
def update_labels_info(self, label): def update_labels_info(self, label):
"""custom your label format here.""" """Custom your label format here."""
return label return label
def build_transforms(self, hyp=None): def build_transforms(self, hyp=None):

View File

@ -20,7 +20,11 @@ from .utils import PIN_MEMORY
class InfiniteDataLoader(dataloader.DataLoader): class InfiniteDataLoader(dataloader.DataLoader):
"""Dataloader that reuses workers. Uses same syntax as vanilla DataLoader.""" """
Dataloader that reuses workers.
Uses same syntax as vanilla DataLoader.
"""
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
"""Dataloader that infinitely recycles workers, inherits from DataLoader.""" """Dataloader that infinitely recycles workers, inherits from DataLoader."""
@ -38,7 +42,9 @@ class InfiniteDataLoader(dataloader.DataLoader):
yield next(self.iterator) yield next(self.iterator)
def reset(self): def reset(self):
"""Reset iterator. """
Reset iterator.
This is useful when we want to modify settings of dataset while training. This is useful when we want to modify settings of dataset while training.
""" """
self.iterator = self._get_iterator() self.iterator = self._get_iterator()
@ -70,7 +76,7 @@ def seed_worker(worker_id): # noqa
def build_yolo_dataset(cfg, img_path, batch, data, mode='train', rect=False, stride=32): def build_yolo_dataset(cfg, img_path, batch, data, mode='train', rect=False, stride=32):
"""Build YOLO Dataset""" """Build YOLO Dataset."""
return YOLODataset( return YOLODataset(
img_path=img_path, img_path=img_path,
imgsz=cfg.imgsz, imgsz=cfg.imgsz,

View File

@ -12,7 +12,8 @@ from ultralytics.utils import TQDM
def coco91_to_coco80_class(): def coco91_to_coco80_class():
"""Converts 91-index COCO class IDs to 80-index COCO class IDs. """
Converts 91-index COCO class IDs to 80-index COCO class IDs.
Returns: Returns:
(list): A list of 91 class IDs where the index represents the 80-index class ID and the value is the (list): A list of 91 class IDs where the index represents the 80-index class ID and the value is the
@ -51,7 +52,8 @@ def convert_coco(labels_dir='../coco/annotations/',
use_segments=False, use_segments=False,
use_keypoints=False, use_keypoints=False,
cls91to80=True): cls91to80=True):
"""Converts COCO dataset annotations to a format suitable for training YOLOv5 models. """
Converts COCO dataset annotations to a format suitable for training YOLOv5 models.
Args: Args:
labels_dir (str, optional): Path to directory containing COCO dataset annotation files. labels_dir (str, optional): Path to directory containing COCO dataset annotation files.
@ -203,6 +205,7 @@ def convert_dota_to_yolo_obb(dota_root_path: str):
'helipad': 17} 'helipad': 17}
def convert_label(image_name, image_width, image_height, orig_label_dir, save_dir): def convert_label(image_name, image_width, image_height, orig_label_dir, save_dir):
"""Converts a single image's DOTA annotation to YOLO OBB format and saves it to a specified directory."""
orig_label_path = orig_label_dir / f'{image_name}.txt' orig_label_path = orig_label_dir / f'{image_name}.txt'
save_path = save_dir / f'{image_name}.txt' save_path = save_dir / f'{image_name}.txt'

View File

@ -33,6 +33,7 @@ class YOLODataset(BaseDataset):
""" """
def __init__(self, *args, data=None, use_segments=False, use_keypoints=False, **kwargs): def __init__(self, *args, data=None, use_segments=False, use_keypoints=False, **kwargs):
"""Initializes the YOLODataset with optional configurations for segments and keypoints."""
self.use_segments = use_segments self.use_segments = use_segments
self.use_keypoints = use_keypoints self.use_keypoints = use_keypoints
self.data = data self.data = data
@ -40,7 +41,9 @@ class YOLODataset(BaseDataset):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
def cache_labels(self, path=Path('./labels.cache')): def cache_labels(self, path=Path('./labels.cache')):
"""Cache dataset labels, check images and read shapes. """
Cache dataset labels, check images and read shapes.
Args: Args:
path (Path): path where to save the cache file (default: Path('./labels.cache')). path (Path): path where to save the cache file (default: Path('./labels.cache')).
Returns: Returns:
@ -157,7 +160,7 @@ class YOLODataset(BaseDataset):
self.transforms = self.build_transforms(hyp) self.transforms = self.build_transforms(hyp)
def update_labels_info(self, label): def update_labels_info(self, label):
"""custom your label format here.""" """Custom your label format here."""
# NOTE: cls is not with bboxes now, classification and semantic segmentation need an independent cls label # NOTE: cls is not with bboxes now, classification and semantic segmentation need an independent cls label
# we can make it also support classification and semantic segmentation by add or remove some dict keys there. # we can make it also support classification and semantic segmentation by add or remove some dict keys there.
bboxes = label.pop('bboxes') bboxes = label.pop('bboxes')
@ -254,6 +257,7 @@ class ClassificationDataset(torchvision.datasets.ImageFolder):
return {'img': sample, 'cls': j} return {'img': sample, 'cls': j}
def __len__(self) -> int: def __len__(self) -> int:
"""Return the total number of samples in the dataset."""
return len(self.samples) return len(self.samples)
def verify_images(self): def verify_images(self):
@ -320,6 +324,16 @@ def save_dataset_cache_file(prefix, path, x):
# TODO: support semantic segmentation # TODO: support semantic segmentation
class SemanticDataset(BaseDataset): class SemanticDataset(BaseDataset):
"""
Semantic Segmentation Dataset.
This class is responsible for handling datasets used for semantic segmentation tasks. It inherits functionalities
from the BaseDataset class.
Note:
This class is currently a placeholder and needs to be populated with methods and attributes for supporting
semantic segmentation tasks.
"""
def __init__(self): def __init__(self):
"""Initialize a SemanticDataset object.""" """Initialize a SemanticDataset object."""

View File

@ -22,6 +22,7 @@ from ultralytics.utils.checks import check_requirements
@dataclass @dataclass
class SourceTypes: class SourceTypes:
"""Class to represent various types of input sources for predictions."""
webcam: bool = False webcam: bool = False
screenshot: bool = False screenshot: bool = False
from_img: bool = False from_img: bool = False
@ -29,7 +30,34 @@ class SourceTypes:
class LoadStreams: class LoadStreams:
"""Stream Loader, i.e. `yolo predict source='rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP, TCP streams`.""" """
Stream Loader for various types of video streams.
Suitable for use with `yolo predict source='rtsp://example.com/media.mp4'`, supports RTSP, RTMP, HTTP, and TCP streams.
Attributes:
sources (str): The source input paths or URLs for the video streams.
imgsz (int): The image size for processing, defaults to 640.
vid_stride (int): Video frame-rate stride, defaults to 1.
buffer (bool): Whether to buffer input streams, defaults to False.
running (bool): Flag to indicate if the streaming thread is running.
mode (str): Set to 'stream' indicating real-time capture.
imgs (list): List of image frames for each stream.
fps (list): List of FPS for each stream.
frames (list): List of total frames for each stream.
threads (list): List of threads for each stream.
shape (list): List of shapes for each stream.
caps (list): List of cv2.VideoCapture objects for each stream.
bs (int): Batch size for processing.
Methods:
__init__: Initialize the stream loader.
update: Read stream frames in daemon thread.
close: Close stream loader and release resources.
__iter__: Returns an iterator object for the class.
__next__: Returns source paths, transformed, and original images for processing.
__len__: Return the length of the sources object.
"""
def __init__(self, sources='file.streams', imgsz=640, vid_stride=1, buffer=False): def __init__(self, sources='file.streams', imgsz=640, vid_stride=1, buffer=False):
"""Initialize instance variables and check for consistent input stream shapes.""" """Initialize instance variables and check for consistent input stream shapes."""
@ -149,10 +177,33 @@ class LoadStreams:
class LoadScreenshots: class LoadScreenshots:
"""YOLOv8 screenshot dataloader, i.e. `yolo predict source=screen`.""" """
YOLOv8 screenshot dataloader.
This class manages the loading of screenshot images for processing with YOLOv8.
Suitable for use with `yolo predict source=screen`.
Attributes:
source (str): The source input indicating which screen to capture.
imgsz (int): The image size for processing, defaults to 640.
screen (int): The screen number to capture.
left (int): The left coordinate for screen capture area.
top (int): The top coordinate for screen capture area.
width (int): The width of the screen capture area.
height (int): The height of the screen capture area.
mode (str): Set to 'stream' indicating real-time capture.
frame (int): Counter for captured frames.
sct (mss.mss): Screen capture object from `mss` library.
bs (int): Batch size, set to 1.
monitor (dict): Monitor configuration details.
Methods:
__iter__: Returns an iterator object.
__next__: Captures the next screenshot and returns it.
"""
def __init__(self, source, imgsz=640): def __init__(self, source, imgsz=640):
"""source = [screen_number left top width height] (pixels).""" """Source = [screen_number left top width height] (pixels)."""
check_requirements('mss') check_requirements('mss')
import mss # noqa import mss # noqa
@ -192,7 +243,28 @@ class LoadScreenshots:
class LoadImages: class LoadImages:
"""YOLOv8 image/video dataloader, i.e. `yolo predict source=image.jpg/vid.mp4`.""" """
YOLOv8 image/video dataloader.
This class manages the loading and pre-processing of image and video data for YOLOv8. It supports loading from
various formats, including single image files, video files, and lists of image and video paths.
Attributes:
imgsz (int): Image size, defaults to 640.
files (list): List of image and video file paths.
nf (int): Total number of files (images and videos).
video_flag (list): Flags indicating whether a file is a video (True) or an image (False).
mode (str): Current mode, 'image' or 'video'.
vid_stride (int): Stride for video frame-rate, defaults to 1.
bs (int): Batch size, set to 1 for this class.
cap (cv2.VideoCapture): Video capture object for OpenCV.
frame (int): Frame counter for video.
frames (int): Total number of frames in the video.
count (int): Counter for iteration, initialized at 0 during `__iter__()`.
Methods:
_new_video(path): Create a new cv2.VideoCapture object for a given video path.
"""
def __init__(self, path, imgsz=640, vid_stride=1): def __init__(self, path, imgsz=640, vid_stride=1):
"""Initialize the Dataloader and raise FileNotFoundError if file not found.""" """Initialize the Dataloader and raise FileNotFoundError if file not found."""
@ -285,6 +357,24 @@ class LoadImages:
class LoadPilAndNumpy: class LoadPilAndNumpy:
"""
Load images from PIL and Numpy arrays for batch processing.
This class is designed to manage loading and pre-processing of image data from both PIL and Numpy formats.
It performs basic validation and format conversion to ensure that the images are in the required format for
downstream processing.
Attributes:
paths (list): List of image paths or autogenerated filenames.
im0 (list): List of images stored as Numpy arrays.
imgsz (int): Image size, defaults to 640.
mode (str): Type of data being processed, defaults to 'image'.
bs (int): Batch size, equivalent to the length of `im0`.
count (int): Counter for iteration, initialized at 0 during `__iter__()`.
Methods:
_single_check(im): Validate and format a single image to a Numpy array.
"""
def __init__(self, im0, imgsz=640): def __init__(self, im0, imgsz=640):
"""Initialize PIL and Numpy Dataloader.""" """Initialize PIL and Numpy Dataloader."""
@ -326,8 +416,24 @@ class LoadPilAndNumpy:
class LoadTensor: class LoadTensor:
"""
Load images from torch.Tensor data.
This class manages the loading and pre-processing of image data from PyTorch tensors for further processing.
Attributes:
im0 (torch.Tensor): The input tensor containing the image(s).
bs (int): Batch size, inferred from the shape of `im0`.
mode (str): Current mode, set to 'image'.
paths (list): List of image paths or filenames.
count (int): Counter for iteration, initialized at 0 during `__iter__()`.
Methods:
_single_check(im, stride): Validate and possibly modify the input tensor.
"""
def __init__(self, im0) -> None: def __init__(self, im0) -> None:
"""Initialize Tensor Dataloader."""
self.im0 = self._single_check(im0) self.im0 = self._single_check(im0)
self.bs = self.im0.shape[0] self.bs = self.im0.shape[0]
self.mode = 'image' self.mode = 'image'
@ -370,9 +476,7 @@ class LoadTensor:
def autocast_list(source): def autocast_list(source):
""" """Merges a list of source of different types into a list of numpy arrays or PIL images."""
Merges a list of source of different types into a list of numpy arrays or PIL images
"""
files = [] files = []
for im in source: for im in source:
if isinstance(im, (str, Path)): # filename or uri if isinstance(im, (str, Path)): # filename or uri

View File

@ -547,9 +547,9 @@ class HUBDatasetStats:
def compress_one_image(f, f_new=None, max_dim=1920, quality=50): def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
""" """
Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the Python
Python Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will not be
not be resized. resized.
Args: Args:
f (str): The path to the input image file. f (str): The path to the input image file.

View File

@ -986,9 +986,7 @@ class Exporter:
return model return model
def add_callback(self, event: str, callback): def add_callback(self, event: str, callback):
""" """Appends the given callback."""
Appends the given callback.
"""
self.callbacks[event].append(callback) self.callbacks[event].append(callback)
def run_callbacks(self, event: str): def run_callbacks(self, event: str):

View File

@ -159,9 +159,7 @@ class Model(nn.Module):
self.overrides['task'] = self.task self.overrides['task'] = self.task
def _check_is_pytorch_model(self): def _check_is_pytorch_model(self):
""" """Raises TypeError is model is not a PyTorch model."""
Raises TypeError is model is not a PyTorch model
"""
pt_str = isinstance(self.model, (str, Path)) and Path(self.model).suffix == '.pt' pt_str = isinstance(self.model, (str, Path)) and Path(self.model).suffix == '.pt'
pt_module = isinstance(self.model, nn.Module) pt_module = isinstance(self.model, nn.Module)
if not (pt_module or pt_str): if not (pt_module or pt_str):
@ -173,9 +171,7 @@ class Model(nn.Module):
f"argument directly in your inference command, i.e. 'model.predict(source=..., device=0)'") f"argument directly in your inference command, i.e. 'model.predict(source=..., device=0)'")
def reset_weights(self): def reset_weights(self):
""" """Resets the model modules parameters to randomly initialized values, losing all training information."""
Resets the model modules parameters to randomly initialized values, losing all training information.
"""
self._check_is_pytorch_model() self._check_is_pytorch_model()
for m in self.model.modules(): for m in self.model.modules():
if hasattr(m, 'reset_parameters'): if hasattr(m, 'reset_parameters'):
@ -185,9 +181,7 @@ class Model(nn.Module):
return self return self
def load(self, weights='yolov8n.pt'): def load(self, weights='yolov8n.pt'):
""" """Transfers parameters with matching names and shapes from 'weights' to model."""
Transfers parameters with matching names and shapes from 'weights' to model.
"""
self._check_is_pytorch_model() self._check_is_pytorch_model()
if isinstance(weights, (str, Path)): if isinstance(weights, (str, Path)):
weights, self.ckpt = attempt_load_one_weight(weights) weights, self.ckpt = attempt_load_one_weight(weights)

View File

@ -58,7 +58,7 @@ Example:
class BasePredictor: class BasePredictor:
""" """
BasePredictor BasePredictor.
A base class for creating predictors. A base class for creating predictors.
@ -109,7 +109,8 @@ class BasePredictor:
callbacks.add_integration_callbacks(self) callbacks.add_integration_callbacks(self)
def preprocess(self, im): def preprocess(self, im):
"""Prepares input image before inference. """
Prepares input image before inference.
Args: Args:
im (torch.Tensor | List(np.ndarray)): BCHW for tensor, [(HWC) x B] for list. im (torch.Tensor | List(np.ndarray)): BCHW for tensor, [(HWC) x B] for list.
@ -128,6 +129,7 @@ class BasePredictor:
return im return im
def inference(self, im, *args, **kwargs): def inference(self, im, *args, **kwargs):
"""Runs inference on a given image using the specified model and arguments."""
visualize = increment_path(self.save_dir / Path(self.batch[0][0]).stem, visualize = increment_path(self.save_dir / Path(self.batch[0][0]).stem,
mkdir=True) if self.args.visualize and (not self.source_type.tensor) else False mkdir=True) if self.args.visualize and (not self.source_type.tensor) else False
return self.model(im, augment=self.args.augment, visualize=visualize) return self.model(im, augment=self.args.augment, visualize=visualize)
@ -194,7 +196,11 @@ class BasePredictor:
return list(self.stream_inference(source, model, *args, **kwargs)) # merge list of Result into one return list(self.stream_inference(source, model, *args, **kwargs)) # merge list of Result into one
def predict_cli(self, source=None, model=None): def predict_cli(self, source=None, model=None):
"""Method used for CLI prediction. It uses always generator as outputs as not required by CLI mode.""" """
Method used for CLI prediction.
It uses always generator as outputs as not required by CLI mode.
"""
gen = self.stream_inference(source, model) gen = self.stream_inference(source, model)
for _ in gen: # running CLI inference without accumulating any outputs (do not modify) for _ in gen: # running CLI inference without accumulating any outputs (do not modify)
pass pass
@ -352,7 +358,5 @@ class BasePredictor:
callback(self) callback(self)
def add_callback(self, event: str, func): def add_callback(self, event: str, func):
""" """Add callback."""
Add callback
"""
self.callbacks[event].append(func) self.callbacks[event].append(func)

View File

@ -1,6 +1,6 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """
Ultralytics Results, Boxes and Masks classes for handling inference results Ultralytics Results, Boxes and Masks classes for handling inference results.
Usage: See https://docs.ultralytics.com/modes/predict/ Usage: See https://docs.ultralytics.com/modes/predict/
""" """
@ -19,12 +19,11 @@ from ultralytics.utils.torch_utils import smart_inference_mode
class BaseTensor(SimpleClass): class BaseTensor(SimpleClass):
""" """Base tensor class with additional methods for easy manipulation and device handling."""
Base tensor class with additional methods for easy manipulation and device handling.
"""
def __init__(self, data, orig_shape) -> None: def __init__(self, data, orig_shape) -> None:
"""Initialize BaseTensor with data and original shape. """
Initialize BaseTensor with data and original shape.
Args: Args:
data (torch.Tensor | np.ndarray): Predictions, such as bboxes, masks and keypoints. data (torch.Tensor | np.ndarray): Predictions, such as bboxes, masks and keypoints.
@ -126,6 +125,18 @@ class Results(SimpleClass):
self.probs = probs self.probs = probs
def _apply(self, fn, *args, **kwargs): def _apply(self, fn, *args, **kwargs):
"""
Applies a function to all non-empty attributes and returns a new Results object with modified attributes. This
function is internally called by methods like .to(), .cuda(), .cpu(), etc.
Args:
fn (str): The name of the function to apply.
*args: Variable length argument list to pass to the function.
**kwargs: Arbitrary keyword arguments to pass to the function.
Returns:
Results: A new Results object with attributes modified by the applied function.
"""
r = self.new() r = self.new()
for k in self._keys: for k in self._keys:
v = getattr(self, k) v = getattr(self, k)
@ -250,9 +261,7 @@ class Results(SimpleClass):
return annotator.result() return annotator.result()
def verbose(self): def verbose(self):
""" """Return log string for each task."""
Return log string for each task.
"""
log_string = '' log_string = ''
probs = self.probs probs = self.probs
boxes = self.boxes boxes = self.boxes
@ -537,6 +546,7 @@ class Probs(BaseTensor):
""" """
def __init__(self, probs, orig_shape=None) -> None: def __init__(self, probs, orig_shape=None) -> None:
"""Initialize the Probs class with classification probabilities and optional original shape of the image."""
super().__init__(probs, orig_shape) super().__init__(probs, orig_shape)
@property @property

View File

@ -1,6 +1,6 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """
Train a model on a dataset Train a model on a dataset.
Usage: Usage:
$ yolo mode=train model=yolov8n.pt data=coco128.yaml imgsz=640 epochs=100 batch=16 $ yolo mode=train model=yolov8n.pt data=coco128.yaml imgsz=640 epochs=100 batch=16
@ -37,7 +37,7 @@ from ultralytics.utils.torch_utils import (EarlyStopping, ModelEMA, de_parallel,
class BaseTrainer: class BaseTrainer:
""" """
BaseTrainer BaseTrainer.
A base class for creating trainers. A base class for creating trainers.
@ -143,15 +143,11 @@ class BaseTrainer:
callbacks.add_integration_callbacks(self) callbacks.add_integration_callbacks(self)
def add_callback(self, event: str, callback): def add_callback(self, event: str, callback):
""" """Appends the given callback."""
Appends the given callback.
"""
self.callbacks[event].append(callback) self.callbacks[event].append(callback)
def set_callback(self, event: str, callback): def set_callback(self, event: str, callback):
""" """Overrides the existing callbacks with the given callback."""
Overrides the existing callbacks with the given callback.
"""
self.callbacks[event] = [callback] self.callbacks[event] = [callback]
def run_callbacks(self, event: str): def run_callbacks(self, event: str):
@ -207,9 +203,7 @@ class BaseTrainer:
world_size=world_size) world_size=world_size)
def _setup_train(self, world_size): def _setup_train(self, world_size):
""" """Builds dataloaders and optimizer on correct rank process."""
Builds dataloaders and optimizer on correct rank process.
"""
# Model # Model
self.run_callbacks('on_pretrain_routine_start') self.run_callbacks('on_pretrain_routine_start')
@ -450,14 +444,14 @@ class BaseTrainer:
@staticmethod @staticmethod
def get_dataset(data): def get_dataset(data):
""" """
Get train, val path from data dict if it exists. Returns None if data format is not recognized. Get train, val path from data dict if it exists.
Returns None if data format is not recognized.
""" """
return data['train'], data.get('val') or data.get('test') return data['train'], data.get('val') or data.get('test')
def setup_model(self): def setup_model(self):
""" """Load/create/download model for any task."""
load/create/download model for any task.
"""
if isinstance(self.model, torch.nn.Module): # if model is loaded beforehand. No setup needed if isinstance(self.model, torch.nn.Module): # if model is loaded beforehand. No setup needed
return return
@ -482,14 +476,14 @@ class BaseTrainer:
self.ema.update(self.model) self.ema.update(self.model)
def preprocess_batch(self, batch): def preprocess_batch(self, batch):
""" """Allows custom preprocessing model inputs and ground truths depending on task type."""
Allows custom preprocessing model inputs and ground truths depending on task type.
"""
return batch return batch
def validate(self): def validate(self):
""" """
Runs validation on test set using self.validator. The returned dict is expected to contain "fitness" key. Runs validation on test set using self.validator.
The returned dict is expected to contain "fitness" key.
""" """
metrics = self.validator(self) metrics = self.validator(self)
fitness = metrics.pop('fitness', -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found fitness = metrics.pop('fitness', -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found
@ -506,26 +500,20 @@ class BaseTrainer:
raise NotImplementedError('get_validator function not implemented in trainer') raise NotImplementedError('get_validator function not implemented in trainer')
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'): def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
""" """Returns dataloader derived from torch.data.Dataloader."""
Returns dataloader derived from torch.data.Dataloader.
"""
raise NotImplementedError('get_dataloader function not implemented in trainer') raise NotImplementedError('get_dataloader function not implemented in trainer')
def build_dataset(self, img_path, mode='train', batch=None): def build_dataset(self, img_path, mode='train', batch=None):
"""Build dataset""" """Build dataset."""
raise NotImplementedError('build_dataset function not implemented in trainer') raise NotImplementedError('build_dataset function not implemented in trainer')
def label_loss_items(self, loss_items=None, prefix='train'): def label_loss_items(self, loss_items=None, prefix='train'):
""" """Returns a loss dict with labelled training loss items tensor."""
Returns a loss dict with labelled training loss items tensor
"""
# Not needed for classification but necessary for segmentation & detection # Not needed for classification but necessary for segmentation & detection
return {'loss': loss_items} if loss_items is not None else ['loss'] return {'loss': loss_items} if loss_items is not None else ['loss']
def set_model_attributes(self): def set_model_attributes(self):
""" """To set or update model parameters before training."""
To set or update model parameters before training.
"""
self.model.names = self.data['names'] self.model.names = self.data['names']
def build_targets(self, preds, targets): def build_targets(self, preds, targets):
@ -632,8 +620,8 @@ class BaseTrainer:
def build_optimizer(self, model, name='auto', lr=0.001, momentum=0.9, decay=1e-5, iterations=1e5): def build_optimizer(self, model, name='auto', lr=0.001, momentum=0.9, decay=1e-5, iterations=1e5):
""" """
Constructs an optimizer for the given model, based on the specified optimizer name, learning rate, Constructs an optimizer for the given model, based on the specified optimizer name, learning rate, momentum,
momentum, weight decay, and number of iterations. weight decay, and number of iterations.
Args: Args:
model (torch.nn.Module): The model for which to build an optimizer. model (torch.nn.Module): The model for which to build an optimizer.

View File

@ -31,32 +31,32 @@ from ultralytics.utils.plotting import plot_tune_results
class Tuner: class Tuner:
""" """
Class responsible for hyperparameter tuning of YOLO models. Class responsible for hyperparameter tuning of YOLO models.
The class evolves YOLO model hyperparameters over a given number of iterations The class evolves YOLO model hyperparameters over a given number of iterations
by mutating them according to the search space and retraining the model to evaluate their performance. by mutating them according to the search space and retraining the model to evaluate their performance.
Attributes: Attributes:
space (dict): Hyperparameter search space containing bounds and scaling factors for mutation. space (dict): Hyperparameter search space containing bounds and scaling factors for mutation.
tune_dir (Path): Directory where evolution logs and results will be saved. tune_dir (Path): Directory where evolution logs and results will be saved.
tune_csv (Path): Path to the CSV file where evolution logs are saved. tune_csv (Path): Path to the CSV file where evolution logs are saved.
Methods: Methods:
_mutate(hyp: dict) -> dict: _mutate(hyp: dict) -> dict:
Mutates the given hyperparameters within the bounds specified in `self.space`. Mutates the given hyperparameters within the bounds specified in `self.space`.
__call__(): __call__():
Executes the hyperparameter evolution across multiple iterations. Executes the hyperparameter evolution across multiple iterations.
Example: Example:
Tune hyperparameters for YOLOv8n on COCO8 at imgsz=640 and epochs=30 for 300 tuning iterations. Tune hyperparameters for YOLOv8n on COCO8 at imgsz=640 and epochs=30 for 300 tuning iterations.
```python ```python
from ultralytics import YOLO from ultralytics import YOLO
model = YOLO('yolov8n.pt') model = YOLO('yolov8n.pt')
model.tune(data='coco8.yaml', epochs=10, iterations=300, optimizer='AdamW', plots=False, save=False, val=False) model.tune(data='coco8.yaml', epochs=10, iterations=300, optimizer='AdamW', plots=False, save=False, val=False)
``` ```
""" """
def __init__(self, args=DEFAULT_CFG, _callbacks=None): def __init__(self, args=DEFAULT_CFG, _callbacks=None):
""" """

View File

@ -36,7 +36,7 @@ from ultralytics.utils.torch_utils import de_parallel, select_device, smart_infe
class BaseValidator: class BaseValidator:
""" """
BaseValidator BaseValidator.
A base class for creating validators. A base class for creating validators.
@ -102,8 +102,7 @@ class BaseValidator:
@smart_inference_mode() @smart_inference_mode()
def __call__(self, trainer=None, model=None): def __call__(self, trainer=None, model=None):
""" """Supports validation of a pre-trained model if passed or a model being trained if trainer is passed (trainer
Supports validation of a pre-trained model if passed or a model being trained if trainer is passed (trainer
gets priority). gets priority).
""" """
self.training = trainer is not None self.training = trainer is not None
@ -260,7 +259,7 @@ class BaseValidator:
raise NotImplementedError('get_dataloader function not implemented for this validator') raise NotImplementedError('get_dataloader function not implemented for this validator')
def build_dataset(self, img_path): def build_dataset(self, img_path):
"""Build dataset""" """Build dataset."""
raise NotImplementedError('build_dataset function not implemented in validator') raise NotImplementedError('build_dataset function not implemented in validator')
def preprocess(self, batch): def preprocess(self, batch):

View File

@ -80,8 +80,8 @@ def get_export(model_id='', format='torchscript'):
def check_dataset(path='', task='detect'): def check_dataset(path='', task='detect'):
""" """
Function for error-checking HUB dataset Zip file before upload. It checks a dataset for errors before it is Function for error-checking HUB dataset Zip file before upload. It checks a dataset for errors before it is uploaded
uploaded to the HUB. Usage examples are given below. to the HUB. Usage examples are given below.
Args: Args:
path (str, optional): Path to data.zip (with data.yaml inside data.zip). Defaults to ''. path (str, optional): Path to data.zip (with data.yaml inside data.zip). Defaults to ''.

View File

@ -9,6 +9,19 @@ API_KEY_URL = f'{HUB_WEB_ROOT}/settings?tab=api+keys'
class Auth: class Auth:
"""
Manages authentication processes including API key handling, cookie-based authentication, and header generation.
The class supports different methods of authentication:
1. Directly using an API key.
2. Authenticating using browser cookies (specifically in Google Colab).
3. Prompting the user to enter an API key.
Attributes:
id_token (str or bool): Token used for identity verification, initialized as False.
api_key (str or bool): API key for authentication, initialized as False.
model_key (bool): Placeholder for model key, initialized as False.
"""
id_token = api_key = model_key = False id_token = api_key = model_key = False
def __init__(self, api_key='', verbose=False): def __init__(self, api_key='', verbose=False):
@ -54,7 +67,9 @@ class Auth:
def request_api_key(self, max_attempts=3): def request_api_key(self, max_attempts=3):
""" """
Prompt the user to input their API key. Returns the model ID. Prompt the user to input their API key.
Returns the model ID.
""" """
import getpass import getpass
for attempts in range(max_attempts): for attempts in range(max_attempts):
@ -86,8 +101,8 @@ class Auth:
def auth_with_cookies(self) -> bool: def auth_with_cookies(self) -> bool:
""" """
Attempt to fetch authentication via cookies and set id_token. Attempt to fetch authentication via cookies and set id_token. User must be logged in to HUB and running in a
User must be logged in to HUB and running in a supported browser. supported browser.
Returns: Returns:
bool: True if authentication is successful, False otherwise. bool: True if authentication is successful, False otherwise.

View File

@ -84,6 +84,7 @@ class HUBTrainingSession:
def _handle_signal(self, signum, frame): def _handle_signal(self, signum, frame):
""" """
Handle kill signals and prevent heartbeats from being sent on Colab after termination. Handle kill signals and prevent heartbeats from being sent on Colab after termination.
This method does not use frame, it is included as it is passed by signal. This method does not use frame, it is included as it is passed by signal.
""" """
if self.alive is True: if self.alive is True:

View File

@ -161,9 +161,7 @@ class Events:
url = 'https://www.google-analytics.com/mp/collect?measurement_id=G-X8NCJYTQXM&api_secret=QLQrATrNSwGRFRLE-cbHJw' url = 'https://www.google-analytics.com/mp/collect?measurement_id=G-X8NCJYTQXM&api_secret=QLQrATrNSwGRFRLE-cbHJw'
def __init__(self): def __init__(self):
""" """Initializes the Events object with default values for events, rate_limit, and metadata."""
Initializes the Events object with default values for events, rate_limit, and metadata.
"""
self.events = [] # events list self.events = [] # events list
self.rate_limit = 60.0 # rate limit (seconds) self.rate_limit = 60.0 # rate limit (seconds)
self.t = 0.0 # rate limit timer (seconds) self.t = 0.0 # rate limit timer (seconds)

View File

@ -22,7 +22,7 @@ class FastSAM(Model):
""" """
def __init__(self, model='FastSAM-x.pt'): def __init__(self, model='FastSAM-x.pt'):
"""Call the __init__ method of the parent class (YOLO) with the updated default model""" """Call the __init__ method of the parent class (YOLO) with the updated default model."""
if str(model) == 'FastSAM.pt': if str(model) == 'FastSAM.pt':
model = 'FastSAM-x.pt' model = 'FastSAM-x.pt'
assert Path(model).suffix not in ('.yaml', '.yml'), 'FastSAM models only support pre-trained models.' assert Path(model).suffix not in ('.yaml', '.yml'), 'FastSAM models only support pre-trained models.'
@ -30,4 +30,5 @@ class FastSAM(Model):
@property @property
def task_map(self): def task_map(self):
"""Returns a dictionary mapping segment task to corresponding predictor and validator classes."""
return {'segment': {'predictor': FastSAMPredictor, 'validator': FastSAMValidator}} return {'segment': {'predictor': FastSAMPredictor, 'validator': FastSAMValidator}}

View File

@ -11,10 +11,12 @@ from ultralytics.utils import DEFAULT_CFG, ops
class FastSAMPredictor(DetectionPredictor): class FastSAMPredictor(DetectionPredictor):
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initializes FastSAMPredictor class by inheriting from DetectionPredictor and setting task to 'segment'."""
super().__init__(cfg, overrides, _callbacks) super().__init__(cfg, overrides, _callbacks)
self.args.task = 'segment' self.args.task = 'segment'
def postprocess(self, preds, img, orig_imgs): def postprocess(self, preds, img, orig_imgs):
"""Postprocesses the predictions, applies non-max suppression, scales the boxes, and returns the results."""
p = ops.non_max_suppression( p = ops.non_max_suppression(
preds[0], preds[0],
self.args.conf, self.args.conf,

View File

@ -15,6 +15,7 @@ from ultralytics.utils import TQDM
class FastSAMPrompt: class FastSAMPrompt:
def __init__(self, source, results, device='cuda') -> None: def __init__(self, source, results, device='cuda') -> None:
"""Initializes FastSAMPrompt with given source, results and device, and assigns clip for linear assignment."""
self.device = device self.device = device
self.results = results self.results = results
self.source = source self.source = source
@ -30,6 +31,7 @@ class FastSAMPrompt:
@staticmethod @staticmethod
def _segment_image(image, bbox): def _segment_image(image, bbox):
"""Segments the given image according to the provided bounding box coordinates."""
image_array = np.array(image) image_array = np.array(image)
segmented_image_array = np.zeros_like(image_array) segmented_image_array = np.zeros_like(image_array)
x1, y1, x2, y2 = bbox x1, y1, x2, y2 = bbox
@ -45,6 +47,9 @@ class FastSAMPrompt:
@staticmethod @staticmethod
def _format_results(result, filter=0): def _format_results(result, filter=0):
"""Formats detection results into list of annotations each containing ID, segmentation, bounding box, score and
area.
"""
annotations = [] annotations = []
n = len(result.masks.data) if result.masks is not None else 0 n = len(result.masks.data) if result.masks is not None else 0
for i in range(n): for i in range(n):
@ -61,6 +66,9 @@ class FastSAMPrompt:
@staticmethod @staticmethod
def _get_bbox_from_mask(mask): def _get_bbox_from_mask(mask):
"""Applies morphological transformations to the mask, displays it, and if with_contours is True, draws
contours.
"""
mask = mask.astype(np.uint8) mask = mask.astype(np.uint8)
contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
x1, y1, w, h = cv2.boundingRect(contours[0]) x1, y1, w, h = cv2.boundingRect(contours[0])
@ -195,6 +203,7 @@ class FastSAMPrompt:
@torch.no_grad() @torch.no_grad()
def retrieve(self, model, preprocess, elements, search_text: str, device) -> int: def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
"""Processes images and text with a model, calculates similarity, and returns softmax score."""
preprocessed_images = [preprocess(image).to(device) for image in elements] preprocessed_images = [preprocess(image).to(device) for image in elements]
tokenized_text = self.clip.tokenize([search_text]).to(device) tokenized_text = self.clip.tokenize([search_text]).to(device)
stacked_images = torch.stack(preprocessed_images) stacked_images = torch.stack(preprocessed_images)
@ -206,6 +215,7 @@ class FastSAMPrompt:
return probs[:, 0].softmax(dim=0) return probs[:, 0].softmax(dim=0)
def _crop_image(self, format_results): def _crop_image(self, format_results):
"""Crops an image based on provided annotation format and returns cropped images and related data."""
if os.path.isdir(self.source): if os.path.isdir(self.source):
raise ValueError(f"'{self.source}' is a directory, not a valid source for this function.") raise ValueError(f"'{self.source}' is a directory, not a valid source for this function.")
image = Image.fromarray(cv2.cvtColor(self.results[0].orig_img, cv2.COLOR_BGR2RGB)) image = Image.fromarray(cv2.cvtColor(self.results[0].orig_img, cv2.COLOR_BGR2RGB))
@ -229,6 +239,7 @@ class FastSAMPrompt:
return cropped_boxes, cropped_images, not_crop, filter_id, annotations return cropped_boxes, cropped_images, not_crop, filter_id, annotations
def box_prompt(self, bbox): def box_prompt(self, bbox):
"""Modifies the bounding box properties and calculates IoU between masks and bounding box."""
if self.results[0].masks is not None: if self.results[0].masks is not None:
assert (bbox[2] != 0 and bbox[3] != 0) assert (bbox[2] != 0 and bbox[3] != 0)
if os.path.isdir(self.source): if os.path.isdir(self.source):
@ -261,7 +272,8 @@ class FastSAMPrompt:
self.results[0].masks.data = torch.tensor(np.array([masks[max_iou_index].cpu().numpy()])) self.results[0].masks.data = torch.tensor(np.array([masks[max_iou_index].cpu().numpy()]))
return self.results return self.results
def point_prompt(self, points, pointlabel): # numpy 处理 def point_prompt(self, points, pointlabel): # numpy
"""Adjusts points on detected masks based on user input and returns the modified results."""
if self.results[0].masks is not None: if self.results[0].masks is not None:
if os.path.isdir(self.source): if os.path.isdir(self.source):
raise ValueError(f"'{self.source}' is a directory, not a valid source for this function.") raise ValueError(f"'{self.source}' is a directory, not a valid source for this function.")
@ -284,6 +296,7 @@ class FastSAMPrompt:
return self.results return self.results
def text_prompt(self, text): def text_prompt(self, text):
"""Processes a text prompt, applies it to existing results and returns the updated results."""
if self.results[0].masks is not None: if self.results[0].masks is not None:
format_results = self._format_results(self.results[0], 0) format_results = self._format_results(self.results[0], 0)
cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results) cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
@ -296,4 +309,5 @@ class FastSAMPrompt:
return self.results return self.results
def everything_prompt(self): def everything_prompt(self):
"""Returns the processed results from the previous methods in the class."""
return self.results return self.results

View File

@ -25,12 +25,13 @@ from .val import NASValidator
class NAS(Model): class NAS(Model):
def __init__(self, model='yolo_nas_s.pt') -> None: def __init__(self, model='yolo_nas_s.pt') -> None:
"""Initializes the NAS model with the provided or default 'yolo_nas_s.pt' model."""
assert Path(model).suffix not in ('.yaml', '.yml'), 'YOLO-NAS models only support pre-trained models.' assert Path(model).suffix not in ('.yaml', '.yml'), 'YOLO-NAS models only support pre-trained models.'
super().__init__(model, task='detect') super().__init__(model, task='detect')
@smart_inference_mode() @smart_inference_mode()
def _load(self, weights: str, task: str): def _load(self, weights: str, task: str):
# Load or create new NAS model """Loads an existing NAS model weights or creates a new NAS model with pretrained weights if not provided."""
import super_gradients import super_gradients
suffix = Path(weights).suffix suffix = Path(weights).suffix
if suffix == '.pt': if suffix == '.pt':
@ -58,4 +59,5 @@ class NAS(Model):
@property @property
def task_map(self): def task_map(self):
"""Returns a dictionary mapping tasks to respective predictor and validator classes."""
return {'detect': {'predictor': NASPredictor, 'validator': NASValidator}} return {'detect': {'predictor': NASPredictor, 'validator': NASValidator}}

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """RT-DETR model interface."""
RT-DETR model interface
"""
from ultralytics.engine.model import Model from ultralytics.engine.model import Model
from ultralytics.nn.tasks import RTDETRDetectionModel from ultralytics.nn.tasks import RTDETRDetectionModel
@ -11,17 +9,17 @@ from .val import RTDETRValidator
class RTDETR(Model): class RTDETR(Model):
""" """RTDETR model interface."""
RTDETR model interface.
"""
def __init__(self, model='rtdetr-l.pt') -> None: def __init__(self, model='rtdetr-l.pt') -> None:
"""Initializes the RTDETR model with the given model file, defaulting to 'rtdetr-l.pt'."""
if model and model.split('.')[-1] not in ('pt', 'yaml', 'yml'): if model and model.split('.')[-1] not in ('pt', 'yaml', 'yml'):
raise NotImplementedError('RT-DETR only supports creating from *.pt file or *.yaml file.') raise NotImplementedError('RT-DETR only supports creating from *.pt file or *.yaml file.')
super().__init__(model=model, task='detect') super().__init__(model=model, task='detect')
@property @property
def task_map(self): def task_map(self):
"""Returns a dictionary mapping task names to corresponding Ultralytics task classes for RTDETR model."""
return { return {
'detect': { 'detect': {
'predictor': RTDETRPredictor, 'predictor': RTDETRPredictor,

View File

@ -48,7 +48,8 @@ class RTDETRPredictor(BasePredictor):
return results return results
def pre_transform(self, im): def pre_transform(self, im):
"""Pre-transform input image before inference. """
Pre-transform input image before inference.
Args: Args:
im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list. im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list.

View File

@ -37,7 +37,8 @@ class RTDETRTrainer(DetectionTrainer):
return model return model
def build_dataset(self, img_path, mode='val', batch=None): def build_dataset(self, img_path, mode='val', batch=None):
"""Build RTDETR Dataset """
Build RTDETR Dataset.
Args: Args:
img_path (str): Path to the folder containing images. img_path (str): Path to the folder containing images.

View File

@ -16,6 +16,7 @@ __all__ = 'RTDETRValidator', # tuple or list
class RTDETRDataset(YOLODataset): class RTDETRDataset(YOLODataset):
def __init__(self, *args, data=None, **kwargs): def __init__(self, *args, data=None, **kwargs):
"""Initialize the RTDETRDataset class by inheriting from the YOLODataset class."""
super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs) super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs)
# NOTE: add stretch version load_image for rtdetr mosaic # NOTE: add stretch version load_image for rtdetr mosaic

View File

@ -32,9 +32,10 @@ def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
def calculate_stability_score(masks: torch.Tensor, mask_threshold: float, threshold_offset: float) -> torch.Tensor: def calculate_stability_score(masks: torch.Tensor, mask_threshold: float, threshold_offset: float) -> torch.Tensor:
""" """
Computes the stability score for a batch of masks. The stability Computes the stability score for a batch of masks.
score is the IoU between the binary masks obtained by thresholding
the predicted mask logits at high and low values. The stability score is the IoU between the binary masks obtained by thresholding the predicted mask logits at high
and low values.
""" """
# One mask is always contained inside the other. # One mask is always contained inside the other.
# Save memory by preventing unnecessary cast to torch.int64 # Save memory by preventing unnecessary cast to torch.int64
@ -60,7 +61,11 @@ def build_all_layer_point_grids(n_per_side: int, n_layers: int, scale_per_layer:
def generate_crop_boxes(im_size: Tuple[int, ...], n_layers: int, def generate_crop_boxes(im_size: Tuple[int, ...], n_layers: int,
overlap_ratio: float) -> Tuple[List[List[int]], List[int]]: overlap_ratio: float) -> Tuple[List[List[int]], List[int]]:
"""Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.""" """
Generates a list of crop boxes of different sizes.
Each layer has (2**i)**2 boxes for the ith layer.
"""
crop_boxes, layer_idxs = [], [] crop_boxes, layer_idxs = [], []
im_h, im_w = im_size im_h, im_w = im_size
short_side = min(im_h, im_w) short_side = min(im_h, im_w)
@ -145,8 +150,9 @@ def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) -> Tup
def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor: def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
""" """
Calculates boxes in XYXY format around masks. Return [0,0,0,0] for Calculates boxes in XYXY format around masks.
an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
Return [0,0,0,0] for an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
""" """
# torch.max below raises an error on empty inputs, just skip in this case # torch.max below raises an error on empty inputs, just skip in this case
if torch.numel(masks) == 0: if torch.numel(masks) == 0:

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """SAM model interface."""
SAM model interface
"""
from pathlib import Path from pathlib import Path
@ -13,16 +11,16 @@ from .predict import Predictor
class SAM(Model): class SAM(Model):
""" """SAM model interface."""
SAM model interface.
"""
def __init__(self, model='sam_b.pt') -> None: def __init__(self, model='sam_b.pt') -> None:
"""Initializes the SAM model instance with the specified pre-trained model file."""
if model and Path(model).suffix not in ('.pt', '.pth'): if model and Path(model).suffix not in ('.pt', '.pth'):
raise NotImplementedError('SAM prediction requires pre-trained *.pt or *.pth model.') raise NotImplementedError('SAM prediction requires pre-trained *.pt or *.pth model.')
super().__init__(model=model, task='segment') super().__init__(model=model, task='segment')
def _load(self, weights: str, task=None): def _load(self, weights: str, task=None):
"""Loads the provided weights into the SAM model."""
self.model = build_sam(weights) self.model = build_sam(weights)
def predict(self, source, stream=False, bboxes=None, points=None, labels=None, **kwargs): def predict(self, source, stream=False, bboxes=None, points=None, labels=None, **kwargs):
@ -48,4 +46,5 @@ class SAM(Model):
@property @property
def task_map(self): def task_map(self):
"""Returns a dictionary mapping the 'segment' task to its corresponding 'Predictor'."""
return {'segment': {'predictor': Predictor}} return {'segment': {'predictor': Predictor}}

View File

@ -98,7 +98,11 @@ class MaskDecoder(nn.Module):
sparse_prompt_embeddings: torch.Tensor, sparse_prompt_embeddings: torch.Tensor,
dense_prompt_embeddings: torch.Tensor, dense_prompt_embeddings: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> Tuple[torch.Tensor, torch.Tensor]:
"""Predicts masks. See 'forward' for more details.""" """
Predicts masks.
See 'forward' for more details.
"""
# Concatenate output tokens # Concatenate output tokens
output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0) output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1) output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1)

View File

@ -100,6 +100,9 @@ class ImageEncoderViT(nn.Module):
) )
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Processes input through patch embedding, applies positional embedding if present, and passes through blocks
and neck.
"""
x = self.patch_embed(x) x = self.patch_embed(x)
if self.pos_embed is not None: if self.pos_embed is not None:
x = x + self.pos_embed x = x + self.pos_embed
@ -157,8 +160,8 @@ class PromptEncoder(nn.Module):
def get_dense_pe(self) -> torch.Tensor: def get_dense_pe(self) -> torch.Tensor:
""" """
Returns the positional encoding used to encode point prompts, Returns the positional encoding used to encode point prompts, applied to a dense set of points the shape of the
applied to a dense set of points the shape of the image encoding. image encoding.
Returns: Returns:
torch.Tensor: Positional encoding with shape 1x(embed_dim)x(embedding_h)x(embedding_w) torch.Tensor: Positional encoding with shape 1x(embed_dim)x(embedding_h)x(embedding_w)
@ -204,9 +207,7 @@ class PromptEncoder(nn.Module):
boxes: Optional[torch.Tensor], boxes: Optional[torch.Tensor],
masks: Optional[torch.Tensor], masks: Optional[torch.Tensor],
) -> int: ) -> int:
""" """Gets the batch size of the output given the batch size of the input prompts."""
Gets the batch size of the output given the batch size of the input prompts.
"""
if points is not None: if points is not None:
return points[0].shape[0] return points[0].shape[0]
elif boxes is not None: elif boxes is not None:
@ -217,6 +218,7 @@ class PromptEncoder(nn.Module):
return 1 return 1
def _get_device(self) -> torch.device: def _get_device(self) -> torch.device:
"""Returns the device of the first point embedding's weight tensor."""
return self.point_embeddings[0].weight.device return self.point_embeddings[0].weight.device
def forward( def forward(
@ -259,11 +261,10 @@ class PromptEncoder(nn.Module):
class PositionEmbeddingRandom(nn.Module): class PositionEmbeddingRandom(nn.Module):
""" """Positional encoding using random spatial frequencies."""
Positional encoding using random spatial frequencies.
"""
def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None: def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
"""Initializes a position embedding using random spatial frequencies."""
super().__init__() super().__init__()
if scale is None or scale <= 0.0: if scale is None or scale <= 0.0:
scale = 1.0 scale = 1.0
@ -304,7 +305,7 @@ class PositionEmbeddingRandom(nn.Module):
class Block(nn.Module): class Block(nn.Module):
"""Transformer blocks with support of window attention and residual propagation blocks""" """Transformer blocks with support of window attention and residual propagation blocks."""
def __init__( def __init__(
self, self,
@ -351,6 +352,7 @@ class Block(nn.Module):
self.window_size = window_size self.window_size = window_size
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Executes a forward pass through the transformer block with window attention and non-overlapping windows."""
shortcut = x shortcut = x
x = self.norm1(x) x = self.norm1(x)
# Window partition # Window partition
@ -404,6 +406,7 @@ class Attention(nn.Module):
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Applies the forward operation including attention, normalization, MLP, and indexing within window limits."""
B, H, W, _ = x.shape B, H, W, _ = x.shape
# qkv with shape (3, B, nHead, H * W, C) # qkv with shape (3, B, nHead, H * W, C)
qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
@ -448,6 +451,7 @@ def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[in
hw: Tuple[int, int]) -> torch.Tensor: hw: Tuple[int, int]) -> torch.Tensor:
""" """
Window unpartition into original sequences and removing padding. Window unpartition into original sequences and removing padding.
Args: Args:
windows (tensor): input tokens with [B * num_windows, window_size, window_size, C]. windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
window_size (int): window size. window_size (int): window size.
@ -540,9 +544,7 @@ def add_decomposed_rel_pos(
class PatchEmbed(nn.Module): class PatchEmbed(nn.Module):
""" """Image to Patch Embedding."""
Image to Patch Embedding.
"""
def __init__( def __init__(
self, self,
@ -565,4 +567,5 @@ class PatchEmbed(nn.Module):
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding) self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Computes patch embedding by applying convolution and transposing resulting tensor."""
return self.proj(x).permute(0, 2, 3, 1) # B C H W -> B H W C return self.proj(x).permute(0, 2, 3, 1) # B C H W -> B H W C

View File

@ -23,6 +23,9 @@ from ultralytics.utils.instance import to_2tuple
class Conv2d_BN(torch.nn.Sequential): class Conv2d_BN(torch.nn.Sequential):
def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1): def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1):
"""Initializes the MBConv model with given input channels, output channels, expansion ratio, activation, and
drop path.
"""
super().__init__() super().__init__()
self.add_module('c', torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False)) self.add_module('c', torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False))
bn = torch.nn.BatchNorm2d(b) bn = torch.nn.BatchNorm2d(b)
@ -34,6 +37,9 @@ class Conv2d_BN(torch.nn.Sequential):
class PatchEmbed(nn.Module): class PatchEmbed(nn.Module):
def __init__(self, in_chans, embed_dim, resolution, activation): def __init__(self, in_chans, embed_dim, resolution, activation):
"""Initialize the PatchMerging class with specified input, output dimensions, resolution and activation
function.
"""
super().__init__() super().__init__()
img_size: Tuple[int, int] = to_2tuple(resolution) img_size: Tuple[int, int] = to_2tuple(resolution)
self.patches_resolution = (img_size[0] // 4, img_size[1] // 4) self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
@ -48,12 +54,16 @@ class PatchEmbed(nn.Module):
) )
def forward(self, x): def forward(self, x):
"""Runs input tensor 'x' through the PatchMerging model's sequence of operations."""
return self.seq(x) return self.seq(x)
class MBConv(nn.Module): class MBConv(nn.Module):
def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path): def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path):
"""Initializes a convolutional layer with specified dimensions, input resolution, depth, and activation
function.
"""
super().__init__() super().__init__()
self.in_chans = in_chans self.in_chans = in_chans
self.hidden_chans = int(in_chans * expand_ratio) self.hidden_chans = int(in_chans * expand_ratio)
@ -73,6 +83,7 @@ class MBConv(nn.Module):
self.drop_path = nn.Identity() self.drop_path = nn.Identity()
def forward(self, x): def forward(self, x):
"""Implements the forward pass for the model architecture."""
shortcut = x shortcut = x
x = self.conv1(x) x = self.conv1(x)
x = self.act1(x) x = self.act1(x)
@ -87,6 +98,9 @@ class MBConv(nn.Module):
class PatchMerging(nn.Module): class PatchMerging(nn.Module):
def __init__(self, input_resolution, dim, out_dim, activation): def __init__(self, input_resolution, dim, out_dim, activation):
"""Initializes the ConvLayer with specific dimension, input resolution, depth, activation, drop path, and other
optional parameters.
"""
super().__init__() super().__init__()
self.input_resolution = input_resolution self.input_resolution = input_resolution
@ -99,6 +113,7 @@ class PatchMerging(nn.Module):
self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0) self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
def forward(self, x): def forward(self, x):
"""Applies forward pass on the input utilizing convolution and activation layers, and returns the result."""
if x.ndim == 3: if x.ndim == 3:
H, W = self.input_resolution H, W = self.input_resolution
B = len(x) B = len(x)
@ -149,6 +164,7 @@ class ConvLayer(nn.Module):
input_resolution, dim=dim, out_dim=out_dim, activation=activation) input_resolution, dim=dim, out_dim=out_dim, activation=activation)
def forward(self, x): def forward(self, x):
"""Processes the input through a series of convolutional layers and returns the activated output."""
for blk in self.blocks: for blk in self.blocks:
x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x) x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
return x if self.downsample is None else self.downsample(x) return x if self.downsample is None else self.downsample(x)
@ -157,6 +173,7 @@ class ConvLayer(nn.Module):
class Mlp(nn.Module): class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
"""Initializes Attention module with the given parameters including dimension, key_dim, number of heads, etc."""
super().__init__() super().__init__()
out_features = out_features or in_features out_features = out_features or in_features
hidden_features = hidden_features or in_features hidden_features = hidden_features or in_features
@ -167,6 +184,7 @@ class Mlp(nn.Module):
self.drop = nn.Dropout(drop) self.drop = nn.Dropout(drop)
def forward(self, x): def forward(self, x):
"""Applies operations on input x and returns modified x, runs downsample if not None."""
x = self.norm(x) x = self.norm(x)
x = self.fc1(x) x = self.fc1(x)
x = self.act(x) x = self.act(x)
@ -216,6 +234,7 @@ class Attention(torch.nn.Module):
@torch.no_grad() @torch.no_grad()
def train(self, mode=True): def train(self, mode=True):
"""Sets the module in training mode and handles attribute 'ab' based on the mode."""
super().train(mode) super().train(mode)
if mode and hasattr(self, 'ab'): if mode and hasattr(self, 'ab'):
del self.ab del self.ab
@ -298,6 +317,9 @@ class TinyViTBlock(nn.Module):
self.local_conv = Conv2d_BN(dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim) self.local_conv = Conv2d_BN(dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)
def forward(self, x): def forward(self, x):
"""Applies attention-based transformation or padding to input 'x' before passing it through a local
convolution.
"""
H, W = self.input_resolution H, W = self.input_resolution
B, L, C = x.shape B, L, C = x.shape
assert L == H * W, 'input feature has wrong size' assert L == H * W, 'input feature has wrong size'
@ -337,6 +359,9 @@ class TinyViTBlock(nn.Module):
return x + self.drop_path(self.mlp(x)) return x + self.drop_path(self.mlp(x))
def extra_repr(self) -> str: def extra_repr(self) -> str:
"""Returns a formatted string representing the TinyViTBlock's parameters: dimension, input resolution, number of
attentions heads, window size, and MLP ratio.
"""
return f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, ' \ return f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, ' \
f'window_size={self.window_size}, mlp_ratio={self.mlp_ratio}' f'window_size={self.window_size}, mlp_ratio={self.mlp_ratio}'
@ -402,23 +427,28 @@ class BasicLayer(nn.Module):
input_resolution, dim=dim, out_dim=out_dim, activation=activation) input_resolution, dim=dim, out_dim=out_dim, activation=activation)
def forward(self, x): def forward(self, x):
"""Performs forward propagation on the input tensor and returns a normalized tensor."""
for blk in self.blocks: for blk in self.blocks:
x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x) x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
return x if self.downsample is None else self.downsample(x) return x if self.downsample is None else self.downsample(x)
def extra_repr(self) -> str: def extra_repr(self) -> str:
"""Returns a string representation of the extra_repr function with the layer's parameters."""
return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}' return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}'
class LayerNorm2d(nn.Module): class LayerNorm2d(nn.Module):
"""A PyTorch implementation of Layer Normalization in 2D."""
def __init__(self, num_channels: int, eps: float = 1e-6) -> None: def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
"""Initialize LayerNorm2d with the number of channels and an optional epsilon."""
super().__init__() super().__init__()
self.weight = nn.Parameter(torch.ones(num_channels)) self.weight = nn.Parameter(torch.ones(num_channels))
self.bias = nn.Parameter(torch.zeros(num_channels)) self.bias = nn.Parameter(torch.zeros(num_channels))
self.eps = eps self.eps = eps
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Perform a forward pass, normalizing the input tensor."""
u = x.mean(1, keepdim=True) u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True) s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps) x = (x - u) / torch.sqrt(s + self.eps)
@ -518,6 +548,7 @@ class TinyViT(nn.Module):
) )
def set_layer_lr_decay(self, layer_lr_decay): def set_layer_lr_decay(self, layer_lr_decay):
"""Sets the learning rate decay for each layer in the TinyViT model."""
decay_rate = layer_lr_decay decay_rate = layer_lr_decay
# layers -> blocks (depth) # layers -> blocks (depth)
@ -525,6 +556,7 @@ class TinyViT(nn.Module):
lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)] lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
def _set_lr_scale(m, scale): def _set_lr_scale(m, scale):
"""Sets the learning rate scale for each layer in the model based on the layer's depth."""
for p in m.parameters(): for p in m.parameters():
p.lr_scale = scale p.lr_scale = scale
@ -544,12 +576,14 @@ class TinyViT(nn.Module):
p.param_name = k p.param_name = k
def _check_lr_scale(m): def _check_lr_scale(m):
"""Checks if the learning rate scale attribute is present in module's parameters."""
for p in m.parameters(): for p in m.parameters():
assert hasattr(p, 'lr_scale'), p.param_name assert hasattr(p, 'lr_scale'), p.param_name
self.apply(_check_lr_scale) self.apply(_check_lr_scale)
def _init_weights(self, m): def _init_weights(self, m):
"""Initializes weights for linear layers and layer normalization in the given module."""
if isinstance(m, nn.Linear): if isinstance(m, nn.Linear):
# NOTE: This initialization is needed only for training. # NOTE: This initialization is needed only for training.
# trunc_normal_(m.weight, std=.02) # trunc_normal_(m.weight, std=.02)
@ -561,11 +595,12 @@ class TinyViT(nn.Module):
@torch.jit.ignore @torch.jit.ignore
def no_weight_decay_keywords(self): def no_weight_decay_keywords(self):
"""Returns a dictionary of parameter names where weight decay should not be applied."""
return {'attention_biases'} return {'attention_biases'}
def forward_features(self, x): def forward_features(self, x):
# x: (N, C, H, W) """Runs the input through the model layers and returns the transformed output."""
x = self.patch_embed(x) x = self.patch_embed(x) # x input is (N, C, H, W)
x = self.layers[0](x) x = self.layers[0](x)
start_i = 1 start_i = 1
@ -579,4 +614,5 @@ class TinyViT(nn.Module):
return self.neck(x) return self.neck(x)
def forward(self, x): def forward(self, x):
"""Executes a forward pass on the input tensor through the constructed model layers."""
return self.forward_features(x) return self.forward_features(x)

View File

@ -21,8 +21,7 @@ class TwoWayTransformer(nn.Module):
attention_downsample_rate: int = 2, attention_downsample_rate: int = 2,
) -> None: ) -> None:
""" """
A transformer decoder that attends to an input image using A transformer decoder that attends to an input image using queries whose positional embedding is supplied.
queries whose positional embedding is supplied.
Args: Args:
depth (int): number of layers in the transformer depth (int): number of layers in the transformer
@ -171,8 +170,7 @@ class TwoWayAttentionBlock(nn.Module):
class Attention(nn.Module): class Attention(nn.Module):
""" """An attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
An attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
values. values.
""" """

View File

@ -19,6 +19,7 @@ from .build import build_sam
class Predictor(BasePredictor): class Predictor(BasePredictor):
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initializes the Predictor class with default or provided configuration, overrides, and callbacks."""
if overrides is None: if overrides is None:
overrides = {} overrides = {}
overrides.update(dict(task='segment', mode='predict', imgsz=1024)) overrides.update(dict(task='segment', mode='predict', imgsz=1024))
@ -34,7 +35,8 @@ class Predictor(BasePredictor):
self.segment_all = False self.segment_all = False
def preprocess(self, im): def preprocess(self, im):
"""Prepares input image before inference. """
Prepares input image before inference.
Args: Args:
im (torch.Tensor | List(np.ndarray)): BCHW for tensor, [(HWC) x B] for list. im (torch.Tensor | List(np.ndarray)): BCHW for tensor, [(HWC) x B] for list.
@ -189,7 +191,8 @@ class Predictor(BasePredictor):
stability_score_thresh=0.95, stability_score_thresh=0.95,
stability_score_offset=0.95, stability_score_offset=0.95,
crop_nms_thresh=0.7): crop_nms_thresh=0.7):
"""Segment the whole image. """
Segment the whole image.
Args: Args:
im (torch.Tensor): The preprocessed image, (N, C, H, W). im (torch.Tensor): The preprocessed image, (N, C, H, W).
@ -360,14 +363,15 @@ class Predictor(BasePredictor):
self.prompts = prompts self.prompts = prompts
def reset_image(self): def reset_image(self):
"""Resets the image and its features to None."""
self.im = None self.im = None
self.features = None self.features = None
@staticmethod @staticmethod
def remove_small_regions(masks, min_area=0, nms_thresh=0.7): def remove_small_regions(masks, min_area=0, nms_thresh=0.7):
""" """
Removes small disconnected regions and holes in masks, then reruns Removes small disconnected regions and holes in masks, then reruns box NMS to remove any new duplicates.
box NMS to remove any new duplicates. Requires open-cv as a dependency. Requires open-cv as a dependency.
Args: Args:
masks (torch.Tensor): Masks, (N, H, W). masks (torch.Tensor): Masks, (N, H, W).

View File

@ -47,6 +47,7 @@ class DETRLoss(nn.Module):
self.device = None self.device = None
def _get_loss_class(self, pred_scores, targets, gt_scores, num_gts, postfix=''): def _get_loss_class(self, pred_scores, targets, gt_scores, num_gts, postfix=''):
"""Computes the classification loss based on predictions, target values, and ground truth scores."""
# logits: [b, query, num_classes], gt_class: list[[n, 1]] # logits: [b, query, num_classes], gt_class: list[[n, 1]]
name_class = f'loss_class{postfix}' name_class = f'loss_class{postfix}'
bs, nq = pred_scores.shape[:2] bs, nq = pred_scores.shape[:2]
@ -68,6 +69,9 @@ class DETRLoss(nn.Module):
return {name_class: loss_cls.squeeze() * self.loss_gain['class']} return {name_class: loss_cls.squeeze() * self.loss_gain['class']}
def _get_loss_bbox(self, pred_bboxes, gt_bboxes, postfix=''): def _get_loss_bbox(self, pred_bboxes, gt_bboxes, postfix=''):
"""Calculates and returns the bounding box loss and GIoU loss for the predicted and ground truth bounding
boxes.
"""
# boxes: [b, query, 4], gt_bbox: list[[n, 4]] # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
name_bbox = f'loss_bbox{postfix}' name_bbox = f'loss_bbox{postfix}'
name_giou = f'loss_giou{postfix}' name_giou = f'loss_giou{postfix}'
@ -125,7 +129,7 @@ class DETRLoss(nn.Module):
postfix='', postfix='',
masks=None, masks=None,
gt_mask=None): gt_mask=None):
"""Get auxiliary losses""" """Get auxiliary losses."""
# NOTE: loss class, bbox, giou, mask, dice # NOTE: loss class, bbox, giou, mask, dice
loss = torch.zeros(5 if masks is not None else 3, device=pred_bboxes.device) loss = torch.zeros(5 if masks is not None else 3, device=pred_bboxes.device)
if match_indices is None and self.use_uni_match: if match_indices is None and self.use_uni_match:
@ -166,12 +170,14 @@ class DETRLoss(nn.Module):
@staticmethod @staticmethod
def _get_index(match_indices): def _get_index(match_indices):
"""Returns batch indices, source indices, and destination indices from provided match indices."""
batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(match_indices)]) batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(match_indices)])
src_idx = torch.cat([src for (src, _) in match_indices]) src_idx = torch.cat([src for (src, _) in match_indices])
dst_idx = torch.cat([dst for (_, dst) in match_indices]) dst_idx = torch.cat([dst for (_, dst) in match_indices])
return (batch_idx, src_idx), dst_idx return (batch_idx, src_idx), dst_idx
def _get_assigned_bboxes(self, pred_bboxes, gt_bboxes, match_indices): def _get_assigned_bboxes(self, pred_bboxes, gt_bboxes, match_indices):
"""Assigns predicted bounding boxes to ground truth bounding boxes based on the match indices."""
pred_assigned = torch.cat([ pred_assigned = torch.cat([
t[I] if len(I) > 0 else torch.zeros(0, t.shape[-1], device=self.device) t[I] if len(I) > 0 else torch.zeros(0, t.shape[-1], device=self.device)
for t, (I, _) in zip(pred_bboxes, match_indices)]) for t, (I, _) in zip(pred_bboxes, match_indices)])
@ -190,7 +196,7 @@ class DETRLoss(nn.Module):
gt_mask=None, gt_mask=None,
postfix='', postfix='',
match_indices=None): match_indices=None):
"""Get losses""" """Get losses."""
if match_indices is None: if match_indices is None:
match_indices = self.matcher(pred_bboxes, match_indices = self.matcher(pred_bboxes,
pred_scores, pred_scores,
@ -250,22 +256,43 @@ class DETRLoss(nn.Module):
class RTDETRDetectionLoss(DETRLoss): class RTDETRDetectionLoss(DETRLoss):
"""
Real-Time DeepTracker (RT-DETR) Detection Loss class that extends the DETRLoss.
This class computes the detection loss for the RT-DETR model, which includes the standard detection loss as well as
an additional denoising training loss when provided with denoising metadata.
"""
def forward(self, preds, batch, dn_bboxes=None, dn_scores=None, dn_meta=None): def forward(self, preds, batch, dn_bboxes=None, dn_scores=None, dn_meta=None):
"""
Forward pass to compute the detection loss.
Args:
preds (tuple): Predicted bounding boxes and scores.
batch (dict): Batch data containing ground truth information.
dn_bboxes (torch.Tensor, optional): Denoising bounding boxes. Default is None.
dn_scores (torch.Tensor, optional): Denoising scores. Default is None.
dn_meta (dict, optional): Metadata for denoising. Default is None.
Returns:
(dict): Dictionary containing the total loss and, if applicable, the denoising loss.
"""
pred_bboxes, pred_scores = preds pred_bboxes, pred_scores = preds
total_loss = super().forward(pred_bboxes, pred_scores, batch) total_loss = super().forward(pred_bboxes, pred_scores, batch)
# Check for denoising metadata to compute denoising training loss
if dn_meta is not None: if dn_meta is not None:
dn_pos_idx, dn_num_group = dn_meta['dn_pos_idx'], dn_meta['dn_num_group'] dn_pos_idx, dn_num_group = dn_meta['dn_pos_idx'], dn_meta['dn_num_group']
assert len(batch['gt_groups']) == len(dn_pos_idx) assert len(batch['gt_groups']) == len(dn_pos_idx)
# Denoising match indices # Get the match indices for denoising
match_indices = self.get_dn_match_indices(dn_pos_idx, dn_num_group, batch['gt_groups']) match_indices = self.get_dn_match_indices(dn_pos_idx, dn_num_group, batch['gt_groups'])
# Compute denoising training loss # Compute the denoising training loss
dn_loss = super().forward(dn_bboxes, dn_scores, batch, postfix='_dn', match_indices=match_indices) dn_loss = super().forward(dn_bboxes, dn_scores, batch, postfix='_dn', match_indices=match_indices)
total_loss.update(dn_loss) total_loss.update(dn_loss)
else: else:
# If no denoising metadata is provided, set denoising loss to zero
total_loss.update({f'{k}_dn': torch.tensor(0., device=self.device) for k in total_loss.keys()}) total_loss.update({f'{k}_dn': torch.tensor(0., device=self.device) for k in total_loss.keys()})
return total_loss return total_loss
@ -276,12 +303,12 @@ class RTDETRDetectionLoss(DETRLoss):
Get the match indices for denoising. Get the match indices for denoising.
Args: Args:
dn_pos_idx (List[torch.Tensor]): A list includes positive indices of denoising. dn_pos_idx (List[torch.Tensor]): List of tensors containing positive indices for denoising.
dn_num_group (int): The number of groups of denoising. dn_num_group (int): Number of denoising groups.
gt_groups (List(int)): a list of batch size length includes the number of gts of each image. gt_groups (List[int]): List of integers representing the number of ground truths for each image.
Returns: Returns:
dn_match_indices (List(tuple)): Matched indices. (List[tuple]): List of tuples containing matched indices for denoising.
""" """
dn_match_indices = [] dn_match_indices = []
idx_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0) idx_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0)

View File

@ -11,8 +11,8 @@ from ultralytics.utils.ops import xywh2xyxy, xyxy2xywh
class HungarianMatcher(nn.Module): class HungarianMatcher(nn.Module):
""" """
A module implementing the HungarianMatcher, which is a differentiable module to solve the assignment problem in A module implementing the HungarianMatcher, which is a differentiable module to solve the assignment problem in an
an end-to-end fashion. end-to-end fashion.
HungarianMatcher performs optimal assignment over the predicted and ground truth bounding boxes using a cost HungarianMatcher performs optimal assignment over the predicted and ground truth bounding boxes using a cost
function that considers classification scores, bounding box coordinates, and optionally, mask predictions. function that considers classification scores, bounding box coordinates, and optionally, mask predictions.
@ -32,6 +32,9 @@ class HungarianMatcher(nn.Module):
""" """
def __init__(self, cost_gain=None, use_fl=True, with_mask=False, num_sample_points=12544, alpha=0.25, gamma=2.0): def __init__(self, cost_gain=None, use_fl=True, with_mask=False, num_sample_points=12544, alpha=0.25, gamma=2.0):
"""Initializes HungarianMatcher with cost coefficients, Focal Loss, mask prediction, sample points, and alpha
gamma factors.
"""
super().__init__() super().__init__()
if cost_gain is None: if cost_gain is None:
cost_gain = {'class': 1, 'bbox': 5, 'giou': 2, 'mask': 1, 'dice': 1} cost_gain = {'class': 1, 'bbox': 5, 'giou': 2, 'mask': 1, 'dice': 1}
@ -45,8 +48,8 @@ class HungarianMatcher(nn.Module):
def forward(self, pred_bboxes, pred_scores, gt_bboxes, gt_cls, gt_groups, masks=None, gt_mask=None): def forward(self, pred_bboxes, pred_scores, gt_bboxes, gt_cls, gt_groups, masks=None, gt_mask=None):
""" """
Forward pass for HungarianMatcher. This function computes costs based on prediction and ground truth Forward pass for HungarianMatcher. This function computes costs based on prediction and ground truth
(classification cost, L1 cost between boxes and GIoU cost between boxes) and finds the optimal matching (classification cost, L1 cost between boxes and GIoU cost between boxes) and finds the optimal matching between
between predictions and ground truth based on these costs. predictions and ground truth based on these costs.
Args: Args:
pred_bboxes (Tensor): Predicted bounding boxes with shape [batch_size, num_queries, 4]. pred_bboxes (Tensor): Predicted bounding boxes with shape [batch_size, num_queries, 4].
@ -153,9 +156,9 @@ def get_cdn_group(batch,
box_noise_scale=1.0, box_noise_scale=1.0,
training=False): training=False):
""" """
Get contrastive denoising training group. This function creates a contrastive denoising training group with Get contrastive denoising training group. This function creates a contrastive denoising training group with positive
positive and negative samples from the ground truths (gt). It applies noise to the class labels and bounding and negative samples from the ground truths (gt). It applies noise to the class labels and bounding box coordinates,
box coordinates, and returns the modified labels, bounding boxes, attention mask and meta information. and returns the modified labels, bounding boxes, attention mask and meta information.
Args: Args:
batch (dict): A dict that includes 'gt_cls' (torch.Tensor with shape [num_gts, ]), 'gt_bboxes' batch (dict): A dict that includes 'gt_cls' (torch.Tensor with shape [num_gts, ]), 'gt_bboxes'
@ -191,12 +194,12 @@ def get_cdn_group(batch,
gt_bbox = batch['bboxes'] # bs*num, 4 gt_bbox = batch['bboxes'] # bs*num, 4
b_idx = batch['batch_idx'] b_idx = batch['batch_idx']
# each group has positive and negative queries. # Each group has positive and negative queries.
dn_cls = gt_cls.repeat(2 * num_group) # (2*num_group*bs*num, ) dn_cls = gt_cls.repeat(2 * num_group) # (2*num_group*bs*num, )
dn_bbox = gt_bbox.repeat(2 * num_group, 1) # 2*num_group*bs*num, 4 dn_bbox = gt_bbox.repeat(2 * num_group, 1) # 2*num_group*bs*num, 4
dn_b_idx = b_idx.repeat(2 * num_group).view(-1) # (2*num_group*bs*num, ) dn_b_idx = b_idx.repeat(2 * num_group).view(-1) # (2*num_group*bs*num, )
# positive and negative mask # Positive and negative mask
# (bs*num*num_group, ), the second total_num*num_group part as negative samples # (bs*num*num_group, ), the second total_num*num_group part as negative samples
neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num
@ -220,10 +223,9 @@ def get_cdn_group(batch,
known_bbox += rand_part * diff known_bbox += rand_part * diff
known_bbox.clip_(min=0.0, max=1.0) known_bbox.clip_(min=0.0, max=1.0)
dn_bbox = xyxy2xywh(known_bbox) dn_bbox = xyxy2xywh(known_bbox)
dn_bbox = inverse_sigmoid(dn_bbox) dn_bbox = torch.logit(dn_bbox, eps=1e-6) # inverse sigmoid
# total denoising queries num_dn = int(max_nums * 2 * num_group) # total denoising queries
num_dn = int(max_nums * 2 * num_group)
# class_embed = torch.cat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)]) # class_embed = torch.cat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)])
dn_cls_embed = class_embed[dn_cls] # bs*num * 2 * num_group, 256 dn_cls_embed = class_embed[dn_cls] # bs*num * 2 * num_group, 256
padding_cls = torch.zeros(bs, num_dn, dn_cls_embed.shape[-1], device=gt_cls.device) padding_cls = torch.zeros(bs, num_dn, dn_cls_embed.shape[-1], device=gt_cls.device)
@ -256,9 +258,3 @@ def get_cdn_group(batch,
return padding_cls.to(class_embed.device), padding_bbox.to(class_embed.device), attn_mask.to( return padding_cls.to(class_embed.device), padding_bbox.to(class_embed.device), attn_mask.to(
class_embed.device), dn_meta class_embed.device), dn_meta
def inverse_sigmoid(x, eps=1e-6):
"""Inverse sigmoid function."""
x = x.clip(min=0., max=1.)
return torch.log(x / (1 - x + eps) + eps)

View File

@ -26,6 +26,7 @@ class ClassificationPredictor(BasePredictor):
""" """
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initializes ClassificationPredictor setting the task to 'classify'."""
super().__init__(cfg, overrides, _callbacks) super().__init__(cfg, overrides, _callbacks)
self.args.task = 'classify' self.args.task = 'classify'

View File

@ -79,6 +79,7 @@ class ClassificationTrainer(BaseTrainer):
return ckpt return ckpt
def build_dataset(self, img_path, mode='train', batch=None): def build_dataset(self, img_path, mode='train', batch=None):
"""Creates a ClassificationDataset instance given an image path, and mode (train/test etc.)."""
return ClassificationDataset(root=img_path, args=self.args, augment=mode == 'train', prefix=mode) return ClassificationDataset(root=img_path, args=self.args, augment=mode == 'train', prefix=mode)
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'): def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
@ -113,8 +114,9 @@ class ClassificationTrainer(BaseTrainer):
def label_loss_items(self, loss_items=None, prefix='train'): def label_loss_items(self, loss_items=None, prefix='train'):
""" """
Returns a loss dict with labelled training loss items tensor. Not needed for classification but necessary for Returns a loss dict with labelled training loss items tensor.
segmentation & detection
Not needed for classification but necessary for segmentation & detection
""" """
keys = [f'{prefix}/{x}' for x in self.loss_names] keys = [f'{prefix}/{x}' for x in self.loss_names]
if loss_items is None: if loss_items is None:

View File

@ -78,6 +78,7 @@ class ClassificationValidator(BaseValidator):
return self.metrics.results_dict return self.metrics.results_dict
def build_dataset(self, img_path): def build_dataset(self, img_path):
"""Creates and returns a ClassificationDataset instance using given image path and preprocessing parameters."""
return ClassificationDataset(root=img_path, args=self.args, augment=False, prefix=self.args.split) return ClassificationDataset(root=img_path, args=self.args, augment=False, prefix=self.args.split)
def get_dataloader(self, dataset_path, batch_size): def get_dataloader(self, dataset_path, batch_size):

View File

@ -57,7 +57,7 @@ class DetectionTrainer(BaseTrainer):
return batch return batch
def set_model_attributes(self): def set_model_attributes(self):
"""nl = de_parallel(self.model).model[-1].nl # number of detection layers (to scale hyps).""" """Nl = de_parallel(self.model).model[-1].nl # number of detection layers (to scale hyps)."""
# self.args.box *= 3 / nl # scale to layers # self.args.box *= 3 / nl # scale to layers
# self.args.cls *= self.data["nc"] / 80 * 3 / nl # scale to classes and layers # self.args.cls *= self.data["nc"] / 80 * 3 / nl # scale to classes and layers
# self.args.cls *= (self.args.imgsz / 640) ** 2 * 3 / nl # scale to image size and layers # self.args.cls *= (self.args.imgsz / 640) ** 2 * 3 / nl # scale to image size and layers
@ -80,8 +80,9 @@ class DetectionTrainer(BaseTrainer):
def label_loss_items(self, loss_items=None, prefix='train'): def label_loss_items(self, loss_items=None, prefix='train'):
""" """
Returns a loss dict with labelled training loss items tensor. Not needed for classification but necessary for Returns a loss dict with labelled training loss items tensor.
segmentation & detection
Not needed for classification but necessary for segmentation & detection
""" """
keys = [f'{prefix}/{x}' for x in self.loss_names] keys = [f'{prefix}/{x}' for x in self.loss_names]
if loss_items is not None: if loss_items is not None:

View File

@ -6,13 +6,11 @@ from ultralytics.nn.tasks import ClassificationModel, DetectionModel, PoseModel,
class YOLO(Model): class YOLO(Model):
""" """YOLO (You Only Look Once) object detection model."""
YOLO (You Only Look Once) object detection model.
"""
@property @property
def task_map(self): def task_map(self):
"""Map head to model, trainer, validator, and predictor classes""" """Map head to model, trainer, validator, and predictor classes."""
return { return {
'classify': { 'classify': {
'model': ClassificationModel, 'model': ClassificationModel,

View File

@ -21,6 +21,7 @@ class PosePredictor(DetectionPredictor):
""" """
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initializes PosePredictor, sets task to 'pose' and logs a warning for using 'mps' as device."""
super().__init__(cfg, overrides, _callbacks) super().__init__(cfg, overrides, _callbacks)
self.args.task = 'pose' self.args.task = 'pose'
if isinstance(self.args.device, str) and self.args.device.lower() == 'mps': if isinstance(self.args.device, str) and self.args.device.lower() == 'mps':

View File

@ -21,10 +21,12 @@ class SegmentationPredictor(DetectionPredictor):
""" """
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initializes the SegmentationPredictor with the provided configuration, overrides, and callbacks."""
super().__init__(cfg, overrides, _callbacks) super().__init__(cfg, overrides, _callbacks)
self.args.task = 'segment' self.args.task = 'segment'
def postprocess(self, preds, img, orig_imgs): def postprocess(self, preds, img, orig_imgs):
"""Applies non-max suppression and processes detections for each image in an input batch."""
p = ops.non_max_suppression(preds[0], p = ops.non_max_suppression(preds[0],
self.args.conf, self.args.conf,
self.args.iou, self.args.iou,

View File

@ -144,7 +144,7 @@ class SegmentationValidator(DetectionValidator):
def _process_batch(self, detections, labels, pred_masks=None, gt_masks=None, overlap=False, masks=False): def _process_batch(self, detections, labels, pred_masks=None, gt_masks=None, overlap=False, masks=False):
""" """
Return correct prediction matrix Return correct prediction matrix.
Args: Args:
detections (array[N, 6]), x1, y1, x2, y2, conf, class detections (array[N, 6]), x1, y1, x2, y2, conf, class

View File

@ -20,7 +20,11 @@ from ultralytics.utils.downloads import attempt_download_asset, is_url
def check_class_names(names): def check_class_names(names):
"""Check class names. Map imagenet class codes to human-readable names if required. Convert lists to dicts.""" """
Check class names.
Map imagenet class codes to human-readable names if required. Convert lists to dicts.
"""
if isinstance(names, list): # names is a list if isinstance(names, list): # names is a list
names = dict(enumerate(names)) # convert to dict names = dict(enumerate(names)) # convert to dict
if isinstance(names, dict): if isinstance(names, dict):
@ -37,6 +41,32 @@ def check_class_names(names):
class AutoBackend(nn.Module): class AutoBackend(nn.Module):
"""
Handles dynamic backend selection for running inference using Ultralytics YOLO models.
The AutoBackend class is designed to provide an abstraction layer for various inference engines. It supports a wide
range of formats, each with specific naming conventions as outlined below:
Supported Formats and Naming Conventions:
| Format | File Suffix |
|-----------------------|------------------|
| PyTorch | *.pt |
| TorchScript | *.torchscript |
| ONNX Runtime | *.onnx |
| ONNX OpenCV DNN | *.onnx (dnn=True)|
| OpenVINO | *openvino_model/ |
| CoreML | *.mlpackage |
| TensorRT | *.engine |
| TensorFlow SavedModel | *_saved_model |
| TensorFlow GraphDef | *.pb |
| TensorFlow Lite | *.tflite |
| TensorFlow Edge TPU | *_edgetpu.tflite |
| PaddlePaddle | *_paddle_model |
| ncnn | *_ncnn_model |
This class offers dynamic backend switching capabilities based on the input model format, making it easier to deploy
models across various platforms.
"""
@torch.no_grad() @torch.no_grad()
def __init__(self, def __init__(self,
@ -48,33 +78,16 @@ class AutoBackend(nn.Module):
fuse=True, fuse=True,
verbose=True): verbose=True):
""" """
MultiBackend class for python inference on various platforms using Ultralytics YOLO. Initialize the AutoBackend for inference.
Args: Args:
weights (str): The path to the weights file. Default: 'yolov8n.pt' weights (str): Path to the model weights file. Defaults to 'yolov8n.pt'.
device (torch.device): The device to run the model on. device (torch.device): Device to run the model on. Defaults to CPU.
dnn (bool): Use OpenCV DNN module for inference if True, defaults to False. dnn (bool): Use OpenCV DNN module for ONNX inference. Defaults to False.
data (str | Path | optional): Additional data.yaml file for class names. data (str | Path | optional): Path to the additional data.yaml file containing class names. Optional.
fp16 (bool): If True, use half precision. Default: False fp16 (bool): Enable half-precision inference. Supported only on specific backends. Defaults to False.
fuse (bool): Whether to fuse the model or not. Default: True fuse (bool): Fuse Conv2D + BatchNorm layers for optimization. Defaults to True.
verbose (bool): Whether to run in verbose mode or not. Default: True verbose (bool): Enable verbose logging. Defaults to True.
Supported formats and their naming conventions:
| Format | Suffix |
|-----------------------|------------------|
| PyTorch | *.pt |
| TorchScript | *.torchscript |
| ONNX Runtime | *.onnx |
| ONNX OpenCV DNN | *.onnx dnn=True |
| OpenVINO | *.xml |
| CoreML | *.mlpackage |
| TensorRT | *.engine |
| TensorFlow SavedModel | *_saved_model |
| TensorFlow GraphDef | *.pb |
| TensorFlow Lite | *.tflite |
| TensorFlow Edge TPU | *_edgetpu.tflite |
| PaddlePaddle | *_paddle_model |
| ncnn | *_ncnn_model |
""" """
super().__init__() super().__init__()
w = str(weights[0] if isinstance(weights, list) else weights) w = str(weights[0] if isinstance(weights, list) else weights)
@ -440,14 +453,14 @@ class AutoBackend(nn.Module):
def from_numpy(self, x): def from_numpy(self, x):
""" """
Convert a numpy array to a tensor. Convert a numpy array to a tensor.
Args: Args:
x (np.ndarray): The array to be converted. x (np.ndarray): The array to be converted.
Returns: Returns:
(torch.Tensor): The converted tensor (torch.Tensor): The converted tensor
""" """
return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x
def warmup(self, imgsz=(1, 3, 640, 640)): def warmup(self, imgsz=(1, 3, 640, 640)):
@ -476,7 +489,7 @@ class AutoBackend(nn.Module):
@staticmethod @staticmethod
def _model_type(p='path/to/model.pt'): def _model_type(p='path/to/model.pt'):
""" """
This function takes a path to a model file and returns the model type This function takes a path to a model file and returns the model type.
Args: Args:
p: path to the model file. Defaults to path/to/model.pt p: path to the model file. Defaults to path/to/model.pt

View File

@ -1,16 +1,20 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """
Ultralytics modules. Visualize with: Ultralytics modules.
from ultralytics.nn.modules import * Example:
import torch Visualize a module with Netron.
import os ```python
from ultralytics.nn.modules import *
import torch
import os
x = torch.ones(1, 128, 40, 40) x = torch.ones(1, 128, 40, 40)
m = Conv(128, 128) m = Conv(128, 128)
f = f'{m._get_name()}.onnx' f = f'{m._get_name()}.onnx'
torch.onnx.export(m, x, f) torch.onnx.export(m, x, f)
os.system(f'onnxsim {f} {f} && open {f}') os.system(f'onnxsim {f} {f} && open {f}')
```
""" """
from .block import (C1, C2, C3, C3TR, DFL, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, GhostBottleneck, from .block import (C1, C2, C3, C3TR, DFL, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, GhostBottleneck,

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """Block modules."""
Block modules
"""
import torch import torch
import torch.nn as nn import torch.nn as nn
@ -17,6 +15,7 @@ __all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', '
class DFL(nn.Module): class DFL(nn.Module):
""" """
Integral module of Distribution Focal Loss (DFL). Integral module of Distribution Focal Loss (DFL).
Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391 Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
""" """
@ -51,11 +50,14 @@ class Proto(nn.Module):
class HGStem(nn.Module): class HGStem(nn.Module):
"""StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d. """
StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
""" """
def __init__(self, c1, cm, c2): def __init__(self, c1, cm, c2):
"""Initialize the SPP layer with input/output channels and specified kernel sizes for max pooling."""
super().__init__() super().__init__()
self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU()) self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())
self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU()) self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())
@ -79,11 +81,14 @@ class HGStem(nn.Module):
class HGBlock(nn.Module): class HGBlock(nn.Module):
"""HG_Block of PPHGNetV2 with 2 convolutions and LightConv. """
HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
""" """
def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()): def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()):
"""Initializes a CSP Bottleneck with 1 convolution using specified input and output channels."""
super().__init__() super().__init__()
block = LightConv if lightconv else Conv block = LightConv if lightconv else Conv
self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n)) self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
@ -218,6 +223,7 @@ class RepC3(nn.Module):
"""Rep C3.""" """Rep C3."""
def __init__(self, c1, c2, n=3, e=1.0): def __init__(self, c1, c2, n=3, e=1.0):
"""Initialize CSP Bottleneck with a single convolution using input channels, output channels, and number."""
super().__init__() super().__init__()
c_ = int(c2 * e) # hidden channels c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c2, 1, 1) self.cv1 = Conv(c1, c2, 1, 1)

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """Convolution modules."""
Convolution modules
"""
import math import math
@ -69,7 +67,9 @@ class Conv2(Conv):
class LightConv(nn.Module): class LightConv(nn.Module):
"""Light convolution with args(ch_in, ch_out, kernel). """
Light convolution with args(ch_in, ch_out, kernel).
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
""" """
@ -148,12 +148,15 @@ class GhostConv(nn.Module):
class RepConv(nn.Module): class RepConv(nn.Module):
""" """
RepConv is a basic rep-style block, including training and deploy status. This module is used in RT-DETR. RepConv is a basic rep-style block, including training and deploy status.
This module is used in RT-DETR.
Based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py Based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
""" """
default_act = nn.SiLU() # default activation default_act = nn.SiLU() # default activation
def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False): def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
"""Initializes Light Convolution layer with inputs, outputs & optional activation function."""
super().__init__() super().__init__()
assert k == 3 and p == 1 assert k == 3 and p == 1
self.g = g self.g = g
@ -166,27 +169,30 @@ class RepConv(nn.Module):
self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False) self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
def forward_fuse(self, x): def forward_fuse(self, x):
"""Forward process""" """Forward process."""
return self.act(self.conv(x)) return self.act(self.conv(x))
def forward(self, x): def forward(self, x):
"""Forward process""" """Forward process."""
id_out = 0 if self.bn is None else self.bn(x) id_out = 0 if self.bn is None else self.bn(x)
return self.act(self.conv1(x) + self.conv2(x) + id_out) return self.act(self.conv1(x) + self.conv2(x) + id_out)
def get_equivalent_kernel_bias(self): def get_equivalent_kernel_bias(self):
"""Returns equivalent kernel and bias by adding 3x3 kernel, 1x1 kernel and identity kernel with their biases."""
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
kernelid, biasid = self._fuse_bn_tensor(self.bn) kernelid, biasid = self._fuse_bn_tensor(self.bn)
return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
def _pad_1x1_to_3x3_tensor(self, kernel1x1): def _pad_1x1_to_3x3_tensor(self, kernel1x1):
"""Pads a 1x1 tensor to a 3x3 tensor."""
if kernel1x1 is None: if kernel1x1 is None:
return 0 return 0
else: else:
return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
def _fuse_bn_tensor(self, branch): def _fuse_bn_tensor(self, branch):
"""Generates appropriate kernels and biases for convolution by fusing branches of the neural network."""
if branch is None: if branch is None:
return 0, 0 return 0, 0
if isinstance(branch, Conv): if isinstance(branch, Conv):
@ -214,6 +220,7 @@ class RepConv(nn.Module):
return kernel * t, beta - running_mean * gamma / std return kernel * t, beta - running_mean * gamma / std
def fuse_convs(self): def fuse_convs(self):
"""Combines two convolution layers into a single layer and removes unused attributes from the class."""
if hasattr(self, 'conv'): if hasattr(self, 'conv'):
return return
kernel, bias = self.get_equivalent_kernel_bias() kernel, bias = self.get_equivalent_kernel_bias()
@ -243,12 +250,14 @@ class ChannelAttention(nn.Module):
"""Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet.""" """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet."""
def __init__(self, channels: int) -> None: def __init__(self, channels: int) -> None:
"""Initializes the class and sets the basic configurations and instance variables required."""
super().__init__() super().__init__()
self.pool = nn.AdaptiveAvgPool2d(1) self.pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True) self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
self.act = nn.Sigmoid() self.act = nn.Sigmoid()
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Applies forward pass using activation on convolutions of the input, optionally using batch normalization."""
return x * self.act(self.fc(self.pool(x))) return x * self.act(self.fc(self.pool(x)))

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """Model head modules."""
Model head modules
"""
import math import math
@ -229,6 +227,7 @@ class RTDETRDecoder(nn.Module):
self._reset_parameters() self._reset_parameters()
def forward(self, x, batch=None): def forward(self, x, batch=None):
"""Runs the forward pass of the module, returning bounding box and classification scores for the input."""
from ultralytics.models.utils.ops import get_cdn_group from ultralytics.models.utils.ops import get_cdn_group
# input projection and embedding # input projection and embedding
@ -265,6 +264,7 @@ class RTDETRDecoder(nn.Module):
return y if self.export else (y, x) return y if self.export else (y, x)
def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2): def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
"""Generates anchor bounding boxes for given shapes with specific grid size and validates them."""
anchors = [] anchors = []
for i, (h, w) in enumerate(shapes): for i, (h, w) in enumerate(shapes):
sy = torch.arange(end=h, dtype=dtype, device=device) sy = torch.arange(end=h, dtype=dtype, device=device)
@ -284,6 +284,7 @@ class RTDETRDecoder(nn.Module):
return anchors, valid_mask return anchors, valid_mask
def _get_encoder_input(self, x): def _get_encoder_input(self, x):
"""Processes and returns encoder inputs by getting projection features from input and concatenating them."""
# get projection features # get projection features
x = [self.input_proj[i](feat) for i, feat in enumerate(x)] x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
# get encoder inputs # get encoder inputs
@ -301,6 +302,7 @@ class RTDETRDecoder(nn.Module):
return feats, shapes return feats, shapes
def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None): def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None):
"""Generates and prepares the input required for the decoder from the provided features and shapes."""
bs = len(feats) bs = len(feats)
# prepare input for decoder # prepare input for decoder
anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device) anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)
@ -339,6 +341,7 @@ class RTDETRDecoder(nn.Module):
# TODO # TODO
def _reset_parameters(self): def _reset_parameters(self):
"""Initializes or resets the parameters of the model's various components with predefined weights and biases."""
# class and bbox head init # class and bbox head init
bias_cls = bias_init_with_prob(0.01) / 80 * self.nc bias_cls = bias_init_with_prob(0.01) / 80 * self.nc
# NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets. # NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """Transformer modules."""
Transformer modules
"""
import math import math
@ -18,9 +16,10 @@ __all__ = ('TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'M
class TransformerEncoderLayer(nn.Module): class TransformerEncoderLayer(nn.Module):
"""Transformer Encoder.""" """Defines a single layer of the transformer encoder."""
def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False): def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
"""Initialize the TransformerEncoderLayer with specified parameters."""
super().__init__() super().__init__()
from ...utils.torch_utils import TORCH_1_9 from ...utils.torch_utils import TORCH_1_9
if not TORCH_1_9: if not TORCH_1_9:
@ -41,10 +40,11 @@ class TransformerEncoderLayer(nn.Module):
self.normalize_before = normalize_before self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos=None): def with_pos_embed(self, tensor, pos=None):
"""Add position embeddings if given.""" """Add position embeddings to the tensor if provided."""
return tensor if pos is None else tensor + pos return tensor if pos is None else tensor + pos
def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None): def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
"""Performs forward pass with post-normalization."""
q = k = self.with_pos_embed(src, pos) q = k = self.with_pos_embed(src, pos)
src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
src = src + self.dropout1(src2) src = src + self.dropout1(src2)
@ -54,6 +54,7 @@ class TransformerEncoderLayer(nn.Module):
return self.norm2(src) return self.norm2(src)
def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None): def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
"""Performs forward pass with pre-normalization."""
src2 = self.norm1(src) src2 = self.norm1(src)
q = k = self.with_pos_embed(src2, pos) q = k = self.with_pos_embed(src2, pos)
src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
@ -70,11 +71,14 @@ class TransformerEncoderLayer(nn.Module):
class AIFI(TransformerEncoderLayer): class AIFI(TransformerEncoderLayer):
"""Defines the AIFI transformer layer."""
def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False): def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
"""Initialize the AIFI instance with specified parameters."""
super().__init__(c1, cm, num_heads, dropout, act, normalize_before) super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
def forward(self, x): def forward(self, x):
"""Forward pass for the AIFI transformer layer."""
c, h, w = x.shape[1:] c, h, w = x.shape[1:]
pos_embed = self.build_2d_sincos_position_embedding(w, h, c) pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
# flatten [B, C, H, W] to [B, HxW, C] # flatten [B, C, H, W] to [B, HxW, C]
@ -82,7 +86,8 @@ class AIFI(TransformerEncoderLayer):
return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous() return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
@staticmethod @staticmethod
def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.): def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
"""Builds 2D sine-cosine position embedding."""
grid_w = torch.arange(int(w), dtype=torch.float32) grid_w = torch.arange(int(w), dtype=torch.float32)
grid_h = torch.arange(int(h), dtype=torch.float32) grid_h = torch.arange(int(h), dtype=torch.float32)
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij') grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
@ -140,27 +145,32 @@ class TransformerBlock(nn.Module):
class MLPBlock(nn.Module): class MLPBlock(nn.Module):
"""Implements a single block of a multi-layer perceptron."""
def __init__(self, embedding_dim, mlp_dim, act=nn.GELU): def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
"""Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function."""
super().__init__() super().__init__()
self.lin1 = nn.Linear(embedding_dim, mlp_dim) self.lin1 = nn.Linear(embedding_dim, mlp_dim)
self.lin2 = nn.Linear(mlp_dim, embedding_dim) self.lin2 = nn.Linear(mlp_dim, embedding_dim)
self.act = act() self.act = act()
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Forward pass for the MLPBlock."""
return self.lin2(self.act(self.lin1(x))) return self.lin2(self.act(self.lin1(x)))
class MLP(nn.Module): class MLP(nn.Module):
""" Very simple multi-layer perceptron (also called FFN)""" """Implements a simple multi-layer perceptron (also called FFN)."""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers): def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
"""Initialize the MLP with specified input, hidden, output dimensions and number of layers."""
super().__init__() super().__init__()
self.num_layers = num_layers self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1) h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x): def forward(self, x):
"""Forward pass for the entire MLP."""
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x return x
@ -168,17 +178,22 @@ class MLP(nn.Module):
class LayerNorm2d(nn.Module): class LayerNorm2d(nn.Module):
""" """
LayerNorm2d module from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py 2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
Original implementation at
https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
""" """
def __init__(self, num_channels, eps=1e-6): def __init__(self, num_channels, eps=1e-6):
"""Initialize LayerNorm2d with the given parameters."""
super().__init__() super().__init__()
self.weight = nn.Parameter(torch.ones(num_channels)) self.weight = nn.Parameter(torch.ones(num_channels))
self.bias = nn.Parameter(torch.zeros(num_channels)) self.bias = nn.Parameter(torch.zeros(num_channels))
self.eps = eps self.eps = eps
def forward(self, x): def forward(self, x):
"""Perform forward pass for 2D layer normalization."""
u = x.mean(1, keepdim=True) u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True) s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps) x = (x - u) / torch.sqrt(s + self.eps)
@ -187,11 +202,13 @@ class LayerNorm2d(nn.Module):
class MSDeformAttn(nn.Module): class MSDeformAttn(nn.Module):
""" """
Original Multi-Scale Deformable Attention Module. Multi-Scale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
""" """
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
"""Initialize MSDeformAttn with the given parameters."""
super().__init__() super().__init__()
if d_model % n_heads != 0: if d_model % n_heads != 0:
raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}') raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}')
@ -214,6 +231,7 @@ class MSDeformAttn(nn.Module):
self._reset_parameters() self._reset_parameters()
def _reset_parameters(self): def _reset_parameters(self):
"""Reset module parameters."""
constant_(self.sampling_offsets.weight.data, 0.) constant_(self.sampling_offsets.weight.data, 0.)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
@ -232,7 +250,10 @@ class MSDeformAttn(nn.Module):
def forward(self, query, refer_bbox, value, value_shapes, value_mask=None): def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
""" """
Perform forward pass for multi-scale deformable attention.
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
Args: Args:
query (torch.Tensor): [bs, query_length, C] query (torch.Tensor): [bs, query_length, C]
refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
@ -272,24 +293,27 @@ class MSDeformAttn(nn.Module):
class DeformableTransformerDecoderLayer(nn.Module): class DeformableTransformerDecoderLayer(nn.Module):
""" """
Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
""" """
def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4): def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4):
"""Initialize the DeformableTransformerDecoderLayer with the given parameters."""
super().__init__() super().__init__()
# self attention # Self attention
self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
self.dropout1 = nn.Dropout(dropout) self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(d_model) self.norm1 = nn.LayerNorm(d_model)
# cross attention # Cross attention
self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
self.dropout2 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model)
# ffn # FFN
self.linear1 = nn.Linear(d_model, d_ffn) self.linear1 = nn.Linear(d_model, d_ffn)
self.act = act self.act = act
self.dropout3 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout)
@ -299,37 +323,44 @@ class DeformableTransformerDecoderLayer(nn.Module):
@staticmethod @staticmethod
def with_pos_embed(tensor, pos): def with_pos_embed(tensor, pos):
"""Add positional embeddings to the input tensor, if provided."""
return tensor if pos is None else tensor + pos return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt): def forward_ffn(self, tgt):
"""Perform forward pass through the Feed-Forward Network part of the layer."""
tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt)))) tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
tgt = tgt + self.dropout4(tgt2) tgt = tgt + self.dropout4(tgt2)
return self.norm3(tgt) return self.norm3(tgt)
def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None): def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
# self attention """Perform the forward pass through the entire decoder layer."""
# Self attention
q = k = self.with_pos_embed(embed, query_pos) q = k = self.with_pos_embed(embed, query_pos)
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
attn_mask=attn_mask)[0].transpose(0, 1) attn_mask=attn_mask)[0].transpose(0, 1)
embed = embed + self.dropout1(tgt) embed = embed + self.dropout1(tgt)
embed = self.norm1(embed) embed = self.norm1(embed)
# cross attention # Cross attention
tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
padding_mask) padding_mask)
embed = embed + self.dropout2(tgt) embed = embed + self.dropout2(tgt)
embed = self.norm2(embed) embed = self.norm2(embed)
# ffn # FFN
return self.forward_ffn(embed) return self.forward_ffn(embed)
class DeformableTransformerDecoder(nn.Module): class DeformableTransformerDecoder(nn.Module):
""" """
Implementation of Deformable Transformer Decoder based on PaddleDetection.
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
""" """
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
"""Initialize the DeformableTransformerDecoder with the given parameters."""
super().__init__() super().__init__()
self.layers = _get_clones(decoder_layer, num_layers) self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers self.num_layers = num_layers
@ -347,6 +378,7 @@ class DeformableTransformerDecoder(nn.Module):
pos_mlp, pos_mlp,
attn_mask=None, attn_mask=None,
padding_mask=None): padding_mask=None):
"""Perform the forward pass through the entire decoder."""
output = embed output = embed
dec_bboxes = [] dec_bboxes = []
dec_cls = [] dec_cls = []

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """Module utils."""
Module utils
"""
import copy import copy
import math import math
@ -16,15 +14,17 @@ __all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
def _get_clones(module, n): def _get_clones(module, n):
"""Create a list of cloned modules from the given module."""
return nn.ModuleList([copy.deepcopy(module) for _ in range(n)]) return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
def bias_init_with_prob(prior_prob=0.01): def bias_init_with_prob(prior_prob=0.01):
"""initialize conv/fc bias value according to a given probability value.""" """Initialize conv/fc bias value according to a given probability value."""
return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init
def linear_init_(module): def linear_init_(module):
"""Initialize the weights and biases of a linear module."""
bound = 1 / math.sqrt(module.weight.shape[0]) bound = 1 / math.sqrt(module.weight.shape[0])
uniform_(module.weight, -bound, bound) uniform_(module.weight, -bound, bound)
if hasattr(module, 'bias') and module.bias is not None: if hasattr(module, 'bias') and module.bias is not None:
@ -32,6 +32,7 @@ def linear_init_(module):
def inverse_sigmoid(x, eps=1e-5): def inverse_sigmoid(x, eps=1e-5):
"""Calculate the inverse sigmoid function for a tensor."""
x = x.clamp(min=0, max=1) x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps) x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps) x2 = (1 - x).clamp(min=eps)
@ -43,6 +44,7 @@ def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shape
attention_weights: torch.Tensor) -> torch.Tensor: attention_weights: torch.Tensor) -> torch.Tensor:
""" """
Multi-scale deformable attention. Multi-scale deformable attention.
https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
""" """

View File

@ -25,14 +25,11 @@ except ImportError:
class BaseModel(nn.Module): class BaseModel(nn.Module):
""" """The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family."""
The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family.
"""
def forward(self, x, *args, **kwargs): def forward(self, x, *args, **kwargs):
""" """
Forward pass of the model on a single scale. Forward pass of the model on a single scale. Wrapper for `_forward_once` method.
Wrapper for `_forward_once` method.
Args: Args:
x (torch.Tensor | dict): The input image tensor or a dict including image tensor and gt labels. x (torch.Tensor | dict): The input image tensor or a dict including image tensor and gt labels.
@ -93,8 +90,8 @@ class BaseModel(nn.Module):
def _profile_one_layer(self, m, x, dt): def _profile_one_layer(self, m, x, dt):
""" """
Profile the computation time and FLOPs of a single layer of the model on a given input. Profile the computation time and FLOPs of a single layer of the model on a given input. Appends the results to
Appends the results to the provided list. the provided list.
Args: Args:
m (nn.Module): The layer to be profiled. m (nn.Module): The layer to be profiled.
@ -158,7 +155,7 @@ class BaseModel(nn.Module):
def info(self, detailed=False, verbose=True, imgsz=640): def info(self, detailed=False, verbose=True, imgsz=640):
""" """
Prints model information Prints model information.
Args: Args:
detailed (bool): if True, prints out detailed information about the model. Defaults to False detailed (bool): if True, prints out detailed information about the model. Defaults to False
@ -175,7 +172,7 @@ class BaseModel(nn.Module):
fn (function): the function to apply to the model fn (function): the function to apply to the model
Returns: Returns:
A model that is a Detect() object. (BaseModel): An updated BaseModel object.
""" """
self = super()._apply(fn) self = super()._apply(fn)
m = self.model[-1] # Detect() m = self.model[-1] # Detect()
@ -202,7 +199,7 @@ class BaseModel(nn.Module):
def loss(self, batch, preds=None): def loss(self, batch, preds=None):
""" """
Compute loss Compute loss.
Args: Args:
batch (dict): Batch to compute loss on batch (dict): Batch to compute loss on
@ -215,6 +212,7 @@ class BaseModel(nn.Module):
return self.criterion(preds, batch) return self.criterion(preds, batch)
def init_criterion(self): def init_criterion(self):
"""Initialize the loss criterion for the BaseModel."""
raise NotImplementedError('compute_loss() needs to be implemented by task heads') raise NotImplementedError('compute_loss() needs to be implemented by task heads')
@ -222,6 +220,7 @@ class DetectionModel(BaseModel):
"""YOLOv8 detection model.""" """YOLOv8 detection model."""
def __init__(self, cfg='yolov8n.yaml', ch=3, nc=None, verbose=True): # model, input channels, number of classes def __init__(self, cfg='yolov8n.yaml', ch=3, nc=None, verbose=True): # model, input channels, number of classes
"""Initialize the YOLOv8 detection model with the given config and parameters."""
super().__init__() super().__init__()
self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg) # cfg dict self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg) # cfg dict
@ -289,6 +288,7 @@ class DetectionModel(BaseModel):
return y return y
def init_criterion(self): def init_criterion(self):
"""Initialize the loss criterion for the DetectionModel."""
return v8DetectionLoss(self) return v8DetectionLoss(self)
@ -300,6 +300,7 @@ class SegmentationModel(DetectionModel):
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose) super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
def init_criterion(self): def init_criterion(self):
"""Initialize the loss criterion for the SegmentationModel."""
return v8SegmentationLoss(self) return v8SegmentationLoss(self)
@ -316,6 +317,7 @@ class PoseModel(DetectionModel):
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose) super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
def init_criterion(self): def init_criterion(self):
"""Initialize the loss criterion for the PoseModel."""
return v8PoseLoss(self) return v8PoseLoss(self)
@ -365,22 +367,59 @@ class ClassificationModel(BaseModel):
m[i] = nn.Conv2d(m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None) m[i] = nn.Conv2d(m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None)
def init_criterion(self): def init_criterion(self):
"""Compute the classification loss between predictions and true labels.""" """Initialize the loss criterion for the ClassificationModel."""
return v8ClassificationLoss() return v8ClassificationLoss()
class RTDETRDetectionModel(DetectionModel): class RTDETRDetectionModel(DetectionModel):
"""
RTDETR (Real-time DEtection and Tracking using Transformers) Detection Model class.
This class is responsible for constructing the RTDETR architecture, defining loss functions, and
facilitating both the training and inference processes. RTDETR is an object detection and tracking model
that extends from the DetectionModel base class.
Attributes:
cfg (str): The configuration file path or preset string. Default is 'rtdetr-l.yaml'.
ch (int): Number of input channels. Default is 3 (RGB).
nc (int, optional): Number of classes for object detection. Default is None.
verbose (bool): Specifies if summary statistics are shown during initialization. Default is True.
Methods:
init_criterion: Initializes the criterion used for loss calculation.
loss: Computes and returns the loss during training.
predict: Performs a forward pass through the network and returns the output.
"""
def __init__(self, cfg='rtdetr-l.yaml', ch=3, nc=None, verbose=True): def __init__(self, cfg='rtdetr-l.yaml', ch=3, nc=None, verbose=True):
"""
Initialize the RTDETRDetectionModel.
Args:
cfg (str): Configuration file name or path.
ch (int): Number of input channels.
nc (int, optional): Number of classes. Defaults to None.
verbose (bool, optional): Print additional information during initialization. Defaults to True.
"""
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose) super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
def init_criterion(self): def init_criterion(self):
"""Compute the classification loss between predictions and true labels.""" """Initialize the loss criterion for the RTDETRDetectionModel."""
from ultralytics.models.utils.loss import RTDETRDetectionLoss from ultralytics.models.utils.loss import RTDETRDetectionLoss
return RTDETRDetectionLoss(nc=self.nc, use_vfl=True) return RTDETRDetectionLoss(nc=self.nc, use_vfl=True)
def loss(self, batch, preds=None): def loss(self, batch, preds=None):
"""
Compute the loss for the given batch of data.
Args:
batch (dict): Dictionary containing image and label data.
preds (torch.Tensor, optional): Precomputed model predictions. Defaults to None.
Returns:
tuple: A tuple containing the total loss and main three losses in a tensor.
"""
if not hasattr(self, 'criterion'): if not hasattr(self, 'criterion'):
self.criterion = self.init_criterion() self.criterion = self.init_criterion()
@ -417,16 +456,17 @@ class RTDETRDetectionModel(DetectionModel):
def predict(self, x, profile=False, visualize=False, batch=None, augment=False): def predict(self, x, profile=False, visualize=False, batch=None, augment=False):
""" """
Perform a forward pass through the network. Perform a forward pass through the model.
Args: Args:
x (torch.Tensor): The input tensor to the model x (torch.Tensor): The input tensor.
profile (bool): Print the computation time of each layer if True, defaults to False. profile (bool, optional): If True, profile the computation time for each layer. Defaults to False.
visualize (bool): Save the feature maps of the model if True, defaults to False visualize (bool, optional): If True, save feature maps for visualization. Defaults to False.
batch (dict): A dict including gt boxes and labels from dataloader. batch (dict, optional): Ground truth data for evaluation. Defaults to None.
augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.
Returns: Returns:
(torch.Tensor): The last output of the model. torch.Tensor: Model's output tensor.
""" """
y, dt = [], [] # outputs y, dt = [], [] # outputs
for m in self.model[:-1]: # except the head part for m in self.model[:-1]: # except the head part
@ -708,9 +748,9 @@ def yaml_model_load(path):
def guess_model_scale(model_path): def guess_model_scale(model_path):
""" """
Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale. Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale. The function
The function uses regular expression matching to find the pattern of the model scale in the YAML file name, uses regular expression matching to find the pattern of the model scale in the YAML file name, which is denoted by
which is denoted by n, s, m, l, or x. The function returns the size character of the model scale as a string. n, s, m, l, or x. The function returns the size character of the model scale as a string.
Args: Args:
model_path (str | Path): The path to the YOLO model's YAML file. model_path (str | Path): The path to the YOLO model's YAML file.

View File

@ -12,6 +12,33 @@ from .utils.kalman_filter import KalmanFilterXYWH
class BOTrack(STrack): class BOTrack(STrack):
"""
An extended version of the STrack class for YOLOv8, adding object tracking features.
Attributes:
shared_kalman (KalmanFilterXYWH): A shared Kalman filter for all instances of BOTrack.
smooth_feat (np.ndarray): Smoothed feature vector.
curr_feat (np.ndarray): Current feature vector.
features (deque): A deque to store feature vectors with a maximum length defined by `feat_history`.
alpha (float): Smoothing factor for the exponential moving average of features.
mean (np.ndarray): The mean state of the Kalman filter.
covariance (np.ndarray): The covariance matrix of the Kalman filter.
Methods:
update_features(feat): Update features vector and smooth it using exponential moving average.
predict(): Predicts the mean and covariance using Kalman filter.
re_activate(new_track, frame_id, new_id): Reactivates a track with updated features and optionally new ID.
update(new_track, frame_id): Update the YOLOv8 instance with new track and frame ID.
tlwh: Property that gets the current position in tlwh format `(top left x, top left y, width, height)`.
multi_predict(stracks): Predicts the mean and covariance of multiple object tracks using shared Kalman filter.
convert_coords(tlwh): Converts tlwh bounding box coordinates to xywh format.
tlwh_to_xywh(tlwh): Convert bounding box to xywh format `(center x, center y, width, height)`.
Usage:
bo_track = BOTrack(tlwh, score, cls, feat)
bo_track.predict()
bo_track.update(new_track, frame_id)
"""
shared_kalman = KalmanFilterXYWH() shared_kalman = KalmanFilterXYWH()
def __init__(self, tlwh, score, cls, feat=None, feat_history=50): def __init__(self, tlwh, score, cls, feat=None, feat_history=50):
@ -59,9 +86,7 @@ class BOTrack(STrack):
@property @property
def tlwh(self): def tlwh(self):
"""Get current position in bounding box format `(top left x, top left y, """Get current position in bounding box format `(top left x, top left y, width, height)`."""
width, height)`.
"""
if self.mean is None: if self.mean is None:
return self._tlwh.copy() return self._tlwh.copy()
ret = self.mean[:4].copy() ret = self.mean[:4].copy()
@ -90,15 +115,37 @@ class BOTrack(STrack):
@staticmethod @staticmethod
def tlwh_to_xywh(tlwh): def tlwh_to_xywh(tlwh):
"""Convert bounding box to format `(center x, center y, width, """Convert bounding box to format `(center x, center y, width, height)`."""
height)`.
"""
ret = np.asarray(tlwh).copy() ret = np.asarray(tlwh).copy()
ret[:2] += ret[2:] / 2 ret[:2] += ret[2:] / 2
return ret return ret
class BOTSORT(BYTETracker): class BOTSORT(BYTETracker):
"""
An extended version of the BYTETracker class for YOLOv8, designed for object tracking with ReID and GMC algorithm.
Attributes:
proximity_thresh (float): Threshold for spatial proximity (IoU) between tracks and detections.
appearance_thresh (float): Threshold for appearance similarity (ReID embeddings) between tracks and detections.
encoder (object): Object to handle ReID embeddings, set to None if ReID is not enabled.
gmc (GMC): An instance of the GMC algorithm for data association.
args (object): Parsed command-line arguments containing tracking parameters.
Methods:
get_kalmanfilter(): Returns an instance of KalmanFilterXYWH for object tracking.
init_track(dets, scores, cls, img): Initialize track with detections, scores, and classes.
get_dists(tracks, detections): Get distances between tracks and detections using IoU and (optionally) ReID.
multi_predict(tracks): Predict and track multiple objects with YOLOv8 model.
Usage:
bot_sort = BOTSORT(args, frame_rate)
bot_sort.init_track(dets, scores, cls, img)
bot_sort.multi_predict(tracks)
Note:
The class is designed to work with the YOLOv8 object detection model and supports ReID only if enabled via args.
"""
def __init__(self, args, frame_rate=30): def __init__(self, args, frame_rate=30):
"""Initialize YOLOv8 object with ReID module and GMC algorithm.""" """Initialize YOLOv8 object with ReID module and GMC algorithm."""

View File

@ -8,10 +8,43 @@ from .utils.kalman_filter import KalmanFilterXYAH
class STrack(BaseTrack): class STrack(BaseTrack):
"""
Single object tracking representation that uses Kalman filtering for state estimation.
This class is responsible for storing all the information regarding individual tracklets and performs state updates
and predictions based on Kalman filter.
Attributes:
shared_kalman (KalmanFilterXYAH): Shared Kalman filter that is used across all STrack instances for prediction.
_tlwh (np.ndarray): Private attribute to store top-left corner coordinates and width and height of bounding box.
kalman_filter (KalmanFilterXYAH): Instance of Kalman filter used for this particular object track.
mean (np.ndarray): Mean state estimate vector.
covariance (np.ndarray): Covariance of state estimate.
is_activated (bool): Boolean flag indicating if the track has been activated.
score (float): Confidence score of the track.
tracklet_len (int): Length of the tracklet.
cls (any): Class label for the object.
idx (int): Index or identifier for the object.
frame_id (int): Current frame ID.
start_frame (int): Frame where the object was first detected.
Methods:
predict(): Predict the next state of the object using Kalman filter.
multi_predict(stracks): Predict the next states for multiple tracks.
multi_gmc(stracks, H): Update multiple track states using a homography matrix.
activate(kalman_filter, frame_id): Activate a new tracklet.
re_activate(new_track, frame_id, new_id): Reactivate a previously lost tracklet.
update(new_track, frame_id): Update the state of a matched track.
convert_coords(tlwh): Convert bounding box to x-y-angle-height format.
tlwh_to_xyah(tlwh): Convert tlwh bounding box to xyah format.
tlbr_to_tlwh(tlbr): Convert tlbr bounding box to tlwh format.
tlwh_to_tlbr(tlwh): Convert tlwh bounding box to tlbr format.
"""
shared_kalman = KalmanFilterXYAH() shared_kalman = KalmanFilterXYAH()
def __init__(self, tlwh, score, cls): def __init__(self, tlwh, score, cls):
"""wait activate.""" """Initialize new STrack instance."""
self._tlwh = np.asarray(self.tlbr_to_tlwh(tlwh[:-1]), dtype=np.float32) self._tlwh = np.asarray(self.tlbr_to_tlwh(tlwh[:-1]), dtype=np.float32)
self.kalman_filter = None self.kalman_filter = None
self.mean, self.covariance = None, None self.mean, self.covariance = None, None
@ -92,10 +125,11 @@ class STrack(BaseTrack):
def update(self, new_track, frame_id): def update(self, new_track, frame_id):
""" """
Update a matched track Update the state of a matched track.
:type new_track: STrack
:type frame_id: int Args:
:return: new_track (STrack): The new track containing updated information.
frame_id (int): The ID of the current frame.
""" """
self.frame_id = frame_id self.frame_id = frame_id
self.tracklet_len += 1 self.tracklet_len += 1
@ -116,9 +150,7 @@ class STrack(BaseTrack):
@property @property
def tlwh(self): def tlwh(self):
"""Get current position in bounding box format `(top left x, top left y, """Get current position in bounding box format (top left x, top left y, width, height)."""
width, height)`.
"""
if self.mean is None: if self.mean is None:
return self._tlwh.copy() return self._tlwh.copy()
ret = self.mean[:4].copy() ret = self.mean[:4].copy()
@ -128,17 +160,15 @@ class STrack(BaseTrack):
@property @property
def tlbr(self): def tlbr(self):
"""Convert bounding box to format `(min x, min y, max x, max y)`, i.e., """Convert bounding box to format (min x, min y, max x, max y), i.e., (top left, bottom right)."""
`(top left, bottom right)`.
"""
ret = self.tlwh.copy() ret = self.tlwh.copy()
ret[2:] += ret[:2] ret[2:] += ret[:2]
return ret return ret
@staticmethod @staticmethod
def tlwh_to_xyah(tlwh): def tlwh_to_xyah(tlwh):
"""Convert bounding box to format `(center x, center y, aspect ratio, """Convert bounding box to format (center x, center y, aspect ratio, height), where the aspect ratio is width /
height)`, where the aspect ratio is `width / height`. height.
""" """
ret = np.asarray(tlwh).copy() ret = np.asarray(tlwh).copy()
ret[:2] += ret[2:] / 2 ret[:2] += ret[2:] / 2
@ -165,6 +195,33 @@ class STrack(BaseTrack):
class BYTETracker: class BYTETracker:
"""
BYTETracker: A tracking algorithm built on top of YOLOv8 for object detection and tracking.
The class is responsible for initializing, updating, and managing the tracks for detected objects in a video
sequence. It maintains the state of tracked, lost, and removed tracks over frames, utilizes Kalman filtering for
predicting the new object locations, and performs data association.
Attributes:
tracked_stracks (list[STrack]): List of successfully activated tracks.
lost_stracks (list[STrack]): List of lost tracks.
removed_stracks (list[STrack]): List of removed tracks.
frame_id (int): The current frame ID.
args (namespace): Command-line arguments.
max_time_lost (int): The maximum frames for a track to be considered as 'lost'.
kalman_filter (object): Kalman Filter object.
Methods:
update(results, img=None): Updates object tracker with new detections.
get_kalmanfilter(): Returns a Kalman filter object for tracking bounding boxes.
init_track(dets, scores, cls, img=None): Initialize object tracking with detections.
get_dists(tracks, detections): Calculates the distance between tracks and detections.
multi_predict(tracks): Predicts the location of tracks.
reset_id(): Resets the ID counter of STrack.
joint_stracks(tlista, tlistb): Combines two lists of stracks.
sub_stracks(tlista, tlistb): Filters out the stracks present in the second list from the first list.
remove_duplicate_stracks(stracksa, stracksb): Removes duplicate stracks based on IOU.
"""
def __init__(self, args, frame_rate=30): def __init__(self, args, frame_rate=30):
"""Initialize a YOLOv8 object to track objects with given arguments and frame rate.""" """Initialize a YOLOv8 object to track objects with given arguments and frame rate."""
@ -234,8 +291,7 @@ class BYTETracker:
else: else:
track.re_activate(det, self.frame_id, new_id=False) track.re_activate(det, self.frame_id, new_id=False)
refind_stracks.append(track) refind_stracks.append(track)
# Step 3: Second association, with low score detection boxes # Step 3: Second association, with low score detection boxes association the untrack to the low score detections
# association the untrack to the low score detections
detections_second = self.init_track(dets_second, scores_second, cls_second, img) detections_second = self.init_track(dets_second, scores_second, cls_second, img)
r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state == TrackState.Tracked] r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state == TrackState.Tracked]
# TODO # TODO

View File

@ -60,7 +60,6 @@ def register_tracker(model, persist):
Args: Args:
model (object): The model object to register tracking callbacks for. model (object): The model object to register tracking callbacks for.
persist (bool): Whether to persist the trackers if they already exist. persist (bool): Whether to persist the trackers if they already exist.
""" """
model.add_callback('on_predict_start', partial(on_predict_start, persist=persist)) model.add_callback('on_predict_start', partial(on_predict_start, persist=persist))
model.add_callback('on_predict_postprocess_end', on_predict_postprocess_end) model.add_callback('on_predict_postprocess_end', on_predict_postprocess_end)

View File

@ -9,6 +9,29 @@ from ultralytics.utils import LOGGER
class GMC: class GMC:
"""
Generalized Motion Compensation (GMC) class for tracking and object detection in video frames.
This class provides methods for tracking and detecting objects based on several tracking algorithms including ORB,
SIFT, ECC, and Sparse Optical Flow. It also supports downscaling of frames for computational efficiency.
Attributes:
method (str): The method used for tracking. Options include 'orb', 'sift', 'ecc', 'sparseOptFlow', 'none'.
downscale (int): Factor by which to downscale the frames for processing.
prevFrame (np.array): Stores the previous frame for tracking.
prevKeyPoints (list): Stores the keypoints from the previous frame.
prevDescriptors (np.array): Stores the descriptors from the previous frame.
initializedFirstFrame (bool): Flag to indicate if the first frame has been processed.
Methods:
__init__(self, method='sparseOptFlow', downscale=2): Initializes a GMC object with the specified method
and downscale factor.
apply(self, raw_frame, detections=None): Applies the chosen method to a raw frame and optionally uses
provided detections.
applyEcc(self, raw_frame, detections=None): Applies the ECC algorithm to a raw frame.
applyFeatures(self, raw_frame, detections=None): Applies feature-based methods like ORB or SIFT to a raw frame.
applySparseOptFlow(self, raw_frame, detections=None): Applies the Sparse Optical Flow method to a raw frame.
"""
def __init__(self, method='sparseOptFlow', downscale=2): def __init__(self, method='sparseOptFlow', downscale=2):
"""Initialize a video tracker with specified parameters.""" """Initialize a video tracker with specified parameters."""

View File

@ -8,8 +8,8 @@ class KalmanFilterXYAH:
""" """
For bytetrack. A simple Kalman filter for tracking bounding boxes in image space. For bytetrack. A simple Kalman filter for tracking bounding boxes in image space.
The 8-dimensional state space (x, y, a, h, vx, vy, va, vh) contains the bounding box center position (x, y), The 8-dimensional state space (x, y, a, h, vx, vy, va, vh) contains the bounding box center position (x, y), aspect
aspect ratio a, height h, and their respective velocities. ratio a, height h, and their respective velocities.
Object motion follows a constant velocity model. The bounding box location (x, y, a, h) is taken as direct Object motion follows a constant velocity model. The bounding box location (x, y, a, h) is taken as direct
observation of the state space (linear observation model). observation of the state space (linear observation model).
@ -182,8 +182,8 @@ class KalmanFilterXYAH:
def gating_distance(self, mean, covariance, measurements, only_position=False, metric='maha'): def gating_distance(self, mean, covariance, measurements, only_position=False, metric='maha'):
""" """
Compute gating distance between state distribution and measurements. A suitable distance threshold can be Compute gating distance between state distribution and measurements. A suitable distance threshold can be
obtained from `chi2inv95`. If `only_position` is False, the chi-square distribution has 4 degrees of obtained from `chi2inv95`. If `only_position` is False, the chi-square distribution has 4 degrees of freedom,
freedom, otherwise 2. otherwise 2.
Parameters Parameters
---------- ----------
@ -223,8 +223,8 @@ class KalmanFilterXYWH(KalmanFilterXYAH):
""" """
For BoT-SORT. A simple Kalman filter for tracking bounding boxes in image space. For BoT-SORT. A simple Kalman filter for tracking bounding boxes in image space.
The 8-dimensional state space (x, y, w, h, vx, vy, vw, vh) contains the bounding box center position (x, y), The 8-dimensional state space (x, y, w, h, vx, vy, vw, vh) contains the bounding box center position (x, y), width
width w, height h, and their respective velocities. w, height h, and their respective velocities.
Object motion follows a constant velocity model. The bounding box location (x, y, w, h) is taken as direct Object motion follows a constant velocity model. The bounding box location (x, y, w, h) is taken as direct
observation of the state space (linear observation model). observation of the state space (linear observation model).

View File

@ -117,6 +117,7 @@ class TQDM(tqdm_original):
""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
"""Initialize custom Ultralytics tqdm class with different default arguments."""
# Set new default values (these can still be overridden when calling TQDM) # Set new default values (these can still be overridden when calling TQDM)
kwargs['disable'] = not VERBOSE or kwargs.get('disable', False) # logical 'and' with default value if passed kwargs['disable'] = not VERBOSE or kwargs.get('disable', False) # logical 'and' with default value if passed
kwargs.setdefault('bar_format', TQDM_BAR_FORMAT) # override default value if passed kwargs.setdefault('bar_format', TQDM_BAR_FORMAT) # override default value if passed
@ -124,8 +125,7 @@ class TQDM(tqdm_original):
class SimpleClass: class SimpleClass:
""" """Ultralytics SimpleClass is a base class providing helpful string representation, error reporting, and attribute
Ultralytics SimpleClass is a base class providing helpful string representation, error reporting, and attribute
access methods for easier debugging and usage. access methods for easier debugging and usage.
""" """
@ -154,8 +154,7 @@ class SimpleClass:
class IterableSimpleNamespace(SimpleNamespace): class IterableSimpleNamespace(SimpleNamespace):
""" """Ultralytics IterableSimpleNamespace is an extension class of SimpleNamespace that adds iterable functionality and
Ultralytics IterableSimpleNamespace is an extension class of SimpleNamespace that adds iterable functionality and
enables usage with dict() and for loops. enables usage with dict() and for loops.
""" """
@ -256,8 +255,8 @@ class EmojiFilter(logging.Filter):
""" """
A custom logging filter class for removing emojis in log messages. A custom logging filter class for removing emojis in log messages.
This filter is particularly useful for ensuring compatibility with Windows terminals This filter is particularly useful for ensuring compatibility with Windows terminals that may not support the
that may not support the display of emojis in log messages. display of emojis in log messages.
""" """
def filter(self, record): def filter(self, record):
@ -275,9 +274,9 @@ if WINDOWS: # emoji-safe logging
class ThreadingLocked: class ThreadingLocked:
""" """
A decorator class for ensuring thread-safe execution of a function or method. A decorator class for ensuring thread-safe execution of a function or method. This class can be used as a decorator
This class can be used as a decorator to make sure that if the decorated function to make sure that if the decorated function is called from multiple threads, only one thread at a time will be able
is called from multiple threads, only one thread at a time will be able to execute the function. to execute the function.
Attributes: Attributes:
lock (threading.Lock): A lock object used to manage access to the decorated function. lock (threading.Lock): A lock object used to manage access to the decorated function.
@ -294,13 +293,16 @@ class ThreadingLocked:
""" """
def __init__(self): def __init__(self):
"""Initializes the decorator class for thread-safe execution of a function or method."""
self.lock = threading.Lock() self.lock = threading.Lock()
def __call__(self, f): def __call__(self, f):
"""Run thread-safe execution of function or method."""
from functools import wraps from functools import wraps
@wraps(f) @wraps(f)
def decorated(*args, **kwargs): def decorated(*args, **kwargs):
"""Applies thread-safety to the decorated function or method."""
with self.lock: with self.lock:
return f(*args, **kwargs) return f(*args, **kwargs)
@ -424,8 +426,7 @@ def is_kaggle():
def is_jupyter(): def is_jupyter():
""" """
Check if the current script is running inside a Jupyter Notebook. Check if the current script is running inside a Jupyter Notebook. Verified on Colab, Jupyterlab, Kaggle, Paperspace.
Verified on Colab, Jupyterlab, Kaggle, Paperspace.
Returns: Returns:
(bool): True if running inside a Jupyter Notebook, False otherwise. (bool): True if running inside a Jupyter Notebook, False otherwise.
@ -529,8 +530,8 @@ def is_github_actions_ci() -> bool:
def is_git_dir(): def is_git_dir():
""" """
Determines whether the current file is part of a git repository. Determines whether the current file is part of a git repository. If the current file is not part of a git
If the current file is not part of a git repository, returns None. repository, returns None.
Returns: Returns:
(bool): True if current file is part of a git repository. (bool): True if current file is part of a git repository.
@ -540,8 +541,8 @@ def is_git_dir():
def get_git_dir(): def get_git_dir():
""" """
Determines whether the current file is part of a git repository and if so, returns the repository root directory. Determines whether the current file is part of a git repository and if so, returns the repository root directory. If
If the current file is not part of a git repository, returns None. the current file is not part of a git repository, returns None.
Returns: Returns:
(Path | None): Git root directory if found or None if not found. (Path | None): Git root directory if found or None if not found.
@ -578,7 +579,8 @@ def get_git_branch():
def get_default_args(func): def get_default_args(func):
"""Returns a dictionary of default arguments for a function. """
Returns a dictionary of default arguments for a function.
Args: Args:
func (callable): The function to inspect. func (callable): The function to inspect.
@ -710,7 +712,11 @@ def remove_colorstr(input_string):
class TryExcept(contextlib.ContextDecorator): class TryExcept(contextlib.ContextDecorator):
"""YOLOv8 TryExcept class. Usage: @TryExcept() decorator or 'with TryExcept():' context manager.""" """
YOLOv8 TryExcept class.
Use as @TryExcept() decorator or 'with TryExcept():' context manager.
"""
def __init__(self, msg='', verbose=True): def __init__(self, msg='', verbose=True):
"""Initialize TryExcept class with optional message and verbosity settings.""" """Initialize TryExcept class with optional message and verbosity settings."""
@ -729,7 +735,11 @@ class TryExcept(contextlib.ContextDecorator):
def threaded(func): def threaded(func):
"""Multi-threads a target function and returns thread. Usage: @threaded decorator.""" """
Multi-threads a target function and returns thread.
Use as @threaded decorator.
"""
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):
"""Multi-threads a given function and returns the thread.""" """Multi-threads a given function and returns the thread."""
@ -824,6 +834,9 @@ class SettingsManager(dict):
""" """
def __init__(self, file=SETTINGS_YAML, version='0.0.4'): def __init__(self, file=SETTINGS_YAML, version='0.0.4'):
"""Initialize the SettingsManager with default settings, load and validate current settings from the YAML
file.
"""
import copy import copy
import hashlib import hashlib

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """Functions for estimating the best YOLO batch size to use a fraction of the available CUDA memory in PyTorch."""
Functions for estimating the best YOLO batch size to use a fraction of the available CUDA memory in PyTorch.
"""
from copy import deepcopy from copy import deepcopy

View File

@ -1,6 +1,6 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """
Benchmark a YOLO model formats for speed and accuracy Benchmark a YOLO model formats for speed and accuracy.
Usage: Usage:
from ultralytics.utils.benchmarks import ProfileModels, benchmark from ultralytics.utils.benchmarks import ProfileModels, benchmark
@ -194,6 +194,7 @@ class ProfileModels:
self.device = device or torch.device(0 if torch.cuda.is_available() else 'cpu') self.device = device or torch.device(0 if torch.cuda.is_available() else 'cpu')
def profile(self): def profile(self):
"""Logs the benchmarking results of a model, checks metrics against floor and returns the results."""
files = self.get_files() files = self.get_files()
if not files: if not files:
@ -235,6 +236,7 @@ class ProfileModels:
return output return output
def get_files(self): def get_files(self):
"""Returns a list of paths for all relevant model files given by the user."""
files = [] files = []
for path in self.paths: for path in self.paths:
path = Path(path) path = Path(path)
@ -250,10 +252,14 @@ class ProfileModels:
return [Path(file) for file in sorted(files)] return [Path(file) for file in sorted(files)]
def get_onnx_model_info(self, onnx_file: str): def get_onnx_model_info(self, onnx_file: str):
"""Retrieves the information including number of layers, parameters, gradients and FLOPs for an ONNX model
file.
"""
# return (num_layers, num_params, num_gradients, num_flops) # return (num_layers, num_params, num_gradients, num_flops)
return 0.0, 0.0, 0.0, 0.0 return 0.0, 0.0, 0.0, 0.0
def iterative_sigma_clipping(self, data, sigma=2, max_iters=3): def iterative_sigma_clipping(self, data, sigma=2, max_iters=3):
"""Applies an iterative sigma clipping algorithm to the given data times number of iterations."""
data = np.array(data) data = np.array(data)
for _ in range(max_iters): for _ in range(max_iters):
mean, std = np.mean(data), np.std(data) mean, std = np.mean(data), np.std(data)
@ -264,6 +270,7 @@ class ProfileModels:
return data return data
def profile_tensorrt_model(self, engine_file: str, eps: float = 1e-3): def profile_tensorrt_model(self, engine_file: str, eps: float = 1e-3):
"""Profiles the TensorRT model, measuring average run time and standard deviation among runs."""
if not self.trt or not Path(engine_file).is_file(): if not self.trt or not Path(engine_file).is_file():
return 0.0, 0.0 return 0.0, 0.0
@ -292,6 +299,9 @@ class ProfileModels:
return np.mean(run_times), np.std(run_times) return np.mean(run_times), np.std(run_times)
def profile_onnx_model(self, onnx_file: str, eps: float = 1e-3): def profile_onnx_model(self, onnx_file: str, eps: float = 1e-3):
"""Profiles an ONNX model by executing it multiple times and returns the mean and standard deviation of run
times.
"""
check_requirements('onnxruntime') check_requirements('onnxruntime')
import onnxruntime as ort import onnxruntime as ort
@ -344,10 +354,12 @@ class ProfileModels:
return np.mean(run_times), np.std(run_times) return np.mean(run_times), np.std(run_times)
def generate_table_row(self, model_name, t_onnx, t_engine, model_info): def generate_table_row(self, model_name, t_onnx, t_engine, model_info):
"""Generates a formatted string for a table row that includes model performance and metric details."""
layers, params, gradients, flops = model_info layers, params, gradients, flops = model_info
return f'| {model_name:18s} | {self.imgsz} | - | {t_onnx[0]:.2f} ± {t_onnx[1]:.2f} ms | {t_engine[0]:.2f} ± {t_engine[1]:.2f} ms | {params / 1e6:.1f} | {flops:.1f} |' return f'| {model_name:18s} | {self.imgsz} | - | {t_onnx[0]:.2f} ± {t_onnx[1]:.2f} ms | {t_engine[0]:.2f} ± {t_engine[1]:.2f} ms | {params / 1e6:.1f} | {flops:.1f} |'
def generate_results_dict(self, model_name, t_onnx, t_engine, model_info): def generate_results_dict(self, model_name, t_onnx, t_engine, model_info):
"""Generates a dictionary of model details including name, parameters, GFLOPS and speed metrics."""
layers, params, gradients, flops = model_info layers, params, gradients, flops = model_info
return { return {
'model/name': model_name, 'model/name': model_name,
@ -357,6 +369,7 @@ class ProfileModels:
'model/speed_TensorRT(ms)': round(t_engine[0], 3)} 'model/speed_TensorRT(ms)': round(t_engine[0], 3)}
def print_table(self, table_rows): def print_table(self, table_rows):
"""Formats and prints a comparison table for different models with given statistics and performance data."""
gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'GPU' gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'GPU'
header = f'| Model | size<br><sup>(pixels) | mAP<sup>val<br>50-95 | Speed<br><sup>CPU ONNX<br>(ms) | Speed<br><sup>{gpu} TensorRT<br>(ms) | params<br><sup>(M) | FLOPs<br><sup>(B) |' header = f'| Model | size<br><sup>(pixels) | mAP<sup>val<br>50-95 | Speed<br><sup>CPU ONNX<br>(ms) | Speed<br><sup>{gpu} TensorRT<br>(ms) | params<br><sup>(M) | FLOPs<br><sup>(B) |'
separator = '|-------------|---------------------|--------------------|------------------------------|-----------------------------------|------------------|-----------------|' separator = '|-------------|---------------------|--------------------|------------------------------|-----------------------------------|------------------|-----------------|'

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """Base callbacks."""
Base callbacks
"""
from collections import defaultdict from collections import defaultdict
from copy import deepcopy from copy import deepcopy

View File

@ -26,31 +26,38 @@ except (ImportError, AssertionError):
def _get_comet_mode(): def _get_comet_mode():
"""Returns the mode of comet set in the environment variables, defaults to 'online' if not set."""
return os.getenv('COMET_MODE', 'online') return os.getenv('COMET_MODE', 'online')
def _get_comet_model_name(): def _get_comet_model_name():
"""Returns the model name for Comet from the environment variable 'COMET_MODEL_NAME' or defaults to 'YOLOv8'."""
return os.getenv('COMET_MODEL_NAME', 'YOLOv8') return os.getenv('COMET_MODEL_NAME', 'YOLOv8')
def _get_eval_batch_logging_interval(): def _get_eval_batch_logging_interval():
"""Get the evaluation batch logging interval from environment variable or use default value 1."""
return int(os.getenv('COMET_EVAL_BATCH_LOGGING_INTERVAL', 1)) return int(os.getenv('COMET_EVAL_BATCH_LOGGING_INTERVAL', 1))
def _get_max_image_predictions_to_log(): def _get_max_image_predictions_to_log():
"""Get the maximum number of image predictions to log from the environment variables."""
return int(os.getenv('COMET_MAX_IMAGE_PREDICTIONS', 100)) return int(os.getenv('COMET_MAX_IMAGE_PREDICTIONS', 100))
def _scale_confidence_score(score): def _scale_confidence_score(score):
"""Scales the given confidence score by a factor specified in an environment variable."""
scale = float(os.getenv('COMET_MAX_CONFIDENCE_SCORE', 100.0)) scale = float(os.getenv('COMET_MAX_CONFIDENCE_SCORE', 100.0))
return score * scale return score * scale
def _should_log_confusion_matrix(): def _should_log_confusion_matrix():
"""Determines if the confusion matrix should be logged based on the environment variable settings."""
return os.getenv('COMET_EVAL_LOG_CONFUSION_MATRIX', 'false').lower() == 'true' return os.getenv('COMET_EVAL_LOG_CONFUSION_MATRIX', 'false').lower() == 'true'
def _should_log_image_predictions(): def _should_log_image_predictions():
"""Determines whether to log image predictions based on a specified environment variable."""
return os.getenv('COMET_EVAL_LOG_IMAGE_PREDICTIONS', 'true').lower() == 'true' return os.getenv('COMET_EVAL_LOG_IMAGE_PREDICTIONS', 'true').lower() == 'true'
@ -104,9 +111,10 @@ def _fetch_trainer_metadata(trainer):
def _scale_bounding_box_to_original_image_shape(box, resized_image_shape, original_image_shape, ratio_pad): def _scale_bounding_box_to_original_image_shape(box, resized_image_shape, original_image_shape, ratio_pad):
"""YOLOv8 resizes images during training and the label values """
are normalized based on this resized shape. This function rescales the YOLOv8 resizes images during training and the label values are normalized based on this resized shape.
bounding box labels to the original image shape.
This function rescales the bounding box labels to the original image shape.
""" """
resized_image_height, resized_image_width = resized_image_shape resized_image_height, resized_image_width = resized_image_shape

View File

@ -25,6 +25,7 @@ except (ImportError, AssertionError, TypeError):
def _log_images(path, prefix=''): def _log_images(path, prefix=''):
"""Logs images at specified path with an optional prefix using DVCLive."""
if live: if live:
name = path.name name = path.name
@ -38,6 +39,7 @@ def _log_images(path, prefix=''):
def _log_plots(plots, prefix=''): def _log_plots(plots, prefix=''):
"""Logs plot images for training progress if they have not been previously processed."""
for name, params in plots.items(): for name, params in plots.items():
timestamp = params['timestamp'] timestamp = params['timestamp']
if _processed_plots.get(name) != timestamp: if _processed_plots.get(name) != timestamp:
@ -46,6 +48,7 @@ def _log_plots(plots, prefix=''):
def _log_confusion_matrix(validator): def _log_confusion_matrix(validator):
"""Logs the confusion matrix for the given validator using DVCLive."""
targets = [] targets = []
preds = [] preds = []
matrix = validator.confusion_matrix.matrix matrix = validator.confusion_matrix.matrix
@ -62,6 +65,7 @@ def _log_confusion_matrix(validator):
def on_pretrain_routine_start(trainer): def on_pretrain_routine_start(trainer):
"""Initializes DVCLive logger for training metadata during pre-training routine."""
try: try:
global live global live
live = dvclive.Live(save_dvc_exp=True, cache_images=True) live = dvclive.Live(save_dvc_exp=True, cache_images=True)
@ -71,20 +75,24 @@ def on_pretrain_routine_start(trainer):
def on_pretrain_routine_end(trainer): def on_pretrain_routine_end(trainer):
"""Logs plots related to the training process at the end of the pretraining routine."""
_log_plots(trainer.plots, 'train') _log_plots(trainer.plots, 'train')
def on_train_start(trainer): def on_train_start(trainer):
"""Logs the training parameters if DVCLive logging is active."""
if live: if live:
live.log_params(trainer.args) live.log_params(trainer.args)
def on_train_epoch_start(trainer): def on_train_epoch_start(trainer):
"""Sets the global variable _training_epoch value to True at the start of training each epoch."""
global _training_epoch global _training_epoch
_training_epoch = True _training_epoch = True
def on_fit_epoch_end(trainer): def on_fit_epoch_end(trainer):
"""Logs training metrics and model info, and advances to next step on the end of each fit epoch."""
global _training_epoch global _training_epoch
if live and _training_epoch: if live and _training_epoch:
all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics, **trainer.lr} all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics, **trainer.lr}
@ -104,6 +112,7 @@ def on_fit_epoch_end(trainer):
def on_train_end(trainer): def on_train_end(trainer):
"""Logs the best metrics, plots, and confusion matrix at the end of training if DVCLive is active."""
if live: if live:
# At the end log the best metrics. It runs validator on the best model internally. # At the end log the best metrics. It runs validator on the best model internally.
all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics, **trainer.lr} all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics, **trainer.lr}

View File

@ -31,14 +31,13 @@ def _log_images(imgs_dict, group=''):
def _log_plot(title, plot_path): def _log_plot(title, plot_path):
"""Log plots to the NeptuneAI experiment logger."""
""" """
Log image as plot in the plot section of NeptuneAI Log plots to the NeptuneAI experiment logger.
arguments: Args:
title (str) Title of the plot title (str): Title of the plot.
plot_path (PosixPath or str) Path to the saved image file plot_path (PosixPath | str): Path to the saved image file.
""" """
import matplotlib.image as mpimg import matplotlib.image as mpimg
import matplotlib.pyplot as plt import matplotlib.pyplot as plt

View File

@ -17,6 +17,7 @@ except (ImportError, AssertionError):
def _log_plots(plots, step): def _log_plots(plots, step):
"""Logs plots from the input dictionary if they haven't been logged already at the specified step."""
for name, params in plots.items(): for name, params in plots.items():
timestamp = params['timestamp'] timestamp = params['timestamp']
if _processed_plots.get(name) != timestamp: if _processed_plots.get(name) != timestamp:

View File

@ -64,8 +64,8 @@ def parse_requirements(file_path=ROOT.parent / 'requirements.txt', package=''):
def parse_version(version='0.0.0') -> tuple: def parse_version(version='0.0.0') -> tuple:
""" """
Convert a version string to a tuple of integers, ignoring any extra non-numeric string attached to the version. Convert a version string to a tuple of integers, ignoring any extra non-numeric string attached to the version. This
This function replaces deprecated 'pkg_resources.parse_version(v)' function replaces deprecated 'pkg_resources.parse_version(v)'.
Args: Args:
version (str): Version string, i.e. '2.0.1+cpu' version (str): Version string, i.e. '2.0.1+cpu'
@ -372,8 +372,10 @@ def check_torchvision():
Checks the installed versions of PyTorch and Torchvision to ensure they're compatible. Checks the installed versions of PyTorch and Torchvision to ensure they're compatible.
This function checks the installed versions of PyTorch and Torchvision, and warns if they're incompatible according This function checks the installed versions of PyTorch and Torchvision, and warns if they're incompatible according
to the provided compatibility table based on https://github.com/pytorch/vision#installation. The to the provided compatibility table based on:
compatibility table is a dictionary where the keys are PyTorch versions and the values are lists of compatible https://github.com/pytorch/vision#installation.
The compatibility table is a dictionary where the keys are PyTorch versions and the values are lists of compatible
Torchvision versions. Torchvision versions.
""" """
@ -527,9 +529,9 @@ def collect_system_info():
def check_amp(model): def check_amp(model):
""" """
This function checks the PyTorch Automatic Mixed Precision (AMP) functionality of a YOLOv8 model. This function checks the PyTorch Automatic Mixed Precision (AMP) functionality of a YOLOv8 model. If the checks
If the checks fail, it means there are anomalies with AMP on the system that may cause NaN losses or zero-mAP fail, it means there are anomalies with AMP on the system that may cause NaN losses or zero-mAP results, so AMP will
results, so AMP will be disabled during training. be disabled during training.
Args: Args:
model (nn.Module): A YOLOv8 model instance. model (nn.Module): A YOLOv8 model instance.
@ -606,7 +608,8 @@ def print_args(args: Optional[dict] = None, show_file=True, show_func=False):
def cuda_device_count() -> int: def cuda_device_count() -> int:
"""Get the number of NVIDIA GPUs available in the environment. """
Get the number of NVIDIA GPUs available in the environment.
Returns: Returns:
(int): The number of NVIDIA GPUs available. (int): The number of NVIDIA GPUs available.
@ -626,7 +629,8 @@ def cuda_device_count() -> int:
def cuda_is_available() -> bool: def cuda_is_available() -> bool:
"""Check if CUDA is available in the environment. """
Check if CUDA is available in the environment.
Returns: Returns:
(bool): True if one or more NVIDIA GPUs are available, False otherwise. (bool): True if one or more NVIDIA GPUs are available, False otherwise.

View File

@ -13,7 +13,8 @@ from .torch_utils import TORCH_1_9
def find_free_network_port() -> int: def find_free_network_port() -> int:
"""Finds a free port on localhost. """
Finds a free port on localhost.
It is useful in single-node training when we don't want to connect to a real main node but have to set the It is useful in single-node training when we don't want to connect to a real main node but have to set the
`MASTER_PORT` environment variable. `MASTER_PORT` environment variable.

View File

@ -69,8 +69,8 @@ def delete_dsstore(path, files_to_delete=('.DS_Store', '__MACOSX')):
def zip_directory(directory, compress=True, exclude=('.DS_Store', '__MACOSX'), progress=True): def zip_directory(directory, compress=True, exclude=('.DS_Store', '__MACOSX'), progress=True):
""" """
Zips the contents of a directory, excluding files containing strings in the exclude list. Zips the contents of a directory, excluding files containing strings in the exclude list. The resulting zip file is
The resulting zip file is named after the directory and placed alongside it. named after the directory and placed alongside it.
Args: Args:
directory (str | Path): The path to the directory to be zipped. directory (str | Path): The path to the directory to be zipped.
@ -341,7 +341,11 @@ def get_github_assets(repo='ultralytics/assets', version='latest', retry=False):
def attempt_download_asset(file, repo='ultralytics/assets', release='v0.0.0'): def attempt_download_asset(file, repo='ultralytics/assets', release='v0.0.0'):
"""Attempt file download from GitHub release assets if not found locally. release = 'latest', 'v6.2', etc.""" """
Attempt file download from GitHub release assets if not found locally.
release = 'latest', 'v6.2', etc.
"""
from ultralytics.utils import SETTINGS # scoped for circular import from ultralytics.utils import SETTINGS # scoped for circular import
# YOLOv3/5u updates # YOLOv3/5u updates

View File

@ -30,9 +30,9 @@ class WorkingDirectory(contextlib.ContextDecorator):
@contextmanager @contextmanager
def spaces_in_path(path): def spaces_in_path(path):
""" """
Context manager to handle paths with spaces in their names. Context manager to handle paths with spaces in their names. If a path contains spaces, it replaces them with
If a path contains spaces, it replaces them with underscores, copies the file/directory to the new path, underscores, copies the file/directory to the new path, executes the context code block, then copies the
executes the context code block, then copies the file/directory back to its original location. file/directory back to its original location.
Args: Args:
path (str | Path): The original path. path (str | Path): The original path.

View File

@ -32,9 +32,14 @@ __all__ = 'Bboxes', # tuple or list
class Bboxes: class Bboxes:
"""Bounding Boxes class. Only numpy variables are supported.""" """
Bounding Boxes class.
Only numpy variables are supported.
"""
def __init__(self, bboxes, format='xyxy') -> None: def __init__(self, bboxes, format='xyxy') -> None:
"""Initializes the Bboxes class with bounding box data in a specified format."""
assert format in _formats, f'Invalid bounding box format: {format}, format must be one of {_formats}' assert format in _formats, f'Invalid bounding box format: {format}, format must be one of {_formats}'
bboxes = bboxes[None, :] if bboxes.ndim == 1 else bboxes bboxes = bboxes[None, :] if bboxes.ndim == 1 else bboxes
assert bboxes.ndim == 2 assert bboxes.ndim == 2
@ -194,7 +199,7 @@ class Instances:
return self._bboxes.areas() return self._bboxes.areas()
def scale(self, scale_w, scale_h, bbox_only=False): def scale(self, scale_w, scale_h, bbox_only=False):
"""this might be similar with denormalize func but without normalized sign.""" """This might be similar with denormalize func but without normalized sign."""
self._bboxes.mul(scale=(scale_w, scale_h, scale_w, scale_h)) self._bboxes.mul(scale=(scale_w, scale_h, scale_w, scale_h))
if bbox_only: if bbox_only:
return return
@ -307,7 +312,11 @@ class Instances:
self.keypoints[..., 1] = self.keypoints[..., 1].clip(0, h) self.keypoints[..., 1] = self.keypoints[..., 1].clip(0, h)
def remove_zero_area_boxes(self): def remove_zero_area_boxes(self):
"""Remove zero-area boxes, i.e. after clipping some boxes may have zero width or height. This removes them.""" """
Remove zero-area boxes, i.e. after clipping some boxes may have zero width or height.
This removes them.
"""
good = self.bbox_areas > 0 good = self.bbox_areas > 0
if not all(good): if not all(good):
self._bboxes = self._bboxes[good] self._bboxes = self._bboxes[good]

View File

@ -13,7 +13,11 @@ from .tal import bbox2dist
class VarifocalLoss(nn.Module): class VarifocalLoss(nn.Module):
"""Varifocal loss by Zhang et al. https://arxiv.org/abs/2008.13367.""" """
Varifocal loss by Zhang et al.
https://arxiv.org/abs/2008.13367.
"""
def __init__(self): def __init__(self):
"""Initialize the VarifocalLoss class.""" """Initialize the VarifocalLoss class."""
@ -33,6 +37,7 @@ class FocalLoss(nn.Module):
"""Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5).""" """Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)."""
def __init__(self, ): def __init__(self, ):
"""Initializer for FocalLoss class with no parameters."""
super().__init__() super().__init__()
@staticmethod @staticmethod
@ -93,6 +98,7 @@ class KeypointLoss(nn.Module):
"""Criterion class for computing training losses.""" """Criterion class for computing training losses."""
def __init__(self, sigmas) -> None: def __init__(self, sigmas) -> None:
"""Initialize the KeypointLoss class."""
super().__init__() super().__init__()
self.sigmas = sigmas self.sigmas = sigmas

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """Model validation metrics."""
Model validation metrics
"""
import math import math
import warnings import warnings
@ -195,7 +193,7 @@ class ConfusionMatrix:
def process_cls_preds(self, preds, targets): def process_cls_preds(self, preds, targets):
""" """
Update confusion matrix for classification task Update confusion matrix for classification task.
Args: Args:
preds (Array[N, min(nc,5)]): Predicted class labels. preds (Array[N, min(nc,5)]): Predicted class labels.
@ -308,9 +306,7 @@ class ConfusionMatrix:
on_plot(plot_fname) on_plot(plot_fname)
def print(self): def print(self):
""" """Print the confusion matrix to the console."""
Print the confusion matrix to the console.
"""
for i in range(self.nc + 1): for i in range(self.nc + 1):
LOGGER.info(' '.join(map(str, self.matrix[i]))) LOGGER.info(' '.join(map(str, self.matrix[i])))
@ -440,7 +436,6 @@ def ap_per_class(tp,
f1 (np.ndarray): F1-score values at each confidence threshold. f1 (np.ndarray): F1-score values at each confidence threshold.
ap (np.ndarray): Average precision for each class at different IoU thresholds. ap (np.ndarray): Average precision for each class at different IoU thresholds.
unique_classes (np.ndarray): An array of unique classes that have data. unique_classes (np.ndarray): An array of unique classes that have data.
""" """
# Sort by objectness # Sort by objectness
@ -498,32 +493,33 @@ def ap_per_class(tp,
class Metric(SimpleClass): class Metric(SimpleClass):
""" """
Class for computing evaluation metrics for YOLOv8 model. Class for computing evaluation metrics for YOLOv8 model.
Attributes: Attributes:
p (list): Precision for each class. Shape: (nc,). p (list): Precision for each class. Shape: (nc,).
r (list): Recall for each class. Shape: (nc,). r (list): Recall for each class. Shape: (nc,).
f1 (list): F1 score for each class. Shape: (nc,). f1 (list): F1 score for each class. Shape: (nc,).
all_ap (list): AP scores for all classes and all IoU thresholds. Shape: (nc, 10). all_ap (list): AP scores for all classes and all IoU thresholds. Shape: (nc, 10).
ap_class_index (list): Index of class for each AP score. Shape: (nc,). ap_class_index (list): Index of class for each AP score. Shape: (nc,).
nc (int): Number of classes. nc (int): Number of classes.
Methods: Methods:
ap50(): AP at IoU threshold of 0.5 for all classes. Returns: List of AP scores. Shape: (nc,) or []. ap50(): AP at IoU threshold of 0.5 for all classes. Returns: List of AP scores. Shape: (nc,) or [].
ap(): AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: List of AP scores. Shape: (nc,) or []. ap(): AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: List of AP scores. Shape: (nc,) or [].
mp(): Mean precision of all classes. Returns: Float. mp(): Mean precision of all classes. Returns: Float.
mr(): Mean recall of all classes. Returns: Float. mr(): Mean recall of all classes. Returns: Float.
map50(): Mean AP at IoU threshold of 0.5 for all classes. Returns: Float. map50(): Mean AP at IoU threshold of 0.5 for all classes. Returns: Float.
map75(): Mean AP at IoU threshold of 0.75 for all classes. Returns: Float. map75(): Mean AP at IoU threshold of 0.75 for all classes. Returns: Float.
map(): Mean AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: Float. map(): Mean AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: Float.
mean_results(): Mean of results, returns mp, mr, map50, map. mean_results(): Mean of results, returns mp, mr, map50, map.
class_result(i): Class-aware result, returns p[i], r[i], ap50[i], ap[i]. class_result(i): Class-aware result, returns p[i], r[i], ap50[i], ap[i].
maps(): mAP of each class. Returns: Array of mAP scores, shape: (nc,). maps(): mAP of each class. Returns: Array of mAP scores, shape: (nc,).
fitness(): Model fitness as a weighted combination of metrics. Returns: Float. fitness(): Model fitness as a weighted combination of metrics. Returns: Float.
update(results): Update metric attributes with new evaluation results. update(results): Update metric attributes with new evaluation results.
""" """
def __init__(self) -> None: def __init__(self) -> None:
"""Initializes a Metric instance for computing evaluation metrics for the YOLOv8 model."""
self.p = [] # (nc, ) self.p = [] # (nc, )
self.r = [] # (nc, ) self.r = [] # (nc, )
self.f1 = [] # (nc, ) self.f1 = [] # (nc, )
@ -606,12 +602,12 @@ class Metric(SimpleClass):
return [self.mp, self.mr, self.map50, self.map] return [self.mp, self.mr, self.map50, self.map]
def class_result(self, i): def class_result(self, i):
"""class-aware result, return p[i], r[i], ap50[i], ap[i].""" """Class-aware result, return p[i], r[i], ap50[i], ap[i]."""
return self.p[i], self.r[i], self.ap50[i], self.ap[i] return self.p[i], self.r[i], self.ap50[i], self.ap[i]
@property @property
def maps(self): def maps(self):
"""mAP of each class.""" """MAP of each class."""
maps = np.zeros(self.nc) + self.map maps = np.zeros(self.nc) + self.map
for i, c in enumerate(self.ap_class_index): for i, c in enumerate(self.ap_class_index):
maps[c] = self.ap[i] maps[c] = self.ap[i]
@ -672,6 +668,7 @@ class DetMetrics(SimpleClass):
""" """
def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None: def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None:
"""Initialize a DetMetrics instance with a save directory, plot flag, callback function, and class names."""
self.save_dir = save_dir self.save_dir = save_dir
self.plot = plot self.plot = plot
self.on_plot = on_plot self.on_plot = on_plot
@ -756,6 +753,7 @@ class SegmentMetrics(SimpleClass):
""" """
def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None: def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None:
"""Initialize a SegmentMetrics instance with a save directory, plot flag, callback function, and class names."""
self.save_dir = save_dir self.save_dir = save_dir
self.plot = plot self.plot = plot
self.on_plot = on_plot self.on_plot = on_plot
@ -865,6 +863,7 @@ class PoseMetrics(SegmentMetrics):
""" """
def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None: def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None:
"""Initialize the PoseMetrics class with directory path, class names, and plotting options."""
super().__init__(save_dir, plot, names) super().__init__(save_dir, plot, names)
self.save_dir = save_dir self.save_dir = save_dir
self.plot = plot self.plot = plot
@ -954,6 +953,7 @@ class ClassifyMetrics(SimpleClass):
""" """
def __init__(self) -> None: def __init__(self) -> None:
"""Initialize a ClassifyMetrics instance."""
self.top1 = 0 self.top1 = 0
self.top5 = 0 self.top5 = 0
self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0}

View File

@ -50,6 +50,7 @@ class Profile(contextlib.ContextDecorator):
self.t += self.dt # accumulate dt self.t += self.dt # accumulate dt
def __str__(self): def __str__(self):
"""Returns a human-readable string representing the accumulated elapsed time in the profiler."""
return f'Elapsed time is {self.t} s' return f'Elapsed time is {self.t} s'
def time(self): def time(self):
@ -303,7 +304,7 @@ def clip_coords(coords, shape):
def scale_image(masks, im0_shape, ratio_pad=None): def scale_image(masks, im0_shape, ratio_pad=None):
""" """
Takes a mask, and resizes it to the original image size Takes a mask, and resizes it to the original image size.
Args: Args:
masks (np.ndarray): resized and padded masks/images, [h, w, num]/[h, w, 3]. masks (np.ndarray): resized and padded masks/images, [h, w, num]/[h, w, 3].
@ -403,8 +404,8 @@ def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0): def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
""" """
Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height, normalized) format. Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height, normalized) format. x, y,
x, y, width and height are normalized to image dimensions width and height are normalized to image dimensions.
Args: Args:
x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format. x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format.
@ -445,7 +446,7 @@ def xywh2ltwh(x):
def xyxy2ltwh(x): def xyxy2ltwh(x):
""" """
Convert nx4 bounding boxes from [x1, y1, x2, y2] to [x1, y1, w, h], where xy1=top-left, xy2=bottom-right Convert nx4 bounding boxes from [x1, y1, x2, y2] to [x1, y1, w, h], where xy1=top-left, xy2=bottom-right.
Args: Args:
x (np.ndarray | torch.Tensor): The input tensor with the bounding boxes coordinates in the xyxy format x (np.ndarray | torch.Tensor): The input tensor with the bounding boxes coordinates in the xyxy format
@ -461,7 +462,7 @@ def xyxy2ltwh(x):
def ltwh2xywh(x): def ltwh2xywh(x):
""" """
Convert nx4 boxes from [x1, y1, w, h] to [x, y, w, h] where xy1=top-left, xy=center Convert nx4 boxes from [x1, y1, w, h] to [x, y, w, h] where xy1=top-left, xy=center.
Args: Args:
x (torch.Tensor): the input tensor x (torch.Tensor): the input tensor
@ -544,7 +545,7 @@ def xywhr2xyxyxyxy(center):
def ltwh2xyxy(x): def ltwh2xyxy(x):
""" """
It converts the bounding box from [x1, y1, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right It converts the bounding box from [x1, y1, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right.
Args: Args:
x (np.ndarray | torch.Tensor): the input image x (np.ndarray | torch.Tensor): the input image
@ -616,8 +617,8 @@ def crop_mask(masks, boxes):
def process_mask_upsample(protos, masks_in, bboxes, shape): def process_mask_upsample(protos, masks_in, bboxes, shape):
""" """
Takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher Takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher quality
quality but is slower. but is slower.
Args: Args:
protos (torch.Tensor): [mask_dim, mask_h, mask_w] protos (torch.Tensor): [mask_dim, mask_h, mask_w]
@ -713,7 +714,7 @@ def scale_masks(masks, shape, padding=True):
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize=False, padding=True): def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize=False, padding=True):
""" """
Rescale segment coordinates (xy) from img1_shape to img0_shape Rescale segment coordinates (xy) from img1_shape to img0_shape.
Args: Args:
img1_shape (tuple): The shape of the image that the coords are from. img1_shape (tuple): The shape of the image that the coords are from.

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
""" """Monkey patches to update/extend functionality of existing functions."""
Monkey patches to update/extend functionality of existing functions
"""
from pathlib import Path from pathlib import Path
@ -14,7 +12,8 @@ _imshow = cv2.imshow # copy to avoid recursion errors
def imread(filename: str, flags: int = cv2.IMREAD_COLOR): def imread(filename: str, flags: int = cv2.IMREAD_COLOR):
"""Read an image from a file. """
Read an image from a file.
Args: Args:
filename (str): Path to the file to read. filename (str): Path to the file to read.
@ -27,7 +26,8 @@ def imread(filename: str, flags: int = cv2.IMREAD_COLOR):
def imwrite(filename: str, img: np.ndarray, params=None): def imwrite(filename: str, img: np.ndarray, params=None):
"""Write an image to a file. """
Write an image to a file.
Args: Args:
filename (str): Path to the file to write. filename (str): Path to the file to write.
@ -45,7 +45,8 @@ def imwrite(filename: str, img: np.ndarray, params=None):
def imshow(winname: str, mat: np.ndarray): def imshow(winname: str, mat: np.ndarray):
"""Displays an image in the specified window. """
Displays an image in the specified window.
Args: Args:
winname (str): Name of the window. winname (str): Name of the window.
@ -59,7 +60,8 @@ _torch_save = torch.save # copy to avoid recursion errors
def torch_save(*args, **kwargs): def torch_save(*args, **kwargs):
"""Use dill (if exists) to serialize the lambda functions where pickle does not do this. """
Use dill (if exists) to serialize the lambda functions where pickle does not do this.
Args: Args:
*args (tuple): Positional arguments to pass to torch.save. *args (tuple): Positional arguments to pass to torch.save.

View File

@ -316,7 +316,8 @@ def plot_labels(boxes, cls, names=(), save_dir=Path(''), on_plot=None):
def save_one_box(xyxy, im, file=Path('im.jpg'), gain=1.02, pad=10, square=False, BGR=False, save=True): def save_one_box(xyxy, im, file=Path('im.jpg'), gain=1.02, pad=10, square=False, BGR=False, save=True):
"""Save image crop as {file} with crop size multiple {gain} and {pad} pixels. Save and/or return crop. """
Save image crop as {file} with crop size multiple {gain} and {pad} pixels. Save and/or return crop.
This function takes a bounding box and an image, and then saves a cropped portion of the image according This function takes a bounding box and an image, and then saves a cropped portion of the image according
to the bounding box. Optionally, the crop can be squared, and the function allows for gain and padding to the bounding box. Optionally, the crop can be squared, and the function allows for gain and padding

View File

@ -205,7 +205,11 @@ def fuse_deconv_and_bn(deconv, bn):
def model_info(model, detailed=False, verbose=True, imgsz=640): def model_info(model, detailed=False, verbose=True, imgsz=640):
"""Model information. imgsz may be int or list, i.e. imgsz=640 or imgsz=[640, 320].""" """
Model information.
imgsz may be int or list, i.e. imgsz=640 or imgsz=[640, 320].
"""
if not verbose: if not verbose:
return return
n_p = get_num_params(model) # number of parameters n_p = get_num_params(model) # number of parameters
@ -517,13 +521,11 @@ def profile(input, ops, n=10, device=None):
class EarlyStopping: class EarlyStopping:
""" """Early stopping class that stops training when a specified number of epochs have passed without improvement."""
Early stopping class that stops training when a specified number of epochs have passed without improvement.
"""
def __init__(self, patience=50): def __init__(self, patience=50):
""" """
Initialize early stopping object Initialize early stopping object.
Args: Args:
patience (int, optional): Number of epochs to wait after fitness stops improving before stopping. patience (int, optional): Number of epochs to wait after fitness stops improving before stopping.
@ -535,7 +537,7 @@ class EarlyStopping:
def __call__(self, epoch, fitness): def __call__(self, epoch, fitness):
""" """
Check whether to stop training Check whether to stop training.
Args: Args:
epoch (int): Current epoch of training epoch (int): Current epoch of training

View File

@ -7,7 +7,8 @@ import numpy as np
class TritonRemoteModel: class TritonRemoteModel:
"""Client for interacting with a remote Triton Inference Server model. """
Client for interacting with a remote Triton Inference Server model.
Attributes: Attributes:
endpoint (str): The name of the model on the Triton server. endpoint (str): The name of the model on the Triton server.