diff --git a/ultralytics/engine/__init__.py b/ultralytics/engine/__init__.py index e69de29b..9e68dc12 100644 --- a/ultralytics/engine/__init__.py +++ b/ultralytics/engine/__init__.py @@ -0,0 +1 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license diff --git a/ultralytics/engine/exporter.py b/ultralytics/engine/exporter.py index 66932a65..522c049b 100644 --- a/ultralytics/engine/exporter.py +++ b/ultralytics/engine/exporter.py @@ -140,7 +140,7 @@ class Exporter: Args: cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG. overrides (dict, optional): Configuration overrides. Defaults to None. - _callbacks (list, optional): List of callback functions. Defaults to None. + _callbacks (dict, optional): Dictionary of callback functions. Defaults to None. """ self.args = get_cfg(cfg, overrides) if self.args.format.lower() in ('coreml', 'mlmodel'): # fix attempt for protobuf<3.20.x errors diff --git a/ultralytics/models/fastsam/predict.py b/ultralytics/models/fastsam/predict.py index b64d2d6e..4a3c2e9e 100644 --- a/ultralytics/models/fastsam/predict.py +++ b/ultralytics/models/fastsam/predict.py @@ -9,14 +9,45 @@ from ultralytics.utils import DEFAULT_CFG, ops class FastSAMPredictor(DetectionPredictor): + """ + FastSAMPredictor is specialized for fast SAM (Segment Anything Model) segmentation prediction tasks in Ultralytics + YOLO framework. + + This class extends the DetectionPredictor, customizing the prediction pipeline specifically for fast SAM. + It adjusts post-processing steps to incorporate mask prediction and non-max suppression while optimizing + for single-class segmentation. + + Attributes: + cfg (dict): Configuration parameters for prediction. + overrides (dict, optional): Optional parameter overrides for custom behavior. + _callbacks (dict, optional): Optional list of callback functions to be invoked during prediction. + """ def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): - """Initializes FastSAMPredictor class by inheriting from DetectionPredictor and setting task to 'segment'.""" + """ + Initializes the FastSAMPredictor class, inheriting from DetectionPredictor and setting the task to 'segment'. + + Args: + cfg (dict): Configuration parameters for prediction. + overrides (dict, optional): Optional parameter overrides for custom behavior. + _callbacks (dict, optional): Optional list of callback functions to be invoked during prediction. + """ super().__init__(cfg, overrides, _callbacks) self.args.task = 'segment' def postprocess(self, preds, img, orig_imgs): - """Postprocesses the predictions, applies non-max suppression, scales the boxes, and returns the results.""" + """ + Perform post-processing steps on predictions, including non-max suppression and scaling boxes to original image + size, and returns the final results. + + Args: + preds (list): The raw output predictions from the model. + img (torch.Tensor): The processed image tensor. + orig_imgs (list | torch.Tensor): The original image or list of images. + + Returns: + (list): A list of Results objects, each containing processed boxes, masks, and other metadata. + """ p = ops.non_max_suppression( preds[0], self.args.conf, diff --git a/ultralytics/models/fastsam/prompt.py b/ultralytics/models/fastsam/prompt.py index 5eb581e9..d381075f 100644 --- a/ultralytics/models/fastsam/prompt.py +++ b/ultralytics/models/fastsam/prompt.py @@ -13,6 +13,15 @@ from ultralytics.utils import TQDM class FastSAMPrompt: + """ + Fast Segment Anything Model class for image annotation and visualization. + + Attributes: + device (str): Computing device ('cuda' or 'cpu'). + results: Object detection or segmentation results. + source: Source image or image path. + clip: CLIP model for linear assignment. + """ def __init__(self, source, results, device='cuda') -> None: """Initializes FastSAMPrompt with given source, results and device, and assigns clip for linear assignment.""" @@ -92,6 +101,20 @@ class FastSAMPrompt: better_quality=True, retina=False, with_contours=True): + """ + Plots annotations, bounding boxes, and points on images and saves the output. + + Args: + annotations (list): Annotations to be plotted. + output (str or Path): Output directory for saving the plots. + bbox (list, optional): Bounding box coordinates [x1, y1, x2, y2]. Defaults to None. + points (list, optional): Points to be plotted. Defaults to None. + point_label (list, optional): Labels for the points. Defaults to None. + mask_random_color (bool, optional): Whether to use random color for masks. Defaults to True. + better_quality (bool, optional): Whether to apply morphological transformations for better mask quality. Defaults to True. + retina (bool, optional): Whether to use retina mask. Defaults to False. + with_contours (bool, optional): Whether to plot contours. Defaults to True. + """ pbar = TQDM(annotations, total=len(annotations)) for ann in pbar: result_name = os.path.basename(ann.path) @@ -160,6 +183,20 @@ class FastSAMPrompt: target_height=960, target_width=960, ): + """ + Quickly shows the mask annotations on the given matplotlib axis. + + Args: + annotation (array-like): Mask annotation. + ax (matplotlib.axes.Axes): Matplotlib axis. + random_color (bool, optional): Whether to use random color for masks. Defaults to False. + bbox (list, optional): Bounding box coordinates [x1, y1, x2, y2]. Defaults to None. + points (list, optional): Points to be plotted. Defaults to None. + pointlabel (list, optional): Labels for the points. Defaults to None. + retinamask (bool, optional): Whether to use retina mask. Defaults to True. + target_height (int, optional): Target height for resizing. Defaults to 960. + target_width (int, optional): Target width for resizing. Defaults to 960. + """ n, h, w = annotation.shape # batch, height, width areas = np.sum(annotation, axis=(1, 2)) diff --git a/ultralytics/models/fastsam/val.py b/ultralytics/models/fastsam/val.py index fa25e49f..4e1e0b01 100644 --- a/ultralytics/models/fastsam/val.py +++ b/ultralytics/models/fastsam/val.py @@ -5,9 +5,35 @@ from ultralytics.utils.metrics import SegmentMetrics class FastSAMValidator(SegmentationValidator): + """ + Custom validation class for fast SAM (Segment Anything Model) segmentation in Ultralytics YOLO framework. + + Extends the SegmentationValidator class, customizing the validation process specifically for fast SAM. This class + sets the task to 'segment' and uses the SegmentMetrics for evaluation. Additionally, plotting features are disabled + to avoid errors during validation. + + Attributes: + dataloader: The data loader object used for validation. + save_dir (str): The directory where validation results will be saved. + pbar: A progress bar object. + args: Additional arguments for customization. + _callbacks: List of callback functions to be invoked during validation. + """ def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None): - """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.""" + """ + Initialize the FastSAMValidator class, setting the task to 'segment' and metrics to SegmentMetrics. + + Args: + dataloader (torch.utils.data.DataLoader): Dataloader to be used for validation. + save_dir (Path, optional): Directory to save results. + pbar (tqdm.tqdm): Progress bar for displaying progress. + args (SimpleNamespace): Configuration for the validator. + _callbacks (dict): Dictionary to store various callback functions. + + Notes: + Plots for ConfusionMatrix and other related metrics are disabled in this class to avoid errors. + """ super().__init__(dataloader, save_dir, pbar, args, _callbacks) self.args.task = 'segment' self.args.plots = False # disable ConfusionMatrix and other plots to avoid errors diff --git a/ultralytics/models/nas/model.py b/ultralytics/models/nas/model.py index 9a770c4c..00d0b6ed 100644 --- a/ultralytics/models/nas/model.py +++ b/ultralytics/models/nas/model.py @@ -23,6 +23,26 @@ from .val import NASValidator class NAS(Model): + """ + YOLO NAS model for object detection. + + This class provides an interface for the YOLO-NAS models and extends the `Model` class from Ultralytics engine. + It is designed to facilitate the task of object detection using pre-trained or custom-trained YOLO-NAS models. + + Example: + ```python + from ultralytics import NAS + + model = NAS('yolo_nas_s') + results = model.predict('ultralytics/assets/bus.jpg') + ``` + + Attributes: + model (str): Path to the pre-trained model or model name. Defaults to 'yolo_nas_s.pt'. + + Note: + YOLO-NAS models only support pre-trained models. Do not provide YAML configuration files. + """ def __init__(self, model='yolo_nas_s.pt') -> None: """Initializes the NAS model with the provided or default 'yolo_nas_s.pt' model.""" diff --git a/ultralytics/models/nas/predict.py b/ultralytics/models/nas/predict.py index fe06c298..0118527a 100644 --- a/ultralytics/models/nas/predict.py +++ b/ultralytics/models/nas/predict.py @@ -8,6 +8,29 @@ from ultralytics.utils import ops class NASPredictor(BasePredictor): + """ + Ultralytics YOLO NAS Predictor for object detection. + + This class extends the `BasePredictor` from Ultralytics engine and is responsible for post-processing the + raw predictions generated by the YOLO NAS models. It applies operations like non-maximum suppression and + scaling the bounding boxes to fit the original image dimensions. + + Attributes: + args (Namespace): Namespace containing various configurations for post-processing. + + Example: + ```python + from ultralytics import NAS + + model = NAS('yolo_nas_s') + predictor = model.predictor + # Assumes that raw_preds, img, orig_imgs are available + results = predictor.postprocess(raw_preds, img, orig_imgs) + ``` + + Note: + Typically, this class is not instantiated directly. It is used internally within the `NAS` class. + """ def postprocess(self, preds_in, img, orig_imgs): """Postprocess predictions and returns a list of Results objects.""" diff --git a/ultralytics/models/nas/val.py b/ultralytics/models/nas/val.py index 5c39171a..41f60c19 100644 --- a/ultralytics/models/nas/val.py +++ b/ultralytics/models/nas/val.py @@ -9,6 +9,30 @@ __all__ = ['NASValidator'] class NASValidator(DetectionValidator): + """ + Ultralytics YOLO NAS Validator for object detection. + + Extends `DetectionValidator` from the Ultralytics models package and is designed to post-process the raw predictions + generated by YOLO NAS models. It performs non-maximum suppression to remove overlapping and low-confidence boxes, + ultimately producing the final detections. + + Attributes: + args (Namespace): Namespace containing various configurations for post-processing, such as confidence and IoU thresholds. + lb (torch.Tensor): Optional tensor for multilabel NMS. + + Example: + ```python + from ultralytics import NAS + + model = NAS('yolo_nas_s') + validator = model.validator + # Assumes that raw_preds are available + final_preds = validator.postprocess(raw_preds) + ``` + + Note: + This class is generally not instantiated directly but is used internally within the `NAS` class. + """ def postprocess(self, preds_in): """Apply Non-maximum suppression to prediction outputs.""" diff --git a/ultralytics/models/rtdetr/val.py b/ultralytics/models/rtdetr/val.py index d8e5fb69..a6af67ab 100644 --- a/ultralytics/models/rtdetr/val.py +++ b/ultralytics/models/rtdetr/val.py @@ -12,14 +12,19 @@ from ultralytics.utils import colorstr, ops __all__ = 'RTDETRValidator', # tuple or list -# TODO: Temporarily RT-DETR does not need padding. class RTDETRDataset(YOLODataset): + """ + Real-Time DEtection and TRacking (RT-DETR) dataset class extending the base YOLODataset class. + + This specialized dataset class is designed for use with the RT-DETR object detection model and is optimized for + real-time detection and tracking tasks. + """ def __init__(self, *args, data=None, **kwargs): """Initialize the RTDETRDataset class by inheriting from the YOLODataset class.""" super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs) - # NOTE: add stretch version load_image for rtdetr mosaic + # NOTE: add stretch version load_image for RTDETR mosaic def load_image(self, i, rect_mode=False): """Loads 1 image from dataset index 'i', returns (im, resized hw).""" return super().load_image(i=i, rect_mode=rect_mode) @@ -46,7 +51,11 @@ class RTDETRDataset(YOLODataset): class RTDETRValidator(DetectionValidator): """ - A class extending the DetectionValidator class for validation based on an RT-DETR detection model. + RTDETRValidator extends the DetectionValidator class to provide validation capabilities specifically tailored for + the RT-DETR (Real-Time DETR) object detection model. + + The class allows building of an RTDETR-specific dataset for validation, applies Non-maximum suppression for + post-processing, and updates evaluation metrics accordingly. Example: ```python @@ -56,6 +65,9 @@ class RTDETRValidator(DetectionValidator): validator = RTDETRValidator(args=args) validator() ``` + + Note: + For further details on the attributes and methods, refer to the parent DetectionValidator class. """ def build_dataset(self, img_path, mode='val', batch=None): diff --git a/ultralytics/models/sam/modules/decoders.py b/ultralytics/models/sam/modules/decoders.py index a9a3a319..999b5be3 100644 --- a/ultralytics/models/sam/modules/decoders.py +++ b/ultralytics/models/sam/modules/decoders.py @@ -10,6 +10,21 @@ from ultralytics.nn.modules import LayerNorm2d class MaskDecoder(nn.Module): + """ + Decoder module for generating masks and their associated quality scores, using a transformer architecture to predict + masks given image and prompt embeddings. + + Attributes: + transformer_dim (int): Channel dimension for the transformer module. + transformer (nn.Module): The transformer module used for mask prediction. + num_multimask_outputs (int): Number of masks to predict for disambiguating masks. + iou_token (nn.Embedding): Embedding for the IoU token. + num_mask_tokens (int): Number of mask tokens. + mask_tokens (nn.Embedding): Embedding for the mask tokens. + output_upscaling (nn.Sequential): Neural network sequence for upscaling the output. + output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks. + iou_prediction_head (nn.Module): MLP for predicting mask quality. + """ def __init__( self, @@ -136,7 +151,7 @@ class MaskDecoder(nn.Module): class MLP(nn.Module): """ - Lightly adapted from + MLP (Multi-Layer Perceptron) model lightly adapted from https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py """ @@ -148,6 +163,16 @@ class MLP(nn.Module): num_layers: int, sigmoid_output: bool = False, ) -> None: + """ + Initializes the MLP (Multi-Layer Perceptron) model. + + Args: + input_dim (int): The dimensionality of the input features. + hidden_dim (int): The dimensionality of the hidden layers. + output_dim (int): The dimensionality of the output layer. + num_layers (int): The number of hidden layers. + sigmoid_output (bool, optional): Whether to apply a sigmoid activation to the output layer. Defaults to False. + """ super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) diff --git a/ultralytics/models/sam/modules/encoders.py b/ultralytics/models/sam/modules/encoders.py index b4c07744..67b35970 100644 --- a/ultralytics/models/sam/modules/encoders.py +++ b/ultralytics/models/sam/modules/encoders.py @@ -12,6 +12,18 @@ from ultralytics.nn.modules import LayerNorm2d, MLPBlock # This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa class ImageEncoderViT(nn.Module): + """ + An image encoder using Vision Transformer (ViT) architecture for encoding an image into a compact latent space. The + encoder takes an image, splits it into patches, and processes these patches through a series of transformer blocks. + The encoded patches are then processed through a neck to generate the final encoded representation. + + Attributes: + img_size (int): Dimension of input images, assumed to be square. + patch_embed (PatchEmbed): Module for patch embedding. + pos_embed (nn.Parameter, optional): Absolute positional embedding for patches. + blocks (nn.ModuleList): List of transformer blocks for processing patch embeddings. + neck (nn.Sequential): Neck module to further process the output. + """ def __init__( self, @@ -112,6 +124,22 @@ class ImageEncoderViT(nn.Module): class PromptEncoder(nn.Module): + """ + Encodes different types of prompts, including points, boxes, and masks, for input to SAM's mask decoder. The encoder + produces both sparse and dense embeddings for the input prompts. + + Attributes: + embed_dim (int): Dimension of the embeddings. + input_image_size (Tuple[int, int]): Size of the input image as (H, W). + image_embedding_size (Tuple[int, int]): Spatial size of the image embedding as (H, W). + pe_layer (PositionEmbeddingRandom): Module for random position embedding. + num_point_embeddings (int): Number of point embeddings for different types of points. + point_embeddings (nn.ModuleList): List of point embeddings. + not_a_point_embed (nn.Embedding): Embedding for points that are not a part of any label. + mask_input_size (Tuple[int, int]): Size of the input mask. + mask_downscaling (nn.Sequential): Neural network for downscaling the mask. + no_mask_embed (nn.Embedding): Embedding for cases where no mask is provided. + """ def __init__( self, diff --git a/ultralytics/models/sam/modules/sam.py b/ultralytics/models/sam/modules/sam.py index 5649920c..4097a228 100644 --- a/ultralytics/models/sam/modules/sam.py +++ b/ultralytics/models/sam/modules/sam.py @@ -16,6 +16,20 @@ from .encoders import ImageEncoderViT, PromptEncoder class Sam(nn.Module): + """ + Sam (Segment Anything Model) is designed for object segmentation tasks. It uses image encoders to generate image + embeddings, and prompt encoders to encode various types of input prompts. These embeddings are then used by the mask + decoder to predict object masks. + + Attributes: + mask_threshold (float): Threshold value for mask prediction. + image_format (str): Format of the input image, default is 'RGB'. + image_encoder (ImageEncoderViT): The backbone used to encode the image into embeddings. + prompt_encoder (PromptEncoder): Encodes various types of input prompts. + mask_decoder (MaskDecoder): Predicts object masks from the image and prompt embeddings. + pixel_mean (List[float]): Mean pixel values for image normalization. + pixel_std (List[float]): Standard deviation values for image normalization. + """ mask_threshold: float = 0.0 image_format: str = 'RGB' @@ -28,18 +42,19 @@ class Sam(nn.Module): pixel_std: List[float] = (58.395, 57.12, 57.375) ) -> None: """ - SAM predicts object masks from an image and input prompts. + Initialize the Sam class to predict object masks from an image and input prompts. Note: All forward() operations moved to SAMPredictor. Args: - image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings that allow for - efficient mask prediction. - prompt_encoder (PromptEncoder): Encodes various types of input prompts. - mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts. - pixel_mean (list(float)): Mean values for normalizing pixels in the input image. - pixel_std (list(float)): Std values for normalizing pixels in the input image. + image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings. + prompt_encoder (PromptEncoder): Encodes various types of input prompts. + mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts. + pixel_mean (List[float], optional): Mean values for normalizing pixels in the input image. Defaults to + (123.675, 116.28, 103.53). + pixel_std (List[float], optional): Std values for normalizing pixels in the input image. Defaults to + (58.395, 57.12, 57.375). """ super().__init__() self.image_encoder = image_encoder diff --git a/ultralytics/models/sam/modules/tiny_encoder.py b/ultralytics/models/sam/modules/tiny_encoder.py index d96b3032..595286fe 100644 --- a/ultralytics/models/sam/modules/tiny_encoder.py +++ b/ultralytics/models/sam/modules/tiny_encoder.py @@ -21,6 +21,7 @@ from ultralytics.utils.instance import to_2tuple class Conv2d_BN(torch.nn.Sequential): + """A sequential container that performs 2D convolution followed by batch normalization.""" def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1): """Initializes the MBConv model with given input channels, output channels, expansion ratio, activation, and @@ -35,6 +36,7 @@ class Conv2d_BN(torch.nn.Sequential): class PatchEmbed(nn.Module): + """Embeds images into patches and projects them into a specified embedding dimension.""" def __init__(self, in_chans, embed_dim, resolution, activation): """Initialize the PatchMerging class with specified input, output dimensions, resolution and activation @@ -59,6 +61,7 @@ class PatchEmbed(nn.Module): class MBConv(nn.Module): + """Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture.""" def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path): """Initializes a convolutional layer with specified dimensions, input resolution, depth, and activation @@ -96,6 +99,7 @@ class MBConv(nn.Module): class PatchMerging(nn.Module): + """Merges neighboring patches in the feature map and projects to a new dimension.""" def __init__(self, input_resolution, dim, out_dim, activation): """Initializes the ConvLayer with specific dimension, input resolution, depth, activation, drop path, and other @@ -130,6 +134,11 @@ class PatchMerging(nn.Module): class ConvLayer(nn.Module): + """ + Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv). + + Optionally applies downsample operations to the output, and provides support for gradient checkpointing. + """ def __init__( self, @@ -143,6 +152,20 @@ class ConvLayer(nn.Module): out_dim=None, conv_expand_ratio=4., ): + """ + Initializes the ConvLayer with the given dimensions and settings. + + Args: + dim (int): The dimensionality of the input and output. + input_resolution (Tuple[int, int]): The resolution of the input image. + depth (int): The number of MBConv layers in the block. + activation (Callable): Activation function applied after each convolution. + drop_path (Union[float, List[float]]): Drop path rate. Single float or a list of floats for each MBConv. + downsample (Optional[Callable]): Function for downsampling the output. None to skip downsampling. + use_checkpoint (bool): Whether to use gradient checkpointing to save memory. + out_dim (Optional[int]): The dimensionality of the output. None means it will be the same as `dim`. + conv_expand_ratio (float): Expansion ratio for the MBConv layers. + """ super().__init__() self.dim = dim self.input_resolution = input_resolution @@ -171,6 +194,11 @@ class ConvLayer(nn.Module): class Mlp(nn.Module): + """ + Multi-layer Perceptron (MLP) for transformer architectures. + + This layer takes an input with in_features, applies layer normalization and two fully-connected layers. + """ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): """Initializes Attention module with the given parameters including dimension, key_dim, number of heads, etc.""" @@ -194,6 +222,14 @@ class Mlp(nn.Module): class Attention(torch.nn.Module): + """ + Multi-head attention module with support for spatial awareness, applying attention biases based on spatial + resolution. Implements trainable attention biases for each unique offset between spatial positions in the resolution + grid. + + Attributes: + ab (Tensor, optional): Cached attention biases for inference, deleted during training. + """ def __init__( self, @@ -203,8 +239,21 @@ class Attention(torch.nn.Module): attn_ratio=4, resolution=(14, 14), ): + """ + Initializes the Attention module. + + Args: + dim (int): The dimensionality of the input and output. + key_dim (int): The dimensionality of the keys and queries. + num_heads (int, optional): Number of attention heads. Default is 8. + attn_ratio (float, optional): Attention ratio, affecting the dimensions of the value vectors. Default is 4. + resolution (Tuple[int, int], optional): Spatial resolution of the input feature map. Default is (14, 14). + + Raises: + AssertionError: If `resolution` is not a tuple of length 2. + """ super().__init__() - # (h, w) + assert isinstance(resolution, tuple) and len(resolution) == 2 self.num_heads = num_heads self.scale = key_dim ** -0.5 @@ -241,8 +290,9 @@ class Attention(torch.nn.Module): else: self.ab = self.attention_biases[:, self.attention_bias_idxs] - def forward(self, x): # x (B,N,C) - B, N, _ = x.shape + def forward(self, x): # x + """Performs forward pass over the input tensor 'x' by applying normalization and querying keys/values.""" + B, N, _ = x.shape # B, N, C # Normalization x = self.norm(x) @@ -264,20 +314,7 @@ class Attention(torch.nn.Module): class TinyViTBlock(nn.Module): - """ - TinyViT Block. - - Args: - dim (int): Number of input channels. - input_resolution (tuple[int, int]): Input resolution. - num_heads (int): Number of attention heads. - window_size (int): Window size. - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. - drop (float, optional): Dropout rate. Default: 0.0 - drop_path (float, optional): Stochastic depth rate. Default: 0.0 - local_conv_size (int): the kernel size of the convolution between Attention and MLP. Default: 3 - activation (torch.nn): the activation function. Default: nn.GELU - """ + """TinyViT Block that applies self-attention and a local convolution to the input.""" def __init__( self, @@ -291,6 +328,24 @@ class TinyViTBlock(nn.Module): local_conv_size=3, activation=nn.GELU, ): + """ + Initializes the TinyViTBlock. + + Args: + dim (int): The dimensionality of the input and output. + input_resolution (Tuple[int, int]): Spatial resolution of the input feature map. + num_heads (int): Number of attention heads. + window_size (int, optional): Window size for attention. Default is 7. + mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4. + drop (float, optional): Dropout rate. Default is 0. + drop_path (float, optional): Stochastic depth rate. Default is 0. + local_conv_size (int, optional): The kernel size of the local convolution. Default is 3. + activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU. + + Raises: + AssertionError: If `window_size` is not greater than 0. + AssertionError: If `dim` is not divisible by `num_heads`. + """ super().__init__() self.dim = dim self.input_resolution = input_resolution @@ -367,24 +422,7 @@ class TinyViTBlock(nn.Module): class BasicLayer(nn.Module): - """ - A basic TinyViT layer for one stage. - - Args: - dim (int): Number of input channels. - input_resolution (tuple[int]): Input resolution. - depth (int): Number of blocks. - num_heads (int): Number of attention heads. - window_size (int): Local window size. - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. - drop (float, optional): Dropout rate. Default: 0.0 - drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 - downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None - use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. - local_conv_size (int): the kernel size of the depthwise convolution between attention and MLP. Default: 3 - activation (torch.nn): the activation function. Default: nn.GELU - out_dim (int | optional): the output dimension of the layer. Default: None - """ + """A basic TinyViT layer for one stage in a TinyViT architecture.""" def __init__( self, @@ -402,6 +440,27 @@ class BasicLayer(nn.Module): activation=nn.GELU, out_dim=None, ): + """ + Initializes the BasicLayer. + + Args: + dim (int): The dimensionality of the input and output. + input_resolution (Tuple[int, int]): Spatial resolution of the input feature map. + depth (int): Number of TinyViT blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4. + drop (float, optional): Dropout rate. Default is 0. + drop_path (float | tuple[float], optional): Stochastic depth rate. Default is 0. + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default is None. + use_checkpoint (bool, optional): Whether to use checkpointing to save memory. Default is False. + local_conv_size (int, optional): Kernel size of the local convolution. Default is 3. + activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU. + out_dim (int | None, optional): The output dimension of the layer. Default is None. + + Raises: + ValueError: If `drop_path` is a list of float but its length doesn't match `depth`. + """ super().__init__() self.dim = dim self.input_resolution = input_resolution @@ -456,6 +515,30 @@ class LayerNorm2d(nn.Module): class TinyViT(nn.Module): + """ + The TinyViT architecture for vision tasks. + + Attributes: + img_size (int): Input image size. + in_chans (int): Number of input channels. + num_classes (int): Number of classification classes. + embed_dims (List[int]): List of embedding dimensions for each layer. + depths (List[int]): List of depths for each layer. + num_heads (List[int]): List of number of attention heads for each layer. + window_sizes (List[int]): List of window sizes for each layer. + mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension. + drop_rate (float): Dropout rate for drop layers. + drop_path_rate (float): Drop path rate for stochastic depth. + use_checkpoint (bool): Use checkpointing for efficient memory usage. + mbconv_expand_ratio (float): Expansion ratio for MBConv layer. + local_conv_size (int): Local convolution kernel size. + layer_lr_decay (float): Layer-wise learning rate decay. + + Note: + This implementation is generalized to accept a list of depths, attention heads, + embedding dimensions and window sizes, which allows you to create a + "stack" of TinyViT models of varying configurations. + """ def __init__( self, @@ -474,6 +557,25 @@ class TinyViT(nn.Module): local_conv_size=3, layer_lr_decay=1.0, ): + """ + Initializes the TinyViT model. + + Args: + img_size (int, optional): The input image size. Defaults to 224. + in_chans (int, optional): Number of input channels. Defaults to 3. + num_classes (int, optional): Number of classification classes. Defaults to 1000. + embed_dims (List[int], optional): List of embedding dimensions for each layer. Defaults to [96, 192, 384, 768]. + depths (List[int], optional): List of depths for each layer. Defaults to [2, 2, 6, 2]. + num_heads (List[int], optional): List of number of attention heads for each layer. Defaults to [3, 6, 12, 24]. + window_sizes (List[int], optional): List of window sizes for each layer. Defaults to [7, 7, 14, 7]. + mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension. Defaults to 4. + drop_rate (float, optional): Dropout rate. Defaults to 0. + drop_path_rate (float, optional): Drop path rate for stochastic depth. Defaults to 0.1. + use_checkpoint (bool, optional): Whether to use checkpointing for efficient memory usage. Defaults to False. + mbconv_expand_ratio (float, optional): Expansion ratio for MBConv layer. Defaults to 4.0. + local_conv_size (int, optional): Local convolution kernel size. Defaults to 3. + layer_lr_decay (float, optional): Layer-wise learning rate decay. Defaults to 1.0. + """ super().__init__() self.img_size = img_size self.num_classes = num_classes diff --git a/ultralytics/models/sam/modules/transformer.py b/ultralytics/models/sam/modules/transformer.py index 95a04666..5c06acd9 100644 --- a/ultralytics/models/sam/modules/transformer.py +++ b/ultralytics/models/sam/modules/transformer.py @@ -10,6 +10,21 @@ from ultralytics.nn.modules import MLPBlock class TwoWayTransformer(nn.Module): + """ + A Two-Way Transformer module that enables the simultaneous attention to both image and query points. This class + serves as a specialized transformer decoder that attends to an input image using queries whose positional embedding + is supplied. This is particularly useful for tasks like object detection, image segmentation, and point cloud + processing. + + Attributes: + depth (int): The number of layers in the transformer. + embedding_dim (int): The channel dimension for the input embeddings. + num_heads (int): The number of heads for multihead attention. + mlp_dim (int): The internal channel dimension for the MLP block. + layers (nn.ModuleList): The list of TwoWayAttentionBlock layers that make up the transformer. + final_attn_token_to_image (Attention): The final attention layer applied from the queries to the image. + norm_final_attn (nn.LayerNorm): The layer normalization applied to the final queries. + """ def __init__( self, @@ -98,6 +113,23 @@ class TwoWayTransformer(nn.Module): class TwoWayAttentionBlock(nn.Module): + """ + An attention block that performs both self-attention and cross-attention in two directions: queries to keys and + keys to queries. This block consists of four main layers: (1) self-attention on sparse inputs, (2) cross-attention + of sparse inputs to dense inputs, (3) an MLP block on sparse inputs, and (4) cross-attention of dense inputs to + sparse inputs. + + Attributes: + self_attn (Attention): The self-attention layer for the queries. + norm1 (nn.LayerNorm): Layer normalization following the first attention block. + cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys. + norm2 (nn.LayerNorm): Layer normalization following the second attention block. + mlp (MLPBlock): MLP block that transforms the query embeddings. + norm3 (nn.LayerNorm): Layer normalization following the MLP block. + norm4 (nn.LayerNorm): Layer normalization following the third attention block. + cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries. + skip_first_layer_pe (bool): Whether to skip the positional encoding in the first layer. + """ def __init__( self, @@ -180,6 +212,17 @@ class Attention(nn.Module): num_heads: int, downsample_rate: int = 1, ) -> None: + """ + Initializes the Attention model with the given dimensions and settings. + + Args: + embedding_dim (int): The dimensionality of the input embeddings. + num_heads (int): The number of attention heads. + downsample_rate (int, optional): The factor by which the internal dimensions are downsampled. Defaults to 1. + + Raises: + AssertionError: If 'num_heads' does not evenly divide the internal dimension (embedding_dim / downsample_rate). + """ super().__init__() self.embedding_dim = embedding_dim self.internal_dim = embedding_dim // downsample_rate @@ -191,13 +234,15 @@ class Attention(nn.Module): self.v_proj = nn.Linear(embedding_dim, self.internal_dim) self.out_proj = nn.Linear(self.internal_dim, embedding_dim) - def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor: + @staticmethod + def _separate_heads(x: Tensor, num_heads: int) -> Tensor: """Separate the input tensor into the specified number of attention heads.""" b, n, c = x.shape x = x.reshape(b, n, num_heads, c // num_heads) return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head - def _recombine_heads(self, x: Tensor) -> Tensor: + @staticmethod + def _recombine_heads(x: Tensor) -> Tensor: """Recombine the separated attention heads into a single tensor.""" b, n_heads, n_tokens, c_per_head = x.shape x = x.transpose(1, 2) diff --git a/ultralytics/models/sam/predict.py b/ultralytics/models/sam/predict.py index 31e0da93..1a3b8dfb 100644 --- a/ultralytics/models/sam/predict.py +++ b/ultralytics/models/sam/predict.py @@ -17,6 +17,24 @@ from .build import build_sam class Predictor(BasePredictor): + """ + A prediction class for segmentation tasks, extending the BasePredictor. + + This class serves as an interface for model inference for segmentation tasks. + It can preprocess input images, perform inference, and postprocess the output. + It also supports handling various types of input prompts including bounding boxes, + points, and low-resolution masks for better prediction results. + + Attributes: + cfg (dict): Configuration dictionary. + overrides (dict): Dictionary of overriding values. + _callbacks (dict): Dictionary of callback functions. + args (namespace): Argument namespace. + im (torch.Tensor): Preprocessed image for current prediction. + features (torch.Tensor): Image features. + prompts (dict): Dictionary of prompts like bboxes, points, masks. + segment_all (bool): Whether to perform segmentation on all objects or not. + """ def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): """Initializes the Predictor class with default or provided configuration, overrides, and callbacks.""" diff --git a/ultralytics/models/utils/loss.py b/ultralytics/models/utils/loss.py index 77eadce7..a61d20a9 100644 --- a/ultralytics/models/utils/loss.py +++ b/ultralytics/models/utils/loss.py @@ -11,6 +11,24 @@ from .ops import HungarianMatcher class DETRLoss(nn.Module): + """ + DETR (DEtection TRansformer) Loss class. This class calculates and returns the different loss components for the + DETR object detection model. It computes classification loss, bounding box loss, GIoU loss, and optionally auxiliary + losses. + + Attributes: + nc (int): The number of classes. + loss_gain (dict): Coefficients for different loss components. + aux_loss (bool): Whether to compute auxiliary losses. + use_fl (bool): Use FocalLoss or not. + use_vfl (bool): Use VarifocalLoss or not. + use_uni_match (bool): Whether to use a fixed layer to assign labels for the auxiliary branch. + uni_match_ind (int): The fixed indices of a layer to use if `use_uni_match` is True. + matcher (HungarianMatcher): Object to compute matching cost and indices. + fl (FocalLoss or None): Focal Loss object if `use_fl` is True, otherwise None. + vfl (VarifocalLoss or None): Varifocal Loss object if `use_vfl` is True, otherwise None. + device (torch.device): Device on which tensors are stored. + """ def __init__(self, nc=80, diff --git a/ultralytics/nn/modules/block.py b/ultralytics/nn/modules/block.py index 593ae24c..9a65395f 100644 --- a/ultralytics/nn/modules/block.py +++ b/ultralytics/nn/modules/block.py @@ -37,7 +37,12 @@ class DFL(nn.Module): class Proto(nn.Module): """YOLOv8 mask Proto module for segmentation models.""" - def __init__(self, c1, c_=256, c2=32): # ch_in, number of protos, number of masks + def __init__(self, c1, c_=256, c2=32): + """ + Initializes the YOLOv8 mask Proto module with specified number of protos and masks. + + Input arguments are ch_in, number of protos, number of masks. + """ super().__init__() self.cv1 = Conv(c1, c_, k=3) self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True) # nn.Upsample(scale_factor=2, mode='nearest') @@ -124,7 +129,12 @@ class SPP(nn.Module): class SPPF(nn.Module): """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.""" - def __init__(self, c1, c2, k=5): # equivalent to SPP(k=(5, 9, 13)) + def __init__(self, c1, c2, k=5): + """ + Initializes the SPPF layer with given input/output channels and kernel size. + + This module is equivalent to SPP(k=(5, 9, 13)). + """ super().__init__() c_ = c1 // 2 # hidden channels self.cv1 = Conv(c1, c_, 1, 1) @@ -142,7 +152,8 @@ class SPPF(nn.Module): class C1(nn.Module): """CSP Bottleneck with 1 convolution.""" - def __init__(self, c1, c2, n=1): # ch_in, ch_out, number + def __init__(self, c1, c2, n=1): + """Initializes the CSP Bottleneck with configurations for 1 convolution with arguments ch_in, ch_out, number.""" super().__init__() self.cv1 = Conv(c1, c2, 1, 1) self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n))) @@ -156,7 +167,10 @@ class C1(nn.Module): class C2(nn.Module): """CSP Bottleneck with 2 convolutions.""" - def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): + """Initializes the CSP Bottleneck with 2 convolutions module with arguments ch_in, ch_out, number, shortcut, + groups, expansion. + """ super().__init__() self.c = int(c2 * e) # hidden channels self.cv1 = Conv(c1, 2 * self.c, 1, 1) @@ -173,7 +187,10 @@ class C2(nn.Module): class C2f(nn.Module): """Faster Implementation of CSP Bottleneck with 2 convolutions.""" - def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): + """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups, + expansion. + """ super().__init__() self.c = int(c2 * e) # hidden channels self.cv1 = Conv(c1, 2 * self.c, 1, 1) @@ -196,7 +213,8 @@ class C2f(nn.Module): class C3(nn.Module): """CSP Bottleneck with 3 convolutions.""" - def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): + """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values.""" super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) @@ -259,7 +277,8 @@ class C3Ghost(C3): class GhostBottleneck(nn.Module): """Ghost Bottleneck https://github.com/huawei-noah/ghostnet.""" - def __init__(self, c1, c2, k=3, s=1): # ch_in, ch_out, kernel, stride + def __init__(self, c1, c2, k=3, s=1): + """Initializes GhostBottleneck module with arguments ch_in, ch_out, kernel, stride.""" super().__init__() c_ = c2 // 2 self.conv = nn.Sequential( @@ -277,7 +296,10 @@ class GhostBottleneck(nn.Module): class Bottleneck(nn.Module): """Standard bottleneck.""" - def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): # ch_in, ch_out, shortcut, groups, kernels, expand + def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): + """Initializes a bottleneck module with given input/output channels, shortcut option, group, kernels, and + expansion. + """ super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, k[0], 1) @@ -292,7 +314,8 @@ class Bottleneck(nn.Module): class BottleneckCSP(nn.Module): """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.""" - def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): + """Initializes the CSP Bottleneck given arguments for ch_in, ch_out, number, shortcut, groups, expansion.""" super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) diff --git a/ultralytics/nn/modules/conv.py b/ultralytics/nn/modules/conv.py index 21a27009..7fe615d9 100644 --- a/ultralytics/nn/modules/conv.py +++ b/ultralytics/nn/modules/conv.py @@ -88,6 +88,7 @@ class DWConv(Conv): """Depth-wise convolution.""" def __init__(self, c1, c2, k=1, s=1, d=1, act=True): # ch_in, ch_out, kernel, stride, dilation, activation + """Initialize Depth-wise convolution with given parameters.""" super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act) @@ -95,6 +96,7 @@ class DWConvTranspose2d(nn.ConvTranspose2d): """Depth-wise transpose convolution.""" def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0): # ch_in, ch_out, kernel, stride, padding, padding_out + """Initialize DWConvTranspose2d class with given parameters.""" super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2)) @@ -121,12 +123,18 @@ class ConvTranspose(nn.Module): class Focus(nn.Module): """Focus wh information into c-space.""" - def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups + def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): + """Initializes Focus object with user defined channel, convolution, padding, group and activation values.""" super().__init__() self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act) # self.contract = Contract(gain=2) - def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2) + def forward(self, x): + """ + Applies convolution to concatenated tensor and returns the output. + + Input shape is (b,c,w,h) and output shape is (b,4c,w/2,h/2). + """ return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1)) # return self.conv(self.contract(x)) @@ -134,7 +142,10 @@ class Focus(nn.Module): class GhostConv(nn.Module): """Ghost Convolution https://github.com/huawei-noah/ghostnet.""" - def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups + def __init__(self, c1, c2, k=1, s=1, g=1, act=True): + """Initializes the GhostConv object with input channels, output channels, kernel size, stride, groups and + activation. + """ super().__init__() c_ = c2 // 2 # hidden channels self.cv1 = Conv(c1, c_, k, s, None, g, act=act) @@ -280,7 +291,8 @@ class SpatialAttention(nn.Module): class CBAM(nn.Module): """Convolutional Block Attention Module.""" - def __init__(self, c1, kernel_size=7): # ch_in, kernels + def __init__(self, c1, kernel_size=7): + """Initialize CBAM with given input channel (c1) and kernel size.""" super().__init__() self.channel_attention = ChannelAttention(c1) self.spatial_attention = SpatialAttention(kernel_size) diff --git a/ultralytics/nn/modules/head.py b/ultralytics/nn/modules/head.py index 9e993d79..5ac4e73c 100644 --- a/ultralytics/nn/modules/head.py +++ b/ultralytics/nn/modules/head.py @@ -25,7 +25,8 @@ class Detect(nn.Module): anchors = torch.empty(0) # init strides = torch.empty(0) # init - def __init__(self, nc=80, ch=()): # detection layer + def __init__(self, nc=80, ch=()): + """Initializes the YOLOv8 detection layer with specified number of classes and channels.""" super().__init__() self.nc = nc # number of classes self.nl = len(ch) # number of detection layers @@ -149,7 +150,10 @@ class Pose(Detect): class Classify(nn.Module): """YOLOv8 classification head, i.e. x(b,c1,20,20) to x(b,c2).""" - def __init__(self, c1, c2, k=1, s=1, p=None, g=1): # ch_in, ch_out, kernel, stride, padding, groups + def __init__(self, c1, c2, k=1, s=1, p=None, g=1): + """Initializes YOLOv8 classification head with specified input and output channels, kernel size, stride, + padding, and groups. + """ super().__init__() c_ = 1280 # efficientnet_b0 size self.conv = Conv(c1, c_, k, s, p, g) @@ -166,6 +170,13 @@ class Classify(nn.Module): class RTDETRDecoder(nn.Module): + """ + Real-Time Deformable Transformer Decoder (RTDETRDecoder) module for object detection. + + This decoder module utilizes Transformer architecture along with deformable convolutions to predict bounding boxes + and class labels for objects in an image. It integrates features from multiple layers and runs through a series of + Transformer decoder layers to output the final predictions. + """ export = False # export mode def __init__( @@ -186,6 +197,26 @@ class RTDETRDecoder(nn.Module): label_noise_ratio=0.5, box_noise_scale=1.0, learnt_init_query=False): + """ + Initializes the RTDETRDecoder module with the given parameters. + + Args: + nc (int): Number of classes. Default is 80. + ch (tuple): Channels in the backbone feature maps. Default is (512, 1024, 2048). + hd (int): Dimension of hidden layers. Default is 256. + nq (int): Number of query points. Default is 300. + ndp (int): Number of decoder points. Default is 4. + nh (int): Number of heads in multi-head attention. Default is 8. + ndl (int): Number of decoder layers. Default is 6. + d_ffn (int): Dimension of the feed-forward networks. Default is 1024. + dropout (float): Dropout rate. Default is 0. + act (nn.Module): Activation function. Default is nn.ReLU. + eval_idx (int): Evaluation index. Default is -1. + nd (int): Number of denoising. Default is 100. + label_noise_ratio (float): Label noise ratio. Default is 0.5. + box_noise_scale (float): Box noise scale. Default is 1.0. + learnt_init_query (bool): Whether to learn initial query embeddings. Default is False. + """ super().__init__() self.hidden_dim = hd self.nhead = nh diff --git a/ultralytics/nn/tasks.py b/ultralytics/nn/tasks.py index 6145146b..7633a73c 100644 --- a/ultralytics/nn/tasks.py +++ b/ultralytics/nn/tasks.py @@ -375,9 +375,9 @@ class RTDETRDetectionModel(DetectionModel): """ RTDETR (Real-time DEtection and Tracking using Transformers) Detection Model class. - This class is responsible for constructing the RTDETR architecture, defining loss functions, and - facilitating both the training and inference processes. RTDETR is an object detection and tracking model - that extends from the DetectionModel base class. + This class is responsible for constructing the RTDETR architecture, defining loss functions, and facilitating both + the training and inference processes. RTDETR is an object detection and tracking model that extends from the + DetectionModel base class. Attributes: cfg (str): The configuration file path or preset string. Default is 'rtdetr-l.yaml'. @@ -418,7 +418,7 @@ class RTDETRDetectionModel(DetectionModel): preds (torch.Tensor, optional): Precomputed model predictions. Defaults to None. Returns: - tuple: A tuple containing the total loss and main three losses in a tensor. + (tuple): A tuple containing the total loss and main three losses in a tensor. """ if not hasattr(self, 'criterion'): self.criterion = self.init_criterion() @@ -466,7 +466,7 @@ class RTDETRDetectionModel(DetectionModel): augment (bool, optional): If True, perform data augmentation during inference. Defaults to False. Returns: - torch.Tensor: Model's output tensor. + (torch.Tensor): Model's output tensor. """ y, dt = [], [] # outputs for m in self.model[:-1]: # except the head part diff --git a/ultralytics/utils/benchmarks.py b/ultralytics/utils/benchmarks.py index bf86b535..4842ff5b 100644 --- a/ultralytics/utils/benchmarks.py +++ b/ultralytics/utils/benchmarks.py @@ -184,6 +184,19 @@ class ProfileModels: half=True, trt=True, device=None): + """ + Initialize the ProfileModels class for profiling models. + + Args: + paths (list): List of paths of the models to be profiled. + num_timed_runs (int, optional): Number of timed runs for the profiling. Default is 100. + num_warmup_runs (int, optional): Number of warmup runs before the actual profiling starts. Default is 10. + min_time (float, optional): Minimum time in seconds for profiling a model. Default is 60. + imgsz (int, optional): Size of the image used during profiling. Default is 640. + half (bool, optional): Flag to indicate whether to use half-precision floating point for profiling. Default is True. + trt (bool, optional): Flag to indicate whether to profile using TensorRT. Default is True. + device (torch.device, optional): Device used for profiling. If None, it is determined automatically. Default is None. + """ self.paths = paths self.num_timed_runs = num_timed_runs self.num_warmup_runs = num_warmup_runs diff --git a/ultralytics/utils/errors.py b/ultralytics/utils/errors.py index 5a764318..745ca0a4 100644 --- a/ultralytics/utils/errors.py +++ b/ultralytics/utils/errors.py @@ -4,6 +4,18 @@ from ultralytics.utils import emojis class HUBModelError(Exception): + """ + Custom exception class for handling errors related to model fetching in Ultralytics YOLO. + + This exception is raised when a requested model is not found or cannot be retrieved. + The message is also processed to include emojis for better user experience. + + Attributes: + message (str): The error message displayed when the exception is raised. + + Note: + The message is automatically processed through the 'emojis' function from the 'ultralytics.utils' package. + """ def __init__(self, message='Model not found. Please check model URL and try again.'): """Create an exception for when a model is not found.""" diff --git a/ultralytics/utils/instance.py b/ultralytics/utils/instance.py index 28f1f654..3f57a094 100644 --- a/ultralytics/utils/instance.py +++ b/ultralytics/utils/instance.py @@ -33,9 +33,17 @@ __all__ = 'Bboxes', # tuple or list class Bboxes: """ - Bounding Boxes class. + A class for handling bounding boxes. - Only numpy variables are supported. + The class supports various bounding box formats like 'xyxy', 'xywh', and 'ltwh'. + Bounding box data should be provided in numpy arrays. + + Attributes: + bboxes (numpy.ndarray): The bounding boxes stored in a 2D numpy array. + format (str): The format of the bounding boxes ('xyxy', 'xywh', or 'ltwh'). + + Note: + This class does not handle normalization or denormalization of bounding boxes. """ def __init__(self, bboxes, format='xyxy') -> None: @@ -166,6 +174,36 @@ class Bboxes: class Instances: + """ + Container for bounding boxes, segments, and keypoints of detected objects in an image. + + Attributes: + _bboxes (Bboxes): Internal object for handling bounding box operations. + keypoints (ndarray): keypoints(x, y, visible) with shape [N, 17, 3]. Default is None. + normalized (bool): Flag indicating whether the bounding box coordinates are normalized. + segments (ndarray): Segments array with shape [N, 1000, 2] after resampling. + + Args: + bboxes (ndarray): An array of bounding boxes with shape [N, 4]. + segments (list | ndarray, optional): A list or array of object segments. Default is None. + keypoints (ndarray, optional): An array of keypoints with shape [N, 17, 3]. Default is None. + bbox_format (str, optional): The format of bounding boxes ('xywh' or 'xyxy'). Default is 'xywh'. + normalized (bool, optional): Whether the bounding box coordinates are normalized. Default is True. + + Examples: + ```python + # Create an Instances object + instances = Instances( + bboxes=np.array([[10, 10, 30, 30], [20, 20, 40, 40]]), + segments=[np.array([[5, 5], [10, 10]]), np.array([[15, 15], [20, 20]])], + keypoints=np.array([[[5, 5, 1], [10, 10, 1]], [[15, 15, 1], [20, 20, 1]]]) + ) + ``` + + Note: + The bounding box format is either 'xywh' or 'xyxy', and is determined by the `bbox_format` argument. + This class does not perform input validation, and it assumes the inputs are well-formed. + """ def __init__(self, bboxes, segments=None, keypoints=None, bbox_format='xywh', normalized=True) -> None: """ diff --git a/ultralytics/utils/loss.py b/ultralytics/utils/loss.py index 13a7fb72..f76960d6 100644 --- a/ultralytics/utils/loss.py +++ b/ultralytics/utils/loss.py @@ -59,6 +59,7 @@ class FocalLoss(nn.Module): class BboxLoss(nn.Module): + """Criterion class for computing training losses during training.""" def __init__(self, reg_max, use_dfl=False): """Initialize the BboxLoss module with regularization maximum and DFL settings.""" @@ -115,7 +116,7 @@ class v8DetectionLoss: """Criterion class for computing training losses.""" def __init__(self, model): # model must be de-paralleled - + """Initializes v8DetectionLoss with the model, defining model-related properties and BCE loss function.""" device = next(model.parameters()).device # get model device h = model.args # hyperparameters @@ -211,6 +212,7 @@ class v8SegmentationLoss(v8DetectionLoss): """Criterion class for computing training losses.""" def __init__(self, model): # model must be de-paralleled + """Initializes the v8SegmentationLoss class, taking a de-paralleled model as argument.""" super().__init__(model) self.overlap = model.args.overlap_mask @@ -375,6 +377,7 @@ class v8PoseLoss(v8DetectionLoss): """Criterion class for computing training losses.""" def __init__(self, model): # model must be de-paralleled + """Initializes v8PoseLoss with model, sets keypoint variables and declares a keypoint loss instance.""" super().__init__(model) self.kpt_shape = model.model[-1].kpt_shape self.bce_pose = nn.BCEWithLogitsLoss() diff --git a/ultralytics/utils/metrics.py b/ultralytics/utils/metrics.py index 36957e9f..ad0168d5 100644 --- a/ultralytics/utils/metrics.py +++ b/ultralytics/utils/metrics.py @@ -166,8 +166,19 @@ def kpt_iou(kpt1, kpt2, area, sigma, eps=1e-7): return (torch.exp(-e) * kpt_mask[:, None]).sum(-1) / (kpt_mask.sum(-1)[:, None] + eps) -def smooth_BCE(eps=0.1): # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441 - # return positive, negative label smoothing BCE targets +def smooth_BCE(eps=0.1): + """ + Computes smoothed positive and negative Binary Cross-Entropy targets. + + This function calculates positive and negative label smoothing BCE targets based on a given epsilon value. + For implementation details, refer to https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441. + + Args: + eps (float, optional): The epsilon value for label smoothing. Defaults to 0.1. + + Returns: + (tuple): A tuple containing the positive and negative label smoothing BCE targets. + """ return 1.0 - 0.5 * eps, 0.5 * eps diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py index ea6d7a6b..be8aa3b2 100644 --- a/ultralytics/utils/torch_utils.py +++ b/ultralytics/utils/torch_utils.py @@ -311,8 +311,10 @@ def initialize_weights(model): m.inplace = True -def scale_img(img, ratio=1.0, same_shape=False, gs=32): # img(16,3,256,416) - # Scales img(bs,3,y,x) by ratio constrained to gs-multiple +def scale_img(img, ratio=1.0, same_shape=False, gs=32): + """Scales and pads an image tensor of shape img(bs,3,y,x) based on given ratio and grid size gs, optionally + retaining the original shape. + """ if ratio == 1.0: return img h, w = img.shape[2:]