Implement all missing docstrings (#5298)

Co-authored-by: snyk-bot <snyk-bot@snyk.io> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-12-25 23:15:40 +08:00 · 2023-10-10 20:07:13 +02:00 · 2023-10-10 20:07:13 +02:00 · 7fd5dcbd86
commit 7fd5dcbd86
parent e7f0658744
26 changed files with 649 additions and 79 deletions
--- a/ultralytics/engine/init.py
+++ b/ultralytics/engine/init.py
@ -0,0 +1 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
--- a/ultralytics/engine/exporter.py
+++ b/ultralytics/engine/exporter.py
@ -140,7 +140,7 @@ class Exporter:
        Args:
            cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG.
            overrides (dict, optional): Configuration overrides. Defaults to None.
-            _callbacks (list, optional): List of callback functions. Defaults to None.
+            _callbacks (dict, optional): Dictionary of callback functions. Defaults to None.
        """
        self.args = get_cfg(cfg, overrides)
        if self.args.format.lower() in ('coreml', 'mlmodel'):  # fix attempt for protobuf<3.20.x errors
--- a/ultralytics/models/fastsam/predict.py
+++ b/ultralytics/models/fastsam/predict.py
@ -9,14 +9,45 @@ from ultralytics.utils import DEFAULT_CFG, ops


 class FastSAMPredictor(DetectionPredictor):
+    """
+    FastSAMPredictor is specialized for fast SAM (Segment Anything Model) segmentation prediction tasks in Ultralytics
+    YOLO framework.
+
+    This class extends the DetectionPredictor, customizing the prediction pipeline specifically for fast SAM.
+    It adjusts post-processing steps to incorporate mask prediction and non-max suppression while optimizing
+    for single-class segmentation.
+
+    Attributes:
+        cfg (dict): Configuration parameters for prediction.
+        overrides (dict, optional): Optional parameter overrides for custom behavior.
+        _callbacks (dict, optional): Optional list of callback functions to be invoked during prediction.
+    """

    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """Initializes FastSAMPredictor class by inheriting from DetectionPredictor and setting task to 'segment'."""
+        """
+        Initializes the FastSAMPredictor class, inheriting from DetectionPredictor and setting the task to 'segment'.
+
+        Args:
+            cfg (dict): Configuration parameters for prediction.
+            overrides (dict, optional): Optional parameter overrides for custom behavior.
+            _callbacks (dict, optional): Optional list of callback functions to be invoked during prediction.
+        """
        super().__init__(cfg, overrides, _callbacks)
        self.args.task = 'segment'

    def postprocess(self, preds, img, orig_imgs):
-        """Postprocesses the predictions, applies non-max suppression, scales the boxes, and returns the results."""
+        """
+        Perform post-processing steps on predictions, including non-max suppression and scaling boxes to original image
+        size, and returns the final results.
+
+        Args:
+            preds (list): The raw output predictions from the model.
+            img (torch.Tensor): The processed image tensor.
+            orig_imgs (list | torch.Tensor): The original image or list of images.
+
+        Returns:
+            (list): A list of Results objects, each containing processed boxes, masks, and other metadata.
+        """
        p = ops.non_max_suppression(
            preds[0],
            self.args.conf,
--- a/ultralytics/models/fastsam/prompt.py
+++ b/ultralytics/models/fastsam/prompt.py
@ -13,6 +13,15 @@ from ultralytics.utils import TQDM


 class FastSAMPrompt:
+    """
+    Fast Segment Anything Model class for image annotation and visualization.
+
+    Attributes:
+        device (str): Computing device ('cuda' or 'cpu').
+        results: Object detection or segmentation results.
+        source: Source image or image path.
+        clip: CLIP model for linear assignment.
+    """

    def __init__(self, source, results, device='cuda') -> None:
        """Initializes FastSAMPrompt with given source, results and device, and assigns clip for linear assignment."""
@ -92,6 +101,20 @@ class FastSAMPrompt:
             better_quality=True,
             retina=False,
             with_contours=True):
+        """
+        Plots annotations, bounding boxes, and points on images and saves the output.
+
+        Args:
+            annotations (list): Annotations to be plotted.
+            output (str or Path): Output directory for saving the plots.
+            bbox (list, optional): Bounding box coordinates [x1, y1, x2, y2]. Defaults to None.
+            points (list, optional): Points to be plotted. Defaults to None.
+            point_label (list, optional): Labels for the points. Defaults to None.
+            mask_random_color (bool, optional): Whether to use random color for masks. Defaults to True.
+            better_quality (bool, optional): Whether to apply morphological transformations for better mask quality. Defaults to True.
+            retina (bool, optional): Whether to use retina mask. Defaults to False.
+            with_contours (bool, optional): Whether to plot contours. Defaults to True.
+        """
        pbar = TQDM(annotations, total=len(annotations))
        for ann in pbar:
            result_name = os.path.basename(ann.path)
@ -160,6 +183,20 @@ class FastSAMPrompt:
        target_height=960,
        target_width=960,
    ):
+        """
+        Quickly shows the mask annotations on the given matplotlib axis.
+
+        Args:
+            annotation (array-like): Mask annotation.
+            ax (matplotlib.axes.Axes): Matplotlib axis.
+            random_color (bool, optional): Whether to use random color for masks. Defaults to False.
+            bbox (list, optional): Bounding box coordinates [x1, y1, x2, y2]. Defaults to None.
+            points (list, optional): Points to be plotted. Defaults to None.
+            pointlabel (list, optional): Labels for the points. Defaults to None.
+            retinamask (bool, optional): Whether to use retina mask. Defaults to True.
+            target_height (int, optional): Target height for resizing. Defaults to 960.
+            target_width (int, optional): Target width for resizing. Defaults to 960.
+        """
        n, h, w = annotation.shape  # batch, height, width

        areas = np.sum(annotation, axis=(1, 2))
--- a/ultralytics/models/fastsam/val.py
+++ b/ultralytics/models/fastsam/val.py
@ -5,9 +5,35 @@ from ultralytics.utils.metrics import SegmentMetrics


 class FastSAMValidator(SegmentationValidator):
+    """
+    Custom validation class for fast SAM (Segment Anything Model) segmentation in Ultralytics YOLO framework.
+
+    Extends the SegmentationValidator class, customizing the validation process specifically for fast SAM. This class
+    sets the task to 'segment' and uses the SegmentMetrics for evaluation. Additionally, plotting features are disabled
+    to avoid errors during validation.
+
+    Attributes:
+        dataloader: The data loader object used for validation.
+        save_dir (str): The directory where validation results will be saved.
+        pbar: A progress bar object.
+        args: Additional arguments for customization.
+        _callbacks: List of callback functions to be invoked during validation.
+    """

    def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None):
-        """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics."""
+        """
+        Initialize the FastSAMValidator class, setting the task to 'segment' and metrics to SegmentMetrics.
+
+        Args:
+            dataloader (torch.utils.data.DataLoader): Dataloader to be used for validation.
+            save_dir (Path, optional): Directory to save results.
+            pbar (tqdm.tqdm): Progress bar for displaying progress.
+            args (SimpleNamespace): Configuration for the validator.
+            _callbacks (dict): Dictionary to store various callback functions.
+
+        Notes:
+            Plots for ConfusionMatrix and other related metrics are disabled in this class to avoid errors.
+        """
        super().__init__(dataloader, save_dir, pbar, args, _callbacks)
        self.args.task = 'segment'
        self.args.plots = False  # disable ConfusionMatrix and other plots to avoid errors
--- a/ultralytics/models/nas/model.py
+++ b/ultralytics/models/nas/model.py
@ -23,6 +23,26 @@ from .val import NASValidator


 class NAS(Model):
+    """
+    YOLO NAS model for object detection.
+
+    This class provides an interface for the YOLO-NAS models and extends the `Model` class from Ultralytics engine.
+    It is designed to facilitate the task of object detection using pre-trained or custom-trained YOLO-NAS models.
+
+    Example:
+        ```python
+        from ultralytics import NAS
+
+        model = NAS('yolo_nas_s')
+        results = model.predict('ultralytics/assets/bus.jpg')
+        ```
+
+    Attributes:
+        model (str): Path to the pre-trained model or model name. Defaults to 'yolo_nas_s.pt'.
+
+    Note:
+        YOLO-NAS models only support pre-trained models. Do not provide YAML configuration files.
+    """

    def __init__(self, model='yolo_nas_s.pt') -> None:
        """Initializes the NAS model with the provided or default 'yolo_nas_s.pt' model."""
--- a/ultralytics/models/nas/predict.py
+++ b/ultralytics/models/nas/predict.py
@ -8,6 +8,29 @@ from ultralytics.utils import ops


 class NASPredictor(BasePredictor):
+    """
+    Ultralytics YOLO NAS Predictor for object detection.
+
+    This class extends the `BasePredictor` from Ultralytics engine and is responsible for post-processing the
+    raw predictions generated by the YOLO NAS models. It applies operations like non-maximum suppression and
+    scaling the bounding boxes to fit the original image dimensions.
+
+    Attributes:
+        args (Namespace): Namespace containing various configurations for post-processing.
+
+    Example:
+        ```python
+        from ultralytics import NAS
+
+        model = NAS('yolo_nas_s')
+        predictor = model.predictor
+        # Assumes that raw_preds, img, orig_imgs are available
+        results = predictor.postprocess(raw_preds, img, orig_imgs)
+        ```
+
+    Note:
+        Typically, this class is not instantiated directly. It is used internally within the `NAS` class.
+    """

    def postprocess(self, preds_in, img, orig_imgs):
        """Postprocess predictions and returns a list of Results objects."""
--- a/ultralytics/models/nas/val.py
+++ b/ultralytics/models/nas/val.py
@ -9,6 +9,30 @@ __all__ = ['NASValidator']


 class NASValidator(DetectionValidator):
+    """
+    Ultralytics YOLO NAS Validator for object detection.
+
+    Extends `DetectionValidator` from the Ultralytics models package and is designed to post-process the raw predictions
+    generated by YOLO NAS models. It performs non-maximum suppression to remove overlapping and low-confidence boxes,
+    ultimately producing the final detections.
+
+    Attributes:
+        args (Namespace): Namespace containing various configurations for post-processing, such as confidence and IoU thresholds.
+        lb (torch.Tensor): Optional tensor for multilabel NMS.
+
+    Example:
+        ```python
+        from ultralytics import NAS
+
+        model = NAS('yolo_nas_s')
+        validator = model.validator
+        # Assumes that raw_preds are available
+        final_preds = validator.postprocess(raw_preds)
+        ```
+
+    Note:
+        This class is generally not instantiated directly but is used internally within the `NAS` class.
+    """

    def postprocess(self, preds_in):
        """Apply Non-maximum suppression to prediction outputs."""
--- a/ultralytics/models/rtdetr/val.py
+++ b/ultralytics/models/rtdetr/val.py
@ -12,14 +12,19 @@ from ultralytics.utils import colorstr, ops
 __all__ = 'RTDETRValidator',  # tuple or list


-# TODO: Temporarily RT-DETR does not need padding.
 class RTDETRDataset(YOLODataset):
+    """
+    Real-Time DEtection and TRacking (RT-DETR) dataset class extending the base YOLODataset class.
+
+    This specialized dataset class is designed for use with the RT-DETR object detection model and is optimized for
+    real-time detection and tracking tasks.
+    """

    def __init__(self, *args, data=None, **kwargs):
        """Initialize the RTDETRDataset class by inheriting from the YOLODataset class."""
        super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs)

-    # NOTE: add stretch version load_image for rtdetr mosaic
+    # NOTE: add stretch version load_image for RTDETR mosaic
    def load_image(self, i, rect_mode=False):
        """Loads 1 image from dataset index 'i', returns (im, resized hw)."""
        return super().load_image(i=i, rect_mode=rect_mode)
@ -46,7 +51,11 @@ class RTDETRDataset(YOLODataset):

 class RTDETRValidator(DetectionValidator):
    """
-    A class extending the DetectionValidator class for validation based on an RT-DETR detection model.
+    RTDETRValidator extends the DetectionValidator class to provide validation capabilities specifically tailored for
+    the RT-DETR (Real-Time DETR) object detection model.
+
+    The class allows building of an RTDETR-specific dataset for validation, applies Non-maximum suppression for
+    post-processing, and updates evaluation metrics accordingly.

    Example:
        ```python
@ -56,6 +65,9 @@ class RTDETRValidator(DetectionValidator):
        validator = RTDETRValidator(args=args)
        validator()
        ```
+
+    Note:
+        For further details on the attributes and methods, refer to the parent DetectionValidator class.
    """

    def build_dataset(self, img_path, mode='val', batch=None):
--- a/ultralytics/models/sam/modules/decoders.py
+++ b/ultralytics/models/sam/modules/decoders.py
@ -10,6 +10,21 @@ from ultralytics.nn.modules import LayerNorm2d


 class MaskDecoder(nn.Module):
+    """
+    Decoder module for generating masks and their associated quality scores, using a transformer architecture to predict
+    masks given image and prompt embeddings.
+
+    Attributes:
+        transformer_dim (int): Channel dimension for the transformer module.
+        transformer (nn.Module): The transformer module used for mask prediction.
+        num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
+        iou_token (nn.Embedding): Embedding for the IoU token.
+        num_mask_tokens (int): Number of mask tokens.
+        mask_tokens (nn.Embedding): Embedding for the mask tokens.
+        output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
+        output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
+        iou_prediction_head (nn.Module): MLP for predicting mask quality.
+    """

    def __init__(
        self,
@ -136,7 +151,7 @@ class MaskDecoder(nn.Module):

 class MLP(nn.Module):
    """
-    Lightly adapted from
+    MLP (Multi-Layer Perceptron) model lightly adapted from
    https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py
    """

@ -148,6 +163,16 @@ class MLP(nn.Module):
        num_layers: int,
        sigmoid_output: bool = False,
    ) -> None:
+        """
+        Initializes the MLP (Multi-Layer Perceptron) model.
+
+        Args:
+            input_dim (int): The dimensionality of the input features.
+            hidden_dim (int): The dimensionality of the hidden layers.
+            output_dim (int): The dimensionality of the output layer.
+            num_layers (int): The number of hidden layers.
+            sigmoid_output (bool, optional): Whether to apply a sigmoid activation to the output layer. Defaults to False.
+        """
        super().__init__()
        self.num_layers = num_layers
        h = [hidden_dim] * (num_layers - 1)
--- a/ultralytics/models/sam/modules/encoders.py
+++ b/ultralytics/models/sam/modules/encoders.py
@ -12,6 +12,18 @@ from ultralytics.nn.modules import LayerNorm2d, MLPBlock

 # This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
 class ImageEncoderViT(nn.Module):
+    """
+    An image encoder using Vision Transformer (ViT) architecture for encoding an image into a compact latent space. The
+    encoder takes an image, splits it into patches, and processes these patches through a series of transformer blocks.
+    The encoded patches are then processed through a neck to generate the final encoded representation.
+
+    Attributes:
+        img_size (int): Dimension of input images, assumed to be square.
+        patch_embed (PatchEmbed): Module for patch embedding.
+        pos_embed (nn.Parameter, optional): Absolute positional embedding for patches.
+        blocks (nn.ModuleList): List of transformer blocks for processing patch embeddings.
+        neck (nn.Sequential): Neck module to further process the output.
+    """

    def __init__(
            self,
@ -112,6 +124,22 @@ class ImageEncoderViT(nn.Module):


 class PromptEncoder(nn.Module):
+    """
+    Encodes different types of prompts, including points, boxes, and masks, for input to SAM's mask decoder. The encoder
+    produces both sparse and dense embeddings for the input prompts.
+
+    Attributes:
+        embed_dim (int): Dimension of the embeddings.
+        input_image_size (Tuple[int, int]): Size of the input image as (H, W).
+        image_embedding_size (Tuple[int, int]): Spatial size of the image embedding as (H, W).
+        pe_layer (PositionEmbeddingRandom): Module for random position embedding.
+        num_point_embeddings (int): Number of point embeddings for different types of points.
+        point_embeddings (nn.ModuleList): List of point embeddings.
+        not_a_point_embed (nn.Embedding): Embedding for points that are not a part of any label.
+        mask_input_size (Tuple[int, int]): Size of the input mask.
+        mask_downscaling (nn.Sequential): Neural network for downscaling the mask.
+        no_mask_embed (nn.Embedding): Embedding for cases where no mask is provided.
+    """

    def __init__(
        self,
--- a/ultralytics/models/sam/modules/sam.py
+++ b/ultralytics/models/sam/modules/sam.py
@ -16,6 +16,20 @@ from .encoders import ImageEncoderViT, PromptEncoder


 class Sam(nn.Module):
+    """
+    Sam (Segment Anything Model) is designed for object segmentation tasks. It uses image encoders to generate image
+    embeddings, and prompt encoders to encode various types of input prompts. These embeddings are then used by the mask
+    decoder to predict object masks.
+
+    Attributes:
+        mask_threshold (float): Threshold value for mask prediction.
+        image_format (str): Format of the input image, default is 'RGB'.
+        image_encoder (ImageEncoderViT): The backbone used to encode the image into embeddings.
+        prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+        mask_decoder (MaskDecoder): Predicts object masks from the image and prompt embeddings.
+        pixel_mean (List[float]): Mean pixel values for image normalization.
+        pixel_std (List[float]): Standard deviation values for image normalization.
+    """
    mask_threshold: float = 0.0
    image_format: str = 'RGB'

@ -28,18 +42,19 @@ class Sam(nn.Module):
        pixel_std: List[float] = (58.395, 57.12, 57.375)
    ) -> None:
        """
-        SAM predicts object masks from an image and input prompts.
+        Initialize the Sam class to predict object masks from an image and input prompts.

        Note:
            All forward() operations moved to SAMPredictor.

        Args:
-          image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings that allow for
-            efficient mask prediction.
-          prompt_encoder (PromptEncoder): Encodes various types of input prompts.
-          mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
-          pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
-          pixel_std (list(float)): Std values for normalizing pixels in the input image.
+            image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
+            prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+            mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
+            pixel_mean (List[float], optional): Mean values for normalizing pixels in the input image. Defaults to
+                (123.675, 116.28, 103.53).
+            pixel_std (List[float], optional): Std values for normalizing pixels in the input image. Defaults to
+                (58.395, 57.12, 57.375).
        """
        super().__init__()
        self.image_encoder = image_encoder
--- a/ultralytics/models/sam/modules/tiny_encoder.py
+++ b/ultralytics/models/sam/modules/tiny_encoder.py
@ -21,6 +21,7 @@ from ultralytics.utils.instance import to_2tuple


 class Conv2d_BN(torch.nn.Sequential):
+    """A sequential container that performs 2D convolution followed by batch normalization."""

    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1):
        """Initializes the MBConv model with given input channels, output channels, expansion ratio, activation, and
@ -35,6 +36,7 @@ class Conv2d_BN(torch.nn.Sequential):


 class PatchEmbed(nn.Module):
+    """Embeds images into patches and projects them into a specified embedding dimension."""

    def __init__(self, in_chans, embed_dim, resolution, activation):
        """Initialize the PatchMerging class with specified input, output dimensions, resolution and activation
@ -59,6 +61,7 @@ class PatchEmbed(nn.Module):


 class MBConv(nn.Module):
+    """Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture."""

    def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path):
        """Initializes a convolutional layer with specified dimensions, input resolution, depth, and activation
@ -96,6 +99,7 @@ class MBConv(nn.Module):


 class PatchMerging(nn.Module):
+    """Merges neighboring patches in the feature map and projects to a new dimension."""

    def __init__(self, input_resolution, dim, out_dim, activation):
        """Initializes the ConvLayer with specific dimension, input resolution, depth, activation, drop path, and other
@ -130,6 +134,11 @@ class PatchMerging(nn.Module):


 class ConvLayer(nn.Module):
+    """
+    Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).
+
+    Optionally applies downsample operations to the output, and provides support for gradient checkpointing.
+    """

    def __init__(
        self,
@ -143,6 +152,20 @@ class ConvLayer(nn.Module):
        out_dim=None,
        conv_expand_ratio=4.,
    ):
+        """
+        Initializes the ConvLayer with the given dimensions and settings.
+
+        Args:
+            dim (int): The dimensionality of the input and output.
+            input_resolution (Tuple[int, int]): The resolution of the input image.
+            depth (int): The number of MBConv layers in the block.
+            activation (Callable): Activation function applied after each convolution.
+            drop_path (Union[float, List[float]]): Drop path rate. Single float or a list of floats for each MBConv.
+            downsample (Optional[Callable]): Function for downsampling the output. None to skip downsampling.
+            use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
+            out_dim (Optional[int]): The dimensionality of the output. None means it will be the same as `dim`.
+            conv_expand_ratio (float): Expansion ratio for the MBConv layers.
+        """
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
@ -171,6 +194,11 @@ class ConvLayer(nn.Module):


 class Mlp(nn.Module):
+    """
+    Multi-layer Perceptron (MLP) for transformer architectures.
+
+    This layer takes an input with in_features, applies layer normalization and two fully-connected layers.
+    """

    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        """Initializes Attention module with the given parameters including dimension, key_dim, number of heads, etc."""
@ -194,6 +222,14 @@ class Mlp(nn.Module):


 class Attention(torch.nn.Module):
+    """
+    Multi-head attention module with support for spatial awareness, applying attention biases based on spatial
+    resolution. Implements trainable attention biases for each unique offset between spatial positions in the resolution
+    grid.
+
+    Attributes:
+        ab (Tensor, optional): Cached attention biases for inference, deleted during training.
+    """

    def __init__(
            self,
@ -203,8 +239,21 @@ class Attention(torch.nn.Module):
            attn_ratio=4,
            resolution=(14, 14),
    ):
+        """
+        Initializes the Attention module.
+
+        Args:
+            dim (int): The dimensionality of the input and output.
+            key_dim (int): The dimensionality of the keys and queries.
+            num_heads (int, optional): Number of attention heads. Default is 8.
+            attn_ratio (float, optional): Attention ratio, affecting the dimensions of the value vectors. Default is 4.
+            resolution (Tuple[int, int], optional): Spatial resolution of the input feature map. Default is (14, 14).
+
+        Raises:
+            AssertionError: If `resolution` is not a tuple of length 2.
+        """
        super().__init__()
-        # (h, w)
+
        assert isinstance(resolution, tuple) and len(resolution) == 2
        self.num_heads = num_heads
        self.scale = key_dim ** -0.5
@ -241,8 +290,9 @@ class Attention(torch.nn.Module):
        else:
            self.ab = self.attention_biases[:, self.attention_bias_idxs]

-    def forward(self, x):  # x (B,N,C)
-        B, N, _ = x.shape
+    def forward(self, x):  # x
+        """Performs forward pass over the input tensor 'x' by applying normalization and querying keys/values."""
+        B, N, _ = x.shape  # B, N, C

        # Normalization
        x = self.norm(x)
@ -264,20 +314,7 @@ class Attention(torch.nn.Module):


 class TinyViTBlock(nn.Module):
-    """
-    TinyViT Block.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int, int]): Input resolution.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        drop (float, optional): Dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        local_conv_size (int): the kernel size of the convolution between Attention and MLP. Default: 3
-        activation (torch.nn): the activation function. Default: nn.GELU
-    """
+    """TinyViT Block that applies self-attention and a local convolution to the input."""

    def __init__(
        self,
@ -291,6 +328,24 @@ class TinyViTBlock(nn.Module):
        local_conv_size=3,
        activation=nn.GELU,
    ):
+        """
+        Initializes the TinyViTBlock.
+
+        Args:
+            dim (int): The dimensionality of the input and output.
+            input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
+            num_heads (int): Number of attention heads.
+            window_size (int, optional): Window size for attention. Default is 7.
+            mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
+            drop (float, optional): Dropout rate. Default is 0.
+            drop_path (float, optional): Stochastic depth rate. Default is 0.
+            local_conv_size (int, optional): The kernel size of the local convolution. Default is 3.
+            activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.
+
+        Raises:
+            AssertionError: If `window_size` is not greater than 0.
+            AssertionError: If `dim` is not divisible by `num_heads`.
+        """
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
@ -367,24 +422,7 @@ class TinyViTBlock(nn.Module):


 class BasicLayer(nn.Module):
-    """
-    A basic TinyViT layer for one stage.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        drop (float, optional): Dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-        local_conv_size (int): the kernel size of the depthwise convolution between attention and MLP. Default: 3
-        activation (torch.nn): the activation function. Default: nn.GELU
-        out_dim (int | optional): the output dimension of the layer. Default: None
-    """
+    """A basic TinyViT layer for one stage in a TinyViT architecture."""

    def __init__(
        self,
@ -402,6 +440,27 @@ class BasicLayer(nn.Module):
        activation=nn.GELU,
        out_dim=None,
    ):
+        """
+        Initializes the BasicLayer.
+
+        Args:
+            dim (int): The dimensionality of the input and output.
+            input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
+            depth (int): Number of TinyViT blocks.
+            num_heads (int): Number of attention heads.
+            window_size (int): Local window size.
+            mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
+            drop (float, optional): Dropout rate. Default is 0.
+            drop_path (float | tuple[float], optional): Stochastic depth rate. Default is 0.
+            downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default is None.
+            use_checkpoint (bool, optional): Whether to use checkpointing to save memory. Default is False.
+            local_conv_size (int, optional): Kernel size of the local convolution. Default is 3.
+            activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.
+            out_dim (int | None, optional): The output dimension of the layer. Default is None.
+
+        Raises:
+            ValueError: If `drop_path` is a list of float but its length doesn't match `depth`.
+        """
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
@ -456,6 +515,30 @@ class LayerNorm2d(nn.Module):


 class TinyViT(nn.Module):
+    """
+    The TinyViT architecture for vision tasks.
+
+    Attributes:
+        img_size (int): Input image size.
+        in_chans (int): Number of input channels.
+        num_classes (int): Number of classification classes.
+        embed_dims (List[int]): List of embedding dimensions for each layer.
+        depths (List[int]): List of depths for each layer.
+        num_heads (List[int]): List of number of attention heads for each layer.
+        window_sizes (List[int]): List of window sizes for each layer.
+        mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
+        drop_rate (float): Dropout rate for drop layers.
+        drop_path_rate (float): Drop path rate for stochastic depth.
+        use_checkpoint (bool): Use checkpointing for efficient memory usage.
+        mbconv_expand_ratio (float): Expansion ratio for MBConv layer.
+        local_conv_size (int): Local convolution kernel size.
+        layer_lr_decay (float): Layer-wise learning rate decay.
+
+    Note:
+        This implementation is generalized to accept a list of depths, attention heads,
+        embedding dimensions and window sizes, which allows you to create a
+        "stack" of TinyViT models of varying configurations.
+    """

    def __init__(
        self,
@ -474,6 +557,25 @@ class TinyViT(nn.Module):
        local_conv_size=3,
        layer_lr_decay=1.0,
    ):
+        """
+        Initializes the TinyViT model.
+
+        Args:
+            img_size (int, optional): The input image size. Defaults to 224.
+            in_chans (int, optional): Number of input channels. Defaults to 3.
+            num_classes (int, optional): Number of classification classes. Defaults to 1000.
+            embed_dims (List[int], optional): List of embedding dimensions for each layer. Defaults to [96, 192, 384, 768].
+            depths (List[int], optional): List of depths for each layer. Defaults to [2, 2, 6, 2].
+            num_heads (List[int], optional): List of number of attention heads for each layer. Defaults to [3, 6, 12, 24].
+            window_sizes (List[int], optional): List of window sizes for each layer. Defaults to [7, 7, 14, 7].
+            mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension. Defaults to 4.
+            drop_rate (float, optional): Dropout rate. Defaults to 0.
+            drop_path_rate (float, optional): Drop path rate for stochastic depth. Defaults to 0.1.
+            use_checkpoint (bool, optional): Whether to use checkpointing for efficient memory usage. Defaults to False.
+            mbconv_expand_ratio (float, optional): Expansion ratio for MBConv layer. Defaults to 4.0.
+            local_conv_size (int, optional): Local convolution kernel size. Defaults to 3.
+            layer_lr_decay (float, optional): Layer-wise learning rate decay. Defaults to 1.0.
+        """
        super().__init__()
        self.img_size = img_size
        self.num_classes = num_classes
--- a/ultralytics/models/sam/modules/transformer.py
+++ b/ultralytics/models/sam/modules/transformer.py
@ -10,6 +10,21 @@ from ultralytics.nn.modules import MLPBlock


 class TwoWayTransformer(nn.Module):
+    """
+    A Two-Way Transformer module that enables the simultaneous attention to both image and query points. This class
+    serves as a specialized transformer decoder that attends to an input image using queries whose positional embedding
+    is supplied. This is particularly useful for tasks like object detection, image segmentation, and point cloud
+    processing.
+
+    Attributes:
+        depth (int): The number of layers in the transformer.
+        embedding_dim (int): The channel dimension for the input embeddings.
+        num_heads (int): The number of heads for multihead attention.
+        mlp_dim (int): The internal channel dimension for the MLP block.
+        layers (nn.ModuleList): The list of TwoWayAttentionBlock layers that make up the transformer.
+        final_attn_token_to_image (Attention): The final attention layer applied from the queries to the image.
+        norm_final_attn (nn.LayerNorm): The layer normalization applied to the final queries.
+    """

    def __init__(
        self,
@ -98,6 +113,23 @@ class TwoWayTransformer(nn.Module):


 class TwoWayAttentionBlock(nn.Module):
+    """
+    An attention block that performs both self-attention and cross-attention in two directions: queries to keys and
+    keys to queries. This block consists of four main layers: (1) self-attention on sparse inputs, (2) cross-attention
+    of sparse inputs to dense inputs, (3) an MLP block on sparse inputs, and (4) cross-attention of dense inputs to
+    sparse inputs.
+
+    Attributes:
+        self_attn (Attention): The self-attention layer for the queries.
+        norm1 (nn.LayerNorm): Layer normalization following the first attention block.
+        cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
+        norm2 (nn.LayerNorm): Layer normalization following the second attention block.
+        mlp (MLPBlock): MLP block that transforms the query embeddings.
+        norm3 (nn.LayerNorm): Layer normalization following the MLP block.
+        norm4 (nn.LayerNorm): Layer normalization following the third attention block.
+        cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
+        skip_first_layer_pe (bool): Whether to skip the positional encoding in the first layer.
+    """

    def __init__(
        self,
@ -180,6 +212,17 @@ class Attention(nn.Module):
        num_heads: int,
        downsample_rate: int = 1,
    ) -> None:
+        """
+        Initializes the Attention model with the given dimensions and settings.
+
+        Args:
+            embedding_dim (int): The dimensionality of the input embeddings.
+            num_heads (int): The number of attention heads.
+            downsample_rate (int, optional): The factor by which the internal dimensions are downsampled. Defaults to 1.
+
+        Raises:
+            AssertionError: If 'num_heads' does not evenly divide the internal dimension (embedding_dim / downsample_rate).
+        """
        super().__init__()
        self.embedding_dim = embedding_dim
        self.internal_dim = embedding_dim // downsample_rate
@ -191,13 +234,15 @@ class Attention(nn.Module):
        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)

-    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+    @staticmethod
+    def _separate_heads(x: Tensor, num_heads: int) -> Tensor:
        """Separate the input tensor into the specified number of attention heads."""
        b, n, c = x.shape
        x = x.reshape(b, n, num_heads, c // num_heads)
        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head

-    def _recombine_heads(self, x: Tensor) -> Tensor:
+    @staticmethod
+    def _recombine_heads(x: Tensor) -> Tensor:
        """Recombine the separated attention heads into a single tensor."""
        b, n_heads, n_tokens, c_per_head = x.shape
        x = x.transpose(1, 2)
--- a/ultralytics/models/sam/predict.py
+++ b/ultralytics/models/sam/predict.py
@ -17,6 +17,24 @@ from .build import build_sam


 class Predictor(BasePredictor):
+    """
+    A prediction class for segmentation tasks, extending the BasePredictor.
+
+    This class serves as an interface for model inference for segmentation tasks.
+    It can preprocess input images, perform inference, and postprocess the output.
+    It also supports handling various types of input prompts including bounding boxes,
+    points, and low-resolution masks for better prediction results.
+
+    Attributes:
+        cfg (dict): Configuration dictionary.
+        overrides (dict): Dictionary of overriding values.
+        _callbacks (dict): Dictionary of callback functions.
+        args (namespace): Argument namespace.
+        im (torch.Tensor): Preprocessed image for current prediction.
+        features (torch.Tensor): Image features.
+        prompts (dict): Dictionary of prompts like bboxes, points, masks.
+        segment_all (bool): Whether to perform segmentation on all objects or not.
+    """

    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
        """Initializes the Predictor class with default or provided configuration, overrides, and callbacks."""
--- a/ultralytics/models/utils/loss.py
+++ b/ultralytics/models/utils/loss.py
@ -11,6 +11,24 @@ from .ops import HungarianMatcher


 class DETRLoss(nn.Module):
+    """
+    DETR (DEtection TRansformer) Loss class. This class calculates and returns the different loss components for the
+    DETR object detection model. It computes classification loss, bounding box loss, GIoU loss, and optionally auxiliary
+    losses.
+
+    Attributes:
+        nc (int): The number of classes.
+        loss_gain (dict): Coefficients for different loss components.
+        aux_loss (bool): Whether to compute auxiliary losses.
+        use_fl (bool): Use FocalLoss or not.
+        use_vfl (bool): Use VarifocalLoss or not.
+        use_uni_match (bool): Whether to use a fixed layer to assign labels for the auxiliary branch.
+        uni_match_ind (int): The fixed indices of a layer to use if `use_uni_match` is True.
+        matcher (HungarianMatcher): Object to compute matching cost and indices.
+        fl (FocalLoss or None): Focal Loss object if `use_fl` is True, otherwise None.
+        vfl (VarifocalLoss or None): Varifocal Loss object if `use_vfl` is True, otherwise None.
+        device (torch.device): Device on which tensors are stored.
+    """

    def __init__(self,
                 nc=80,
--- a/ultralytics/nn/modules/block.py
+++ b/ultralytics/nn/modules/block.py
@ -37,7 +37,12 @@ class DFL(nn.Module):
 class Proto(nn.Module):
    """YOLOv8 mask Proto module for segmentation models."""

-    def __init__(self, c1, c_=256, c2=32):  # ch_in, number of protos, number of masks
+    def __init__(self, c1, c_=256, c2=32):
+        """
+        Initializes the YOLOv8 mask Proto module with specified number of protos and masks.
+
+        Input arguments are ch_in, number of protos, number of masks.
+        """
        super().__init__()
        self.cv1 = Conv(c1, c_, k=3)
        self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True)  # nn.Upsample(scale_factor=2, mode='nearest')
@ -124,7 +129,12 @@ class SPP(nn.Module):
 class SPPF(nn.Module):
    """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""

-    def __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))
+    def __init__(self, c1, c2, k=5):
+        """
+        Initializes the SPPF layer with given input/output channels and kernel size.
+
+        This module is equivalent to SPP(k=(5, 9, 13)).
+        """
        super().__init__()
        c_ = c1 // 2  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
@ -142,7 +152,8 @@ class SPPF(nn.Module):
 class C1(nn.Module):
    """CSP Bottleneck with 1 convolution."""

-    def __init__(self, c1, c2, n=1):  # ch_in, ch_out, number
+    def __init__(self, c1, c2, n=1):
+        """Initializes the CSP Bottleneck with configurations for 1 convolution with arguments ch_in, ch_out, number."""
        super().__init__()
        self.cv1 = Conv(c1, c2, 1, 1)
        self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
@ -156,7 +167,10 @@ class C1(nn.Module):
 class C2(nn.Module):
    """CSP Bottleneck with 2 convolutions."""

-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initializes the CSP Bottleneck with 2 convolutions module with arguments ch_in, ch_out, number, shortcut,
+        groups, expansion.
+        """
        super().__init__()
        self.c = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
@ -173,7 +187,10 @@ class C2(nn.Module):
 class C2f(nn.Module):
    """Faster Implementation of CSP Bottleneck with 2 convolutions."""

-    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
+        """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
+        expansion.
+        """
        super().__init__()
        self.c = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
@ -196,7 +213,8 @@ class C2f(nn.Module):
 class C3(nn.Module):
    """CSP Bottleneck with 3 convolutions."""

-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values."""
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
@ -259,7 +277,8 @@ class C3Ghost(C3):
 class GhostBottleneck(nn.Module):
    """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""

-    def __init__(self, c1, c2, k=3, s=1):  # ch_in, ch_out, kernel, stride
+    def __init__(self, c1, c2, k=3, s=1):
+        """Initializes GhostBottleneck module with arguments ch_in, ch_out, kernel, stride."""
        super().__init__()
        c_ = c2 // 2
        self.conv = nn.Sequential(
@ -277,7 +296,10 @@ class GhostBottleneck(nn.Module):
 class Bottleneck(nn.Module):
    """Standard bottleneck."""

-    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):  # ch_in, ch_out, shortcut, groups, kernels, expand
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
+        """Initializes a bottleneck module with given input/output channels, shortcut option, group, kernels, and
+        expansion.
+        """
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, k[0], 1)
@ -292,7 +314,8 @@ class Bottleneck(nn.Module):
 class BottleneckCSP(nn.Module):
    """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""

-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initializes the CSP Bottleneck given arguments for ch_in, ch_out, number, shortcut, groups, expansion."""
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
--- a/ultralytics/nn/modules/conv.py
+++ b/ultralytics/nn/modules/conv.py
@ -88,6 +88,7 @@ class DWConv(Conv):
    """Depth-wise convolution."""

    def __init__(self, c1, c2, k=1, s=1, d=1, act=True):  # ch_in, ch_out, kernel, stride, dilation, activation
+        """Initialize Depth-wise convolution with given parameters."""
        super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)


@ -95,6 +96,7 @@ class DWConvTranspose2d(nn.ConvTranspose2d):
    """Depth-wise transpose convolution."""

    def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):  # ch_in, ch_out, kernel, stride, padding, padding_out
+        """Initialize DWConvTranspose2d class with given parameters."""
        super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))


@ -121,12 +123,18 @@ class ConvTranspose(nn.Module):
 class Focus(nn.Module):
    """Focus wh information into c-space."""

-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
+        """Initializes Focus object with user defined channel, convolution, padding, group and activation values."""
        super().__init__()
        self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
        # self.contract = Contract(gain=2)

-    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
+    def forward(self, x):
+        """
+        Applies convolution to concatenated tensor and returns the output.
+
+        Input shape is (b,c,w,h) and output shape is (b,4c,w/2,h/2).
+        """
        return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
        # return self.conv(self.contract(x))

@ -134,7 +142,10 @@ class Focus(nn.Module):
 class GhostConv(nn.Module):
    """Ghost Convolution https://github.com/huawei-noah/ghostnet."""

-    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):  # ch_in, ch_out, kernel, stride, groups
+    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):
+        """Initializes the GhostConv object with input channels, output channels, kernel size, stride, groups and
+        activation.
+        """
        super().__init__()
        c_ = c2 // 2  # hidden channels
        self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
@ -280,7 +291,8 @@ class SpatialAttention(nn.Module):
 class CBAM(nn.Module):
    """Convolutional Block Attention Module."""

-    def __init__(self, c1, kernel_size=7):  # ch_in, kernels
+    def __init__(self, c1, kernel_size=7):
+        """Initialize CBAM with given input channel (c1) and kernel size."""
        super().__init__()
        self.channel_attention = ChannelAttention(c1)
        self.spatial_attention = SpatialAttention(kernel_size)
--- a/ultralytics/nn/modules/head.py
+++ b/ultralytics/nn/modules/head.py
@ -25,7 +25,8 @@ class Detect(nn.Module):
    anchors = torch.empty(0)  # init
    strides = torch.empty(0)  # init

-    def __init__(self, nc=80, ch=()):  # detection layer
+    def __init__(self, nc=80, ch=()):
+        """Initializes the YOLOv8 detection layer with specified number of classes and channels."""
        super().__init__()
        self.nc = nc  # number of classes
        self.nl = len(ch)  # number of detection layers
@ -149,7 +150,10 @@ class Pose(Detect):
 class Classify(nn.Module):
    """YOLOv8 classification head, i.e. x(b,c1,20,20) to x(b,c2)."""

-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1):  # ch_in, ch_out, kernel, stride, padding, groups
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1):
+        """Initializes YOLOv8 classification head with specified input and output channels, kernel size, stride,
+        padding, and groups.
+        """
        super().__init__()
        c_ = 1280  # efficientnet_b0 size
        self.conv = Conv(c1, c_, k, s, p, g)
@ -166,6 +170,13 @@ class Classify(nn.Module):


 class RTDETRDecoder(nn.Module):
+    """
+    Real-Time Deformable Transformer Decoder (RTDETRDecoder) module for object detection.
+
+    This decoder module utilizes Transformer architecture along with deformable convolutions to predict bounding boxes
+    and class labels for objects in an image. It integrates features from multiple layers and runs through a series of
+    Transformer decoder layers to output the final predictions.
+    """
    export = False  # export mode

    def __init__(
@ -186,6 +197,26 @@ class RTDETRDecoder(nn.Module):
            label_noise_ratio=0.5,
            box_noise_scale=1.0,
            learnt_init_query=False):
+        """
+        Initializes the RTDETRDecoder module with the given parameters.
+
+        Args:
+            nc (int): Number of classes. Default is 80.
+            ch (tuple): Channels in the backbone feature maps. Default is (512, 1024, 2048).
+            hd (int): Dimension of hidden layers. Default is 256.
+            nq (int): Number of query points. Default is 300.
+            ndp (int): Number of decoder points. Default is 4.
+            nh (int): Number of heads in multi-head attention. Default is 8.
+            ndl (int): Number of decoder layers. Default is 6.
+            d_ffn (int): Dimension of the feed-forward networks. Default is 1024.
+            dropout (float): Dropout rate. Default is 0.
+            act (nn.Module): Activation function. Default is nn.ReLU.
+            eval_idx (int): Evaluation index. Default is -1.
+            nd (int): Number of denoising. Default is 100.
+            label_noise_ratio (float): Label noise ratio. Default is 0.5.
+            box_noise_scale (float): Box noise scale. Default is 1.0.
+            learnt_init_query (bool): Whether to learn initial query embeddings. Default is False.
+        """
        super().__init__()
        self.hidden_dim = hd
        self.nhead = nh
--- a/ultralytics/nn/tasks.py
+++ b/ultralytics/nn/tasks.py
@ -375,9 +375,9 @@ class RTDETRDetectionModel(DetectionModel):
    """
    RTDETR (Real-time DEtection and Tracking using Transformers) Detection Model class.

-    This class is responsible for constructing the RTDETR architecture, defining loss functions, and
-    facilitating both the training and inference processes. RTDETR is an object detection and tracking model
-    that extends from the DetectionModel base class.
+    This class is responsible for constructing the RTDETR architecture, defining loss functions, and facilitating both
+    the training and inference processes. RTDETR is an object detection and tracking model that extends from the
+    DetectionModel base class.

    Attributes:
        cfg (str): The configuration file path or preset string. Default is 'rtdetr-l.yaml'.
@ -418,7 +418,7 @@ class RTDETRDetectionModel(DetectionModel):
            preds (torch.Tensor, optional): Precomputed model predictions. Defaults to None.

        Returns:
-            tuple: A tuple containing the total loss and main three losses in a tensor.
+            (tuple): A tuple containing the total loss and main three losses in a tensor.
        """
        if not hasattr(self, 'criterion'):
            self.criterion = self.init_criterion()
@ -466,7 +466,7 @@ class RTDETRDetectionModel(DetectionModel):
            augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.

        Returns:
-            torch.Tensor: Model's output tensor.
+            (torch.Tensor): Model's output tensor.
        """
        y, dt = [], []  # outputs
        for m in self.model[:-1]:  # except the head part
--- a/ultralytics/utils/benchmarks.py
+++ b/ultralytics/utils/benchmarks.py
@ -184,6 +184,19 @@ class ProfileModels:
                 half=True,
                 trt=True,
                 device=None):
+        """
+        Initialize the ProfileModels class for profiling models.
+
+        Args:
+            paths (list): List of paths of the models to be profiled.
+            num_timed_runs (int, optional): Number of timed runs for the profiling. Default is 100.
+            num_warmup_runs (int, optional): Number of warmup runs before the actual profiling starts. Default is 10.
+            min_time (float, optional): Minimum time in seconds for profiling a model. Default is 60.
+            imgsz (int, optional): Size of the image used during profiling. Default is 640.
+            half (bool, optional): Flag to indicate whether to use half-precision floating point for profiling. Default is True.
+            trt (bool, optional): Flag to indicate whether to profile using TensorRT. Default is True.
+            device (torch.device, optional): Device used for profiling. If None, it is determined automatically. Default is None.
+        """
        self.paths = paths
        self.num_timed_runs = num_timed_runs
        self.num_warmup_runs = num_warmup_runs
--- a/ultralytics/utils/errors.py
+++ b/ultralytics/utils/errors.py
@ -4,6 +4,18 @@ from ultralytics.utils import emojis


 class HUBModelError(Exception):
+    """
+    Custom exception class for handling errors related to model fetching in Ultralytics YOLO.
+
+    This exception is raised when a requested model is not found or cannot be retrieved.
+    The message is also processed to include emojis for better user experience.
+
+    Attributes:
+        message (str): The error message displayed when the exception is raised.
+
+    Note:
+        The message is automatically processed through the 'emojis' function from the 'ultralytics.utils' package.
+    """

    def __init__(self, message='Model not found. Please check model URL and try again.'):
        """Create an exception for when a model is not found."""
--- a/ultralytics/utils/instance.py
+++ b/ultralytics/utils/instance.py
@ -33,9 +33,17 @@ __all__ = 'Bboxes',  # tuple or list

 class Bboxes:
    """
-    Bounding Boxes class.
+    A class for handling bounding boxes.

-    Only numpy variables are supported.
+    The class supports various bounding box formats like 'xyxy', 'xywh', and 'ltwh'.
+    Bounding box data should be provided in numpy arrays.
+
+    Attributes:
+        bboxes (numpy.ndarray): The bounding boxes stored in a 2D numpy array.
+        format (str): The format of the bounding boxes ('xyxy', 'xywh', or 'ltwh').
+
+    Note:
+        This class does not handle normalization or denormalization of bounding boxes.
    """

    def __init__(self, bboxes, format='xyxy') -> None:
@ -166,6 +174,36 @@ class Bboxes:


 class Instances:
+    """
+    Container for bounding boxes, segments, and keypoints of detected objects in an image.
+
+    Attributes:
+        _bboxes (Bboxes): Internal object for handling bounding box operations.
+        keypoints (ndarray): keypoints(x, y, visible) with shape [N, 17, 3]. Default is None.
+        normalized (bool): Flag indicating whether the bounding box coordinates are normalized.
+        segments (ndarray): Segments array with shape [N, 1000, 2] after resampling.
+
+    Args:
+        bboxes (ndarray): An array of bounding boxes with shape [N, 4].
+        segments (list | ndarray, optional): A list or array of object segments. Default is None.
+        keypoints (ndarray, optional): An array of keypoints with shape [N, 17, 3]. Default is None.
+        bbox_format (str, optional): The format of bounding boxes ('xywh' or 'xyxy'). Default is 'xywh'.
+        normalized (bool, optional): Whether the bounding box coordinates are normalized. Default is True.
+
+    Examples:
+        ```python
+        # Create an Instances object
+        instances = Instances(
+            bboxes=np.array([[10, 10, 30, 30], [20, 20, 40, 40]]),
+            segments=[np.array([[5, 5], [10, 10]]), np.array([[15, 15], [20, 20]])],
+            keypoints=np.array([[[5, 5, 1], [10, 10, 1]], [[15, 15, 1], [20, 20, 1]]])
+        )
+        ```
+
+    Note:
+        The bounding box format is either 'xywh' or 'xyxy', and is determined by the `bbox_format` argument.
+        This class does not perform input validation, and it assumes the inputs are well-formed.
+    """

    def __init__(self, bboxes, segments=None, keypoints=None, bbox_format='xywh', normalized=True) -> None:
        """
--- a/ultralytics/utils/loss.py
+++ b/ultralytics/utils/loss.py
@ -59,6 +59,7 @@ class FocalLoss(nn.Module):


 class BboxLoss(nn.Module):
+    """Criterion class for computing training losses during training."""

    def __init__(self, reg_max, use_dfl=False):
        """Initialize the BboxLoss module with regularization maximum and DFL settings."""
@ -115,7 +116,7 @@ class v8DetectionLoss:
    """Criterion class for computing training losses."""

    def __init__(self, model):  # model must be de-paralleled
-
+        """Initializes v8DetectionLoss with the model, defining model-related properties and BCE loss function."""
        device = next(model.parameters()).device  # get model device
        h = model.args  # hyperparameters

@ -211,6 +212,7 @@ class v8SegmentationLoss(v8DetectionLoss):
    """Criterion class for computing training losses."""

    def __init__(self, model):  # model must be de-paralleled
+        """Initializes the v8SegmentationLoss class, taking a de-paralleled model as argument."""
        super().__init__(model)
        self.overlap = model.args.overlap_mask

@ -375,6 +377,7 @@ class v8PoseLoss(v8DetectionLoss):
    """Criterion class for computing training losses."""

    def __init__(self, model):  # model must be de-paralleled
+        """Initializes v8PoseLoss with model, sets keypoint variables and declares a keypoint loss instance."""
        super().__init__(model)
        self.kpt_shape = model.model[-1].kpt_shape
        self.bce_pose = nn.BCEWithLogitsLoss()
--- a/ultralytics/utils/metrics.py
+++ b/ultralytics/utils/metrics.py
@ -166,8 +166,19 @@ def kpt_iou(kpt1, kpt2, area, sigma, eps=1e-7):
    return (torch.exp(-e) * kpt_mask[:, None]).sum(-1) / (kpt_mask.sum(-1)[:, None] + eps)


-def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
-    # return positive, negative label smoothing BCE targets
+def smooth_BCE(eps=0.1):
+    """
+    Computes smoothed positive and negative Binary Cross-Entropy targets.
+
+    This function calculates positive and negative label smoothing BCE targets based on a given epsilon value.
+    For implementation details, refer to https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441.
+
+    Args:
+        eps (float, optional): The epsilon value for label smoothing. Defaults to 0.1.
+
+    Returns:
+        (tuple): A tuple containing the positive and negative label smoothing BCE targets.
+    """
    return 1.0 - 0.5 * eps, 0.5 * eps


--- a/ultralytics/utils/torch_utils.py
+++ b/ultralytics/utils/torch_utils.py
@ -311,8 +311,10 @@ def initialize_weights(model):
            m.inplace = True


-def scale_img(img, ratio=1.0, same_shape=False, gs=32):  # img(16,3,256,416)
-    # Scales img(bs,3,y,x) by ratio constrained to gs-multiple
+def scale_img(img, ratio=1.0, same_shape=False, gs=32):
+    """Scales and pads an image tensor of shape img(bs,3,y,x) based on given ratio and grid size gs, optionally
+    retaining the original shape.
+    """
    if ratio == 1.0:
        return img
    h, w = img.shape[2:]