mirror of
https://github.com/THU-MIG/yolov10.git
synced 2025-07-07 22:04:53 +08:00
Tests and docstrings improvements (#4475)
This commit is contained in:
parent
c659c0fa7b
commit
615ddc9d97
@ -46,7 +46,7 @@ the benchmarks to their specific needs and compare the performance of different
|
|||||||
| Key | Value | Description |
|
| Key | Value | Description |
|
||||||
|-----------|---------|-----------------------------------------------------------------------|
|
|-----------|---------|-----------------------------------------------------------------------|
|
||||||
| `model` | `None` | path to model file, i.e. yolov8n.pt, yolov8n.yaml |
|
| `model` | `None` | path to model file, i.e. yolov8n.pt, yolov8n.yaml |
|
||||||
| `data` | `None` | path to yaml referencing the benchmarking dataset (under `val` label) |
|
| `data` | `None` | path to YAML referencing the benchmarking dataset (under `val` label) |
|
||||||
| `imgsz` | `640` | image size as scalar or (h, w) list, i.e. (640, 480) |
|
| `imgsz` | `640` | image size as scalar or (h, w) list, i.e. (640, 480) |
|
||||||
| `half` | `False` | FP16 quantization |
|
| `half` | `False` | FP16 quantization |
|
||||||
| `int8` | `False` | INT8 quantization |
|
| `int8` | `False` | INT8 quantization |
|
||||||
|
@ -93,7 +93,7 @@ of the model to improve its performance.
|
|||||||
from ultralytics import YOLO
|
from ultralytics import YOLO
|
||||||
|
|
||||||
model = YOLO("model.pt")
|
model = YOLO("model.pt")
|
||||||
# It'll use the data yaml file in model.pt if you don't set data.
|
# It'll use the data YAML file in model.pt if you don't set data.
|
||||||
model.val()
|
model.val()
|
||||||
# or you can set the data you want to val
|
# or you can set the data you want to val
|
||||||
model.val(data='coco128.yaml')
|
model.val(data='coco128.yaml')
|
||||||
|
@ -107,7 +107,7 @@ Versioning your data separately from your code is generally a good idea and make
|
|||||||
|
|
||||||
### Prepare Your Dataset
|
### Prepare Your Dataset
|
||||||
|
|
||||||
The YOLOv5 repository supports a number of different datasets by using yaml files containing their information. By default datasets are downloaded to the `../datasets` folder in relation to the repository root folder. So if you downloaded the `coco128` dataset using the link in the yaml or with the scripts provided by yolov5, you get this folder structure:
|
The YOLOv5 repository supports a number of different datasets by using YAML files containing their information. By default datasets are downloaded to the `../datasets` folder in relation to the repository root folder. So if you downloaded the `coco128` dataset using the link in the YAML or with the scripts provided by yolov5, you get this folder structure:
|
||||||
|
|
||||||
```
|
```
|
||||||
..
|
..
|
||||||
@ -122,7 +122,7 @@ The YOLOv5 repository supports a number of different datasets by using yaml file
|
|||||||
|
|
||||||
But this can be any dataset you wish. Feel free to use your own, as long as you keep to this folder structure.
|
But this can be any dataset you wish. Feel free to use your own, as long as you keep to this folder structure.
|
||||||
|
|
||||||
Next, ⚠️**copy the corresponding yaml file to the root of the dataset folder**⚠️. This yaml files contains the information ClearML will need to properly use the dataset. You can make this yourself too, of course, just follow the structure of the example yamls.
|
Next, ⚠️**copy the corresponding YAML file to the root of the dataset folder**⚠️. This YAML files contains the information ClearML will need to properly use the dataset. You can make this yourself too, of course, just follow the structure of the example YAMLs.
|
||||||
|
|
||||||
Basically we need the following keys: `path`, `train`, `test`, `val`, `nc`, `names`.
|
Basically we need the following keys: `path`, `train`, `test`, `val`, `nc`, `names`.
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ python train.py --data custom.yaml --weights yolov5s.pt
|
|||||||
custom_pretrained.pt
|
custom_pretrained.pt
|
||||||
```
|
```
|
||||||
|
|
||||||
- **Start from Scratch.** Recommended for large datasets (i.e. [COCO](https://github.com/ultralytics/yolov5/blob/master/data/coco.yaml), [Objects365](https://github.com/ultralytics/yolov5/blob/master/data/Objects365.yaml), [OIv6](https://storage.googleapis.com/openimages/web/index.html)). Pass the model architecture yaml you are interested in, along with an empty `--weights ''` argument:
|
- **Start from Scratch.** Recommended for large datasets (i.e. [COCO](https://github.com/ultralytics/yolov5/blob/master/data/coco.yaml), [Objects365](https://github.com/ultralytics/yolov5/blob/master/data/Objects365.yaml), [OIv6](https://storage.googleapis.com/openimages/web/index.html)). Pass the model architecture YAML you are interested in, along with an empty `--weights ''` argument:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python train.py --data custom.yaml --weights '' --cfg yolov5s.yaml
|
python train.py --data custom.yaml --weights '' --cfg yolov5s.yaml
|
||||||
|
@ -35,28 +35,19 @@ def test_model_methods():
|
|||||||
model = model.reset_weights()
|
model = model.reset_weights()
|
||||||
model = model.load(MODEL)
|
model = model.load(MODEL)
|
||||||
model.to('cpu')
|
model.to('cpu')
|
||||||
|
model.fuse()
|
||||||
_ = model.names
|
_ = model.names
|
||||||
_ = model.device
|
_ = model.device
|
||||||
|
|
||||||
|
|
||||||
def test_model_fuse():
|
|
||||||
model = YOLO(MODEL)
|
|
||||||
model.fuse()
|
|
||||||
|
|
||||||
|
|
||||||
def test_predict_dir():
|
|
||||||
model = YOLO(MODEL)
|
|
||||||
model(source=ASSETS, imgsz=32)
|
|
||||||
|
|
||||||
|
|
||||||
def test_predict_txt():
|
def test_predict_txt():
|
||||||
# Write a list of sources to a txt file
|
# Write a list of sources (file, dir, glob, recursive glob) to a txt file
|
||||||
txt_file = TMP / 'sources.txt'
|
txt_file = TMP / 'sources.txt'
|
||||||
with open(txt_file, 'w') as f:
|
with open(txt_file, 'w') as f:
|
||||||
for x in [ASSETS / 'bus.jpg', ASSETS / 'zidane.jpg']:
|
for x in [ASSETS / 'bus.jpg', ASSETS, ASSETS / '*', ASSETS / '**/*.jpg']:
|
||||||
f.write(f'{x}\n')
|
f.write(f'{x}\n')
|
||||||
model = YOLO(MODEL)
|
model = YOLO(MODEL)
|
||||||
model(source=txt_file, imgsz=640)
|
model(source=txt_file, imgsz=32)
|
||||||
|
|
||||||
|
|
||||||
def test_predict_img():
|
def test_predict_img():
|
||||||
|
@ -16,7 +16,7 @@ import torch
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
|
from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
|
||||||
from ultralytics.utils import ASSETS, LOGGER, is_colab, is_kaggle, ops
|
from ultralytics.utils import LOGGER, is_colab, is_kaggle, ops
|
||||||
from ultralytics.utils.checks import check_requirements
|
from ultralytics.utils.checks import check_requirements
|
||||||
|
|
||||||
|
|
||||||
@ -167,7 +167,7 @@ class LoadScreenshots:
|
|||||||
|
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
"""mss screen capture: get raw pixels from the screen as np array."""
|
"""mss screen capture: get raw pixels from the screen as np array."""
|
||||||
im0 = np.array(self.sct.grab(self.monitor))[:, :, :3] # [:, :, :3] BGRA to BGR
|
im0 = np.asarray(self.sct.grab(self.monitor))[:, :, :3] # BGRA to BGR
|
||||||
s = f'screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: '
|
s = f'screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: '
|
||||||
|
|
||||||
self.frame += 1
|
self.frame += 1
|
||||||
@ -400,10 +400,3 @@ def get_best_youtube_url(url, use_pafy=False):
|
|||||||
good_size = (f.get('width') or 0) >= 1920 or (f.get('height') or 0) >= 1080
|
good_size = (f.get('width') or 0) >= 1920 or (f.get('height') or 0) >= 1080
|
||||||
if good_size and f['vcodec'] != 'none' and f['acodec'] == 'none' and f['ext'] == 'mp4':
|
if good_size and f['vcodec'] != 'none' and f['acodec'] == 'none' and f['ext'] == 'mp4':
|
||||||
return f.get('url')
|
return f.get('url')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
img = cv2.imread(str(ASSETS / 'bus.jpg'))
|
|
||||||
dataset = LoadPilAndNumpy(im0=img)
|
|
||||||
for d in dataset:
|
|
||||||
print(d[0])
|
|
||||||
|
@ -204,7 +204,7 @@ def check_det_dataset(dataset, autodownload=True):
|
|||||||
data = next((DATASETS_DIR / new_dir).rglob('*.yaml'))
|
data = next((DATASETS_DIR / new_dir).rglob('*.yaml'))
|
||||||
extract_dir, autodownload = data.parent, False
|
extract_dir, autodownload = data.parent, False
|
||||||
|
|
||||||
# Read yaml (optional)
|
# Read YAML (optional)
|
||||||
if isinstance(data, (str, Path)):
|
if isinstance(data, (str, Path)):
|
||||||
data = yaml_load(data, append_filename=True) # dictionary
|
data = yaml_load(data, append_filename=True) # dictionary
|
||||||
|
|
||||||
@ -244,7 +244,7 @@ def check_det_dataset(dataset, autodownload=True):
|
|||||||
else:
|
else:
|
||||||
data[k] = [str((path / x).resolve()) for x in data[k]]
|
data[k] = [str((path / x).resolve()) for x in data[k]]
|
||||||
|
|
||||||
# Parse yaml
|
# Parse YAML
|
||||||
train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download'))
|
train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download'))
|
||||||
if val:
|
if val:
|
||||||
val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path
|
val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path
|
||||||
@ -321,12 +321,12 @@ def check_cls_dataset(dataset: str, split=''):
|
|||||||
# Print to console
|
# Print to console
|
||||||
for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
|
for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
|
||||||
if v is None:
|
if v is None:
|
||||||
LOGGER.info(colorstr(k) + f': {v}')
|
LOGGER.info(f'{colorstr(k)}: {v}')
|
||||||
else:
|
else:
|
||||||
files = [path for path in v.rglob('*.*') if path.suffix[1:].lower() in IMG_FORMATS]
|
files = [path for path in v.rglob('*.*') if path.suffix[1:].lower() in IMG_FORMATS]
|
||||||
nf = len(files) # number of files
|
nf = len(files) # number of files
|
||||||
nd = len({file.parent for file in files}) # number of directories
|
nd = len({file.parent for file in files}) # number of directories
|
||||||
LOGGER.info(colorstr(k) + f': {v}... found {nf} images in {nd} classes ✅ ') # keep trailing space
|
LOGGER.info(f'{colorstr(k)}: {v}... found {nf} images in {nd} classes ✅ ') # keep trailing space
|
||||||
|
|
||||||
return {'train': train_set, 'val': val_set or test_set, 'test': test_set or val_set, 'nc': nc, 'names': names}
|
return {'train': train_set, 'val': val_set or test_set, 'test': test_set or val_set, 'nc': nc, 'names': names}
|
||||||
|
|
||||||
|
@ -122,7 +122,7 @@ class Model:
|
|||||||
self.model = model(cfg_dict, verbose=verbose and RANK == -1) # build model
|
self.model = model(cfg_dict, verbose=verbose and RANK == -1) # build model
|
||||||
self.overrides['model'] = self.cfg
|
self.overrides['model'] = self.cfg
|
||||||
|
|
||||||
# Below added to allow export from yamls
|
# Below added to allow export from YAMLs
|
||||||
args = {**DEFAULT_CFG_DICT, **self.overrides} # combine model and default args, preferring model args
|
args = {**DEFAULT_CFG_DICT, **self.overrides} # combine model and default args, preferring model args
|
||||||
self.model.args = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS} # attach args to model
|
self.model.args = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS} # attach args to model
|
||||||
self.model.task = self.task
|
self.model.task = self.task
|
||||||
|
@ -24,7 +24,7 @@ class MaskDecoder(nn.Module):
|
|||||||
"""
|
"""
|
||||||
Predicts masks given an image and prompt embeddings, using a transformer architecture.
|
Predicts masks given an image and prompt embeddings, using a transformer architecture.
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
transformer_dim (int): the channel dimension of the transformer module
|
transformer_dim (int): the channel dimension of the transformer module
|
||||||
transformer (nn.Module): the transformer used to predict masks
|
transformer (nn.Module): the transformer used to predict masks
|
||||||
num_multimask_outputs (int): the number of masks to predict when disambiguating masks
|
num_multimask_outputs (int): the number of masks to predict when disambiguating masks
|
||||||
@ -65,7 +65,7 @@ class MaskDecoder(nn.Module):
|
|||||||
"""
|
"""
|
||||||
Predict masks given image and prompt embeddings.
|
Predict masks given image and prompt embeddings.
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
image_embeddings (torch.Tensor): the embeddings from the image encoder
|
image_embeddings (torch.Tensor): the embeddings from the image encoder
|
||||||
image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
|
image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
|
||||||
sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
|
sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
|
||||||
|
@ -103,13 +103,9 @@ class ImageEncoderViT(nn.Module):
|
|||||||
x = self.patch_embed(x)
|
x = self.patch_embed(x)
|
||||||
if self.pos_embed is not None:
|
if self.pos_embed is not None:
|
||||||
x = x + self.pos_embed
|
x = x + self.pos_embed
|
||||||
|
|
||||||
for blk in self.blocks:
|
for blk in self.blocks:
|
||||||
x = blk(x)
|
x = blk(x)
|
||||||
|
return self.neck(x.permute(0, 3, 1, 2))
|
||||||
x = self.neck(x.permute(0, 3, 1, 2))
|
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class PromptEncoder(nn.Module):
|
class PromptEncoder(nn.Module):
|
||||||
@ -125,7 +121,7 @@ class PromptEncoder(nn.Module):
|
|||||||
"""
|
"""
|
||||||
Encodes prompts for input to SAM's mask decoder.
|
Encodes prompts for input to SAM's mask decoder.
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
embed_dim (int): The prompts' embedding dimension
|
embed_dim (int): The prompts' embedding dimension
|
||||||
image_embedding_size (tuple(int, int)): The spatial size of the
|
image_embedding_size (tuple(int, int)): The spatial size of the
|
||||||
image embedding, as (H, W).
|
image embedding, as (H, W).
|
||||||
@ -165,8 +161,7 @@ class PromptEncoder(nn.Module):
|
|||||||
applied to a dense set of points the shape of the image encoding.
|
applied to a dense set of points the shape of the image encoding.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
torch.Tensor: Positional encoding with shape
|
torch.Tensor: Positional encoding with shape 1x(embed_dim)x(embedding_h)x(embedding_w)
|
||||||
1x(embed_dim)x(embedding_h)x(embedding_w)
|
|
||||||
"""
|
"""
|
||||||
return self.pe_layer(self.image_embedding_size).unsqueeze(0)
|
return self.pe_layer(self.image_embedding_size).unsqueeze(0)
|
||||||
|
|
||||||
@ -231,21 +226,17 @@ class PromptEncoder(nn.Module):
|
|||||||
masks: Optional[torch.Tensor],
|
masks: Optional[torch.Tensor],
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
Embeds different types of prompts, returning both sparse and dense
|
Embeds different types of prompts, returning both sparse and dense embeddings.
|
||||||
embeddings.
|
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
points (tuple(torch.Tensor, torch.Tensor), None): point coordinates
|
points (tuple(torch.Tensor, torch.Tensor), None): point coordinates and labels to embed.
|
||||||
and labels to embed.
|
|
||||||
boxes (torch.Tensor, None): boxes to embed
|
boxes (torch.Tensor, None): boxes to embed
|
||||||
masks (torch.Tensor, None): masks to embed
|
masks (torch.Tensor, None): masks to embed
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
torch.Tensor: sparse embeddings for the points and boxes, with shape
|
torch.Tensor: sparse embeddings for the points and boxes, with shape BxNx(embed_dim), where N is determined
|
||||||
BxNx(embed_dim), where N is determined by the number of input points
|
by the number of input points and boxes.
|
||||||
and boxes.
|
torch.Tensor: dense embeddings for the masks, in the shape Bx(embed_dim)x(embed_H)x(embed_W)
|
||||||
torch.Tensor: dense embeddings for the masks, in the shape
|
|
||||||
Bx(embed_dim)x(embed_H)x(embed_W)
|
|
||||||
"""
|
"""
|
||||||
bs = self._get_batch_size(points, boxes, masks)
|
bs = self._get_batch_size(points, boxes, masks)
|
||||||
sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
|
sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
|
||||||
@ -372,9 +363,7 @@ class Block(nn.Module):
|
|||||||
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
|
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
|
||||||
|
|
||||||
x = shortcut + x
|
x = shortcut + x
|
||||||
x = x + self.mlp(self.norm2(x))
|
return x + self.mlp(self.norm2(x))
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class Attention(nn.Module):
|
class Attention(nn.Module):
|
||||||
@ -427,9 +416,7 @@ class Attention(nn.Module):
|
|||||||
|
|
||||||
attn = attn.softmax(dim=-1)
|
attn = attn.softmax(dim=-1)
|
||||||
x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
|
x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
|
||||||
x = self.proj(x)
|
return self.proj(x)
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
|
def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
|
||||||
@ -577,7 +564,4 @@ class PatchEmbed(nn.Module):
|
|||||||
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
|
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
x = self.proj(x)
|
return self.proj(x).permute(0, 2, 3, 1) # B C H W -> B H W C
|
||||||
# B C H W -> B H W C
|
|
||||||
x = x.permute(0, 2, 3, 1)
|
|
||||||
return x
|
|
||||||
|
@ -29,7 +29,7 @@ class Sam(nn.Module):
|
|||||||
"""
|
"""
|
||||||
SAM predicts object masks from an image and input prompts.
|
SAM predicts object masks from an image and input prompts.
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
image_encoder (ImageEncoderViT): The backbone used to encode the
|
image_encoder (ImageEncoderViT): The backbone used to encode the
|
||||||
image into image embeddings that allow for efficient mask prediction.
|
image into image embeddings that allow for efficient mask prediction.
|
||||||
prompt_encoder (PromptEncoder): Encodes various types of input prompts.
|
prompt_encoder (PromptEncoder): Encodes various types of input prompts.
|
||||||
@ -60,14 +60,12 @@ class Sam(nn.Module):
|
|||||||
multimask_output: bool,
|
multimask_output: bool,
|
||||||
) -> List[Dict[str, torch.Tensor]]:
|
) -> List[Dict[str, torch.Tensor]]:
|
||||||
"""
|
"""
|
||||||
Predicts masks end-to-end from provided images and prompts.
|
Predicts masks end-to-end from provided images and prompts. If prompts are not known in advance, using
|
||||||
If prompts are not known in advance, using SamPredictor is
|
SamPredictor is recommended over calling the model directly.
|
||||||
recommended over calling the model directly.
|
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
batched_input (list(dict)): A list over input images, each a
|
batched_input (list(dict)): A list over input images, each a dictionary with the following keys. A prompt
|
||||||
dictionary with the following keys. A prompt key can be
|
key can be excluded if it is not present.
|
||||||
excluded if it is not present.
|
|
||||||
'image': The image as a torch tensor in 3xHxW format,
|
'image': The image as a torch tensor in 3xHxW format,
|
||||||
already transformed for input to the model.
|
already transformed for input to the model.
|
||||||
'original_size': (tuple(int, int)) The original size of
|
'original_size': (tuple(int, int)) The original size of
|
||||||
@ -81,12 +79,11 @@ class Sam(nn.Module):
|
|||||||
Already transformed to the input frame of the model.
|
Already transformed to the input frame of the model.
|
||||||
'mask_inputs': (torch.Tensor) Batched mask inputs to the model,
|
'mask_inputs': (torch.Tensor) Batched mask inputs to the model,
|
||||||
in the form Bx1xHxW.
|
in the form Bx1xHxW.
|
||||||
multimask_output (bool): Whether the model should predict multiple
|
multimask_output (bool): Whether the model should predict multiple disambiguating masks, or return a single
|
||||||
disambiguating masks, or return a single mask.
|
mask.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
(list(dict)): A list over input images, where each element is
|
(list(dict)): A list over input images, where each element is as dictionary with the following keys.
|
||||||
as dictionary with the following keys.
|
|
||||||
'masks': (torch.Tensor) Batched binary mask predictions,
|
'masks': (torch.Tensor) Batched binary mask predictions,
|
||||||
with shape BxCxHxW, where B is the number of input prompts,
|
with shape BxCxHxW, where B is the number of input prompts,
|
||||||
C is determined by multimask_output, and (H, W) is the
|
C is determined by multimask_output, and (H, W) is the
|
||||||
@ -139,7 +136,7 @@ class Sam(nn.Module):
|
|||||||
"""
|
"""
|
||||||
Remove padding and upscale masks to the original image size.
|
Remove padding and upscale masks to the original image size.
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
masks (torch.Tensor): Batched masks from the mask_decoder,
|
masks (torch.Tensor): Batched masks from the mask_decoder,
|
||||||
in BxCxHxW format.
|
in BxCxHxW format.
|
||||||
input_size (tuple(int, int)): The size of the image input to the
|
input_size (tuple(int, int)): The size of the image input to the
|
||||||
@ -158,8 +155,7 @@ class Sam(nn.Module):
|
|||||||
align_corners=False,
|
align_corners=False,
|
||||||
)
|
)
|
||||||
masks = masks[..., :input_size[0], :input_size[1]]
|
masks = masks[..., :input_size[0], :input_size[1]]
|
||||||
masks = F.interpolate(masks, original_size, mode='bilinear', align_corners=False)
|
return F.interpolate(masks, original_size, mode='bilinear', align_corners=False)
|
||||||
return masks
|
|
||||||
|
|
||||||
def preprocess(self, x: torch.Tensor) -> torch.Tensor:
|
def preprocess(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
"""Normalize pixel values and pad to a square input."""
|
"""Normalize pixel values and pad to a square input."""
|
||||||
|
@ -35,8 +35,7 @@ class Conv2d_BN(torch.nn.Sequential):
|
|||||||
c, bn = self._modules.values()
|
c, bn = self._modules.values()
|
||||||
w = bn.weight / (bn.running_var + bn.eps) ** 0.5
|
w = bn.weight / (bn.running_var + bn.eps) ** 0.5
|
||||||
w = c.weight * w[:, None, None, None]
|
w = c.weight * w[:, None, None, None]
|
||||||
b = bn.bias - bn.running_mean * bn.weight / \
|
b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5
|
||||||
(bn.running_var + bn.eps)**0.5
|
|
||||||
m = torch.nn.Conv2d(w.size(1) * self.c.groups,
|
m = torch.nn.Conv2d(w.size(1) * self.c.groups,
|
||||||
w.size(0),
|
w.size(0),
|
||||||
w.shape[2:],
|
w.shape[2:],
|
||||||
@ -72,8 +71,7 @@ class PatchEmbed(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
img_size: Tuple[int, int] = to_2tuple(resolution)
|
img_size: Tuple[int, int] = to_2tuple(resolution)
|
||||||
self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
|
self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
|
||||||
self.num_patches = self.patches_resolution[0] * \
|
self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
|
||||||
self.patches_resolution[1]
|
|
||||||
self.in_chans = in_chans
|
self.in_chans = in_chans
|
||||||
self.embed_dim = embed_dim
|
self.embed_dim = embed_dim
|
||||||
n = embed_dim
|
n = embed_dim
|
||||||
@ -110,21 +108,14 @@ class MBConv(nn.Module):
|
|||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
shortcut = x
|
shortcut = x
|
||||||
|
|
||||||
x = self.conv1(x)
|
x = self.conv1(x)
|
||||||
x = self.act1(x)
|
x = self.act1(x)
|
||||||
|
|
||||||
x = self.conv2(x)
|
x = self.conv2(x)
|
||||||
x = self.act2(x)
|
x = self.act2(x)
|
||||||
|
|
||||||
x = self.conv3(x)
|
x = self.conv3(x)
|
||||||
|
|
||||||
x = self.drop_path(x)
|
x = self.drop_path(x)
|
||||||
|
|
||||||
x += shortcut
|
x += shortcut
|
||||||
x = self.act3(x)
|
return self.act3(x)
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class PatchMerging(nn.Module):
|
class PatchMerging(nn.Module):
|
||||||
@ -137,9 +128,7 @@ class PatchMerging(nn.Module):
|
|||||||
self.out_dim = out_dim
|
self.out_dim = out_dim
|
||||||
self.act = activation()
|
self.act = activation()
|
||||||
self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
|
self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
|
||||||
stride_c = 2
|
stride_c = 1 if out_dim in [320, 448, 576] else 2
|
||||||
if (out_dim == 320 or out_dim == 448 or out_dim == 576):
|
|
||||||
stride_c = 1
|
|
||||||
self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
|
self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
|
||||||
self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
|
self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
|
||||||
|
|
||||||
@ -156,8 +145,7 @@ class PatchMerging(nn.Module):
|
|||||||
x = self.conv2(x)
|
x = self.conv2(x)
|
||||||
x = self.act(x)
|
x = self.act(x)
|
||||||
x = self.conv3(x)
|
x = self.conv3(x)
|
||||||
x = x.flatten(2).transpose(1, 2)
|
return x.flatten(2).transpose(1, 2)
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class ConvLayer(nn.Module):
|
class ConvLayer(nn.Module):
|
||||||
@ -174,7 +162,6 @@ class ConvLayer(nn.Module):
|
|||||||
out_dim=None,
|
out_dim=None,
|
||||||
conv_expand_ratio=4.,
|
conv_expand_ratio=4.,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
self.input_resolution = input_resolution
|
self.input_resolution = input_resolution
|
||||||
@ -192,20 +179,13 @@ class ConvLayer(nn.Module):
|
|||||||
) for i in range(depth)])
|
) for i in range(depth)])
|
||||||
|
|
||||||
# patch merging layer
|
# patch merging layer
|
||||||
if downsample is not None:
|
self.downsample = None if downsample is None else downsample(
|
||||||
self.downsample = downsample(input_resolution, dim=dim, out_dim=out_dim, activation=activation)
|
input_resolution, dim=dim, out_dim=out_dim, activation=activation)
|
||||||
else:
|
|
||||||
self.downsample = None
|
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
for blk in self.blocks:
|
for blk in self.blocks:
|
||||||
if self.use_checkpoint:
|
x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
|
||||||
x = checkpoint.checkpoint(blk, x)
|
return x if self.downsample is None else self.downsample(x)
|
||||||
else:
|
|
||||||
x = blk(x)
|
|
||||||
if self.downsample is not None:
|
|
||||||
x = self.downsample(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class Mlp(nn.Module):
|
class Mlp(nn.Module):
|
||||||
@ -222,13 +202,11 @@ class Mlp(nn.Module):
|
|||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
x = self.norm(x)
|
x = self.norm(x)
|
||||||
|
|
||||||
x = self.fc1(x)
|
x = self.fc1(x)
|
||||||
x = self.act(x)
|
x = self.act(x)
|
||||||
x = self.drop(x)
|
x = self.drop(x)
|
||||||
x = self.fc2(x)
|
x = self.fc2(x)
|
||||||
x = self.drop(x)
|
return self.drop(x)
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class Attention(torch.nn.Module):
|
class Attention(torch.nn.Module):
|
||||||
@ -297,12 +275,12 @@ class Attention(torch.nn.Module):
|
|||||||
(self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab))
|
(self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab))
|
||||||
attn = attn.softmax(dim=-1)
|
attn = attn.softmax(dim=-1)
|
||||||
x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
|
x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
|
||||||
x = self.proj(x)
|
return self.proj(x)
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class TinyViTBlock(nn.Module):
|
class TinyViTBlock(nn.Module):
|
||||||
r""" TinyViT Block.
|
"""
|
||||||
|
TinyViT Block.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dim (int): Number of input channels.
|
dim (int): Number of input channels.
|
||||||
@ -312,8 +290,7 @@ class TinyViTBlock(nn.Module):
|
|||||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
||||||
drop (float, optional): Dropout rate. Default: 0.0
|
drop (float, optional): Dropout rate. Default: 0.0
|
||||||
drop_path (float, optional): Stochastic depth rate. Default: 0.0
|
drop_path (float, optional): Stochastic depth rate. Default: 0.0
|
||||||
local_conv_size (int): the kernel size of the convolution between
|
local_conv_size (int): the kernel size of the convolution between Attention and MLP. Default: 3
|
||||||
Attention and MLP. Default: 3
|
|
||||||
activation (torch.nn): the activation function. Default: nn.GELU
|
activation (torch.nn): the activation function. Default: nn.GELU
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -391,8 +368,7 @@ class TinyViTBlock(nn.Module):
|
|||||||
x = self.local_conv(x)
|
x = self.local_conv(x)
|
||||||
x = x.view(B, C, L).transpose(1, 2)
|
x = x.view(B, C, L).transpose(1, 2)
|
||||||
|
|
||||||
x = x + self.drop_path(self.mlp(x))
|
return x + self.drop_path(self.mlp(x))
|
||||||
return x
|
|
||||||
|
|
||||||
def extra_repr(self) -> str:
|
def extra_repr(self) -> str:
|
||||||
return f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, ' \
|
return f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, ' \
|
||||||
@ -400,7 +376,8 @@ class TinyViTBlock(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class BasicLayer(nn.Module):
|
class BasicLayer(nn.Module):
|
||||||
""" A basic TinyViT layer for one stage.
|
"""
|
||||||
|
A basic TinyViT layer for one stage.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dim (int): Number of input channels.
|
dim (int): Number of input channels.
|
||||||
@ -434,7 +411,6 @@ class BasicLayer(nn.Module):
|
|||||||
activation=nn.GELU,
|
activation=nn.GELU,
|
||||||
out_dim=None,
|
out_dim=None,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
self.input_resolution = input_resolution
|
self.input_resolution = input_resolution
|
||||||
@ -456,20 +432,13 @@ class BasicLayer(nn.Module):
|
|||||||
) for i in range(depth)])
|
) for i in range(depth)])
|
||||||
|
|
||||||
# patch merging layer
|
# patch merging layer
|
||||||
if downsample is not None:
|
self.downsample = None if downsample is None else downsample(
|
||||||
self.downsample = downsample(input_resolution, dim=dim, out_dim=out_dim, activation=activation)
|
input_resolution, dim=dim, out_dim=out_dim, activation=activation)
|
||||||
else:
|
|
||||||
self.downsample = None
|
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
for blk in self.blocks:
|
for blk in self.blocks:
|
||||||
if self.use_checkpoint:
|
x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
|
||||||
x = checkpoint.checkpoint(blk, x)
|
return x if self.downsample is None else self.downsample(x)
|
||||||
else:
|
|
||||||
x = blk(x)
|
|
||||||
if self.downsample is not None:
|
|
||||||
x = self.downsample(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
def extra_repr(self) -> str:
|
def extra_repr(self) -> str:
|
||||||
return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}'
|
return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}'
|
||||||
@ -487,8 +456,7 @@ class LayerNorm2d(nn.Module):
|
|||||||
u = x.mean(1, keepdim=True)
|
u = x.mean(1, keepdim=True)
|
||||||
s = (x - u).pow(2).mean(1, keepdim=True)
|
s = (x - u).pow(2).mean(1, keepdim=True)
|
||||||
x = (x - u) / torch.sqrt(s + self.eps)
|
x = (x - u) / torch.sqrt(s + self.eps)
|
||||||
x = self.weight[:, None, None] * x + self.bias[:, None, None]
|
return self.weight[:, None, None] * x + self.bias[:, None, None]
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class TinyViT(nn.Module):
|
class TinyViT(nn.Module):
|
||||||
@ -548,10 +516,7 @@ class TinyViT(nn.Module):
|
|||||||
activation=activation,
|
activation=activation,
|
||||||
)
|
)
|
||||||
if i_layer == 0:
|
if i_layer == 0:
|
||||||
layer = ConvLayer(
|
layer = ConvLayer(conv_expand_ratio=mbconv_expand_ratio, **kwargs)
|
||||||
conv_expand_ratio=mbconv_expand_ratio,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
layer = BasicLayer(num_heads=num_heads[i_layer],
|
layer = BasicLayer(num_heads=num_heads[i_layer],
|
||||||
window_size=window_sizes[i_layer],
|
window_size=window_sizes[i_layer],
|
||||||
@ -622,7 +587,7 @@ class TinyViT(nn.Module):
|
|||||||
if isinstance(m, nn.Linear):
|
if isinstance(m, nn.Linear):
|
||||||
# NOTE: This initialization is needed only for training.
|
# NOTE: This initialization is needed only for training.
|
||||||
# trunc_normal_(m.weight, std=.02)
|
# trunc_normal_(m.weight, std=.02)
|
||||||
if isinstance(m, nn.Linear) and m.bias is not None:
|
if m.bias is not None:
|
||||||
nn.init.constant_(m.bias, 0)
|
nn.init.constant_(m.bias, 0)
|
||||||
elif isinstance(m, nn.LayerNorm):
|
elif isinstance(m, nn.LayerNorm):
|
||||||
nn.init.constant_(m.bias, 0)
|
nn.init.constant_(m.bias, 0)
|
||||||
@ -645,9 +610,7 @@ class TinyViT(nn.Module):
|
|||||||
B, _, C = x.size()
|
B, _, C = x.size()
|
||||||
x = x.view(B, 64, 64, C)
|
x = x.view(B, 64, 64, C)
|
||||||
x = x.permute(0, 3, 1, 2)
|
x = x.permute(0, 3, 1, 2)
|
||||||
x = self.neck(x)
|
return self.neck(x)
|
||||||
return x
|
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
x = self.forward_features(x)
|
return self.forward_features(x)
|
||||||
return x
|
|
||||||
|
@ -61,16 +61,14 @@ class TwoWayTransformer(nn.Module):
|
|||||||
) -> Tuple[Tensor, Tensor]:
|
) -> Tuple[Tensor, Tensor]:
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
image_embedding (torch.Tensor): image to attend to. Should be shape
|
image_embedding (torch.Tensor): image to attend to. Should be shape B x embedding_dim x h x w for any h and w.
|
||||||
B x embedding_dim x h x w for any h and w.
|
image_pe (torch.Tensor): the positional encoding to add to the image. Must have same shape as image_embedding.
|
||||||
image_pe (torch.Tensor): the positional encoding to add to the image. Must
|
|
||||||
have the same shape as image_embedding.
|
|
||||||
point_embedding (torch.Tensor): the embedding to add to the query points.
|
point_embedding (torch.Tensor): the embedding to add to the query points.
|
||||||
Must have shape B x N_points x embedding_dim for any N_points.
|
Must have shape B x N_points x embedding_dim for any N_points.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
torch.Tensor: the processed point_embedding
|
(torch.Tensor): the processed point_embedding
|
||||||
torch.Tensor: the processed image_embedding
|
(torch.Tensor): the processed image_embedding
|
||||||
"""
|
"""
|
||||||
# BxCxHxW -> BxHWxC == B x N_image_tokens x C
|
# BxCxHxW -> BxHWxC == B x N_image_tokens x C
|
||||||
bs, c, h, w = image_embedding.shape
|
bs, c, h, w = image_embedding.shape
|
||||||
@ -112,12 +110,11 @@ class TwoWayAttentionBlock(nn.Module):
|
|||||||
skip_first_layer_pe: bool = False,
|
skip_first_layer_pe: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
A transformer block with four layers: (1) self-attention of sparse
|
A transformer block with four layers: (1) self-attention of sparse inputs, (2) cross attention of sparse
|
||||||
inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
|
inputs to dense inputs, (3) mlp block on sparse inputs, and (4) cross attention of dense inputs to sparse
|
||||||
block on sparse inputs, and (4) cross attention of dense inputs to sparse
|
|
||||||
inputs.
|
inputs.
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
embedding_dim (int): the channel dimension of the embeddings
|
embedding_dim (int): the channel dimension of the embeddings
|
||||||
num_heads (int): the number of heads in the attention layers
|
num_heads (int): the number of heads in the attention layers
|
||||||
mlp_dim (int): the hidden dimension of the mlp block
|
mlp_dim (int): the hidden dimension of the mlp block
|
||||||
@ -175,8 +172,8 @@ class TwoWayAttentionBlock(nn.Module):
|
|||||||
|
|
||||||
class Attention(nn.Module):
|
class Attention(nn.Module):
|
||||||
"""
|
"""
|
||||||
An attention layer that allows for downscaling the size of the embedding
|
An attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
|
||||||
after projection to queries, keys, and values.
|
values.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -230,6 +227,4 @@ class Attention(nn.Module):
|
|||||||
# Get output
|
# Get output
|
||||||
out = attn @ v
|
out = attn @ v
|
||||||
out = self._recombine_heads(out)
|
out = self._recombine_heads(out)
|
||||||
out = self.out_proj(out)
|
return self.out_proj(out)
|
||||||
|
|
||||||
return out
|
|
||||||
|
@ -145,9 +145,11 @@ class SegmentationValidator(DetectionValidator):
|
|||||||
def _process_batch(self, detections, labels, pred_masks=None, gt_masks=None, overlap=False, masks=False):
|
def _process_batch(self, detections, labels, pred_masks=None, gt_masks=None, overlap=False, masks=False):
|
||||||
"""
|
"""
|
||||||
Return correct prediction matrix
|
Return correct prediction matrix
|
||||||
Arguments:
|
|
||||||
|
Args:
|
||||||
detections (array[N, 6]), x1, y1, x2, y2, conf, class
|
detections (array[N, 6]), x1, y1, x2, y2, conf, class
|
||||||
labels (array[M, 5]), class, x1, y1, x2, y2
|
labels (array[M, 5]), class, x1, y1, x2, y2
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
correct (array[N, 10]), for 10 IoU levels
|
correct (array[N, 10]), for 10 IoU levels
|
||||||
"""
|
"""
|
||||||
|
@ -97,8 +97,7 @@ class AIFI(TransformerEncoderLayer):
|
|||||||
out_w = grid_w.flatten()[..., None] @ omega[None]
|
out_w = grid_w.flatten()[..., None] @ omega[None]
|
||||||
out_h = grid_h.flatten()[..., None] @ omega[None]
|
out_h = grid_h.flatten()[..., None] @ omega[None]
|
||||||
|
|
||||||
return torch.concat([torch.sin(out_w), torch.cos(out_w),
|
return torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], 1)[None]
|
||||||
torch.sin(out_h), torch.cos(out_h)], axis=1)[None, :, :]
|
|
||||||
|
|
||||||
|
|
||||||
class TransformerLayer(nn.Module):
|
class TransformerLayer(nn.Module):
|
||||||
@ -170,9 +169,11 @@ class MLP(nn.Module):
|
|||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
|
|
||||||
# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
|
|
||||||
class LayerNorm2d(nn.Module):
|
class LayerNorm2d(nn.Module):
|
||||||
|
"""
|
||||||
|
LayerNorm2d module from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
|
||||||
|
https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, num_channels, eps=1e-6):
|
def __init__(self, num_channels, eps=1e-6):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -229,7 +229,7 @@ class DetectionModel(BaseModel):
|
|||||||
ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels
|
ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels
|
||||||
if nc and nc != self.yaml['nc']:
|
if nc and nc != self.yaml['nc']:
|
||||||
LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
|
LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
|
||||||
self.yaml['nc'] = nc # override yaml value
|
self.yaml['nc'] = nc # override YAML value
|
||||||
self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
|
self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
|
||||||
self.names = {i: f'{i}' for i in range(self.yaml['nc'])} # default names dict
|
self.names = {i: f'{i}' for i in range(self.yaml['nc'])} # default names dict
|
||||||
self.inplace = self.yaml.get('inplace', True)
|
self.inplace = self.yaml.get('inplace', True)
|
||||||
@ -329,7 +329,7 @@ class ClassificationModel(BaseModel):
|
|||||||
ch=3,
|
ch=3,
|
||||||
nc=None,
|
nc=None,
|
||||||
cutoff=10,
|
cutoff=10,
|
||||||
verbose=True): # yaml, model, channels, number of classes, cutoff index, verbose flag
|
verbose=True): # YAML, model, channels, number of classes, cutoff index, verbose flag
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg, ch, nc, verbose)
|
self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg, ch, nc, verbose)
|
||||||
|
|
||||||
@ -357,7 +357,7 @@ class ClassificationModel(BaseModel):
|
|||||||
ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels
|
ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels
|
||||||
if nc and nc != self.yaml['nc']:
|
if nc and nc != self.yaml['nc']:
|
||||||
LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
|
LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
|
||||||
self.yaml['nc'] = nc # override yaml value
|
self.yaml['nc'] = nc # override YAML value
|
||||||
elif not nc and not self.yaml.get('nc', None):
|
elif not nc and not self.yaml.get('nc', None):
|
||||||
raise ValueError('nc not specified. Must specify nc in model.yaml or function arguments.')
|
raise ValueError('nc not specified. Must specify nc in model.yaml or function arguments.')
|
||||||
self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
|
self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
|
||||||
|
@ -341,10 +341,10 @@ def yaml_load(file='data.yaml', append_filename=False):
|
|||||||
|
|
||||||
def yaml_print(yaml_file: Union[str, Path, dict]) -> None:
|
def yaml_print(yaml_file: Union[str, Path, dict]) -> None:
|
||||||
"""
|
"""
|
||||||
Pretty prints a yaml file or a yaml-formatted dictionary.
|
Pretty prints a YAML file or a YAML-formatted dictionary.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
yaml_file: The file path of the yaml file or a yaml-formatted dictionary.
|
yaml_file: The file path of the YAML file or a YAML-formatted dictionary.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
|
@ -29,8 +29,7 @@ def _log_debug_samples(files, title='Debug Samples') -> None:
|
|||||||
files (list): A list of file paths in PosixPath format.
|
files (list): A list of file paths in PosixPath format.
|
||||||
title (str): A title that groups together images with the same values.
|
title (str): A title that groups together images with the same values.
|
||||||
"""
|
"""
|
||||||
task = Task.current_task()
|
if task := Task.current_task():
|
||||||
if task:
|
|
||||||
for f in files:
|
for f in files:
|
||||||
if f.exists():
|
if f.exists():
|
||||||
it = re.search(r'_batch(\d+)', f.name)
|
it = re.search(r'_batch(\d+)', f.name)
|
||||||
@ -63,8 +62,7 @@ def _log_plot(title, plot_path) -> None:
|
|||||||
def on_pretrain_routine_start(trainer):
|
def on_pretrain_routine_start(trainer):
|
||||||
"""Runs at start of pretraining routine; initializes and connects/ logs task to ClearML."""
|
"""Runs at start of pretraining routine; initializes and connects/ logs task to ClearML."""
|
||||||
try:
|
try:
|
||||||
task = Task.current_task()
|
if task := Task.current_task():
|
||||||
if task:
|
|
||||||
# Make sure the automatic pytorch and matplotlib bindings are disabled!
|
# Make sure the automatic pytorch and matplotlib bindings are disabled!
|
||||||
# We are logging these plots and model files manually in the integration
|
# We are logging these plots and model files manually in the integration
|
||||||
PatchPyTorchModelIO.update_current_task(None)
|
PatchPyTorchModelIO.update_current_task(None)
|
||||||
@ -86,21 +84,19 @@ def on_pretrain_routine_start(trainer):
|
|||||||
|
|
||||||
|
|
||||||
def on_train_epoch_end(trainer):
|
def on_train_epoch_end(trainer):
|
||||||
task = Task.current_task()
|
"""Logs debug samples for the first epoch of YOLO training and report current training progress."""
|
||||||
|
if task := Task.current_task():
|
||||||
if task:
|
# Log debug samples
|
||||||
"""Logs debug samples for the first epoch of YOLO training."""
|
|
||||||
if trainer.epoch == 1:
|
if trainer.epoch == 1:
|
||||||
_log_debug_samples(sorted(trainer.save_dir.glob('train_batch*.jpg')), 'Mosaic')
|
_log_debug_samples(sorted(trainer.save_dir.glob('train_batch*.jpg')), 'Mosaic')
|
||||||
"""Report the current training progress."""
|
# Report the current training progress
|
||||||
for k, v in trainer.validator.metrics.results_dict.items():
|
for k, v in trainer.validator.metrics.results_dict.items():
|
||||||
task.get_logger().report_scalar('train', k, v, iteration=trainer.epoch)
|
task.get_logger().report_scalar('train', k, v, iteration=trainer.epoch)
|
||||||
|
|
||||||
|
|
||||||
def on_fit_epoch_end(trainer):
|
def on_fit_epoch_end(trainer):
|
||||||
"""Reports model information to logger at the end of an epoch."""
|
"""Reports model information to logger at the end of an epoch."""
|
||||||
task = Task.current_task()
|
if task := Task.current_task():
|
||||||
if task:
|
|
||||||
# You should have access to the validation bboxes under jdict
|
# You should have access to the validation bboxes under jdict
|
||||||
task.get_logger().report_scalar(title='Epoch Time',
|
task.get_logger().report_scalar(title='Epoch Time',
|
||||||
series='Epoch Time',
|
series='Epoch Time',
|
||||||
@ -120,8 +116,7 @@ def on_val_end(validator):
|
|||||||
|
|
||||||
def on_train_end(trainer):
|
def on_train_end(trainer):
|
||||||
"""Logs final model and its name on training completion."""
|
"""Logs final model and its name on training completion."""
|
||||||
task = Task.current_task()
|
if task := Task.current_task():
|
||||||
if task:
|
|
||||||
# Log final results, CM matrix + PR plots
|
# Log final results, CM matrix + PR plots
|
||||||
files = [
|
files = [
|
||||||
'results.png', 'confusion_matrix.png', 'confusion_matrix_normalized.png',
|
'results.png', 'confusion_matrix.png', 'confusion_matrix_normalized.png',
|
||||||
|
@ -40,7 +40,7 @@ def _log_images(path, prefix=''):
|
|||||||
|
|
||||||
# Group images by batch to enable sliders in UI
|
# Group images by batch to enable sliders in UI
|
||||||
if m := re.search(r'_batch(\d+)', name):
|
if m := re.search(r'_batch(\d+)', name):
|
||||||
ni = m.group(1)
|
ni = m[1]
|
||||||
new_stem = re.sub(r'_batch(\d+)', '_batch', path.stem)
|
new_stem = re.sub(r'_batch(\d+)', '_batch', path.stem)
|
||||||
name = (Path(new_stem) / ni).with_suffix(path.suffix)
|
name = (Path(new_stem) / ni).with_suffix(path.suffix)
|
||||||
|
|
||||||
|
@ -93,7 +93,7 @@ def zip_directory(directory, compress=True, exclude=('.DS_Store', '__MACOSX'), p
|
|||||||
raise FileNotFoundError(f"Directory '{directory}' does not exist.")
|
raise FileNotFoundError(f"Directory '{directory}' does not exist.")
|
||||||
|
|
||||||
# Unzip with progress bar
|
# Unzip with progress bar
|
||||||
files_to_zip = [f for f in directory.rglob('*') if f.is_file() and not any(x in f.name for x in exclude)]
|
files_to_zip = [f for f in directory.rglob('*') if f.is_file() and all(x not in f.name for x in exclude)]
|
||||||
zip_file = directory.with_suffix('.zip')
|
zip_file = directory.with_suffix('.zip')
|
||||||
compression = ZIP_DEFLATED if compress else ZIP_STORED
|
compression = ZIP_DEFLATED if compress else ZIP_STORED
|
||||||
with ZipFile(zip_file, 'w', compression) as f:
|
with ZipFile(zip_file, 'w', compression) as f:
|
||||||
@ -185,11 +185,9 @@ def check_disk_space(url='https://ultralytics.com/assets/coco128.zip', sf=1.5, h
|
|||||||
f'Please free {data * sf - free:.1f} GB additional disk space and try again.')
|
f'Please free {data * sf - free:.1f} GB additional disk space and try again.')
|
||||||
if hard:
|
if hard:
|
||||||
raise MemoryError(text)
|
raise MemoryError(text)
|
||||||
else:
|
LOGGER.warning(text)
|
||||||
LOGGER.warning(text)
|
return False
|
||||||
return False
|
|
||||||
|
|
||||||
# Pass if error
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
@ -332,6 +330,9 @@ def get_github_assets(repo='ultralytics/assets', version='latest', retry=False):
|
|||||||
r = requests.get(url) # github api
|
r = requests.get(url) # github api
|
||||||
if r.status_code != 200 and retry:
|
if r.status_code != 200 and retry:
|
||||||
r = requests.get(url) # try again
|
r = requests.get(url) # try again
|
||||||
|
if r.status_code != 200:
|
||||||
|
LOGGER.warning(f'⚠️ GitHub assets check failure for {url}: {r.status_code} {r.reason}')
|
||||||
|
return '', []
|
||||||
data = r.json()
|
data = r.json()
|
||||||
return data['tag_name'], [x['name'] for x in data['assets']] # tag, assets
|
return data['tag_name'], [x['name'] for x in data['assets']] # tag, assets
|
||||||
|
|
||||||
|
@ -382,7 +382,7 @@ def compute_ap(recall, precision):
|
|||||||
"""
|
"""
|
||||||
Compute the average precision (AP) given the recall and precision curves.
|
Compute the average precision (AP) given the recall and precision curves.
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
recall (list): The recall curve.
|
recall (list): The recall curve.
|
||||||
precision (list): The precision curve.
|
precision (list): The precision curve.
|
||||||
|
|
||||||
|
@ -140,7 +140,7 @@ def non_max_suppression(
|
|||||||
"""
|
"""
|
||||||
Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
|
Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
|
prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
|
||||||
containing the predicted boxes, classes, and masks. The tensor should be in the format
|
containing the predicted boxes, classes, and masks. The tensor should be in the format
|
||||||
output by a model, such as YOLO.
|
output by a model, such as YOLO.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user