From 52afadcb9772fb17933f02dc646b00074c184fba Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Tue, 29 Aug 2023 16:17:59 +0200 Subject: [PATCH] `ultralytics 8.0.166` expand `HUBDatasetStats` to Classify tasks (#4635) --- .github/workflows/ci.yaml | 2 +- tests/test_python.py | 2 +- ultralytics/__init__.py | 2 +- ultralytics/cfg/__init__.py | 2 +- ultralytics/data/utils.py | 96 ++++++++++++++++++++++++------------- 5 files changed, 67 insertions(+), 37 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ea3bab90..f7f39757 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -133,7 +133,7 @@ jobs: run: coverage run -a --source=ultralytics -m ultralytics.cfg.__init__ benchmark model='path with spaces/${{ matrix.model }}-seg.pt' imgsz=160 verbose=0.30 - name: Benchmark ClassificationModel shell: bash - run: coverage run -a --source=ultralytics -m ultralytics.cfg.__init__ benchmark model='path with spaces/${{ matrix.model }}-cls.pt' imgsz=160 verbose=0.36 + run: coverage run -a --source=ultralytics -m ultralytics.cfg.__init__ benchmark model='path with spaces/${{ matrix.model }}-cls.pt' imgsz=160 verbose=0.16 - name: Benchmark PoseModel shell: bash run: coverage run -a --source=ultralytics -m ultralytics.cfg.__init__ benchmark model='path with spaces/${{ matrix.model }}-pose.pt' imgsz=160 verbose=0.17 diff --git a/tests/test_python.py b/tests/test_python.py index 89077805..98ee4de0 100644 --- a/tests/test_python.py +++ b/tests/test_python.py @@ -277,7 +277,7 @@ def test_data_utils(): # from ultralytics.utils.files import WorkingDirectory # with WorkingDirectory(ROOT.parent / 'tests'): - for task in 'detect', 'segment', 'pose': + for task in 'detect', 'segment', 'pose', 'classify': file = Path(TASK2DATA[task]).with_suffix('.zip') # i.e. coco8.zip download(f'https://github.com/ultralytics/hub/raw/main/example_datasets/{file}', unzip=False, dir=TMP) stats = HUBDatasetStats(TMP / file, task=task) diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py index b8a41c7d..4de51cc2 100644 --- a/ultralytics/__init__.py +++ b/ultralytics/__init__.py @@ -1,6 +1,6 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -__version__ = '8.0.165' +__version__ = '8.0.166' from ultralytics.models import RTDETR, SAM, YOLO from ultralytics.models.fastsam import FastSAM diff --git a/ultralytics/cfg/__init__.py b/ultralytics/cfg/__init__.py index fb938f32..7bc48f27 100644 --- a/ultralytics/cfg/__init__.py +++ b/ultralytics/cfg/__init__.py @@ -15,7 +15,7 @@ from ultralytics.utils import (ASSETS, DEFAULT_CFG, DEFAULT_CFG_DICT, DEFAULT_CF # Define valid tasks and modes MODES = 'train', 'val', 'predict', 'export', 'track', 'benchmark' TASKS = 'detect', 'segment', 'classify', 'pose' -TASK2DATA = {'detect': 'coco8.yaml', 'segment': 'coco8-seg.yaml', 'classify': 'imagenet100', 'pose': 'coco8-pose.yaml'} +TASK2DATA = {'detect': 'coco8.yaml', 'segment': 'coco8-seg.yaml', 'classify': 'imagenet10', 'pose': 'coco8-pose.yaml'} TASK2MODEL = { 'detect': 'yolov8n.pt', 'segment': 'yolov8n-seg.pt', diff --git a/ultralytics/data/utils.py b/ultralytics/data/utils.py index 552b0784..bf5535a1 100644 --- a/ultralytics/data/utils.py +++ b/ultralytics/data/utils.py @@ -296,7 +296,7 @@ def check_det_dataset(dataset, autodownload=True): return data # dictionary -def check_cls_dataset(dataset: str, split=''): +def check_cls_dataset(dataset, split=''): """ Checks a classification dataset such as Imagenet. @@ -304,7 +304,7 @@ def check_cls_dataset(dataset: str, split=''): If the dataset is not found locally, it attempts to download the dataset from the internet and save it locally. Args: - dataset (str): The name of the dataset. + dataset (str | Path): The name of the dataset. split (str, optional): The split of the dataset. Either 'val', 'test', or ''. Defaults to ''. Returns: @@ -360,7 +360,7 @@ def check_cls_dataset(dataset: str, split=''): else: LOGGER.info(f'{prefix} found {nf} images in {nd} classes ✅ ') - return {'train': train_set, 'val': val_set or test_set, 'test': test_set or val_set, 'nc': nc, 'names': names} + return {'train': train_set, 'val': val_set, 'test': test_set, 'nc': nc, 'names': names} class HUBDatasetStats: @@ -373,14 +373,17 @@ class HUBDatasetStats: autodownload (bool): Attempt to download dataset if not found locally. Default is False. Example: - Download *.zip files from i.e. https://github.com/ultralytics/hub/raw/main/example_datasets/coco8.zip. + Download *.zip files from https://github.com/ultralytics/hub/tree/main/example_datasets + i.e. https://github.com/ultralytics/hub/raw/main/example_datasets/coco8.zip for coco8.zip. ```python from ultralytics.data.utils import HUBDatasetStats stats = HUBDatasetStats('path/to/coco8.zip', task='detect') # detect dataset stats = HUBDatasetStats('path/to/coco8-seg.zip', task='segment') # segment dataset stats = HUBDatasetStats('path/to/coco8-pose.zip', task='pose') # pose dataset - stats.get_json(save=False) + stats = HUBDatasetStats('path/to/imagenet10.zip', task='classify') # classification dataset + + stats.get_json(save=True) stats.process_images() ``` """ @@ -389,21 +392,27 @@ class HUBDatasetStats: """Initialize class.""" path = Path(path).resolve() LOGGER.info(f'Starting HUB dataset checks for {path}....') - zipped, data_dir, yaml_path = self._unzip(path) - try: - # data = yaml_load(check_yaml(yaml_path)) # data dict - data = check_det_dataset(yaml_path, autodownload) # data dict - if zipped: - data['path'] = data_dir - except Exception as e: - raise Exception('error/HUB/dataset_stats/yaml_load') from e - self.hub_dir = Path(str(data['path']) + '-hub') + self.task = task # detect, segment, pose, classify + if self.task == 'classify': + unzip_dir = unzip_file(path) + data = check_cls_dataset(unzip_dir) + data['path'] = unzip_dir + else: # detect, segment, pose + zipped, data_dir, yaml_path = self._unzip(Path(path)) + try: + # data = yaml_load(check_yaml(yaml_path)) # data dict + data = check_det_dataset(yaml_path, autodownload) # data dict + if zipped: + data['path'] = data_dir + except Exception as e: + raise Exception('error/HUB/dataset_stats/init') from e + + self.hub_dir = Path(f'{data["path"]}-hub') self.im_dir = self.hub_dir / 'images' self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images self.stats = {'nc': len(data['names']), 'names': list(data['names'].values())} # statistics dictionary self.data = data - self.task = task # detect, segment, pose, classify @staticmethod def _find_yaml(dir): @@ -430,7 +439,6 @@ class HUBDatasetStats: def get_json(self, save=False, verbose=False): """Return dataset JSON for Ultralytics HUB.""" - from ultralytics.data import YOLODataset # ClassificationDataset def _round(labels): """Update labels to integer class and 4 decimal place floats.""" @@ -458,23 +466,45 @@ class HUBDatasetStats: continue # Get dataset statistics - dataset = YOLODataset(img_path=self.data[split], - data=self.data, - use_segments=self.task == 'segment', - use_keypoints=self.task == 'pose') - x = np.array([ - np.bincount(label['cls'].astype(int).flatten(), minlength=self.data['nc']) - for label in tqdm(dataset.labels, total=len(dataset), desc='Statistics')]) # shape(128x80) - self.stats[split] = { - 'instance_stats': { - 'total': int(x.sum()), - 'per_class': x.sum(0).tolist()}, - 'image_stats': { - 'total': len(dataset), - 'unlabelled': int(np.all(x == 0, 1).sum()), - 'per_class': (x > 0).sum(0).tolist()}, - 'labels': [{ - Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)]} + if self.task == 'classify': + from torchvision.datasets import ImageFolder + + dataset = ImageFolder(self.data[split]) + + x = np.zeros(len(dataset.classes)).astype(int) + for im in dataset.imgs: + x[im[1]] += 1 + + self.stats[split] = { + 'instance_stats': { + 'total': len(dataset), + 'per_class': x.tolist()}, + 'image_stats': { + 'total': len(dataset), + 'unlabelled': 0, + 'per_class': x.tolist()}, + 'labels': [{ + Path(k).name: v} for k, v in dataset.imgs]} + else: + from ultralytics.data import YOLODataset + + dataset = YOLODataset(img_path=self.data[split], + data=self.data, + use_segments=self.task == 'segment', + use_keypoints=self.task == 'pose') + x = np.array([ + np.bincount(label['cls'].astype(int).flatten(), minlength=self.data['nc']) + for label in tqdm(dataset.labels, total=len(dataset), desc='Statistics')]) # shape(128x80) + self.stats[split] = { + 'instance_stats': { + 'total': int(x.sum()), + 'per_class': x.sum(0).tolist()}, + 'image_stats': { + 'total': len(dataset), + 'unlabelled': int(np.all(x == 0, 1).sum()), + 'per_class': (x > 0).sum(0).tolist()}, + 'labels': [{ + Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)]} # Save, print and return if save: