From 67eeb0468d5a710d118f4107d28876aef8e27968 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 23 Aug 2023 14:19:07 +0200
Subject: [PATCH] `ultralytics 8.0.161` fix Classify dataset scanning bug
 (#4515)

---
 ultralytics/__init__.py        |  2 +-
 ultralytics/data/dataset.py    |  8 ++++----
 ultralytics/data/utils.py      | 23 ++++++++++++++++-------
 ultralytics/utils/downloads.py | 14 ++++++++------
 4 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py
index 6a3bfa70..96d0cacb 100644
--- a/ultralytics/__init__.py
+++ b/ultralytics/__init__.py
@@ -1,6 +1,6 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
 
-__version__ = '8.0.160'
+__version__ = '8.0.161'
 
 from ultralytics.models import RTDETR, SAM, YOLO
 from ultralytics.models.fastsam import FastSAM
diff --git a/ultralytics/data/dataset.py b/ultralytics/data/dataset.py
index 5318ca06..cf3a7ce9 100644
--- a/ultralytics/data/dataset.py
+++ b/ultralytics/data/dataset.py
@@ -17,7 +17,7 @@ from .base import BaseDataset
 from .utils import HELP_URL, LOGGER, get_hash, img2label_paths, verify_image, verify_image_label
 
 # Ultralytics dataset *.cache version, >= 1.0.0 for YOLOv8
-DATASET_CACHE_VERSION = '1.0.2'
+DATASET_CACHE_VERSION = '1.0.3'
 
 
 class YOLODataset(BaseDataset):
@@ -279,11 +279,11 @@ class ClassificationDataset(torchvision.datasets.ImageFolder):
         # Run scan if *.cache retrieval failed
         nf, nc, msgs, samples, x = 0, 0, [], [], {}
         with ThreadPool(NUM_THREADS) as pool:
-            results = pool.imap(func=verify_image, iterable=zip([x[0] for x in self.samples], repeat(self.prefix)))
+            results = pool.imap(func=verify_image, iterable=zip(self.samples, repeat(self.prefix)))
             pbar = tqdm(results, desc=desc, total=len(self.samples), bar_format=TQDM_BAR_FORMAT)
-            for im_file, nf_f, nc_f, msg in pbar:
+            for sample, nf_f, nc_f, msg in pbar:
                 if nf_f:
-                    samples.append((im_file, nf))
+                    samples.append(sample)
                 if msg:
                     msgs.append(msg)
                 nf += nf_f
diff --git a/ultralytics/data/utils.py b/ultralytics/data/utils.py
index c8d5e99e..428ffc95 100644
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@@ -59,7 +59,7 @@ def exif_size(img: Image.Image):
 
 def verify_image(args):
     """Verify one image."""
-    im_file, prefix = args
+    (im_file, cls), prefix = args
     # Number (found, corrupt), message
     nf, nc, msg = 0, 0, ''
     try:
@@ -79,7 +79,7 @@ def verify_image(args):
     except Exception as e:
         nc = 1
         msg = f'{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}'
-    return im_file, nf, nc, msg
+    return (im_file, cls), nf, nc, msg
 
 
 def verify_image_label(args):
@@ -321,7 +321,7 @@ def check_cls_dataset(dataset: str, split=''):
     dataset = Path(dataset)
     data_dir = (dataset if dataset.is_dir() else (DATASETS_DIR / dataset)).resolve()
     if not data_dir.is_dir():
-        LOGGER.info(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...')
+        LOGGER.warning(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...')
         t = time.time()
         if str(dataset) == 'imagenet':
             subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True)
@@ -335,9 +335,9 @@ def check_cls_dataset(dataset: str, split=''):
         data_dir / 'validation').exists() else None  # data/test or data/val
     test_set = data_dir / 'test' if (data_dir / 'test').exists() else None  # data/val or data/test
     if split == 'val' and not val_set:
-        LOGGER.info("WARNING ⚠️ Dataset 'split=val' not found, using 'split=test' instead.")
+        LOGGER.warning("WARNING ⚠️ Dataset 'split=val' not found, using 'split=test' instead.")
     elif split == 'test' and not test_set:
-        LOGGER.info("WARNING ⚠️ Dataset 'split=test' not found, using 'split=val' instead.")
+        LOGGER.warning("WARNING ⚠️ Dataset 'split=test' not found, using 'split=val' instead.")
 
     nc = len([x for x in (data_dir / 'train').glob('*') if x.is_dir()])  # number of classes
     names = [x.name for x in (data_dir / 'train').iterdir() if x.is_dir()]  # class names list
@@ -345,13 +345,22 @@ def check_cls_dataset(dataset: str, split=''):
 
     # Print to console
     for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
+        prefix = f'{colorstr(k)} {v}...'
         if v is None:
-            LOGGER.info(f'{colorstr(k)}: {v}')
+            LOGGER.info(prefix)
         else:
             files = [path for path in v.rglob('*.*') if path.suffix[1:].lower() in IMG_FORMATS]
             nf = len(files)  # number of files
             nd = len({file.parent for file in files})  # number of directories
-            LOGGER.info(f'{colorstr(k)}: {v}... found {nf} images in {nd} classes ✅ ')  # keep trailing space
+            if nf == 0:
+                if k == 'train':
+                    raise FileNotFoundError(emojis(f"{dataset} '{k}:' no training images found ❌ "))
+                else:
+                    LOGGER.warning(f'{prefix} found {nf} images in {nd} classes: WARNING ⚠️ no images found')
+            elif nd != nc:
+                LOGGER.warning(f'{prefix} found {nf} images in {nd} classes: ERROR ❌️ requires {nc} classes, not {nd}')
+            else:
+                LOGGER.info(f'{prefix} found {nf} images in {nd} classes ✅ ')
 
     return {'train': train_set, 'val': val_set or test_set, 'test': test_set or val_set, 'nc': nc, 'names': names}
 
diff --git a/ultralytics/utils/downloads.py b/ultralytics/utils/downloads.py
index 3171c038..08243250 100644
--- a/ultralytics/utils/downloads.py
+++ b/ultralytics/utils/downloads.py
@@ -39,16 +39,17 @@ def is_url(url, check=True):
     return False
 
 
-def delete_dsstore(path):
+def delete_dsstore(path, files_to_delete=('.DS_Store', '__MACOSX')):
     """
     Deletes all ".DS_store" files under a specified directory.
 
     Args:
         path (str, optional): The directory path where the ".DS_store" files should be deleted.
+        files_to_delete (tuple): The files to be deleted.
 
     Example:
         ```python
-        from ultralytics.data.utils import delete_dsstore
+        from ultralytics.utils.downloads import delete_dsstore
 
         delete_dsstore('path/to/dir')
         ```
@@ -58,10 +59,11 @@ def delete_dsstore(path):
         are hidden system files and can cause issues when transferring files between different operating systems.
     """
     # Delete Apple .DS_store files
-    files = list(Path(path).rglob('.DS_store'))
-    LOGGER.info(f'Deleting *.DS_store files: {files}')
-    for f in files:
-        f.unlink()
+    for file in files_to_delete:
+        matches = list(Path(path).rglob(file))
+        LOGGER.info(f'Deleting {file} files: {matches}')
+        for f in matches:
+            f.unlink()
 
 
 def zip_directory(directory, compress=True, exclude=('.DS_Store', '__MACOSX'), progress=True):