diff --git a/app.py b/app.py index 6fddcbaa..c8162d39 100644 --- a/app.py +++ b/app.py @@ -2,13 +2,48 @@ import gradio as gr import cv2 import tempfile from ultralytics import YOLOv10 +import supervision as sv +from huggingface_hub import hf_hub_download -def yolov10_inference(image, video, model_id, image_size, conf_threshold): - model = YOLOv10.from_pretrained(f'jameslahm/{model_id}') +def download_models(model_id): + hf_hub_download("kadirnar/Yolov10", filename=f"{model_id}", local_dir=f"./") + return f"./{model_id}" + +box_annotator = sv.BoxAnnotator() +category_dict = { + 0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', + 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', + 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', + 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', + 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', + 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', + 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', + 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', + 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', + 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', + 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', + 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', + 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', + 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', + 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', + 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush' +} + + +def yolov10_inference(image, video, model_id, image_size, conf_threshold, iou_threshold): + model_path = download_models(model_id) + model = YOLOv10(model_path) + if image: - results = model.predict(source=image, imgsz=image_size, conf=conf_threshold) - annotated_image = results[0].plot() + results = model(source=image, imgsz=image_size, iou=iou_threshold, conf=conf_threshold, verbose=False)[0] + detections = sv.Detections.from_ultralytics(results) + + labels = [ + f"{category_dict[class_id]} {confidence:.2f}" + for class_id, confidence in zip(detections.class_id, detections.confidence) + ] + annotated_image = box_annotator.annotate(image, detections=detections, labels=labels) return annotated_image[:, :, ::-1], None else: video_path = tempfile.mktemp(suffix=".webm") @@ -29,8 +64,14 @@ def yolov10_inference(image, video, model_id, image_size, conf_threshold): if not ret: break - results = model.predict(source=frame, imgsz=image_size, conf=conf_threshold) - annotated_frame = results[0].plot() + results = model(source=frame, imgsz=image_size, iou=iou_threshold, conf=conf_threshold, verbose=False)[0] + detections = sv.Detections.from_ultralytics(results) + + labels = [ + f"{category_dict[class_id]} {confidence:.2f}" + for class_id, confidence in zip(detections.class_id, detections.confidence) + ] + annotated_frame = box_annotator.annotate(frame, detections=detections, labels=labels) out.write(annotated_frame) cap.release() @@ -39,8 +80,8 @@ def yolov10_inference(image, video, model_id, image_size, conf_threshold): return None, output_video_path -def yolov10_inference_for_examples(image, model_path, image_size, conf_threshold): - annotated_image, _ = yolov10_inference(image, None, model_path, image_size, conf_threshold) +def yolov10_inference_for_examples(image, model_id, image_size, conf_threshold, iou_threshold): + annotated_image, _ = yolov10_inference(image, None, model_id, image_size, conf_threshold, iou_threshold) return annotated_image @@ -58,14 +99,14 @@ def app(): model_id = gr.Dropdown( label="Model", choices=[ - "yolov10n", - "yolov10s", - "yolov10m", - "yolov10b", - "yolov10l", - "yolov10x", + "yolov10n.pt", + "yolov10s.pt", + "yolov10m.pt", + "yolov10b.pt", + "yolov10l.pt", + "yolov10x.pt", ], - value="yolov10m", + value="yolov10m.pt", ) image_size = gr.Slider( label="Image Size", @@ -76,11 +117,18 @@ def app(): ) conf_threshold = gr.Slider( label="Confidence Threshold", - minimum=0.0, + minimum=0.1, maximum=1.0, - step=0.05, + step=0.1, value=0.25, ) + iou_threshold = gr.Slider( + label="IoU Threshold", + minimum=0.1, + maximum=1.0, + step=0.1, + value=0.45, + ) yolov10_infer = gr.Button(value="Detect Objects") with gr.Column(): @@ -88,12 +136,13 @@ def app(): output_video = gr.Video(label="Annotated Video", visible=False) def update_visibility(input_type): - image = gr.update(visible=True) if input_type == "Image" else gr.update(visible=False) - video = gr.update(visible=False) if input_type == "Image" else gr.update(visible=True) - output_image = gr.update(visible=True) if input_type == "Image" else gr.update(visible=False) - output_video = gr.update(visible=False) if input_type == "Image" else gr.update(visible=True) - - return image, video, output_image, output_video + image_visibility = input_type == "Image" + return ( + gr.update(visible=image_visibility), + gr.update(visible=not image_visibility), + gr.update(visible=image_visibility), + gr.update(visible=not image_visibility), + ) input_type.change( fn=update_visibility, @@ -101,16 +150,15 @@ def app(): outputs=[image, video, output_image, output_video], ) - def run_inference(image, video, model_id, image_size, conf_threshold, input_type): + def run_inference(image, video, model_id, image_size, conf_threshold, iou_threshold, input_type): if input_type == "Image": - return yolov10_inference(image, None, model_id, image_size, conf_threshold) + return yolov10_inference(image, None, model_id, image_size, conf_threshold, iou_threshold) else: - return yolov10_inference(None, video, model_id, image_size, conf_threshold) - + return yolov10_inference(None, video, model_id, image_size, conf_threshold, iou_threshold) yolov10_infer.click( fn=run_inference, - inputs=[image, video, model_id, image_size, conf_threshold, input_type], + inputs=[image, video, model_id, image_size, conf_threshold, iou_threshold, input_type], outputs=[output_image, output_video], ) @@ -118,15 +166,17 @@ def app(): examples=[ [ "ultralytics/assets/bus.jpg", - "yolov10s", + "yolov10s.pt", 640, 0.25, + 0.45, ], [ "ultralytics/assets/zidane.jpg", - "yolov10s", + "yolov10s.pt", 640, 0.25, + 0.45, ], ], fn=yolov10_inference_for_examples, @@ -135,11 +185,13 @@ def app(): model_id, image_size, conf_threshold, + iou_threshold, ], outputs=[output_image], cache_examples='lazy', ) + gradio_app = gr.Blocks() with gradio_app: gr.HTML( @@ -157,5 +209,6 @@ with gradio_app: with gr.Row(): with gr.Column(): app() + if __name__ == '__main__': gradio_app.launch()