diff --git a/README.md b/README.md
index fbb74d4c..72c72144 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # [YOLOv10: Real-Time End-to-End Object Detection](https://arxiv.org/abs/2405.14458)
 
 
-Official PyTorch implementation of **YOLOv10**.
+Official PyTorch implementation of **YOLOv10**. NeurIPS 2024.
 
 <p align="center">
   <img src="figures/latency.svg" width=48%>
diff --git a/docker/Dockerfile b/docker/Dockerfile
index b96173ee..f6ef8b45 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -14,7 +14,7 @@ ADD https://github.com/ultralytics/assets/releases/download/v0.0.0/Arial.ttf \
 # Install linux packages
 # g++ required to build 'tflite_support' and 'lap' packages, libusb-1.0-0 required for 'tflite_support' package
 RUN apt update \
-    && apt install --no-install-recommends -y gcc git zip curl htop libgl1 libglib2.0-0 libpython3-dev gnupg g++ libusb-1.0-0
+    && apt install --no-install-recommends -y gcc git zip curl htop libgl1 libglib2.0-0 libpython3-dev gnupg g++ libusb-1.0-0 build-essential
 
 # Security updates
 # https://security.snyk.io/vuln/SNYK-UBUNTU1804-OPENSSL-3314796
diff --git a/docs/en/modes/export.md b/docs/en/modes/export.md
index 5859b18b..1b0827ad 100644
--- a/docs/en/modes/export.md
+++ b/docs/en/modes/export.md
@@ -83,7 +83,7 @@ This table details the configurations and options available for exporting YOLO m
 | `half`      | `bool`           | `False`         | Enables FP16 (half-precision) quantization, reducing model size and potentially speeding up inference on supported hardware.                                     |
 | `int8`      | `bool`           | `False`         | Activates INT8 quantization, further compressing the model and speeding up inference with minimal accuracy loss, primarily for edge devices.                     |
 | `dynamic`   | `bool`           | `False`         | Allows dynamic input sizes for ONNX and TensorRT exports, enhancing flexibility in handling varying image dimensions.                                            |
-| `simplify`  | `bool`           | `False`         | Simplifies the model graph for ONNX exports, potentially improving performance and compatibility.                                                                |
+| `simplify`  | `bool`           | `False`         | Simplifies the model graph for ONNX exports with `onnxsim`, potentially improving performance and compatibility.                                                                |
 | `opset`     | `int`            | `None`          | Specifies the ONNX opset version for compatibility with different ONNX parsers and runtimes. If not set, uses the latest supported version.                      |
 | `workspace` | `float`          | `4.0`           | Sets the maximum workspace size in GB for TensorRT optimizations, balancing memory usage and performance.                                                        |
 | `nms`       | `bool`           | `False`         | Adds Non-Maximum Suppression (NMS) to the CoreML export, essential for accurate and efficient detection post-processing.                                         |
diff --git a/requirements.txt b/requirements.txt
index 680357a7..019d1ae4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ onnxruntime==1.15.1
 pycocotools==2.0.7
 PyYAML==6.0.1
 scipy==1.13.0
-onnxsim==0.4.36
+onnxslim==0.1.31
 gradio==4.31.5
 opencv-python==4.9.0.80
 psutil==5.9.8
diff --git a/ultralytics/cfg/default.yaml b/ultralytics/cfg/default.yaml
index bc64897e..bd074b10 100644
--- a/ultralytics/cfg/default.yaml
+++ b/ultralytics/cfg/default.yaml
@@ -82,7 +82,7 @@ keras: False # (bool) use Kera=s
 optimize: False # (bool) TorchScript: optimize for mobile
 int8: False # (bool) CoreML/TF INT8 quantization
 dynamic: False # (bool) ONNX/TF/TensorRT: dynamic axes
-simplify: False # (bool) ONNX: simplify model
+simplify: False # (bool) ONNX: simplify model using `onnxslim`
 opset: # (int, optional) ONNX: opset version
 workspace: 4 # (int) TensorRT: workspace size (GB)
 nms: False # (bool) CoreML: add NMS
diff --git a/ultralytics/engine/exporter.py b/ultralytics/engine/exporter.py
index 1fa3f2e1..6ac170c5 100644
--- a/ultralytics/engine/exporter.py
+++ b/ultralytics/engine/exporter.py
@@ -355,9 +355,7 @@ class Exporter:
         """YOLOv8 ONNX export."""
         requirements = ["onnx>=1.12.0"]
         if self.args.simplify:
-            requirements += ["onnxsim>=0.4.33", "onnxruntime-gpu" if torch.cuda.is_available() else "onnxruntime"]
-            if ARM64:
-                check_requirements("cmake")  # 'cmake' is needed to build onnxsim on aarch64
+            requirements += ["onnxslim==0.1.31", "onnxruntime" + ("-gpu" if torch.cuda.is_available() else "")]
         check_requirements(requirements)
         import onnx  # noqa
 
@@ -394,14 +392,17 @@ class Exporter:
         # Simplify
         if self.args.simplify:
             try:
-                import onnxsim
+                import onnxslim
 
-                LOGGER.info(f"{prefix} simplifying with onnxsim {onnxsim.__version__}...")
-                # subprocess.run(f'onnxsim "{f}" "{f}"', shell=True)
-                model_onnx, check = onnxsim.simplify(model_onnx)
-                assert check, "Simplified ONNX model could not be validated"
+                LOGGER.info(f"{prefix} simplifying with onnxslim {onnxslim.__version__}...")
+                model_onnx = onnxslim.slim(model_onnx)
+                
+                # ONNX Simplifier (deprecated as must be compiled with 'cmake' in aarch64 and Conda CI environments)
+                # import onnxsim
+                # model_onnx, check = onnxsim.simplify(model_onnx)
+                # assert check, "Simplified ONNX model could not be validated"
             except Exception as e:
-                LOGGER.info(f"{prefix} simplifier failure: {e}")
+                LOGGER.warning(f"{prefix} simplifier failure: {e}")
 
         # Metadata
         for k, v in self.metadata.items():
@@ -656,7 +657,7 @@ class Exporter:
     def export_engine(self, prefix=colorstr("TensorRT:")):
         """YOLOv8 TensorRT export https://developer.nvidia.com/tensorrt."""
         assert self.im.device.type != "cpu", "export running on CPU but must be on GPU, i.e. use 'device=0'"
-        f_onnx, _ = self.export_onnx()  # run before trt import https://github.com/ultralytics/ultralytics/issues/7016
+        f_onnx, _ = self.export_onnx()  # run before TRT import https://github.com/ultralytics/ultralytics/issues/7016
 
         try:
             import tensorrt as trt  # noqa
@@ -741,7 +742,7 @@ class Exporter:
                 "onnx>=1.12.0",
                 "onnx2tf>=1.15.4,<=1.17.5",
                 "sng4onnx>=1.0.1",
-                "onnxsim>=0.4.33",
+                "onnxslim==0.1.31",
                 "onnx_graphsurgeon>=0.3.26",
                 "tflite_support",
                 "flatbuffers>=23.5.26,<100",  # update old 'flatbuffers' included inside tensorflow package
diff --git a/ultralytics/nn/modules/__init__.py b/ultralytics/nn/modules/__init__.py
index 4a99bf59..7f4c4fed 100644
--- a/ultralytics/nn/modules/__init__.py
+++ b/ultralytics/nn/modules/__init__.py
@@ -13,7 +13,7 @@ Example:
     m = Conv(128, 128)
     f = f'{m._get_name()}.onnx'
     torch.onnx.export(m, x, f)
-    os.system(f'onnxsim {f} {f} && open {f}')
+    os.system(f'onnxslim {f} {f} && open {f}') # pip install onnxslim
     ```
 """
 
diff --git a/ultralytics/utils/benchmarks.py b/ultralytics/utils/benchmarks.py
index 3bc63510..02869906 100644
--- a/ultralytics/utils/benchmarks.py
+++ b/ultralytics/utils/benchmarks.py
@@ -323,6 +323,8 @@ class ProfileModels:
 
         input_tensor = sess.get_inputs()[0]
         input_type = input_tensor.type
+        dynamic = not all(isinstance(dim, int) and dim >= 0 for dim in input_tensor.shape)  # dynamic input shape
+        input_shape = (1, 3, self.imgsz, self.imgsz) if dynamic else input_tensor.shape
 
         # Mapping ONNX datatype to numpy datatype
         if "float16" in input_type:
@@ -338,7 +340,7 @@ class ProfileModels:
         else:
             raise ValueError(f"Unsupported ONNX datatype {input_type}")
 
-        input_data = np.random.rand(*input_tensor.shape).astype(input_dtype)
+        input_data = np.random.rand(*input_shape).astype(input_dtype)
         input_name = input_tensor.name
         output_name = sess.get_outputs()[0].name