From 487ade952d1dc8c1f916e156c503ac7f748241cb Mon Sep 17 00:00:00 2001
From: Nyall Dawson <nyall.dawson@gmail.com>
Date: Mon, 4 Nov 2024 15:43:54 +1000
Subject: [PATCH] Use gdal_minmax_element.hpp for optimised min/max element
 retrieval

---
 scripts/astyle.sh                             |    2 +-
 .../providers/gdal/gdal_minmax_element.hpp    | 1408 +++++++++++++++++
 .../providers/gdal/gdal_priv_templates.hpp    |  787 +++++++++
 src/core/raster/qgsrasterblock.cpp            |  288 +---
 4 files changed, 2226 insertions(+), 259 deletions(-)
 create mode 100644 src/core/providers/gdal/gdal_minmax_element.hpp
 create mode 100644 src/core/providers/gdal/gdal_priv_templates.hpp
diff --git a/scripts/astyle.sh b/scripts/astyle.sh
index 414a7f4fb3f..30fe1baf5c2 100755
--- a/scripts/astyle.sh
+++ b/scripts/astyle.sh
@@ -109,7 +109,7 @@ astyleit() {
 
 for f in "$@"; do
 	case "$f" in
-                external/libdxfrw/*|external/untwine/*|external/qwt*|external/o2/*|external/odbccpp/*|external/qt-unix-signals/*|external/rtree/*|external/astyle/*|external/kdbush/*|external/PDF4QT/*|external/poly2tri/*|external/wintoast/*|external/qt3dextra-headers/*|external/lazperf/*|external/meshOptimizer/*|external/mapbox-vector-tile/*|external/pdal_wrench/*|external/tinygltf/*|python/ext-libs/*|ui_*.py|*.astyle|tests/testdata/*|editors/*)
+                external/libdxfrw/*|external/untwine/*|external/qwt*|external/o2/*|external/odbccpp/*|external/qt-unix-signals/*|external/rtree/*|external/astyle/*|external/kdbush/*|external/PDF4QT/*|external/poly2tri/*|external/wintoast/*|external/qt3dextra-headers/*|external/lazperf/*|external/meshOptimizer/*|external/mapbox-vector-tile/*|external/pdal_wrench/*|external/tinygltf/*|python/ext-libs/*|ui_*.py|*.astyle|src/core/providers/gdal/gdal_minmax_element.hpp|src/core/providers/gdal/gdal_priv_templates.hpp|tests/testdata/*|editors/*)
 			echo -ne "$f skipped $elcr"
 			continue
 			;;
diff --git a/src/core/providers/gdal/gdal_minmax_element.hpp b/src/core/providers/gdal/gdal_minmax_element.hpp
new file mode 100644
index 00000000000..893471d22c7
--- /dev/null
+++ b/src/core/providers/gdal/gdal_minmax_element.hpp
@@ -0,0 +1,1408 @@
+/******************************************************************************
+ * Project:  GDAL Core
+ * Purpose:  Utility functions to find minimum and maximum values in a buffer
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2024, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#ifndef GDAL_MINMAX_ELEMENT_INCLUDED
+#define GDAL_MINMAX_ELEMENT_INCLUDED
+
+// NOTE: This header requires C++17
+
+// This file may be vendored by other applications than GDAL
+// WARNING: if modifying this file, please also update the upstream GDAL version
+// at https://github.com/OSGeo/gdal/blob/master/gcore/gdal_minmax_element.hpp
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+#include "gdal.h"
+
+#ifdef GDAL_COMPILATION
+#define GDAL_MINMAXELT_NS gdal
+#elif !defined(GDAL_MINMAXELT_NS)
+#error "Please define the GDAL_MINMAXELT_NS macro to define the namespace"
+#endif
+
+#if defined(__x86_64) || defined(_M_X64)
+#define GDAL_MINMAX_ELEMENT_USE_SSE2
+#endif
+
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
+// SSE2 header
+#include <emmintrin.h>
+#endif
+
+#include "gdal_priv_templates.hpp"
+
+namespace GDAL_MINMAXELT_NS
+{
+namespace detail
+{
+
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
+/************************************************************************/
+/*                            compScalar()                              */
+/************************************************************************/
+
+template <class T, bool IS_MAX> inline static bool compScalar(T x, T y)
+{
+    if constexpr (IS_MAX)
+        return x > y;
+    else
+        return x < y;
+}
+
+/************************************************************************/
+/*                         extremum_element()                           */
+/************************************************************************/
+
+template <class T, bool IS_MAX>
+size_t extremum_element(const T *v, size_t size, T noDataValue)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    T extremum = v[0];
+    bool extremum_is_nodata = extremum == noDataValue;
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] != noDataValue &&
+            (compScalar<T, IS_MAX>(v[i], extremum) || extremum_is_nodata))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+            extremum_is_nodata = false;
+        }
+    }
+    return idx_of_extremum;
+}
+
+/************************************************************************/
+/*                         extremum_element()                           */
+/************************************************************************/
+
+template <class T, bool IS_MAX> size_t extremum_element(const T *v, size_t size)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    T extremum = v[0];
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (compScalar<T, IS_MAX>(v[i], extremum))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+        }
+    }
+    return idx_of_extremum;
+}
+
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
+
+/************************************************************************/
+/*                   extremum_element_with_nan()                        */
+/************************************************************************/
+
+static inline int8_t Shift8(uint8_t x)
+{
+    return static_cast<int8_t>(x + std::numeric_limits<int8_t>::min());
+}
+
+static inline int16_t Shift16(uint16_t x)
+{
+    return static_cast<int16_t>(x + std::numeric_limits<int16_t>::min());
+}
+
+CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW
+    static inline int32_t Shift32(uint32_t x)
+{
+    x += static_cast<uint32_t>(std::numeric_limits<int32_t>::min());
+    int32_t ret;
+    memcpy(&ret, &x, sizeof(x));
+    return ret;
+}
+
+// Return a _mm128[i|d] register with all its elements set to x
+template <class T> static inline auto set1(T x)
+{
+    if constexpr (std::is_same_v<T, uint8_t>)
+        return _mm_set1_epi8(Shift8(x));
+    else if constexpr (std::is_same_v<T, int8_t>)
+        return _mm_set1_epi8(x);
+    else if constexpr (std::is_same_v<T, uint16_t>)
+        return _mm_set1_epi16(Shift16(x));
+    else if constexpr (std::is_same_v<T, int16_t>)
+        return _mm_set1_epi16(x);
+    else if constexpr (std::is_same_v<T, uint32_t>)
+        return _mm_set1_epi32(Shift32(x));
+    else if constexpr (std::is_same_v<T, int32_t>)
+        return _mm_set1_epi32(x);
+    else if constexpr (std::is_same_v<T, float>)
+        return _mm_set1_ps(x);
+    else
+        return _mm_set1_pd(x);
+}
+
+// Return a _mm128[i|d] register with all its elements set to x
+template <class T> static inline auto set1_unshifted(T x)
+{
+    if constexpr (std::is_same_v<T, uint8_t>)
+    {
+        int8_t xSigned;
+        memcpy(&xSigned, &x, sizeof(xSigned));
+        return _mm_set1_epi8(xSigned);
+    }
+    else if constexpr (std::is_same_v<T, int8_t>)
+        return _mm_set1_epi8(x);
+    else if constexpr (std::is_same_v<T, uint16_t>)
+    {
+        int16_t xSigned;
+        memcpy(&xSigned, &x, sizeof(xSigned));
+        return _mm_set1_epi16(xSigned);
+    }
+    else if constexpr (std::is_same_v<T, int16_t>)
+        return _mm_set1_epi16(x);
+    else if constexpr (std::is_same_v<T, uint32_t>)
+    {
+        int32_t xSigned;
+        memcpy(&xSigned, &x, sizeof(xSigned));
+        return _mm_set1_epi32(xSigned);
+    }
+    else if constexpr (std::is_same_v<T, int32_t>)
+        return _mm_set1_epi32(x);
+    else if constexpr (std::is_same_v<T, float>)
+        return _mm_set1_ps(x);
+    else
+        return _mm_set1_pd(x);
+}
+
+// Load as many values of type T at a _mm128[i|d] register can contain from x
+template <class T> static inline auto loadv(const T *x)
+{
+    if constexpr (std::is_same_v<T, float>)
+        return _mm_loadu_ps(x);
+    else if constexpr (std::is_same_v<T, double>)
+        return _mm_loadu_pd(x);
+    else
+        return _mm_loadu_si128(reinterpret_cast<const __m128i *>(x));
+}
+
+// Return a __m128i register with bits set when x[i] < y[i] when !IS_MAX
+// or x[i] > y[i] when IS_MAX
+template <class T, bool IS_MAX, class SSE_T>
+static inline __m128i comp(SSE_T x, SSE_T y)
+{
+    if constexpr (IS_MAX)
+    {
+        if constexpr (std::is_same_v<T, uint8_t>)
+            return _mm_cmpgt_epi8(
+                _mm_add_epi8(x,
+                             _mm_set1_epi8(std::numeric_limits<int8_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int8_t>)
+            return _mm_cmpgt_epi8(x, y);
+        else if constexpr (std::is_same_v<T, uint16_t>)
+            return _mm_cmpgt_epi16(
+                _mm_add_epi16(
+                    x, _mm_set1_epi16(std::numeric_limits<int16_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int16_t>)
+            return _mm_cmpgt_epi16(x, y);
+        else if constexpr (std::is_same_v<T, uint32_t>)
+            return _mm_cmpgt_epi32(
+                _mm_add_epi32(
+                    x, _mm_set1_epi32(std::numeric_limits<int32_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int32_t>)
+            return _mm_cmpgt_epi32(x, y);
+        // We could use _mm_cmpgt_pX() if there was no NaN values
+        else if constexpr (std::is_same_v<T, float>)
+            return _mm_castps_si128(_mm_cmpnle_ps(x, y));
+        else
+            return _mm_castpd_si128(_mm_cmpnle_pd(x, y));
+    }
+    else
+    {
+        if constexpr (std::is_same_v<T, uint8_t>)
+            return _mm_cmplt_epi8(
+                _mm_add_epi8(x,
+                             _mm_set1_epi8(std::numeric_limits<int8_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int8_t>)
+            return _mm_cmplt_epi8(x, y);
+        else if constexpr (std::is_same_v<T, uint16_t>)
+            return _mm_cmplt_epi16(
+                _mm_add_epi16(
+                    x, _mm_set1_epi16(std::numeric_limits<int16_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int16_t>)
+            return _mm_cmplt_epi16(x, y);
+        else if constexpr (std::is_same_v<T, uint32_t>)
+            return _mm_cmplt_epi32(
+                _mm_add_epi32(
+                    x, _mm_set1_epi32(std::numeric_limits<int32_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int32_t>)
+            return _mm_cmplt_epi32(x, y);
+        // We could use _mm_cmplt_pX() if there was no NaN values
+        else if constexpr (std::is_same_v<T, float>)
+            return _mm_castps_si128(_mm_cmpnge_ps(x, y));
+        else
+            return _mm_castpd_si128(_mm_cmpnge_pd(x, y));
+    }
+}
+
+// Using SSE2
+template <class T, bool IS_MAX, bool HAS_NODATA>
+inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
+{
+    static_assert(std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t> ||
+                  std::is_same_v<T, uint16_t> || std::is_same_v<T, int16_t> ||
+                  std::is_same_v<T, uint32_t> || std::is_same_v<T, int32_t> ||
+                  std::is_floating_point_v<T>);
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    T extremum = v[0];
+    [[maybe_unused]] bool extremum_is_invalid = false;
+    if constexpr (std::is_floating_point_v<T>)
+    {
+        extremum_is_invalid = std::isnan(extremum);
+    }
+    if constexpr (HAS_NODATA)
+    {
+        if (extremum == noDataValue)
+            extremum_is_invalid = true;
+    }
+    size_t i = 1;
+
+    constexpr size_t VALS_PER_REG = sizeof(set1(extremum)) / sizeof(extremum);
+    constexpr int LOOP_UNROLLING = 4;
+    // If changing the value, then we need to adjust the number of sse_valX
+    // loading in the loop.
+    static_assert(LOOP_UNROLLING == 4);
+    constexpr size_t VALS_PER_ITER = VALS_PER_REG * LOOP_UNROLLING;
+
+    const auto update = [v, noDataValue, &extremum, &idx_of_extremum,
+                         &extremum_is_invalid](size_t idx)
+    {
+        if constexpr (HAS_NODATA)
+        {
+            if (v[idx] == noDataValue)
+                return;
+            if (extremum_is_invalid)
+            {
+                if constexpr (std::is_floating_point_v<T>)
+                {
+                    if (std::isnan(v[idx]))
+                        return;
+                }
+                extremum = v[idx];
+                idx_of_extremum = idx;
+                extremum_is_invalid = false;
+                return;
+            }
+        }
+        else
+        {
+            CPL_IGNORE_RET_VAL(noDataValue);
+        }
+        if (compScalar<T, IS_MAX>(v[idx], extremum))
+        {
+            extremum = v[idx];
+            idx_of_extremum = idx;
+            extremum_is_invalid = false;
+        }
+        else if constexpr (std::is_floating_point_v<T>)
+        {
+            if (extremum_is_invalid && !std::isnan(v[idx]))
+            {
+                extremum = v[idx];
+                idx_of_extremum = idx;
+                extremum_is_invalid = false;
+            }
+        }
+    };
+
+    for (; i < VALS_PER_ITER && i < size; ++i)
+    {
+        update(i);
+    }
+
+    [[maybe_unused]] auto sse_neutral = set1_unshifted(static_cast<T>(0));
+    [[maybe_unused]] auto sse_nodata = set1_unshifted(noDataValue);
+    if constexpr (HAS_NODATA)
+    {
+        for (; i < size && extremum_is_invalid; ++i)
+        {
+            update(i);
+        }
+        if (!extremum_is_invalid)
+        {
+            for (; i < size && (i % VALS_PER_ITER) != 0; ++i)
+            {
+                update(i);
+            }
+            sse_neutral = set1_unshifted(extremum);
+        }
+    }
+
+    auto sse_extremum = set1(extremum);
+
+    [[maybe_unused]] size_t hits = 0;
+    const auto sse_iter_count = (size / VALS_PER_ITER) * VALS_PER_ITER;
+    for (; i < sse_iter_count; i += VALS_PER_ITER)
+    {
+        // A bit of loop unrolling to save 3/4 of slow movemask operations.
+        auto sse_val0 = loadv(v + i + 0 * VALS_PER_REG);
+        auto sse_val1 = loadv(v + i + 1 * VALS_PER_REG);
+        auto sse_val2 = loadv(v + i + 2 * VALS_PER_REG);
+        auto sse_val3 = loadv(v + i + 3 * VALS_PER_REG);
+
+        if constexpr (HAS_NODATA)
+        {
+            // Replace all components that are at the nodata value by a
+            // neutral value (current minimum)
+            if constexpr (std::is_same_v<T, uint8_t> ||
+                          std::is_same_v<T, int8_t>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_epi8(sse_val, sse_nodata);
+                    return _mm_or_si128(_mm_and_si128(eq_nodata, sse_neutral),
+                                        _mm_andnot_si128(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else if constexpr (std::is_same_v<T, uint16_t> ||
+                               std::is_same_v<T, int16_t>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_epi16(sse_val, sse_nodata);
+                    return _mm_or_si128(_mm_and_si128(eq_nodata, sse_neutral),
+                                        _mm_andnot_si128(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else if constexpr (std::is_same_v<T, uint32_t> ||
+                               std::is_same_v<T, int32_t>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_epi32(sse_val, sse_nodata);
+                    return _mm_or_si128(_mm_and_si128(eq_nodata, sse_neutral),
+                                        _mm_andnot_si128(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else if constexpr (std::is_same_v<T, float>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_ps(sse_val, sse_nodata);
+                    return _mm_or_ps(_mm_and_ps(eq_nodata, sse_neutral),
+                                     _mm_andnot_ps(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else if constexpr (std::is_same_v<T, double>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_pd(sse_val, sse_nodata);
+                    return _mm_or_pd(_mm_and_pd(eq_nodata, sse_neutral),
+                                     _mm_andnot_pd(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else
+            {
+                static_assert(
+                    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t> ||
+                    std::is_same_v<T, uint16_t> || std::is_same_v<T, int16_t> ||
+                    std::is_same_v<T, uint32_t> || std::is_same_v<T, int32_t> ||
+                    std::is_same_v<T, float> || std::is_same_v<T, double>);
+            }
+        }
+
+        if (_mm_movemask_epi8(_mm_or_si128(
+                _mm_or_si128(comp<T, IS_MAX>(sse_val0, sse_extremum),
+                             comp<T, IS_MAX>(sse_val1, sse_extremum)),
+                _mm_or_si128(comp<T, IS_MAX>(sse_val2, sse_extremum),
+                             comp<T, IS_MAX>(sse_val3, sse_extremum)))) != 0)
+        {
+            if constexpr (!std::is_same_v<T, int8_t> &&
+                          !std::is_same_v<T, uint8_t>)
+            {
+                // The above tests excluding int8_t/uint8_t is due to the fact
+                // with those small ranges of values we will quickly converge
+                // to the minimum, so no need to do the below "smart" test.
+
+                if (++hits == size / 16)
+                {
+                    // If we have an almost sorted array, then using this code path
+                    // will hurt performance. Arbitrary give up if we get here
+                    // more than 1. / 16 of the size of the array.
+                    // fprintf(stderr, "going to non-vector path\n");
+                    break;
+                }
+            }
+            for (size_t j = 0; j < VALS_PER_ITER; j++)
+            {
+                update(i + j);
+            }
+            sse_extremum = set1(extremum);
+            if constexpr (HAS_NODATA)
+            {
+                sse_neutral = set1_unshifted(extremum);
+            }
+        }
+    }
+    for (; i < size; ++i)
+    {
+        update(i);
+    }
+    return idx_of_extremum;
+}
+
+#else
+
+/************************************************************************/
+/*                    extremum_element_with_nan()                       */
+/************************************************************************/
+
+template <class T, bool IS_MAX, bool HAS_NODATA>
+inline size_t extremum_element_with_nan(const T *v, size_t size, T /* nodata */)
+{
+    static_assert(!HAS_NODATA);
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    auto extremum = v[0];
+    bool extremum_is_nan = std::isnan(extremum);
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (compScalar<T, IS_MAX>(v[i], extremum) ||
+            (extremum_is_nan && !std::isnan(v[i])))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+            extremum_is_nan = false;
+        }
+    }
+    return idx_of_extremum;
+}
+#endif
+
+/************************************************************************/
+/*                         extremum_element()                           */
+/************************************************************************/
+
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
+
+template <>
+size_t extremum_element<uint8_t, true>(const uint8_t *v, size_t size,
+                                       uint8_t noDataValue)
+{
+    return extremum_element_with_nan<uint8_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<uint8_t, false>(const uint8_t *v, size_t size,
+                                        uint8_t noDataValue)
+{
+    return extremum_element_with_nan<uint8_t, false, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<uint8_t, true>(const uint8_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint8_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint8_t, false>(const uint8_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint8_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int8_t, true>(const int8_t *v, size_t size,
+                                      int8_t noDataValue)
+{
+    return extremum_element_with_nan<int8_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<int8_t, false>(const int8_t *v, size_t size,
+                                       int8_t noDataValue)
+{
+    return extremum_element_with_nan<int8_t, false, true>(v, size, noDataValue);
+}
+
+template <> size_t extremum_element<int8_t, true>(const int8_t *v, size_t size)
+{
+    return extremum_element_with_nan<int8_t, true, false>(v, size, 0);
+}
+
+template <> size_t extremum_element<int8_t, false>(const int8_t *v, size_t size)
+{
+    return extremum_element_with_nan<int8_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint16_t, true>(const uint16_t *v, size_t size,
+                                        uint16_t noDataValue)
+{
+    return extremum_element_with_nan<uint16_t, true, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<uint16_t, false>(const uint16_t *v, size_t size,
+                                         uint16_t noDataValue)
+{
+    return extremum_element_with_nan<uint16_t, false, true>(v, size,
+                                                            noDataValue);
+}
+
+template <>
+size_t extremum_element<uint16_t, true>(const uint16_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint16_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint16_t, false>(const uint16_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint16_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int16_t, true>(const int16_t *v, size_t size,
+                                       int16_t noDataValue)
+{
+    return extremum_element_with_nan<int16_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<int16_t, false>(const int16_t *v, size_t size,
+                                        int16_t noDataValue)
+{
+    return extremum_element_with_nan<int16_t, false, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<int16_t, true>(const int16_t *v, size_t size)
+{
+    return extremum_element_with_nan<int16_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int16_t, false>(const int16_t *v, size_t size)
+{
+    return extremum_element_with_nan<int16_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint32_t, true>(const uint32_t *v, size_t size,
+                                        uint32_t noDataValue)
+{
+    return extremum_element_with_nan<uint32_t, true, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<uint32_t, false>(const uint32_t *v, size_t size,
+                                         uint32_t noDataValue)
+{
+    return extremum_element_with_nan<uint32_t, false, true>(v, size,
+                                                            noDataValue);
+}
+
+template <>
+size_t extremum_element<uint32_t, true>(const uint32_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint32_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint32_t, false>(const uint32_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint32_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int32_t, true>(const int32_t *v, size_t size,
+                                       int32_t noDataValue)
+{
+    return extremum_element_with_nan<int32_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<int32_t, false>(const int32_t *v, size_t size,
+                                        int32_t noDataValue)
+{
+    return extremum_element_with_nan<int32_t, false, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<int32_t, true>(const int32_t *v, size_t size)
+{
+    return extremum_element_with_nan<int32_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int32_t, false>(const int32_t *v, size_t size)
+{
+    return extremum_element_with_nan<int32_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<float, true>(const float *v, size_t size,
+                                     float noDataValue)
+{
+    return extremum_element_with_nan<float, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<float, false>(const float *v, size_t size,
+                                      float noDataValue)
+{
+    return extremum_element_with_nan<float, false, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, true>(const double *v, size_t size,
+                                      double noDataValue)
+{
+    return extremum_element_with_nan<double, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, false>(const double *v, size_t size,
+                                       double noDataValue)
+{
+    return extremum_element_with_nan<double, false, true>(v, size, noDataValue);
+}
+
+#endif
+
+template <> size_t extremum_element<float, true>(const float *v, size_t size)
+{
+    return extremum_element_with_nan<float, true, false>(v, size, 0);
+}
+
+template <> size_t extremum_element<double, true>(const double *v, size_t size)
+{
+    return extremum_element_with_nan<double, true, false>(v, size, 0);
+}
+
+template <> size_t extremum_element<float, false>(const float *v, size_t size)
+{
+    return extremum_element_with_nan<float, false, false>(v, size, 0);
+}
+
+template <> size_t extremum_element<double, false>(const double *v, size_t size)
+{
+    return extremum_element_with_nan<double, false, false>(v, size, 0);
+}
+
+/************************************************************************/
+/*                       extremum_element_with_nan()                    */
+/************************************************************************/
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
+{
+    if (std::isnan(noDataValue))
+        return extremum_element_with_nan<T, IS_MAX, false>(v, size, 0);
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    auto extremum = v[0];
+    bool extremum_is_nan_or_nodata =
+        std::isnan(extremum) || (extremum == noDataValue);
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] != noDataValue &&
+            (compScalar<T, IS_MAX>(v[i], extremum) ||
+             (extremum_is_nan_or_nodata && !std::isnan(v[i]))))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+            extremum_is_nan_or_nodata = false;
+        }
+    }
+    return idx_of_extremum;
+}
+
+/************************************************************************/
+/*                            extremum_element()                        */
+/************************************************************************/
+
+#if !defined(GDAL_MINMAX_ELEMENT_USE_SSE2)
+
+template <>
+size_t extremum_element<float, true>(const float *v, size_t size,
+                                     float noDataValue)
+{
+    return extremum_element_with_nan<float, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<float, false>(const float *v, size_t size,
+                                      float noDataValue)
+{
+    return extremum_element_with_nan<float, false>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, true>(const double *v, size_t size,
+                                      double noDataValue)
+{
+    return extremum_element_with_nan<double, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, false>(const double *v, size_t size,
+                                       double noDataValue)
+{
+    return extremum_element_with_nan<double, false>(v, size, noDataValue);
+}
+
+#endif
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData,
+                               T noDataValue)
+{
+    if (bHasNoData)
+        return extremum_element<T, IS_MAX>(buffer, size, noDataValue);
+    else
+        return extremum_element<T, IS_MAX>(buffer, size);
+}
+
+#else
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData,
+                               T noDataValue)
+{
+    if (bHasNoData)
+    {
+        if constexpr (std::is_floating_point_v<T>)
+        {
+            if (std::isnan(noDataValue))
+            {
+                if constexpr (IS_MAX)
+                {
+                    return std::max_element(buffer, buffer + size,
+                                            [](T a, T b) {
+                                                return std::isnan(b)   ? false
+                                                       : std::isnan(a) ? true
+                                                                       : a < b;
+                                            }) -
+                           buffer;
+                }
+                else
+                {
+                    return std::min_element(buffer, buffer + size,
+                                            [](T a, T b) {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                                       : a < b;
+                                            }) -
+                           buffer;
+                }
+            }
+            else
+            {
+                if constexpr (IS_MAX)
+                {
+                    return std::max_element(buffer, buffer + size,
+                                            [noDataValue](T a, T b)
+                                            {
+                                                return std::isnan(b)   ? false
+                                                       : std::isnan(a) ? true
+                                                       : (b == noDataValue)
+                                                           ? false
+                                                           : (a == noDataValue)
+                                                                 ? true
+                                                                 : a < b;
+                                            }) -
+                           buffer;
+                }
+                else
+                {
+                    return std::min_element(buffer, buffer + size,
+                                            [noDataValue](T a, T b)
+                                            {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                       : (b == noDataValue)
+                                                           ? true
+                                                           : (a == noDataValue)
+                                                                 ? false
+                                                                 : a < b;
+                                            }) -
+                           buffer;
+                }
+            }
+        }
+        else
+        {
+            if constexpr (IS_MAX)
+            {
+                return std::max_element(buffer, buffer + size,
+                                        [noDataValue](T a, T b) {
+                                            return (b == noDataValue)   ? false
+                                                   : (a == noDataValue) ? true
+                                                                        : a < b;
+                                        }) -
+                       buffer;
+            }
+            else
+            {
+                return std::min_element(buffer, buffer + size,
+                                        [noDataValue](T a, T b) {
+                                            return (b == noDataValue)   ? true
+                                                   : (a == noDataValue) ? false
+                                                                        : a < b;
+                                        }) -
+                       buffer;
+            }
+        }
+    }
+    else
+    {
+        if constexpr (std::is_floating_point_v<T>)
+        {
+            if constexpr (IS_MAX)
+            {
+                return std::max_element(buffer, buffer + size,
+                                        [](T a, T b) {
+                                            return std::isnan(b)   ? false
+                                                   : std::isnan(a) ? true
+                                                                   : a < b;
+                                        }) -
+                       buffer;
+            }
+            else
+            {
+                return std::min_element(buffer, buffer + size,
+                                        [](T a, T b) {
+                                            return std::isnan(b)   ? true
+                                                   : std::isnan(a) ? false
+                                                                   : a < b;
+                                        }) -
+                       buffer;
+            }
+        }
+        else
+        {
+            if constexpr (IS_MAX)
+            {
+                return std::max_element(buffer, buffer + size) - buffer;
+            }
+            else
+            {
+                return std::min_element(buffer, buffer + size) - buffer;
+            }
+        }
+    }
+}
+#endif
+
+template <bool IS_MAX>
+size_t extremum_element(const void *buffer, size_t nElts, GDALDataType eDT,
+                        bool bHasNoData, double dfNoDataValue)
+{
+    switch (eDT)
+    {
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 7, 0)
+    case GDT_Int8:
+    {
+        using T = int8_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+#endif
+    case GDT_Byte:
+    {
+        using T = uint8_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_Int16:
+    {
+        using T = int16_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_UInt16:
+    {
+        using T = uint16_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_Int32:
+    {
+        using T = int32_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_UInt32:
+    {
+        using T = uint32_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 5, 0)
+    case GDT_Int64:
+    {
+        using T = int64_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_UInt64:
+    {
+        using T = uint64_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+#endif
+    case GDT_Float32:
+    {
+        using T = float;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_Float64:
+    {
+        using T = double;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return extremum_element<T, IS_MAX>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    default:
+        break;
+    }
+    CPLError(CE_Failure, CPLE_NotSupported,
+             "%s not supported for this data type.", __FUNCTION__);
+    return 0;
+}
+
+}  // namespace detail
+
+/************************************************************************/
+/*                            max_element()                             */
+/************************************************************************/
+
+/** Return the index of the element where the maximum value is hit.
+ *
+ * If it is hit in several locations, it is not specified which one will be
+ * returned.
+ *
+ * @param buffer Vector of nElts elements of type eDT.
+ * @param nElts Number of elements in buffer.
+ * @param eDT Data type of the elements of buffer.
+ * @param bHasNoData Whether dfNoDataValue is valid.
+ * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true
+ *
+ * @since GDAL 3.11
+ */
+inline size_t max_element(const void *buffer, size_t nElts, GDALDataType eDT,
+                          bool bHasNoData, double dfNoDataValue)
+{
+    return detail::extremum_element<true>(buffer, nElts, eDT, bHasNoData,
+                                          dfNoDataValue);
+}
+
+/************************************************************************/
+/*                            min_element()                             */
+/************************************************************************/
+
+/** Return the index of the element where the minimum value is hit.
+ *
+ * If it is hit in several locations, it is not specified which one will be
+ * returned.
+ *
+ * @param buffer Vector of nElts elements of type eDT.
+ * @param nElts Number of elements in buffer.
+ * @param eDT Data type of the elements of buffer.
+ * @param bHasNoData Whether dfNoDataValue is valid.
+ * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true
+ *
+ * @since GDAL 3.11
+ */
+inline size_t min_element(const void *buffer, size_t nElts, GDALDataType eDT,
+                          bool bHasNoData, double dfNoDataValue)
+{
+    return detail::extremum_element<false>(buffer, nElts, eDT, bHasNoData,
+                                           dfNoDataValue);
+}
+
+namespace detail
+{
+
+#ifdef NOT_EFFICIENT
+
+/************************************************************************/
+/*                         minmax_element()                             */
+/************************************************************************/
+
+template <class T>
+std::pair<size_t, size_t> minmax_element(const T *v, size_t size, T noDataValue)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return std::pair(0, 0);
+    size_t idx_of_min = 0;
+    size_t idx_of_max = 0;
+    T vmin = v[0];
+    T vmax = v[0];
+    bool extremum_is_nodata = vmin == noDataValue;
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] != noDataValue && (v[i] < vmin || extremum_is_nodata))
+        {
+            vmin = v[i];
+            idx_of_min = i;
+            extremum_is_nodata = false;
+        }
+        if (v[i] != noDataValue && (v[i] > vmax || extremum_is_nodata))
+        {
+            vmax = v[i];
+            idx_of_max = i;
+            extremum_is_nodata = false;
+        }
+    }
+    return std::pair(idx_of_min, idx_of_max);
+}
+
+template <class T>
+std::pair<size_t, size_t> minmax_element(const T *v, size_t size)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return std::pair(0, 0);
+    size_t idx_of_min = 0;
+    size_t idx_of_max = 0;
+    T vmin = v[0];
+    T vmax = v[0];
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] < vmin)
+        {
+            vmin = v[i];
+            idx_of_min = i;
+        }
+        if (v[i] > vmax)
+        {
+            vmax = v[i];
+            idx_of_max = i;
+        }
+    }
+    return std::pair(idx_of_min, idx_of_max);
+}
+
+template <class T>
+inline std::pair<size_t, size_t> minmax_element_with_nan(const T *v,
+                                                         size_t size)
+{
+    if (size == 0)
+        return std::pair(0, 0);
+    size_t idx_of_min = 0;
+    size_t idx_of_max = 0;
+    T vmin = v[0];
+    T vmax = v[0];
+    size_t i = 1;
+    if (std::isnan(v[0]))
+    {
+        for (; i < size; ++i)
+        {
+            if (!std::isnan(v[i]))
+            {
+                vmin = v[i];
+                idx_of_min = i;
+                vmax = v[i];
+                idx_of_max = i;
+                break;
+            }
+        }
+    }
+    for (; i < size; ++i)
+    {
+        if (v[i] < vmin)
+        {
+            vmin = v[i];
+            idx_of_min = i;
+        }
+        if (v[i] > vmax)
+        {
+            vmax = v[i];
+            idx_of_max = i;
+        }
+    }
+    return std::pair(idx_of_min, idx_of_max);
+}
+
+template <>
+std::pair<size_t, size_t> minmax_element<float>(const float *v, size_t size)
+{
+    return minmax_element_with_nan<float>(v, size);
+}
+
+template <>
+std::pair<size_t, size_t> minmax_element<double>(const double *v, size_t size)
+{
+    return minmax_element_with_nan<double>(v, size);
+}
+
+template <class T>
+inline std::pair<size_t, size_t> minmax_element(const T *buffer, size_t size,
+                                                bool bHasNoData, T noDataValue)
+{
+    if (bHasNoData)
+    {
+        return minmax_element<T>(buffer, size, noDataValue);
+    }
+    else
+    {
+        return minmax_element<T>(buffer, size);
+    }
+}
+#else
+
+/************************************************************************/
+/*                         minmax_element()                             */
+/************************************************************************/
+
+template <class T>
+inline std::pair<size_t, size_t> minmax_element(const T *buffer, size_t size,
+                                                bool bHasNoData, T noDataValue)
+{
+#ifdef NOT_EFFICIENT
+    if (bHasNoData)
+    {
+        return minmax_element<T>(buffer, size, noDataValue);
+    }
+    else
+    {
+        return minmax_element<T>(buffer, size);
+        //auto [imin, imax] = std::minmax_element(buffer, buffer + size);
+        //return std::pair(imin - buffer, imax - buffer);
+    }
+#else
+
+#if !defined(GDAL_MINMAX_ELEMENT_USE_SSE2)
+    if constexpr (!std::is_floating_point_v<T>)
+    {
+        if (!bHasNoData)
+        {
+            auto [min_iter, max_iter] =
+                std::minmax_element(buffer, buffer + size);
+            return std::pair(min_iter - buffer, max_iter - buffer);
+        }
+    }
+#endif
+
+    // Using separately min and max is more efficient than computing them
+    // within the same loop
+    return std::pair(
+        extremum_element<T, false>(buffer, size, bHasNoData, noDataValue),
+        extremum_element<T, true>(buffer, size, bHasNoData, noDataValue));
+
+#endif
+}
+#endif
+
+}  // namespace detail
+
+/************************************************************************/
+/*                          minmax_element()                            */
+/************************************************************************/
+
+/** Return the index of the elements where the minimum and maximum values are hit.
+ *
+ * If they are hit in several locations, it is not specified which one will be
+ * returned (contrary to std::minmax_element).
+ *
+ * @param buffer Vector of nElts elements of type eDT.
+ * @param nElts Number of elements in buffer.
+ * @param eDT Data type of the elements of buffer.
+ * @param bHasNoData Whether dfNoDataValue is valid.
+ * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true
+ *
+ * @since GDAL 3.11
+ */
+inline std::pair<size_t, size_t> minmax_element(const void *buffer,
+                                                size_t nElts, GDALDataType eDT,
+                                                bool bHasNoData,
+                                                double dfNoDataValue)
+{
+    switch (eDT)
+    {
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 7, 0)
+    case GDT_Int8:
+    {
+        using T = int8_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+#endif
+    case GDT_Byte:
+    {
+        using T = uint8_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_Int16:
+    {
+        using T = int16_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_UInt16:
+    {
+        using T = uint16_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_Int32:
+    {
+        using T = int32_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_UInt32:
+    {
+        using T = uint32_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 5, 0)
+    case GDT_Int64:
+    {
+        using T = int64_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_UInt64:
+    {
+        using T = uint64_t;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+#endif
+    case GDT_Float32:
+    {
+        using T = float;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    case GDT_Float64:
+    {
+        using T = double;
+        bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+        return detail::minmax_element<T>(
+            static_cast<const T *>(buffer), nElts, bHasNoData,
+            bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+    }
+    default:
+        break;
+    }
+    CPLError(CE_Failure, CPLE_NotSupported,
+             "%s not supported for this data type.", __FUNCTION__);
+    return std::pair(0, 0);
+}
+
+}  // namespace GDAL_MINMAXELT_NS
+
+#endif  // GDAL_MINMAX_ELEMENT_INCLUDED
diff --git a/src/core/providers/gdal/gdal_priv_templates.hpp b/src/core/providers/gdal/gdal_priv_templates.hpp
new file mode 100644
index 00000000000..3c20c055687
--- /dev/null
+++ b/src/core/providers/gdal/gdal_priv_templates.hpp
@@ -0,0 +1,787 @@
+/******************************************************************************
+ * $Id$
+ *
+ * Project:  GDAL Core
+ * Purpose:  Inline C++ templates
+ * Author:   Phil Vachon, <philippe at cowpig.ca>
+ *
+ ******************************************************************************
+ * Copyright (c) 2009, Phil Vachon, <philippe at cowpig.ca>
+ *
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#ifndef GDAL_PRIV_TEMPLATES_HPP_INCLUDED
+#define GDAL_PRIV_TEMPLATES_HPP_INCLUDED
+
+#include "cpl_port.h"
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+/************************************************************************/
+/*                        GDALGetDataLimits()                           */
+/************************************************************************/
+/**
+ * Compute the limits of values that can be placed in Tout in terms of
+ * Tin. Usually used for output clamping, when the output data type's
+ * limits are stable relative to the input type (i.e. no roundoff error).
+ *
+ * @param tMaxValue the returned maximum value
+ * @param tMinValue the returned minimum value
+ */
+
+template <class Tin, class Tout>
+inline void GDALGetDataLimits(Tin &tMaxValue, Tin &tMinValue)
+{
+    tMaxValue = std::numeric_limits<Tin>::max();
+    tMinValue = std::numeric_limits<Tin>::min();
+
+    // Compute the actual minimum value of Tout in terms of Tin.
+    if constexpr (std::numeric_limits<Tout>::is_signed &&
+                  std::numeric_limits<Tout>::is_integer)
+    {
+        // the minimum value is less than zero
+        if constexpr (std::numeric_limits<Tout>::digits <
+                          std::numeric_limits<Tin>::digits ||
+                      !std::numeric_limits<Tin>::is_integer)
+        {
+            // Tout is smaller than Tin, so we need to clamp values in input
+            // to the range of Tout's min/max values
+            if (std::numeric_limits<Tin>::is_signed)
+            {
+                tMinValue = static_cast<Tin>(std::numeric_limits<Tout>::min());
+            }
+            tMaxValue = static_cast<Tin>(std::numeric_limits<Tout>::max());
+        }
+    }
+    else if constexpr (std::numeric_limits<Tout>::is_integer)
+    {
+        // the output is unsigned, so we just need to determine the max
+        /* coverity[same_on_both_sides] */
+        if constexpr (std::numeric_limits<Tout>::digits <=
+                      std::numeric_limits<Tin>::digits)
+        {
+            // Tout is smaller than Tin, so we need to clamp the input values
+            // to the range of Tout's max
+            tMaxValue = static_cast<Tin>(std::numeric_limits<Tout>::max());
+        }
+        tMinValue = 0;
+    }
+}
+
+/************************************************************************/
+/*                          GDALClampValue()                            */
+/************************************************************************/
+/**
+ * Clamp values of type T to a specified range
+ *
+ * @param tValue the value
+ * @param tMax the max value
+ * @param tMin the min value
+ */
+template <class T>
+inline T GDALClampValue(const T tValue, const T tMax, const T tMin)
+{
+    return tValue > tMax ? tMax : tValue < tMin ? tMin : tValue;
+}
+
+/************************************************************************/
+/*                          GDALClampDoubleValue()                            */
+/************************************************************************/
+/**
+ * Clamp double values to a specified range, this uses the same
+ * argument ordering as std::clamp, returns TRUE if the value was clamped.
+ *
+ * @param tValue the value
+ * @param tMin the min value
+ * @param tMax the max value
+ *
+ */
+template <class T2, class T3>
+inline bool GDALClampDoubleValue(double &tValue, const T2 tMin, const T3 tMax)
+{
+    const double tMin2{static_cast<double>(tMin)};
+    const double tMax2{static_cast<double>(tMax)};
+    if (tValue > tMax2 || tValue < tMin2)
+    {
+        tValue = tValue > tMax2 ? tMax2 : tValue < tMin2 ? tMin2 : tValue;
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+/************************************************************************/
+/*                         GDALIsValueInRange()                         */
+/************************************************************************/
+/**
+ * Returns whether a value is in the type range.
+ * NaN is considered not to be in type range.
+ *
+ * @param dfValue the value
+ * @return whether the value is in the type range.
+ */
+template <class T> inline bool GDALIsValueInRange(double dfValue)
+{
+    return dfValue >= static_cast<double>(std::numeric_limits<T>::lowest()) &&
+           dfValue <= static_cast<double>(std::numeric_limits<T>::max());
+}
+
+template <> inline bool GDALIsValueInRange<double>(double dfValue)
+{
+    return !std::isnan(dfValue);
+}
+
+template <> inline bool GDALIsValueInRange<float>(double dfValue)
+{
+    return std::isinf(dfValue) ||
+           (dfValue >= -std::numeric_limits<float>::max() &&
+            dfValue <= std::numeric_limits<float>::max());
+}
+
+template <> inline bool GDALIsValueInRange<int64_t>(double dfValue)
+{
+    // Values in the range [INT64_MAX - 1023, INT64_MAX - 1]
+    // get converted to a double that once cast to int64_t is
+    // INT64_MAX + 1, hence the < strict comparison.
+    return dfValue >=
+               static_cast<double>(std::numeric_limits<int64_t>::min()) &&
+           dfValue < static_cast<double>(std::numeric_limits<int64_t>::max());
+}
+
+template <> inline bool GDALIsValueInRange<uint64_t>(double dfValue)
+{
+    // Values in the range [UINT64_MAX - 2047, UINT64_MAX - 1]
+    // get converted to a double that once cast to uint64_t is
+    // UINT64_MAX + 1, hence the < strict comparison.
+    return dfValue >= 0 &&
+           dfValue < static_cast<double>(std::numeric_limits<uint64_t>::max());
+}
+
+/************************************************************************/
+/*                         GDALIsValueExactAs()                         */
+/************************************************************************/
+/**
+ * Returns whether a value can be exactly represented on type T.
+ *
+ * That is static_cast\<double\>(static_cast\<T\>(dfValue)) is legal and is
+ * equal to dfValue.
+ *
+ * Note: for T=float or double, a NaN input leads to true
+ *
+ * @param dfValue the value
+ * @return whether the value can be exactly represented on type T.
+ */
+template <class T> inline bool GDALIsValueExactAs(double dfValue)
+{
+    return GDALIsValueInRange<T>(dfValue) &&
+           static_cast<double>(static_cast<T>(dfValue)) == dfValue;
+}
+
+template <> inline bool GDALIsValueExactAs<float>(double dfValue)
+{
+    return std::isnan(dfValue) ||
+           (GDALIsValueInRange<float>(dfValue) &&
+            static_cast<double>(static_cast<float>(dfValue)) == dfValue);
+}
+
+template <> inline bool GDALIsValueExactAs<double>(double)
+{
+    return true;
+}
+
+/************************************************************************/
+/*                          GDALCopyWord()                              */
+/************************************************************************/
+
+template <class Tin, class Tout> struct sGDALCopyWord
+{
+    static inline void f(const Tin tValueIn, Tout &tValueOut)
+    {
+        Tin tMaxVal, tMinVal;
+        GDALGetDataLimits<Tin, Tout>(tMaxVal, tMinVal);
+        tValueOut =
+            static_cast<Tout>(GDALClampValue(tValueIn, tMaxVal, tMinVal));
+    }
+};
+
+template <class Tin> struct sGDALCopyWord<Tin, float>
+{
+    static inline void f(const Tin tValueIn, float &fValueOut)
+    {
+        fValueOut = static_cast<float>(tValueIn);
+    }
+};
+
+template <class Tin> struct sGDALCopyWord<Tin, double>
+{
+    static inline void f(const Tin tValueIn, double &dfValueOut)
+    {
+        dfValueOut = static_cast<double>(tValueIn);
+    }
+};
+
+template <> struct sGDALCopyWord<double, double>
+{
+    static inline void f(const double dfValueIn, double &dfValueOut)
+    {
+        dfValueOut = dfValueIn;
+    }
+};
+
+template <> struct sGDALCopyWord<float, float>
+{
+    static inline void f(const float fValueIn, float &fValueOut)
+    {
+        fValueOut = fValueIn;
+    }
+};
+
+template <> struct sGDALCopyWord<float, double>
+{
+    static inline void f(const float fValueIn, double &dfValueOut)
+    {
+        dfValueOut = fValueIn;
+    }
+};
+
+template <> struct sGDALCopyWord<double, float>
+{
+    static inline void f(const double dfValueIn, float &fValueOut)
+    {
+        if (dfValueIn > std::numeric_limits<float>::max())
+        {
+            fValueOut = std::numeric_limits<float>::infinity();
+            return;
+        }
+        if (dfValueIn < -std::numeric_limits<float>::max())
+        {
+            fValueOut = -std::numeric_limits<float>::infinity();
+            return;
+        }
+
+        fValueOut = static_cast<float>(dfValueIn);
+    }
+};
+
+template <class Tout> struct sGDALCopyWord<float, Tout>
+{
+    static inline void f(const float fValueIn, Tout &tValueOut)
+    {
+        if (std::isnan(fValueIn))
+        {
+            tValueOut = 0;
+            return;
+        }
+        float fMaxVal, fMinVal;
+        GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
+        tValueOut = static_cast<Tout>(
+            GDALClampValue(fValueIn + 0.5f, fMaxVal, fMinVal));
+    }
+};
+
+template <> struct sGDALCopyWord<float, short>
+{
+    static inline void f(const float fValueIn, short &nValueOut)
+    {
+        if (std::isnan(fValueIn))
+        {
+            nValueOut = 0;
+            return;
+        }
+        float fMaxVal, fMinVal;
+        GDALGetDataLimits<float, short>(fMaxVal, fMinVal);
+        float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
+        nValueOut =
+            static_cast<short>(GDALClampValue(fValue, fMaxVal, fMinVal));
+    }
+};
+
+template <> struct sGDALCopyWord<float, signed char>
+{
+    static inline void f(const float fValueIn, signed char &nValueOut)
+    {
+        if (std::isnan(fValueIn))
+        {
+            nValueOut = 0;
+            return;
+        }
+        float fMaxVal, fMinVal;
+        GDALGetDataLimits<float, signed char>(fMaxVal, fMinVal);
+        float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
+        nValueOut =
+            static_cast<signed char>(GDALClampValue(fValue, fMaxVal, fMinVal));
+    }
+};
+
+template <class Tout> struct sGDALCopyWord<double, Tout>
+{
+    static inline void f(const double dfValueIn, Tout &tValueOut)
+    {
+        if (std::isnan(dfValueIn))
+        {
+            tValueOut = 0;
+            return;
+        }
+        double dfMaxVal, dfMinVal;
+        GDALGetDataLimits<double, Tout>(dfMaxVal, dfMinVal);
+        tValueOut = static_cast<Tout>(
+            GDALClampValue(dfValueIn + 0.5, dfMaxVal, dfMinVal));
+    }
+};
+
+template <> struct sGDALCopyWord<double, int>
+{
+    static inline void f(const double dfValueIn, int &nValueOut)
+    {
+        if (std::isnan(dfValueIn))
+        {
+            nValueOut = 0;
+            return;
+        }
+        double dfMaxVal, dfMinVal;
+        GDALGetDataLimits<double, int>(dfMaxVal, dfMinVal);
+        double dfValue = dfValueIn >= 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
+        nValueOut =
+            static_cast<int>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
+    }
+};
+
+template <> struct sGDALCopyWord<double, std::int64_t>
+{
+    static inline void f(const double dfValueIn, std::int64_t &nValueOut)
+    {
+        if (std::isnan(dfValueIn))
+        {
+            nValueOut = 0;
+        }
+        else if (dfValueIn >=
+                 static_cast<double>(std::numeric_limits<std::int64_t>::max()))
+        {
+            nValueOut = std::numeric_limits<std::int64_t>::max();
+        }
+        else if (dfValueIn <=
+                 static_cast<double>(std::numeric_limits<std::int64_t>::min()))
+        {
+            nValueOut = std::numeric_limits<std::int64_t>::min();
+        }
+        else
+        {
+            nValueOut = static_cast<std::int64_t>(
+                dfValueIn > 0.0f ? dfValueIn + 0.5f : dfValueIn - 0.5f);
+        }
+    }
+};
+
+template <> struct sGDALCopyWord<double, std::uint64_t>
+{
+    static inline void f(const double dfValueIn, std::uint64_t &nValueOut)
+    {
+        if (!(dfValueIn > 0))
+        {
+            nValueOut = 0;
+        }
+        else if (dfValueIn >
+                 static_cast<double>(std::numeric_limits<uint64_t>::max()))
+        {
+            nValueOut = std::numeric_limits<uint64_t>::max();
+        }
+        else
+        {
+            nValueOut = static_cast<std::uint64_t>(dfValueIn + 0.5);
+        }
+    }
+};
+
+template <> struct sGDALCopyWord<double, short>
+{
+    static inline void f(const double dfValueIn, short &nValueOut)
+    {
+        if (std::isnan(dfValueIn))
+        {
+            nValueOut = 0;
+            return;
+        }
+        double dfMaxVal, dfMinVal;
+        GDALGetDataLimits<double, short>(dfMaxVal, dfMinVal);
+        double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
+        nValueOut =
+            static_cast<short>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
+    }
+};
+
+template <> struct sGDALCopyWord<double, signed char>
+{
+    static inline void f(const double dfValueIn, signed char &nValueOut)
+    {
+        if (std::isnan(dfValueIn))
+        {
+            nValueOut = 0;
+            return;
+        }
+        double dfMaxVal, dfMinVal;
+        GDALGetDataLimits<double, signed char>(dfMaxVal, dfMinVal);
+        double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
+        nValueOut = static_cast<signed char>(
+            GDALClampValue(dfValue, dfMaxVal, dfMinVal));
+    }
+};
+
+// Roundoff occurs for Float32 -> int32 for max/min. Overload GDALCopyWord
+// specifically for this case.
+template <> struct sGDALCopyWord<float, int>
+{
+    static inline void f(const float fValueIn, int &nValueOut)
+    {
+        if (std::isnan(fValueIn))
+        {
+            nValueOut = 0;
+        }
+        else if (fValueIn >=
+                 static_cast<float>(std::numeric_limits<int>::max()))
+        {
+            nValueOut = std::numeric_limits<int>::max();
+        }
+        else if (fValueIn <=
+                 static_cast<float>(std::numeric_limits<int>::min()))
+        {
+            nValueOut = std::numeric_limits<int>::min();
+        }
+        else
+        {
+            nValueOut = static_cast<int>(fValueIn > 0.0f ? fValueIn + 0.5f
+                                                         : fValueIn - 0.5f);
+        }
+    }
+};
+
+// Roundoff occurs for Float32 -> uint32 for max. Overload GDALCopyWord
+// specifically for this case.
+template <> struct sGDALCopyWord<float, unsigned int>
+{
+    static inline void f(const float fValueIn, unsigned int &nValueOut)
+    {
+        if (!(fValueIn > 0))
+        {
+            nValueOut = 0;
+        }
+        else if (fValueIn >=
+                 static_cast<float>(std::numeric_limits<unsigned int>::max()))
+        {
+            nValueOut = std::numeric_limits<unsigned int>::max();
+        }
+        else
+        {
+            nValueOut = static_cast<unsigned int>(fValueIn + 0.5f);
+        }
+    }
+};
+
+// Roundoff occurs for Float32 -> std::int64_t for max/min. Overload
+// GDALCopyWord specifically for this case.
+template <> struct sGDALCopyWord<float, std::int64_t>
+{
+    static inline void f(const float fValueIn, std::int64_t &nValueOut)
+    {
+        if (std::isnan(fValueIn))
+        {
+            nValueOut = 0;
+        }
+        else if (fValueIn >=
+                 static_cast<float>(std::numeric_limits<std::int64_t>::max()))
+        {
+            nValueOut = std::numeric_limits<std::int64_t>::max();
+        }
+        else if (fValueIn <=
+                 static_cast<float>(std::numeric_limits<std::int64_t>::min()))
+        {
+            nValueOut = std::numeric_limits<std::int64_t>::min();
+        }
+        else
+        {
+            nValueOut = static_cast<std::int64_t>(
+                fValueIn > 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f);
+        }
+    }
+};
+
+// Roundoff occurs for Float32 -> std::uint64_t for max. Overload GDALCopyWord
+// specifically for this case.
+template <> struct sGDALCopyWord<float, std::uint64_t>
+{
+    static inline void f(const float fValueIn, std::uint64_t &nValueOut)
+    {
+        if (!(fValueIn > 0))
+        {
+            nValueOut = 0;
+        }
+        else if (fValueIn >=
+                 static_cast<float>(std::numeric_limits<std::uint64_t>::max()))
+        {
+            nValueOut = std::numeric_limits<std::uint64_t>::max();
+        }
+        else
+        {
+            nValueOut = static_cast<std::uint64_t>(fValueIn + 0.5f);
+        }
+    }
+};
+
+/**
+ * Copy a single word, optionally rounding if appropriate (i.e. going
+ * from the float to the integer case). Note that this is the function
+ * you should specialize if you're adding a new data type.
+ *
+ * @param tValueIn value of type Tin; the input value to be converted
+ * @param tValueOut value of type Tout; the output value
+ */
+
+template <class Tin, class Tout>
+inline void GDALCopyWord(const Tin tValueIn, Tout &tValueOut)
+{
+    sGDALCopyWord<Tin, Tout>::f(tValueIn, tValueOut);
+}
+
+/************************************************************************/
+/*                         GDALCopy4Words()                             */
+/************************************************************************/
+/**
+ * Copy 4 packed words to 4 packed words, optionally rounding if appropriate
+ * (i.e. going from the float to the integer case).
+ *
+ * @param pValueIn pointer to 4 input values of type Tin.
+ * @param pValueOut pointer to 4 output values of type Tout.
+ */
+
+template <class Tin, class Tout>
+inline void GDALCopy4Words(const Tin *pValueIn, Tout *const pValueOut)
+{
+    GDALCopyWord(pValueIn[0], pValueOut[0]);
+    GDALCopyWord(pValueIn[1], pValueOut[1]);
+    GDALCopyWord(pValueIn[2], pValueOut[2]);
+    GDALCopyWord(pValueIn[3], pValueOut[3]);
+}
+
+/************************************************************************/
+/*                         GDALCopy8Words()                             */
+/************************************************************************/
+/**
+ * Copy 8 packed words to 8 packed words, optionally rounding if appropriate
+ * (i.e. going from the float to the integer case).
+ *
+ * @param pValueIn pointer to 8 input values of type Tin.
+ * @param pValueOut pointer to 8 output values of type Tout.
+ */
+
+template <class Tin, class Tout>
+inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut)
+{
+    GDALCopy4Words(pValueIn, pValueOut);
+    GDALCopy4Words(pValueIn + 4, pValueOut + 4);
+}
+
+// Needs SSE2
+#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)
+
+#include <emmintrin.h>
+
+static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest)
+{
+    int n32 = _mm_cvtsi128_si32(xmm);  // Extract lower 32 bit word
+    memcpy(pDest, &n32, sizeof(n32));
+}
+
+static inline void GDALCopyXMMToInt64(const __m128i xmm, void *pDest)
+{
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(pDest), xmm);
+}
+
+#if __SSSE3__
+#include <tmmintrin.h>
+#endif
+
+#if __SSE4_1__
+#include <smmintrin.h>
+#endif
+
+template <>
+inline void GDALCopy4Words(const float *pValueIn, GByte *const pValueOut)
+{
+    __m128 xmm = _mm_loadu_ps(pValueIn);
+
+    // The following clamping would be useless due to the final saturating
+    // packing if we could guarantee the input range in [INT_MIN,INT_MAX]
+    const __m128 p0d5 = _mm_set1_ps(0.5f);
+    const __m128 xmm_max = _mm_set1_ps(255);
+    xmm = _mm_add_ps(xmm, p0d5);
+    xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
+
+    __m128i xmm_i = _mm_cvttps_epi32(xmm);
+
+#if __SSSE3__
+    xmm_i = _mm_shuffle_epi8(
+        xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
+#else
+    xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
+    xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
+#endif
+    GDALCopyXMMToInt32(xmm_i, pValueOut);
+}
+
+template <>
+inline void GDALCopy4Words(const float *pValueIn, GInt16 *const pValueOut)
+{
+    __m128 xmm = _mm_loadu_ps(pValueIn);
+
+    const __m128 xmm_min = _mm_set1_ps(-32768);
+    const __m128 xmm_max = _mm_set1_ps(32767);
+    xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
+
+    const __m128 p0d5 = _mm_set1_ps(0.5f);
+    const __m128 m0d5 = _mm_set1_ps(-0.5f);
+    const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
+    // f >= 0.5f ? f + 0.5f : f - 0.5f
+    xmm = _mm_add_ps(
+        xmm, _mm_or_ps(_mm_and_ps(mask, p0d5), _mm_andnot_ps(mask, m0d5)));
+
+    __m128i xmm_i = _mm_cvttps_epi32(xmm);
+
+    xmm_i = _mm_packs_epi32(xmm_i, xmm_i);  // Pack int32 to int16
+    GDALCopyXMMToInt64(xmm_i, pValueOut);
+}
+
+template <>
+inline void GDALCopy4Words(const float *pValueIn, GUInt16 *const pValueOut)
+{
+    __m128 xmm = _mm_loadu_ps(pValueIn);
+
+    const __m128 p0d5 = _mm_set1_ps(0.5f);
+    const __m128 xmm_max = _mm_set1_ps(65535);
+    xmm = _mm_add_ps(xmm, p0d5);
+    xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
+
+    __m128i xmm_i = _mm_cvttps_epi32(xmm);
+
+#if __SSE4_1__
+    xmm_i = _mm_packus_epi32(xmm_i, xmm_i);  // Pack int32 to uint16
+#else
+    // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
+    xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
+    xmm_i = _mm_packs_epi32(xmm_i, xmm_i);  // Pack int32 to int16
+    // Translate back to uint16 range (actually -32768==32768 in int16)
+    xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
+#endif
+    GDALCopyXMMToInt64(xmm_i, pValueOut);
+}
+
+#ifdef __AVX2__
+
+#include <immintrin.h>
+
+template <>
+inline void GDALCopy8Words(const float *pValueIn, GByte *const pValueOut)
+{
+    __m256 ymm = _mm256_loadu_ps(pValueIn);
+
+    const __m256 p0d5 = _mm256_set1_ps(0.5f);
+    const __m256 ymm_max = _mm256_set1_ps(255);
+    ymm = _mm256_add_ps(ymm, p0d5);
+    ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
+
+    __m256i ymm_i = _mm256_cvttps_epi32(ymm);
+
+    ymm_i = _mm256_packus_epi32(ymm_i, ymm_i);  // Pack int32 to uint16
+    ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2));  // AVX2
+
+    __m128i xmm_i = _mm256_castsi256_si128(ymm_i);
+    xmm_i = _mm_packus_epi16(xmm_i, xmm_i);
+    GDALCopyXMMToInt64(xmm_i, pValueOut);
+}
+
+template <>
+inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
+{
+    __m256 ymm = _mm256_loadu_ps(pValueIn);
+
+    const __m256 p0d5 = _mm256_set1_ps(0.5f);
+    const __m256 ymm_max = _mm256_set1_ps(65535);
+    ymm = _mm256_add_ps(ymm, p0d5);
+    ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
+
+    __m256i ymm_i = _mm256_cvttps_epi32(ymm);
+
+    ymm_i = _mm256_packus_epi32(ymm_i, ymm_i);  // Pack int32 to uint16
+    ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2));  // AVX2
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
+                     _mm256_castsi256_si128(ymm_i));
+}
+#else
+template <>
+inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
+{
+    __m128 xmm = _mm_loadu_ps(pValueIn);
+    __m128 xmm1 = _mm_loadu_ps(pValueIn + 4);
+
+    const __m128 p0d5 = _mm_set1_ps(0.5f);
+    const __m128 xmm_max = _mm_set1_ps(65535);
+    xmm = _mm_add_ps(xmm, p0d5);
+    xmm1 = _mm_add_ps(xmm1, p0d5);
+    xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
+    xmm1 = _mm_min_ps(_mm_max_ps(xmm1, p0d5), xmm_max);
+
+    __m128i xmm_i = _mm_cvttps_epi32(xmm);
+    __m128i xmm1_i = _mm_cvttps_epi32(xmm1);
+
+#if __SSE4_1__
+    xmm_i = _mm_packus_epi32(xmm_i, xmm1_i);  // Pack int32 to uint16
+#else
+    // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
+    xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
+    xmm1_i = _mm_add_epi32(xmm1_i, _mm_set1_epi32(-32768));
+    xmm_i = _mm_packs_epi32(xmm_i, xmm1_i);  // Pack int32 to int16
+    // Translate back to uint16 range (actually -32768==32768 in int16)
+    xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
+#endif
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
+}
+#endif
+
+#ifdef notdef_because_slightly_slower_than_default_implementation
+template <>
+inline void GDALCopy4Words(const double *pValueIn, float *const pValueOut)
+{
+    __m128d float_posmax = _mm_set1_pd(std::numeric_limits<float>::max());
+    __m128d float_negmax = _mm_set1_pd(-std::numeric_limits<float>::max());
+    __m128d float_posinf = _mm_set1_pd(std::numeric_limits<float>::infinity());
+    __m128d float_neginf = _mm_set1_pd(-std::numeric_limits<float>::infinity());
+    __m128d val01 = _mm_loadu_pd(pValueIn);
+    __m128d val23 = _mm_loadu_pd(pValueIn + 2);
+    __m128d mask_max = _mm_cmpge_pd(val01, float_posmax);
+    __m128d mask_max23 = _mm_cmpge_pd(val23, float_posmax);
+    val01 = _mm_or_pd(_mm_and_pd(mask_max, float_posinf),
+                      _mm_andnot_pd(mask_max, val01));
+    val23 = _mm_or_pd(_mm_and_pd(mask_max23, float_posinf),
+                      _mm_andnot_pd(mask_max23, val23));
+    __m128d mask_min = _mm_cmple_pd(val01, float_negmax);
+    __m128d mask_min23 = _mm_cmple_pd(val23, float_negmax);
+    val01 = _mm_or_pd(_mm_and_pd(mask_min, float_neginf),
+                      _mm_andnot_pd(mask_min, val01));
+    val23 = _mm_or_pd(_mm_and_pd(mask_min23, float_neginf),
+                      _mm_andnot_pd(mask_min23, val23));
+    __m128 val01_s = _mm_cvtpd_ps(val01);
+    __m128 val23_s = _mm_cvtpd_ps(val23);
+    __m128i val01_i = _mm_castps_si128(val01_s);
+    __m128i val23_i = _mm_castps_si128(val23_s);
+    GDALCopyXMMToInt64(val01_i, pValueOut);
+    GDALCopyXMMToInt64(val23_i, pValueOut + 2);
+}
+#endif
+
+#endif  //  defined(__x86_64) || defined(_M_X64)
+
+#endif  // GDAL_PRIV_TEMPLATES_HPP_INCLUDED
diff --git a/src/core/raster/qgsrasterblock.cpp b/src/core/raster/qgsrasterblock.cpp
index 2abc8d9c6ec..55fbf5b9a17 100644
--- a/src/core/raster/qgsrasterblock.cpp
+++ b/src/core/raster/qgsrasterblock.cpp
@@ -24,6 +24,10 @@
 #include "qgslogger.h"
 #include "qgsrasterblock.h"
 #include "qgsrectangle.h"
+#include "qgsgdalutils.h"
+
+#define GDAL_MINMAXELT_NS qgis_gdal
+#include "gdal_minmax_element.hpp"
 
 // See #9101 before any change of NODATA_COLOR!
 const QRgb QgsRasterBlock::NO_DATA_COLOR = qRgba( 0, 0, 0, 0 );
@@ -873,292 +877,60 @@ QRect QgsRasterBlock::subRect( const QgsRectangle &extent, int width, int height
   return subRect;
 }
 
-template<class T> bool findMaximum( const void *data, bool hasNoData, double noDataValue, int width, int height, double &maximum, std::size_t &offset )
-{
-  const T *castData = static_cast< const T * >( data );
-  auto end = castData + static_cast< std::size_t >( width ) * static_cast< std::size_t >( height );
-  if ( !hasNoData )
-  {
-    const auto maxElem = std::max_element( castData, end );
-    if ( maxElem != end )
-    {
-      offset = maxElem - castData;
-      maximum = *maxElem;
-      return true;
-    }
-    return false;
-  }
-  else
-  {
-    maximum = std::numeric_limits< double >::lowest();
-    bool found = false;
-    for ( auto it = castData; it != end; ++it )
-    {
-      const double value = static_cast< double >( *it );
-      if ( std::isnan( value ) || qgsDoubleNear( value, noDataValue ) )
-        continue;
-
-      if ( value > maximum || !found )
-      {
-        maximum = value;
-        offset = it - castData;
-        found = true;
-      }
-    }
-    return found;
-  }
-}
-
-template<class T> bool findMinimum( const void *data, bool hasNoData, double noDataValue, int width, int height, double &minimum, std::size_t &offset )
-{
-  const T *castData = static_cast< const T * >( data );
-  auto end = castData + static_cast< std::size_t >( width ) * static_cast< std::size_t >( height );
-  if ( !hasNoData )
-  {
-    const auto minElem = std::min_element( castData, end );
-    if ( minElem != end )
-    {
-      offset = minElem - castData;
-      minimum = *minElem;
-      return true;
-    }
-    return false;
-  }
-  else
-  {
-    minimum = std::numeric_limits< double >::max();
-    bool found = false;
-    for ( auto it = castData; it != end; ++it )
-    {
-      const double value = static_cast< double >( *it );
-      if ( std::isnan( value ) || qgsDoubleNear( value, noDataValue ) )
-        continue;
-
-      if ( value < minimum || !found )
-      {
-        minimum = value;
-        offset = it - castData;
-        found = true;
-      }
-    }
-    return found;
-  }
-}
-
-template<class T> bool findMinMax( const void *data, bool hasNoData, double noDataValue, int width, int height, double &minimum, std::size_t &minOffset, double &maximum, std::size_t &maxOffset )
-{
-  const T *castData = static_cast< const T * >( data );
-  auto end = castData + static_cast< std::size_t >( width ) * static_cast< std::size_t >( height );
-  if ( !hasNoData )
-  {
-    const auto minMaxElem = std::minmax_element( castData, end );
-    if ( minMaxElem.first != end )
-    {
-      minOffset = minMaxElem.first - castData;
-      minimum = *minMaxElem.first;
-      maxOffset = minMaxElem.second - castData;
-      maximum = *minMaxElem.second;
-      return true;
-    }
-    return false;
-  }
-  else
-  {
-    minimum = std::numeric_limits< double >::max();
-    maximum = std::numeric_limits< double >::lowest();
-    bool found = false;
-    for ( auto it = castData; it != end; ++it )
-    {
-      const double value = static_cast< double >( *it );
-      if ( std::isnan( value ) || qgsDoubleNear( value, noDataValue ) )
-        continue;
-
-      if ( value < minimum || !found )
-      {
-        minimum = value;
-        minOffset = it - castData;
-      }
-      if ( value > maximum || !found )
-      {
-        maximum = value;
-        maxOffset = it - castData;
-      }
-      found = true;
-    }
-    return found;
-  }
-}
-
 bool QgsRasterBlock::minimum( double &minimum, int &row, int &column ) const
 {
-  minimum = std::numeric_limits<double>::quiet_NaN();
   if ( !mData )
   {
+    minimum = std::numeric_limits<double>::quiet_NaN();
     return false;
   }
 
-  std::size_t offset = 0;
-  bool found = false;
-  switch ( mDataType )
-  {
-    case Qgis::DataType::Byte:
-      found = findMinimum< quint8 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, offset );
-      break;
-    case Qgis::DataType::Int8:
-      found = findMinimum< qint8 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, offset );
-      break;
-    case Qgis::DataType::UInt16:
-      found = findMinimum< quint16 >( mData, mHasNoDataValue, mNoDataValue,  mWidth, mHeight, minimum, offset );
-      break;
-    case Qgis::DataType::Int16:
-      found = findMinimum< qint16 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, offset );
-      break;
-    case Qgis::DataType::UInt32:
-      found = findMinimum< quint32 >( mData, mHasNoDataValue, mNoDataValue,  mWidth, mHeight, minimum, offset );
-      break;
-    case Qgis::DataType::Int32:
-      found = findMinimum< qint32 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, offset );
-      break;
-    case Qgis::DataType::Float32:
-      found = findMinimum< float >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, offset );
-      break;
-    case Qgis::DataType::Float64:
-      found = findMinimum< double >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, offset );
-      break;
+  const std::size_t offset = qgis_gdal::min_element( mData, static_cast<std::size_t>( mWidth ) * static_cast< std::size_t>( mHeight ),
+                             QgsGdalUtils::gdalDataTypeFromQgisDataType( mDataType ), mHasNoDataValue, mNoDataValue );
 
-    case Qgis::DataType::CInt16:
-    case Qgis::DataType::CInt32:
-    case Qgis::DataType::CFloat32:
-    case Qgis::DataType::CFloat64:
-    case Qgis::DataType::ARGB32:
-    case Qgis::DataType::ARGB32_Premultiplied:
-    case Qgis::DataType::UnknownDataType:
-      QgsDebugError( QStringLiteral( "Data type %1 is not supported" ).arg( qgsEnumValueToKey< Qgis::DataType >( mDataType ) ) );
-      return false;
-  }
+  row = static_cast< int >( offset / mWidth );
+  column = static_cast< int >( offset % mWidth );
+  minimum = value( offset );
 
-  if ( found )
-  {
-    row = static_cast< int >( offset / mWidth );
-    column = static_cast< int >( offset % mWidth );
-  }
-
-  return found;
+  return true;
 }
+
 bool QgsRasterBlock::maximum( double &maximum SIP_OUT, int &row SIP_OUT, int &column SIP_OUT ) const
 {
-  maximum = std::numeric_limits<double>::quiet_NaN();
   if ( !mData )
   {
+    maximum = std::numeric_limits<double>::quiet_NaN();
     return false;
   }
+  const std::size_t offset = qgis_gdal::max_element( mData, static_cast<std::size_t>( mWidth ) * static_cast< std::size_t>( mHeight ),
+                             QgsGdalUtils::gdalDataTypeFromQgisDataType( mDataType ), mHasNoDataValue, mNoDataValue );
 
-  std::size_t offset = 0;
-  bool found = false;
-  switch ( mDataType )
-  {
-    case Qgis::DataType::Byte:
-      found = findMaximum< quint8 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, maximum, offset );
-      break;
-    case Qgis::DataType::Int8:
-      found = findMaximum< qint8 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, maximum, offset );
-      break;
-    case Qgis::DataType::UInt16:
-      found = findMaximum< quint16 >( mData, mHasNoDataValue, mNoDataValue,  mWidth, mHeight, maximum, offset );
-      break;
-    case Qgis::DataType::Int16:
-      found = findMaximum< qint16 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, maximum, offset );
-      break;
-    case Qgis::DataType::UInt32:
-      found = findMaximum< quint32 >( mData, mHasNoDataValue, mNoDataValue,  mWidth, mHeight, maximum, offset );
-      break;
-    case Qgis::DataType::Int32:
-      found = findMaximum< qint32 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, maximum, offset );
-      break;
-    case Qgis::DataType::Float32:
-      found = findMaximum< float >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, maximum, offset );
-      break;
-    case Qgis::DataType::Float64:
-      found = findMaximum< double >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, maximum, offset );
-      break;
+  row = static_cast< int >( offset / mWidth );
+  column = static_cast< int >( offset % mWidth );
+  maximum = value( offset );
 
-    case Qgis::DataType::CInt16:
-    case Qgis::DataType::CInt32:
-    case Qgis::DataType::CFloat32:
-    case Qgis::DataType::CFloat64:
-    case Qgis::DataType::ARGB32:
-    case Qgis::DataType::ARGB32_Premultiplied:
-    case Qgis::DataType::UnknownDataType:
-      QgsDebugError( QStringLiteral( "Data type %1 is not supported" ).arg( qgsEnumValueToKey< Qgis::DataType >( mDataType ) ) );
-      return false;
-  }
-
-  if ( found )
-  {
-    row = static_cast< int >( offset / mWidth );
-    column = static_cast< int >( offset % mWidth );
-  }
-
-  return found;
+  return true;
 }
 
 bool QgsRasterBlock::minimumMaximum( double &minimum, int &minimumRow, int &minimumColumn, double &maximum, int &maximumRow, int &maximumColumn ) const
 {
-  minimum = std::numeric_limits<double>::quiet_NaN();
-  maximum = std::numeric_limits<double>::quiet_NaN();
   if ( !mData )
   {
+    minimum = std::numeric_limits<double>::quiet_NaN();
+    maximum = std::numeric_limits<double>::quiet_NaN();
     return false;
   }
 
-  std::size_t minOffset = 0;
-  std::size_t maxOffset = 0;
-  bool found = false;
-  switch ( mDataType )
-  {
-    case Qgis::DataType::Byte:
-      found = findMinMax< quint8 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, minOffset, maximum, maxOffset );
-      break;
-    case Qgis::DataType::Int8:
-      found = findMinMax< qint8 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, minOffset, maximum, maxOffset );
-      break;
-    case Qgis::DataType::UInt16:
-      found = findMinMax< quint16 >( mData, mHasNoDataValue, mNoDataValue,  mWidth, mHeight, minimum, minOffset, maximum, maxOffset );
-      break;
-    case Qgis::DataType::Int16:
-      found = findMinMax< qint16 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, minOffset, maximum, maxOffset );
-      break;
-    case Qgis::DataType::UInt32:
-      found = findMinMax< quint32 >( mData, mHasNoDataValue, mNoDataValue,  mWidth, mHeight, minimum, minOffset, maximum, maxOffset );
-      break;
-    case Qgis::DataType::Int32:
-      found = findMinMax< qint32 >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, minOffset, maximum, maxOffset );
-      break;
-    case Qgis::DataType::Float32:
-      found = findMinMax< float >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, minOffset, maximum, maxOffset );
-      break;
-    case Qgis::DataType::Float64:
-      found = findMinMax< double >( mData, mHasNoDataValue, mNoDataValue, mWidth, mHeight, minimum, minOffset, maximum, maxOffset );
-      break;
+  const auto [minOffset, maxOffset] = qgis_gdal::minmax_element( mData, static_cast<std::size_t>( mWidth ) * static_cast< std::size_t>( mHeight ),
+                                      QgsGdalUtils::gdalDataTypeFromQgisDataType( mDataType ), mHasNoDataValue, mNoDataValue );
 
-    case Qgis::DataType::CInt16:
-    case Qgis::DataType::CInt32:
-    case Qgis::DataType::CFloat32:
-    case Qgis::DataType::CFloat64:
-    case Qgis::DataType::ARGB32:
-    case Qgis::DataType::ARGB32_Premultiplied:
-    case Qgis::DataType::UnknownDataType:
-      QgsDebugError( QStringLiteral( "Data type %1 is not supported" ).arg( qgsEnumValueToKey< Qgis::DataType >( mDataType ) ) );
-      return false;
-  }
+  minimumRow = static_cast< int >( minOffset / mWidth );
+  minimumColumn = static_cast< int >( minOffset % mWidth );
+  minimum = value( minOffset );
 
-  if ( found )
-  {
-    minimumRow = static_cast< int >( minOffset / mWidth );
-    minimumColumn = static_cast< int >( minOffset % mWidth );
-    maximumRow = static_cast< int >( maxOffset / mWidth );
-    maximumColumn = static_cast< int >( maxOffset % mWidth );
-  }
+  maximumRow = static_cast< int >( maxOffset / mWidth );
+  maximumColumn = static_cast< int >( maxOffset % mWidth );
+  maximum = value( maxOffset );
 
-  return found;
+  return true;
 }