From 6e974d7f6ebe78e9f51313d9ffbbbf9083ad0e78 Mon Sep 17 00:00:00 2001
From: Aleksandr Voron <aleksandr.voron@intel.com>
Date: Wed, 17 Jun 2026 17:33:48 +0200
Subject: [PATCH] feat: optimized QASYMM8_SIGNED->F32 direct convolution path

Signed-off-by: Aleksandr Voron <aleksandr.voron@intel.com>
---
 .../kernels/arm_gemm/gemm_interleaved.hpp     |  70 ++-
 .../NEON/kernels/arm_gemm/quantized-fp16.cpp  |   3 +-
 src/core/NEON/kernels/arm_gemm/quantized.cpp  |  56 ++-
 .../arm_common/internal/quantized.hpp         |  12 +-
 .../kernels/assembly/arm_gemm/arm_gemm.hpp    |  14 +-
 src/cpu/operators/CpuConv2d.cpp               |  24 +-
 src/cpu/operators/CpuGemmDirectConv2d.cpp     |  37 +-
 src/cpu/operators/CpuGemmDirectConv2d.h       |   3 +-
 .../NEON/functions/NEConvolutionLayer.cpp     |   4 +-
 tests/validation/NEON/ConvolutionLayer.cpp    | 439 ++++++++++++++++++
 .../fixtures/ConvolutionLayerFixture.h        | 138 ++++++
 11 files changed, 773 insertions(+), 27 deletions(-)
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 5a2b83f527a..2a8779f933e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -281,6 +281,14 @@ void kernel_and_merge<false, false, DequantizeFloat>::run(
         offset_bias = bias + n_0;
     }
 
+    // When b_offset != 0, row sums of A are packed at the end of the A panel
+    // (appended by the quantized PrepareA transform with multiplier=1).  Read them
+    // to pass to dequantize_block_32 for per-row offset correction.
+    const int32_t *row_sum = nullptr;
+    if (dq.b_offset != 0) {
+        row_sum = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
+    }
+
     strat.kernel(// A and B pointers are just the packed panels.
                  a_ptr, b_panel,
                  // Provide relevant part of output array and row stride.
@@ -288,7 +296,7 @@ void kernel_and_merge<false, false, DequantizeFloat>::run(
                  // M, N, K sizes
                  m_max-m_0, n_max - n_0, kern_k,
                  // Bias, activation, accumulation.  Need to offset the bias as needed.
-                 offset_col_bias, dq, offset_bias, act, accumulate, acc_buff);
+                 offset_col_bias, dq, offset_bias, act, accumulate, acc_buff, row_sum, kern_k);
 }
 
 template<>
@@ -300,7 +308,7 @@ void kernel_and_merge<true, false, DequantizeFloat>::run(
         strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t, Tri *c_panel,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
         unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *bias,
-        const Activation &act, bool not_first_pass, const DequantizeFloat &qp, const int32_t *,
+        const Activation &act, bool not_first_pass, const DequantizeFloat &qp, const int32_t *col_bias,
         Tab *)
 {
     const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
@@ -317,6 +325,11 @@ void kernel_and_merge<true, false, DequantizeFloat>::run(
 #ifdef CYCLE_PROFILING
         auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr)));
 #endif
+        // When b_offset != 0, row sums are packed after the A panel data
+        const int32_t *row_sum = (qp.b_offset != 0)
+            ? reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k)
+            : nullptr;
+
         for (int i=0; i<bblocks; i++) {
             unsigned int n_start = n_0 + (strategy::out_width() * i);
             unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
@@ -324,7 +337,8 @@ void kernel_and_merge<true, false, DequantizeFloat>::run(
             dequantize_block_32(qp, (n_end - n_start), (m_max - m_0),
                             c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
                             c_ptr + m_0 * ldc + n_start, ldc,
-                            bias != nullptr ? bias + n_start : nullptr, not_first_pass, act);
+                            bias != nullptr ? bias + n_start : nullptr, not_first_pass, act,
+                            col_bias != nullptr ? col_bias + n_start : nullptr, row_sum, kern_k);
 
         }
     }
@@ -475,6 +489,13 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
             return _Nsize * _nmulti * sizeof(int32_t);
         }
 
+        if (std::is_same<OutputStage, DequantizeFloat>::value) {
+            const DequantizeFloat *dq = reinterpret_cast<const DequantizeFloat *>(&_os);
+            if (dq->a_offset != 0) {
+                return _Nsize * _nmulti * sizeof(int32_t);
+            }
+        }
+
         return 0;
     }
 
@@ -557,6 +578,12 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
             k_depth += sizeof(int32_t) / sizeof(Tloi);
         }
 
+        if (std::is_same<OutputStage, DequantizeFloat>::value && MergeStep) {
+            // transforms_quantized always packs row sum slots (zeros when multiplier=0, actual
+            // sums when b_offset != 0).  Reserve space unconditionally when MergeStep is enabled.
+            k_depth += sizeof(int32_t) / sizeof(Tloi);
+        }
+
         return k_depth;
     }
 
@@ -647,6 +674,13 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
             return -qp->b_offset;
         }
 
+        if (std::is_same<OutputStage, DequantizeFloat>::value) {
+            const DequantizeFloat *dq = reinterpret_cast<const DequantizeFloat *>(&_os);
+            // Pack row sums into the A panel when b_offset is non-zero so that the
+            // merge step can apply the b_offset correction per output position.
+            return (dq->b_offset != 0) ? 1 : 0;
+        }
+
         return 0;
     }
 
@@ -693,6 +727,14 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
             return get_ktotal(args);
         }
 
+        // K blocking is not supported for DequantizeFloat with MergeStep when b_offset != 0,
+        // because row sums of A must cover the full K depth.  We cannot check b_offset here
+        // (static function), so we conservatively disable K-blocking for all DequantizeFloat
+        // MergeStep cases.  The working-memory cost is minimal and correctness is guaranteed.
+        if (std::is_same<OutputStage, DequantizeFloat>::value && MergeStep) {
+            return get_ktotal(args);
+        }
+
         // We can't K block non-fast FP16 cases without an accumulation buffer.
 #if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(ARM_COMPUTE_ENABLE_FP16))
         if (std::is_same<Tlo, __fp16>::value && std::is_same<Tr, __fp16>::value && !args._fast_mode && MergeStep) {
@@ -937,7 +979,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
 #endif
                             // See comment above on transform_type<> class: this extracts either 'transforms' or
                             // 'transforms_quantized' as appropriate.
-                            typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+                            typename transform_type<strategy, MergeStep && (std::is_same<OutputStage, Requantize32>::value || std::is_same<OutputStage, DequantizeFloat>::value)>::type transforms;
 
                             if (_indirect_buf != nullptr) {
                                 transforms.PrepareA_indirect(a_panel,
@@ -1027,7 +1069,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
 #endif
                     // See comment above on transform_type<> class: this extracts either 'transforms' or
                     // 'transforms_quantized' as appropriate.
-                    typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+                    typename transform_type<strategy, MergeStep && (std::is_same<OutputStage, Requantize32>::value || std::is_same<OutputStage, DequantizeFloat>::value)>::type transforms;
 
                     for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
                         unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
@@ -1060,6 +1102,10 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
 
                     if(std::is_same<OutputStage, Requantize32>::value) {
                         a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Tloi));
+                    } else if (std::is_same<OutputStage, DequantizeFloat>::value) {
+                        // transforms_quantized always packs row-sum slots (zeros when b_offset=0,
+                        // actual sums when b_offset != 0), so the stride must include the slot.
+                        a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Tloi));
                     } else {
                         a_panel_stride = kern_k;
                     }
@@ -1212,6 +1258,20 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
                 compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
             }
         }
+
+        if (std::is_same<OutputStage, DequantizeFloat>::value) {
+            const DequantizeFloat *dq = reinterpret_cast<const DequantizeFloat *>(&_os);
+            if (dq->a_offset != 0) {
+                // Compute raw column sums of B (weight matrix) for use in a_offset correction.
+                // dequantize_block_32 applies: -a_offset * col_sums[n] * scale per output channel.
+                col_bias = reinterpret_cast<int32_t *>(in_buffer);
+                for (unsigned int i = 0; i < _nmulti; ++i) {
+                    compute_raw_col_sums(_Nsize, _Ksize * _Ksections,
+                                         B + (i * B_multi_stride), ldb,
+                                         col_bias + (i * _Nsize));
+                }
+            }
+        }
     }
 
     // Support for transposed B is a property of the strategy::transpose type
diff --git a/src/core/NEON/kernels/arm_gemm/quantized-fp16.cpp b/src/core/NEON/kernels/arm_gemm/quantized-fp16.cpp
index 3647fbc8a12..a66bc3b3957 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized-fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized-fp16.cpp
@@ -35,7 +35,8 @@ namespace arm_gemm {
 template<>
 void dequantize_block_32<__fp16>(const DequantizeFloat &qp, unsigned int width, unsigned int height,
                                  const int32_t * in_ptr, unsigned int in_stride, __fp16 *out_ptr, unsigned int out_stride,
-                                 const __fp16 * bias_ptr, bool not_first_pass, const Activation &act)
+                                 const __fp16 * bias_ptr, bool not_first_pass, const Activation &act,
+                                 const int32_t * /*col_bias*/, const int32_t * /*row_sum*/, int32_t /*k_total*/)
 {
     const float32x4_t vscale = vdupq_n_f32(qp.scale);
     float maxval = std::numeric_limits<float>::infinity();
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
index 0243ee14a3b..ae47c157621 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp
@@ -973,10 +973,28 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
 template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);
 template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);
 
+template<typename T>
+void compute_raw_col_sums(unsigned int width, unsigned int height,
+                          const T *input, unsigned int in_stride, int32_t *col_sums)
+{
+    memset(reinterpret_cast<void *>(col_sums), 0, width * sizeof(int32_t));
+    for (unsigned int row = 0; row < height; ++row)
+    {
+        for (unsigned int col = 0; col < width; ++col)
+        {
+            col_sums[col] += static_cast<int32_t>(input[row * in_stride + col]);
+        }
+    }
+}
+
+template void compute_raw_col_sums(unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_sums);
+template void compute_raw_col_sums(unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_sums);
+
 template<>
 void dequantize_block_32<float>(const DequantizeFloat &qp, unsigned int width, unsigned int height,
                          const int32_t* in_ptr, unsigned int in_stride, float *out_ptr, unsigned int out_stride,
-                         const float* bias_ptr, bool accumulate, const Activation &act)
+                         const float* bias_ptr, bool accumulate, const Activation &act,
+                         const int32_t *col_bias, const int32_t *row_sum, int32_t k_total)
 {
     const float32x4_t vscale = vdupq_n_f32(qp.scale);
     float maxval = std::numeric_limits<float>::infinity();
@@ -1000,14 +1018,38 @@ void dequantize_block_32<float>(const DequantizeFloat &qp, unsigned int width, u
     for(unsigned int row=0; row<height; row++) {
         auto row_in_ptr = in_ptr + (row * in_stride);
         auto row_out_ptr = out_ptr + (row * out_stride);
+
+        // Per-row addend from b_offset correction: -b_offset * sum_a_row[m] * scale
+        // row_sum values are packed with multiplier=1, so entry is plain sum(a_row[k]).
+        // Also add the cross-term: +a_offset * b_offset * K * scale (constant per tile).
+        float row_offset = 0.0f;
+        if (row_sum != nullptr) {
+            row_offset += static_cast<float>(-qp.b_offset * row_sum[row]) * qp.scale;
+        }
+        if (col_bias != nullptr && row_sum != nullptr && k_total != 0) {
+            // Cross-term: +a_offset * b_offset * K * scale
+            row_offset += static_cast<float>(qp.a_offset) * static_cast<float>(qp.b_offset)
+                          * static_cast<float>(k_total) * qp.scale;
+        }
+        const float32x4_t vrow_offset = vdupq_n_f32(row_offset);
+
         unsigned int col=0;
         if (width >= 4) {
             for(; col <= (width - 4); col+= 4) {
                 const int32x4_t vin = vld1q_s32(row_in_ptr + col);
                 float32x4_t vdeq = vmulq_f32(vcvtq_f32_s32(vin), vscale);
                 if(bias_ptr) {
-                    const float32x4_t bin = vld1q_f32(bias_ptr + col);
-                    vdeq = vaddq_f32(vdeq, bin);
+                    vdeq = vaddq_f32(vdeq, vld1q_f32(bias_ptr + col));
+                }
+                if(col_bias) {
+                    // a_offset correction: -a_offset * sum_b_col[n] * scale
+                    const float32x4_t vcol_corr = vmulq_f32(
+                        vcvtq_f32_s32(vld1q_s32(col_bias + col)),
+                        vdupq_n_f32(static_cast<float>(-qp.a_offset) * qp.scale));
+                    vdeq = vaddq_f32(vdeq, vcol_corr);
+                }
+                if(row_sum) {
+                    vdeq = vaddq_f32(vdeq, vrow_offset);
                 }
                 if(accumulate) {
                     vdeq = vaddq_f32(vdeq, vld1q_f32(row_out_ptr + col));
@@ -1019,10 +1061,16 @@ void dequantize_block_32<float>(const DequantizeFloat &qp, unsigned int width, u
         // left-over elements
         for(; col < width; ++col) {
             const int32_t val = *(row_in_ptr + col);
-            float res = static_cast<float>(val * qp.scale);
+            float res = static_cast<float>(val) * qp.scale;
             if(bias_ptr) {
                 res += static_cast<float>(*(bias_ptr + col));
             }
+            if(col_bias) {
+                res += static_cast<float>(-qp.a_offset * col_bias[col]) * qp.scale;
+            }
+            if(row_sum) {
+                res += row_offset;
+            }
             if(accumulate) {
                 res += *(row_out_ptr + col);
             }
diff --git a/src/cpu/kernels/assembly/arm_common/internal/quantized.hpp b/src/cpu/kernels/assembly/arm_common/internal/quantized.hpp
index 796da4e2e19..64985e7b9ba 100644
--- a/src/cpu/kernels/assembly/arm_common/internal/quantized.hpp
+++ b/src/cpu/kernels/assembly/arm_common/internal/quantized.hpp
@@ -41,6 +41,14 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
                       const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth,
                       unsigned int multi, unsigned int first_col);
 
+/** Compute raw column sums of a matrix: col_sums[n] = sum_{k} input[k * in_stride + n].
+ *  Unlike compute_col_sums(), this does not apply any quantization offsets or scaling —
+ *  it stores the plain integer sums for use as weight column reductions in the
+ *  DequantizeFloat a_offset correction path. */
+template<typename T>
+void compute_raw_col_sums(unsigned int width, unsigned int height,
+                          const T *input, unsigned int in_stride, int32_t *col_sums);
+
 template<typename T>
 void row_sums_indirect(size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
                        size_t M, int32_t *output_ptr, const Requantize32 *qp);
@@ -48,6 +56,8 @@ void row_sums_indirect(size_t num_strings, const unsigned int *string_lengths, I
 template<typename T>
 void dequantize_block_32(const DequantizeFloat &qp, unsigned int width, unsigned int height,
                          const int32_t* input, unsigned int in_stride, T *output, unsigned int out_stride,
-                         const T *row_bias, bool not_first_pass, const Activation &act);
+                         const T *row_bias, bool not_first_pass, const Activation &act,
+                         const int32_t *col_bias = nullptr, const int32_t *row_sum = nullptr,
+                         int32_t k_total = 0);
 
 } // namespace arm_gemm
diff --git a/src/cpu/kernels/assembly/arm_gemm/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm/arm_gemm.hpp
index 8bae014a757..cdc2a42682d 100644
--- a/src/cpu/kernels/assembly/arm_gemm/arm_gemm.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm/arm_gemm.hpp
@@ -240,12 +240,20 @@ struct Requantize32
 struct DequantizeFloat
 {
 public:
-    float scale = 0;
+    float   scale    = 0;
+    int32_t a_offset = 0; // input quantization zero-point  (subtract from each input value)
+    int32_t b_offset = 0; // weight quantization zero-point (subtract from each weight value)
 
     DequantizeFloat() = default;
 
-    // Constructor
-    DequantizeFloat(const float scale) : scale(scale)
+    // Constructor without offset (symmetric quantization)
+    DequantizeFloat(const float scale) : scale(scale), a_offset(0), b_offset(0)
+    {
+    }
+
+    // Constructor with asymmetric quantization offsets
+    DequantizeFloat(const float scale, int32_t a_offset, int32_t b_offset)
+        : scale(scale), a_offset(a_offset), b_offset(b_offset)
     {
     }
 };
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
index adc4486889c..a2c950792e2 100644
--- a/src/cpu/operators/CpuConv2d.cpp
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023-2025 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2026 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,13 +59,14 @@ void CpuConv2d::configure(ITensorInfo               *input,
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation,
-                                                   act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                            enable_fast_math, num_groups));
 
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
                            enable_fast_math, num_groups);
 
-    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups, weights_info);
     switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
                                               enable_fast_math))
     {
@@ -119,7 +120,8 @@ Status CpuConv2d::validate(const ITensorInfo         *input,
     ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConv2d::validate");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
 
-    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups, weights_info);
+
     switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
                                               enable_fast_math))
     {
@@ -155,7 +157,17 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *i
                                                     bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
-    ARM_COMPUTE_UNUSED(weights_info);
+
+    // For QASYMM8_SIGNED→F32 with NHWC and no dilation, automatically select the single-kernel
+    // CpuGemmDirectConv2d path when it validates successfully.
+    if (input->data_type() == DataType::QASYMM8_SIGNED && output->data_type() == DataType::F32)
+    {
+        const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1, weights_info);
+        if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
+        {
+            return ConvolutionMethod::GEMM_CONV2D;
+        }
+    }
 
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp
index caac7761e27..4024207add5 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -44,6 +44,11 @@ using namespace arm_compute::utils::cast;
 
 namespace
 {
+inline bool is_direct_i8_s8_f32_path(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return src->data_type() == DataType::QASYMM8_SIGNED && dst->data_type() == DataType::F32;
+}
+
 GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo         *src,
                                                         const ITensorInfo         *weights,
                                                         const ITensorInfo         *dst,
@@ -130,7 +135,15 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src,
 
     // Configure assembly dispatch
     cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
-    if (is_data_type_quantized(src->data_type()))
+    if (is_direct_i8_s8_f32_path(src, dst))
+    {
+        // Provide the quantization zero-points via AsmGemmInfo so that create_arm_gemm_dequant
+        // can bake them into the DequantizeFloat output stage.  The assembly kernel then handles
+        // all offset corrections (col sums, row sums, cross-term) natively.
+        asm_info.dequant_a_offset = src->quantization_info().uniform().offset;
+        asm_info.dequant_b_offset = weights->quantization_info().uniform().offset;
+    }
+    else if (is_data_type_quantized(src->data_type()))
     {
         asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
     }
@@ -189,6 +202,17 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src,
     ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
+    const bool direct_i8_f32 = is_direct_i8_s8_f32_path(src, dst);
+
+    if (direct_i8_f32)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F32);
+        // Offset correction (a_offset, b_offset) is handled inside the assembly kernel via the
+        // extended DequantizeFloat output stage — no separate operator-level validation needed.
+    }
+
     // Validate Permute
     TensorInfo perm_weights;
     ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &perm_weights, PermutationVector{3, 0, 1, 2}));
@@ -204,7 +228,11 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src,
     // Validate biases
     if (biases != nullptr)
     {
-        if (is_data_type_quantized_asymmetric(data_type))
+        if (direct_i8_f32)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        }
+        else if (is_data_type_quantized_asymmetric(data_type))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -216,7 +244,10 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src,
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    if (!direct_i8_f32)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
 
     cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info));
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h
index a7365615b94..2ab9ef531a6 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.h
+++ b/src/cpu/operators/CpuGemmDirectConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2026 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,6 +55,7 @@ class CpuGemmDirectConv2d : public ICpuOperator
      * |:--------------|:--------------|:--------------|:--------------|
      * |QASYMM8        |QASYMM8        |S32            |QASYMM8        |
      * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32            |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |F32            |F32            |
      * |F16            |F16            |F16            |F16            |
      * |F32            |F32            |F32            |F32            |
      * |BFLOAT16       |BFLOAT16       |BFLOAT16       |BFLOAT16       |
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index c4f28079852..d9061b4c355 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023-2025 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2026 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -83,7 +83,6 @@ void NEConvolutionLayer::configure(ITensor                   *input,
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
                            enable_fast_math, num_groups);
 
-    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
     switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
                                                    weights_info, dilation, act_info, enable_fast_math))
     {
@@ -134,7 +133,6 @@ Status NEConvolutionLayer::validate(const ITensorInfo         *input,
                                     unsigned int               num_groups)
 {
     ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "NEConvolutionLayer::validate");
-    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_DYNAMIC_SHAPE(input, weights, biases, output);
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 16eff388d6c..8371e0c5981 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -311,6 +311,445 @@ TEST_SUITE_END() // QASYMM8_SIGNED
 
 // clang-format on
 // *INDENT-ON*
+
+#ifdef __aarch64__
+// *INDENT-OFF*
+// clang-format off
+TEST_SUITE(DirectI8S8F32)
+
+/** Validate accepts QASYMM8_SIGNED→F32 with both zero and non-zero offsets.
+ *  Shapes are in NHWC format: [C, W, H] for input/output, [Cin, Kw, Kh, Cout] for weights. */
+DATA_TEST_CASE(
+    Validate,
+    framework::DatasetMode::ALL,
+    zip(make("SrcInfo",   { TensorInfo(TensorShape(16U, 8U, 8U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),
+                            TensorInfo(TensorShape(32U, 4U, 4U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),
+                            TensorInfo(TensorShape(16U, 8U, 8U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),
+                            TensorInfo(TensorShape(16U, 8U, 8U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC) }),
+        make("WgtInfo",   { TensorInfo(TensorShape(16U, 3U, 3U, 8U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),
+                            TensorInfo(TensorShape(32U, 1U, 1U, 16U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),
+                            TensorInfo(TensorShape(16U, 3U, 3U, 8U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),
+                            TensorInfo(TensorShape(16U, 3U, 3U, 8U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC) }),
+        make("BiasInfo",  { TensorInfo(TensorShape(8U), 1, DataType::F32),
+                            TensorInfo(TensorShape(16U), 1, DataType::F32),
+                            TensorInfo(TensorShape(8U), 1, DataType::F32),
+                            TensorInfo(TensorShape(8U), 1, DataType::F32) }),
+        make("DstInfo",   { TensorInfo(TensorShape(8U, 6U, 6U), 1, DataType::F32, DataLayout::NHWC),
+                            TensorInfo(TensorShape(16U, 4U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                            TensorInfo(TensorShape(8U, 6U, 6U), 1, DataType::F32, DataLayout::NHWC),
+                            TensorInfo(TensorShape(8U, 6U, 6U), 1, DataType::F32, DataLayout::NHWC) }),
+        make("SrcOffset", { 0,  0,  5, -10 }),
+        make("WgtOffset", { 0,  0,  0,   3 }),
+        make("Expected",  { true, true, true, true })),
+    src_info_const, wgt_info_const, bias_info_const, dst_info_const,
+    src_offset, wgt_offset, expected)
+{
+    TensorInfo src_info  = src_info_const;
+    TensorInfo wgt_info  = wgt_info_const;
+    TensorInfo bias_info = bias_info_const;
+    TensorInfo dst_info  = dst_info_const;
+    src_info.set_quantization_info(QuantizationInfo(0.25f, src_offset));
+    wgt_info.set_quantization_info(QuantizationInfo(0.125f, wgt_offset));
+
+    const Status s = NEConvolutionLayer::validate(&src_info, &wgt_info, &bias_info, &dst_info,
+                                                  PadStrideInfo(1, 1, 0, 0),
+                                                  WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(),
+                                                  false /*fast_math*/, 1 /*num_groups*/);
+    ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS);
+}
+
+/** Verify GEMM_CONV2D is returned automatically for NHWC QASYMM8_SIGNED→F32. */
+TEST_CASE(GetConvMethodDirectI8F32, framework::DatasetMode::ALL)
+{
+    const QuantizationInfo qi(0.25f, 0);
+    TensorInfo src_info(TensorShape(16U, 8U, 8U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC);
+    TensorInfo wgt_info(TensorShape(16U, 3U, 3U, 8U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC);
+    TensorInfo dst_info(TensorShape(8U, 6U, 6U), 1, DataType::F32, DataLayout::NHWC);
+    src_info.set_quantization_info(qi);
+    wgt_info.set_quantization_info(qi);
+
+    const ConvolutionMethod m = NEConvolutionLayer::get_convolution_method(
+        &src_info.clone()->set_is_resizable(true),
+        &wgt_info.clone()->set_is_resizable(true),
+        &dst_info.clone()->set_is_resizable(true),
+        PadStrideInfo(1, 1, 0, 0),
+        WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(),
+        false /*fast_math*/);
+    ARM_COMPUTE_EXPECT(m == ConvolutionMethod::GEMM_CONV2D, framework::LogLevel::ERRORS);
+}
+
+using NEDirectI8S8F32ConvolutionFixture = DirectI8S8F32ConvolutionFixture<Tensor, Accessor, NEConvolutionLayer>;
+
+/** Small dataset: verify direct i8->f32 path produces correct output vs CPU reference.
+ *  Shapes are in NCHW format [W, H, C] for input/output, [Kw, Kh, Cin, Cout] for weights.
+ *  InputShape/WeightsShape/BiasShape/OutputShape are zipped — each row is one consistent config. */
+FIXTURE_DATA_TEST_CASE(
+    RunSmall,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(8U, 8U, 16U), TensorShape(16U, 16U, 32U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(3U, 3U, 32U, 16U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(16U) }),
+            make("OutputShape",  { TensorShape(6U, 6U, 8U), TensorShape(14U, 14U, 16U) })
+        ),
+        make("ConvInfo",     { PadStrideInfo(1, 1, 0, 0) }),
+        make("Dilation",     { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",   { DataLayout::NHWC }),
+        ActivationFunctionsDataset,
+        make("InputQI",      { QuantizationInfo(0.25f, 0) }),
+        make("WeightsQI",    { QuantizationInfo(0.125f, 0) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** Stride-2 3x3 convolution. */
+FIXTURE_DATA_TEST_CASE(
+    RunStride2,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        make("InputShape",   { TensorShape(14U, 14U, 32U) }),
+        make("WeightsShape", { TensorShape(3U, 3U, 32U, 16U) }),
+        make("BiasShape",    { TensorShape(16U) }),
+        make("OutputShape",  { TensorShape(6U, 6U, 16U) }),
+        make("ConvInfo",     { PadStrideInfo(2, 2, 0, 0) }),
+        make("Dilation",     { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",   { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo(),
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("InputQI",      { QuantizationInfo(0.25f, 0) }),
+        make("WeightsQI",    { QuantizationInfo(0.125f, 0) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** 1x1 convolution. */
+FIXTURE_DATA_TEST_CASE(
+    Run1x1,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        make("InputShape",   { TensorShape(8U, 8U, 32U) }),
+        make("WeightsShape", { TensorShape(1U, 1U, 32U, 16U) }),
+        make("BiasShape",    { TensorShape(16U) }),
+        make("OutputShape",  { TensorShape(8U, 8U, 16U) }),
+        make("ConvInfo",     { PadStrideInfo(1, 1, 0, 0) }),
+        make("Dilation",     { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",   { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo() }),
+        make("InputQI",      { QuantizationInfo(0.25f, 0) }),
+        make("WeightsQI",    { QuantizationInfo(0.125f, 0) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** Padded convolution. */
+FIXTURE_DATA_TEST_CASE(
+    RunPadded,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        make("InputShape",   { TensorShape(8U, 8U, 32U) }),
+        make("WeightsShape", { TensorShape(3U, 3U, 32U, 16U) }),
+        make("BiasShape",    { TensorShape(16U) }),
+        make("OutputShape",  { TensorShape(8U, 8U, 16U) }),
+        make("ConvInfo",     { PadStrideInfo(1, 1, 1, 1) }),
+        make("Dilation",     { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",   { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo(),
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f) }),
+        make("InputQI",      { QuantizationInfo(0.25f, 0) }),
+        make("WeightsQI",    { QuantizationInfo(0.125f, 0) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** Non-zero offsets: input offset only.
+ *  Uses the same fixture with QuantizationInfo passed explicitly.
+ *  Shapes in NCHW: [W,H,C] / [Kw,Kh,Cin,Cout]. */
+FIXTURE_DATA_TEST_CASE(
+    RunWithInputOffset,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(8U, 8U, 16U), TensorShape(8U, 8U, 16U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(1U, 1U, 16U, 8U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(8U) }),
+            make("OutputShape",  { TensorShape(6U, 6U, 8U), TensorShape(8U, 8U, 8U) })
+        ),
+        make("ConvInfo",     { PadStrideInfo(1, 1, 0, 0) }),
+        make("Dilation",     { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",   { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo() }),
+        make("InputQI",      { QuantizationInfo(0.25f, -10) }),   // non-zero input offset
+        make("WeightsQI",    { QuantizationInfo(0.125f, 0) })     // zero weight offset
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** Non-zero offsets: weight offset only. */
+FIXTURE_DATA_TEST_CASE(
+    RunWithWeightOffset,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(8U, 8U, 16U), TensorShape(8U, 8U, 16U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(1U, 1U, 16U, 8U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(8U) }),
+            make("OutputShape",  { TensorShape(6U, 6U, 8U), TensorShape(8U, 8U, 8U) })
+        ),
+        make("ConvInfo",     { PadStrideInfo(1, 1, 0, 0) }),
+        make("Dilation",     { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",   { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo() }),
+        make("InputQI",      { QuantizationInfo(0.25f, 0) }),     // zero input offset
+        make("WeightsQI",    { QuantizationInfo(0.125f, 5) })     // non-zero weight offset
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** Non-zero offsets on both tensors, with padding and activation. */
+FIXTURE_DATA_TEST_CASE(
+    RunWithBothOffsets,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(8U, 8U, 16U), TensorShape(8U, 8U, 32U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(3U, 3U, 32U, 16U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(16U) }),
+            make("OutputShape",  { TensorShape(8U, 8U, 8U), TensorShape(8U, 8U, 16U) })
+        ),
+        make("ConvInfo",     { PadStrideInfo(1, 1, 1, 1) }),  // same-size output via padding
+        make("Dilation",     { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",   { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo(),
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("InputQI",      { QuantizationInfo(0.25f, -20) }),   // non-zero input offset
+        make("WeightsQI",    { QuantizationInfo(0.125f, 10) })    // non-zero weight offset
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** BOUNDED_RELU activation with non-zero input offset.
+ *  Exercises that the activation clips the post-correction float value correctly. */
+FIXTURE_DATA_TEST_CASE(
+    RunWithInputOffsetAndBoundedRelu,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(8U, 8U, 16U), TensorShape(8U, 8U, 16U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(1U, 1U, 16U, 8U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(8U) }),
+            make("OutputShape",  { TensorShape(6U, 6U, 8U), TensorShape(8U, 8U, 8U) })
+        ),
+        make("ConvInfo",       { PadStrideInfo(1, 1, 0, 0) }),
+        make("Dilation",       { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",     { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f),
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, -6.0f) }),
+        make("InputQI",        { QuantizationInfo(0.25f, -10) }),
+        make("WeightsQI",      { QuantizationInfo(0.125f, 0) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** BOUNDED_RELU activation with non-zero weight offset. */
+FIXTURE_DATA_TEST_CASE(
+    RunWithWeightOffsetAndActivation,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(8U, 8U, 16U), TensorShape(8U, 8U, 32U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(3U, 3U, 32U, 16U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(16U) }),
+            make("OutputShape",  { TensorShape(6U, 6U, 8U), TensorShape(6U, 6U, 16U) })
+        ),
+        make("ConvInfo",       { PadStrideInfo(1, 1, 0, 0) }),
+        make("Dilation",       { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",     { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 4.0f) }),
+        make("InputQI",        { QuantizationInfo(0.25f, 0) }),
+        make("WeightsQI",      { QuantizationInfo(0.125f, 5) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** Stride-2 convolution with non-zero offsets.
+ *  Stride changes which input patches contribute to each output position, exercising the
+ *  per-output row-sum computation path with stride != 1. */
+FIXTURE_DATA_TEST_CASE(
+    RunStride2WithOffsets,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(14U, 14U, 16U), TensorShape(14U, 14U, 32U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(3U, 3U, 32U, 16U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(16U) }),
+            make("OutputShape",  { TensorShape(6U, 6U, 8U), TensorShape(6U, 6U, 16U) })
+        ),
+        make("ConvInfo",       { PadStrideInfo(2, 2, 0, 0) }),
+        make("Dilation",       { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",     { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo(),
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("InputQI",        { QuantizationInfo(0.25f, -15) }),
+        make("WeightsQI",      { QuantizationInfo(0.125f, 8) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** 1×1 kernel with both non-zero offsets.
+ *  K = Cin only (no spatial footprint), ensuring the K-total used in the cross-term is correct. */
+FIXTURE_DATA_TEST_CASE(
+    Run1x1WithBothOffsets,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(8U, 8U, 16U), TensorShape(8U, 8U, 32U) }),
+            make("WeightsShape", { TensorShape(1U, 1U, 16U, 8U), TensorShape(1U, 1U, 32U, 16U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(16U) }),
+            make("OutputShape",  { TensorShape(8U, 8U, 8U), TensorShape(8U, 8U, 16U) })
+        ),
+        make("ConvInfo",       { PadStrideInfo(1, 1, 0, 0) }),
+        make("Dilation",       { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",     { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo(),
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("InputQI",        { QuantizationInfo(0.25f, -20) }),
+        make("WeightsQI",      { QuantizationInfo(0.125f, 10) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** Positive input offset + stride-2 + padding: exercises the row-sum computation
+ *  for a strided conv where each output position sees a distinct input patch.
+ *  Also covers a positive (not negative) input offset sign. */
+FIXTURE_DATA_TEST_CASE(
+    RunStride2PaddedWithInputOffset,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(14U, 14U, 16U), TensorShape(14U, 14U, 32U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(3U, 3U, 32U, 16U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(16U) }),
+            make("OutputShape",  { TensorShape(7U, 7U, 8U), TensorShape(7U, 7U, 16U) })
+        ),
+        make("ConvInfo",       { PadStrideInfo(2, 2, 1, 1) }),
+        make("Dilation",       { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",     { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo(),
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("InputQI",        { QuantizationInfo(0.25f, 12) }),  // positive input offset
+        make("WeightsQI",      { QuantizationInfo(0.125f, 0) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** Negative weight offset — exercises the sign of the b_offset correction term. */
+FIXTURE_DATA_TEST_CASE(
+    RunWithNegativeWeightOffset,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(8U, 8U, 16U), TensorShape(8U, 8U, 32U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(3U, 3U, 32U, 16U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(16U) }),
+            make("OutputShape",  { TensorShape(6U, 6U, 8U), TensorShape(6U, 6U, 16U) })
+        ),
+        make("ConvInfo",       { PadStrideInfo(1, 1, 0, 0) }),
+        make("Dilation",       { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",     { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo(),
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("InputQI",        { QuantizationInfo(0.25f, 0) }),
+        make("WeightsQI",      { QuantizationInfo(0.125f, -8) })
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+/** Large offsets (both near ±127) — stress-tests the cross-term magnitude.
+ *  Large a_offset * b_offset * K can be a large fraction of the raw accumulator;
+ *  any sign or magnitude error is highly visible. */
+FIXTURE_DATA_TEST_CASE(
+    RunWithLargeOffsets,
+    NEDirectI8S8F32ConvolutionFixture,
+    framework::DatasetMode::ALL,
+    combine(
+        zip(
+            make("InputShape",   { TensorShape(8U, 8U, 16U), TensorShape(8U, 8U, 32U) }),
+            make("WeightsShape", { TensorShape(3U, 3U, 16U, 8U), TensorShape(3U, 3U, 32U, 16U) }),
+            make("BiasShape",    { TensorShape(8U), TensorShape(16U) }),
+            make("OutputShape",  { TensorShape(6U, 6U, 8U), TensorShape(6U, 6U, 16U) })
+        ),
+        make("ConvInfo",       { PadStrideInfo(1, 1, 0, 0) }),
+        make("Dilation",       { Size2D(1U, 1U) }),
+        make("ReshapeWeights", { true }),
+        make("DataLayout",     { DataLayout::NHWC }),
+        make("ActivationInfo", { ActivationLayerInfo() }),
+        make("InputQI",        { QuantizationInfo(0.01f, 100) }),   // large positive input offset
+        make("WeightsQI",      { QuantizationInfo(0.01f, -100) })   // large negative weight offset
+    )
+)
+{
+    validate(Accessor(_target), _reference, rel_tolerance_f32, tolerance_num_dequantize_f32, float(abs_tolerance_f32));
+}
+
+TEST_SUITE_END() // DirectI8S8F32
+// clang-format on
+// *INDENT-ON*
+#endif // #ifdef __aarch64__
+
 TEST_SUITE_END() // ConvolutionLayer
 
 /*
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index 2ccefa54575..1f3c7e6c319 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -1320,6 +1320,144 @@ class HasOptImplFixture : public framework::Fixture
 };
 #endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 
+/** Fixture for testing the single-kernel i8->fp32 direct convolution path.
+ *
+ *  Supports NHWC, no dilation, arbitrary quantization offsets.
+ *  Compares target (direct single-kernel) against CPU float reference.
+ */
+template <typename TensorType, typename AccessorType, typename FunctionType>
+class DirectI8S8F32ConvolutionFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape         input_shape_nchw,
+               TensorShape         weights_shape_nchw,
+               TensorShape         bias_shape,
+               TensorShape         output_shape_nchw,
+               PadStrideInfo       conv_info,
+               Size2D              dilation,
+               bool                reshape_weights,
+               DataLayout          data_layout,
+               ActivationLayerInfo act_info,
+               QuantizationInfo    input_qi   = QuantizationInfo(0.25f, 0),
+               QuantizationInfo    weights_qi = QuantizationInfo(0.125f, 0))
+    {
+        ARM_COMPUTE_EXPECT_EQUAL(data_layout, DataLayout::NHWC, framework::LogLevel::ERRORS);
+        _data_layout = data_layout;
+        _input_qi    = input_qi;
+        _weights_qi  = weights_qi;
+
+        _hash = static_cast<int32_t>(input_shape_nchw[0] + input_shape_nchw[1] + input_shape_nchw[2] +
+                                     weights_shape_nchw[0] + weights_shape_nchw[1] +
+                                     static_cast<int32_t>(input_qi.uniform().offset) +
+                                     static_cast<int32_t>(weights_qi.uniform().offset));
+
+        _target    = compute_target(input_shape_nchw, weights_shape_nchw, bias_shape, output_shape_nchw, conv_info,
+                                    reshape_weights, dilation, act_info);
+        _reference = compute_reference(input_shape_nchw, weights_shape_nchw, bias_shape, output_shape_nchw, conv_info,
+                                       dilation, act_info);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch (tensor.data_type())
+        {
+            case DataType::QASYMM8_SIGNED:
+            {
+                std::uniform_int_distribution<int32_t> dist(-127, 127);
+                library->fill(tensor, dist, i);
+                break;
+            }
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+                library->fill(tensor, dist, i);
+                break;
+            }
+            default:
+                library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    TensorType compute_target(TensorShape               input_shape,
+                              TensorShape               weights_shape,
+                              const TensorShape        &bias_shape,
+                              TensorShape               output_shape,
+                              const PadStrideInfo      &conv_info,
+                              bool                      reshape_weights,
+                              const Size2D             &dilation,
+                              const ActivationLayerInfo act_info)
+    {
+        // Convert NCHW→NHWC
+        permute(input_shape, PermutationVector(2U, 0U, 1U));
+        permute(weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(output_shape, PermutationVector(2U, 0U, 1U));
+
+        const int idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+        const int idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+        WeightsInfo weights_info(!reshape_weights, weights_shape[idx_w], weights_shape[idx_h], weights_shape[3]);
+
+        TensorType src = create_tensor<TensorType>(input_shape, DataType::QASYMM8_SIGNED, 1, _input_qi, _data_layout);
+        TensorType wgt = create_tensor<TensorType>(weights_shape, DataType::QASYMM8_SIGNED, 1, _weights_qi,
+                                                   _data_layout);
+        TensorType bias = create_tensor<TensorType>(bias_shape, DataType::F32, 1, QuantizationInfo(), _data_layout);
+        TensorType dst  = create_tensor<TensorType>(output_shape, DataType::F32, 1, QuantizationInfo(), _data_layout);
+
+        FunctionType conv;
+        conv.configure(&src, &wgt, &bias, &dst, conv_info, weights_info, dilation, act_info,
+                       false /*enable_fast_math*/, 1 /*num_groups*/);
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        src.allocator()->allocate();
+        wgt.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        fill(AccessorType(src), 0 + _hash);
+        fill(AccessorType(wgt), 1 + _hash);
+        fill(AccessorType(bias), 2 + _hash);
+
+        conv.run();
+        return dst;
+    }
+
+    SimpleTensor<float> compute_reference(TensorShape               input_shape,
+                                          TensorShape               weights_shape,
+                                          const TensorShape        &bias_shape,
+                                          TensorShape               output_shape,
+                                          const PadStrideInfo      &conv_info,
+                                          const Size2D             &dilation,
+                                          const ActivationLayerInfo act_info)
+    {
+        // Reference runs in NCHW with float input (dequantized) to match what the kernel should produce.
+        SimpleTensor<int8_t> src_q{input_shape, DataType::QASYMM8_SIGNED, 1, _input_qi};
+        SimpleTensor<int8_t> wgt_q{weights_shape, DataType::QASYMM8_SIGNED, 1, _weights_qi};
+        SimpleTensor<float>  bias_f{bias_shape, DataType::F32};
+
+        fill(src_q, 0 + _hash);
+        fill(wgt_q, 1 + _hash);
+        fill(bias_f, 2 + _hash);
+
+        auto conv = reference::convolution_layer<int8_t, int8_t, float, float>(src_q, wgt_q, bias_f, output_shape,
+                                                                                conv_info, dilation, 1 /*num_groups*/,
+                                                                                QuantizationInfo());
+        if (act_info.enabled())
+        {
+            return reference::activation_layer<float>(conv, act_info);
+        }
+        return conv;
+    }
+
+    TensorType          _target{};
+    SimpleTensor<float> _reference{};
+    DataLayout          _data_layout{DataLayout::NHWC};
+    QuantizationInfo    _input_qi{};
+    QuantizationInfo    _weights_qi{};
+    int32_t             _hash{0};
+};
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute