Merge pull request #4055 from pratham-mcw:sgm-neon-optimization

pratham-mcw · web-flow · commit b545e2ffd214 · 2026-01-05T16:14:28.000+03:00
stereo: performance optimization of sgm on Window-ARM64 #4055 ### Pull Request Readiness Checklist - This PR adds an ARM64 NEON intrinsics-based optimization for the computeDisparityBinarySGBM function in stereo_binary_sgbm.cpp. - The new implementation uses NEON vector instructions (e.g., vld1q_s16, vminq_s16, vqaddq_s16), allowing for efficient parallel computation. This is guarded under the CV_NEON macro and does not affect other platforms. - This change is similar to existing SSE2 optimizations for x64 and brings the same performance benefits to ARM64. **Performance Improvements:** - The optimization significantly improves the performance of sgm on Windows ARM64 targets. - The table below shows timing comparisons before and after the optimization: <img width="1047" height="199" alt="image" src="https://github.com/user-attachments/assets/0752cfc0-3c82-4595-8e3f-5d87cbdfdf96" /> See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch
diff --git a/modules/stereo/src/stereo_binary_sgbm.cpp b/modules/stereo/src/stereo_binary_sgbm.cpp
@@ -147,6 +147,10 @@ namespace cv
             volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 #endif
 
+#if CV_NEON
+            volatile bool useSIMD = checkHardwareSupport(CV_CPU_NEON);
+#endif
+
             const int ALIGN = 16;
             const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
             const int DISP_SCALE = (1 << DISP_SHIFT);
@@ -420,6 +424,69 @@ namespace cv
                             _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
                         }
                         else
+#elif CV_NEON
+                        if ( useSIMD )
+                        {
+                            int16x8_t vP1 = vdupq_n_s16((short)P1);
+                            int16x8_t vDelta0 = vdupq_n_s16((short)delta0);
+                            int16x8_t vDelta1 = vdupq_n_s16((short)delta1);
+                            int16x8_t vDelta2 = vdupq_n_s16((short)delta2);
+                            int16x8_t vDelta3 = vdupq_n_s16((short)delta3);
+                            int16x8_t vMinL0 = vdupq_n_s16((short)MAX_COST);
+                            int16x8_t vCpd, vL0, vL1, vL2, vL3, vL0m1, vL0p1;
+                            int16x8_t vL1m1, vL1p1, vL2m1, vL2p1, vL3m1, vL3p1;
+                            for ( d = 0; d < D; d += 8 )
+                            {
+                                vCpd = vld1q_s16(Cp + d);
+                                vL0 = vld1q_s16(Lr_p0 + d);
+                                vL1 = vld1q_s16(Lr_p1 + d);
+                                vL2 = vld1q_s16(Lr_p2 + d);
+                                vL3 = vld1q_s16(Lr_p3 + d);
+                                vL0m1 = vld1q_s16(Lr_p0 + d - 1);
+                                vL0p1 = vld1q_s16(Lr_p0 + d + 1);
+                                vL0 = vminq_s16(vL0, vqaddq_s16(vL0m1, vP1));
+                                vL0 = vminq_s16(vL0, vqaddq_s16(vL0p1, vP1));
+                                vL1m1 = vld1q_s16(Lr_p1 + d - 1);
+                                vL1p1 = vld1q_s16(Lr_p1 + d + 1);
+                                vL1 = vminq_s16(vL1, vqaddq_s16(vL1m1, vP1));
+                                vL1 = vminq_s16(vL1, vqaddq_s16(vL1p1, vP1));
+                                vL2m1 = vld1q_s16(Lr_p2 + d - 1);
+                                vL2p1 = vld1q_s16(Lr_p2 + d + 1);
+                                vL2 = vminq_s16(vL2, vqaddq_s16(vL2m1, vP1));
+                                vL2 = vminq_s16(vL2, vqaddq_s16(vL2p1, vP1));
+                                vL3m1 = vld1q_s16(Lr_p3 + d - 1);
+                                vL3p1 = vld1q_s16(Lr_p3 + d + 1);
+                                vL3 = vminq_s16(vL3, vqaddq_s16(vL3m1, vP1));
+                                vL3 = vminq_s16(vL3, vqaddq_s16(vL3p1, vP1));
+                                vL0 = vminq_s16(vL0, vDelta0);
+                                vL0 = vqaddq_s16(vCpd, vqsubq_s16(vL0, vDelta0));
+                                vL1 = vminq_s16(vL1, vDelta1);
+                                vL1 = vqaddq_s16(vCpd, vqsubq_s16(vL1, vDelta1));
+                                vL2 = vminq_s16(vL2, vDelta2);
+                                vL2 = vqaddq_s16(vCpd, vqsubq_s16(vL2, vDelta2));
+                                vL3 = vminq_s16(vL3, vDelta3);
+                                vL3 = vqaddq_s16(vCpd, vqsubq_s16(vL3, vDelta3));
+                                vst1q_s16(Lr_p + d, vL0);
+                                vst1q_s16(Lr_p + d + D2, vL1);
+                                vst1q_s16(Lr_p + d + D2 * 2, vL2);
+                                vst1q_s16(Lr_p + d + D2 * 3, vL3);
+                                int16x8_t t0 = vminq_s16(vcombine_s16(vget_low_s16(vL0), vget_low_s16(vL2)),
+                                                        vcombine_s16(vget_high_s16(vL0), vget_high_s16(vL2)));
+                                int16x8_t t1 = vminq_s16(vcombine_s16(vget_low_s16(vL1), vget_low_s16(vL3)),
+                                                        vcombine_s16(vget_high_s16(vL1), vget_high_s16(vL3)));
+                                int16x8_t t2 = vminq_s16(t0, t1);
+                                vMinL0 = vminq_s16(vMinL0, t2);
+                                int16x8_t Sval = vld1q_s16(Sp + d);
+                                int16x8_t L01 = vqaddq_s16(vL0, vL1);
+                                int16x8_t L23 = vqaddq_s16(vL2, vL3);
+                                Sval = vqaddq_s16(Sval, L01);
+                                Sval = vqaddq_s16(Sval, L23);
+                                vst1q_s16(Sp + d, Sval);
+                            }
+                            int16x4_t minL = vpmin_s16(vget_low_s16(vMinL0), vget_high_s16(vMinL0));
+                            minLr[0][xm] = vget_lane_s16(minL, 0);
+                        }
+                        else
 #endif
                         {
                             int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;