Skip to content

Commit b545e2f

Browse files
authored
Merge pull request #4055 from pratham-mcw:sgm-neon-optimization
stereo: performance optimization of sgm on Window-ARM64 #4055 ### Pull Request Readiness Checklist - This PR adds an ARM64 NEON intrinsics-based optimization for the computeDisparityBinarySGBM function in stereo_binary_sgbm.cpp. - The new implementation uses NEON vector instructions (e.g., vld1q_s16, vminq_s16, vqaddq_s16), allowing for efficient parallel computation. This is guarded under the CV_NEON macro and does not affect other platforms. - This change is similar to existing SSE2 optimizations for x64 and brings the same performance benefits to ARM64. **Performance Improvements:** - The optimization significantly improves the performance of sgm on Windows ARM64 targets. - The table below shows timing comparisons before and after the optimization: <img width="1047" height="199" alt="image" src="https://github.com/user-attachments/assets/0752cfc0-3c82-4595-8e3f-5d87cbdfdf96" /> See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch
1 parent 45dd594 commit b545e2f

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

modules/stereo/src/stereo_binary_sgbm.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,10 @@ namespace cv
147147
volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
148148
#endif
149149

150+
#if CV_NEON
151+
volatile bool useSIMD = checkHardwareSupport(CV_CPU_NEON);
152+
#endif
153+
150154
const int ALIGN = 16;
151155
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
152156
const int DISP_SCALE = (1 << DISP_SHIFT);
@@ -420,6 +424,69 @@ namespace cv
420424
_mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
421425
}
422426
else
427+
#elif CV_NEON
428+
if ( useSIMD )
429+
{
430+
int16x8_t vP1 = vdupq_n_s16((short)P1);
431+
int16x8_t vDelta0 = vdupq_n_s16((short)delta0);
432+
int16x8_t vDelta1 = vdupq_n_s16((short)delta1);
433+
int16x8_t vDelta2 = vdupq_n_s16((short)delta2);
434+
int16x8_t vDelta3 = vdupq_n_s16((short)delta3);
435+
int16x8_t vMinL0 = vdupq_n_s16((short)MAX_COST);
436+
int16x8_t vCpd, vL0, vL1, vL2, vL3, vL0m1, vL0p1;
437+
int16x8_t vL1m1, vL1p1, vL2m1, vL2p1, vL3m1, vL3p1;
438+
for ( d = 0; d < D; d += 8 )
439+
{
440+
vCpd = vld1q_s16(Cp + d);
441+
vL0 = vld1q_s16(Lr_p0 + d);
442+
vL1 = vld1q_s16(Lr_p1 + d);
443+
vL2 = vld1q_s16(Lr_p2 + d);
444+
vL3 = vld1q_s16(Lr_p3 + d);
445+
vL0m1 = vld1q_s16(Lr_p0 + d - 1);
446+
vL0p1 = vld1q_s16(Lr_p0 + d + 1);
447+
vL0 = vminq_s16(vL0, vqaddq_s16(vL0m1, vP1));
448+
vL0 = vminq_s16(vL0, vqaddq_s16(vL0p1, vP1));
449+
vL1m1 = vld1q_s16(Lr_p1 + d - 1);
450+
vL1p1 = vld1q_s16(Lr_p1 + d + 1);
451+
vL1 = vminq_s16(vL1, vqaddq_s16(vL1m1, vP1));
452+
vL1 = vminq_s16(vL1, vqaddq_s16(vL1p1, vP1));
453+
vL2m1 = vld1q_s16(Lr_p2 + d - 1);
454+
vL2p1 = vld1q_s16(Lr_p2 + d + 1);
455+
vL2 = vminq_s16(vL2, vqaddq_s16(vL2m1, vP1));
456+
vL2 = vminq_s16(vL2, vqaddq_s16(vL2p1, vP1));
457+
vL3m1 = vld1q_s16(Lr_p3 + d - 1);
458+
vL3p1 = vld1q_s16(Lr_p3 + d + 1);
459+
vL3 = vminq_s16(vL3, vqaddq_s16(vL3m1, vP1));
460+
vL3 = vminq_s16(vL3, vqaddq_s16(vL3p1, vP1));
461+
vL0 = vminq_s16(vL0, vDelta0);
462+
vL0 = vqaddq_s16(vCpd, vqsubq_s16(vL0, vDelta0));
463+
vL1 = vminq_s16(vL1, vDelta1);
464+
vL1 = vqaddq_s16(vCpd, vqsubq_s16(vL1, vDelta1));
465+
vL2 = vminq_s16(vL2, vDelta2);
466+
vL2 = vqaddq_s16(vCpd, vqsubq_s16(vL2, vDelta2));
467+
vL3 = vminq_s16(vL3, vDelta3);
468+
vL3 = vqaddq_s16(vCpd, vqsubq_s16(vL3, vDelta3));
469+
vst1q_s16(Lr_p + d, vL0);
470+
vst1q_s16(Lr_p + d + D2, vL1);
471+
vst1q_s16(Lr_p + d + D2 * 2, vL2);
472+
vst1q_s16(Lr_p + d + D2 * 3, vL3);
473+
int16x8_t t0 = vminq_s16(vcombine_s16(vget_low_s16(vL0), vget_low_s16(vL2)),
474+
vcombine_s16(vget_high_s16(vL0), vget_high_s16(vL2)));
475+
int16x8_t t1 = vminq_s16(vcombine_s16(vget_low_s16(vL1), vget_low_s16(vL3)),
476+
vcombine_s16(vget_high_s16(vL1), vget_high_s16(vL3)));
477+
int16x8_t t2 = vminq_s16(t0, t1);
478+
vMinL0 = vminq_s16(vMinL0, t2);
479+
int16x8_t Sval = vld1q_s16(Sp + d);
480+
int16x8_t L01 = vqaddq_s16(vL0, vL1);
481+
int16x8_t L23 = vqaddq_s16(vL2, vL3);
482+
Sval = vqaddq_s16(Sval, L01);
483+
Sval = vqaddq_s16(Sval, L23);
484+
vst1q_s16(Sp + d, Sval);
485+
}
486+
int16x4_t minL = vpmin_s16(vget_low_s16(vMinL0), vget_high_s16(vMinL0));
487+
minLr[0][xm] = vget_lane_s16(minL, 0);
488+
}
489+
else
423490
#endif
424491
{
425492
int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;

0 commit comments

Comments
 (0)