Skip to content

Commit 530bebd

Browse files
authored
Merge pull request #1630 from nico/neon-reversible
NEON-optimize 5-3 IDWT
2 parents 33d594d + 0186472 commit 530bebd

1 file changed

Lines changed: 24 additions & 14 deletions

File tree

src/lib/openjp2/dwt.c

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
/** Number of int32 values in a AVX2 register */
7777
#define VREG_INT_COUNT 8
7878
#else
79-
/** Number of int32 values in a SSE2 register */
79+
/** Number of int32 values in a SSE2 or NEON register */
8080
#define VREG_INT_COUNT 4
8181
#endif
8282

@@ -702,7 +702,7 @@ static void opj_idwt53_h(const opj_dwt_t *dwt,
702702
#endif
703703
}
704704

705-
#if (defined(__SSE2__) || defined(__AVX2__) || defined(__AVX512F__)) && !defined(STANDARD_SLOW_VERSION)
705+
#if (defined(__ARM_NEON) || defined(__SSE2__) || defined(__AVX2__) || defined(__AVX512F__)) && !defined(STANDARD_SLOW_VERSION)
706706

707707
/* Conveniency macros to improve the readability of the formulas */
708708
#if defined(__AVX512F__)
@@ -725,6 +725,16 @@ static void opj_idwt53_h(const opj_dwt_t *dwt,
725725
#define ADD(x,y) _mm256_add_epi32((x),(y))
726726
#define SUB(x,y) _mm256_sub_epi32((x),(y))
727727
#define SAR(x,y) _mm256_srai_epi32((x),(y))
728+
#elif defined(__ARM_NEON)
729+
#define VREG int32x4_t
730+
#define LOAD_CST(x) vdupq_n_s32(x)
731+
#define LOAD(x) vld1q_s32((const int32_t*)(x))
732+
#define LOADU(x) vld1q_s32((const int32_t*)(x))
733+
#define STORE(x,y) vst1q_s32((int32_t*)(x),(y))
734+
#define STOREU(x,y) vst1q_s32((int32_t*)(x),(y))
735+
#define ADD(x,y) vaddq_s32((x),(y))
736+
#define SUB(x,y) vsubq_s32((x),(y))
737+
#define SAR(x,y) vshrq_n_s32((x),(y))
728738
#else
729739
#define VREG __m128i
730740
#define LOAD_CST(x) _mm_set1_epi32(x)
@@ -758,9 +768,9 @@ void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
758768
}
759769
}
760770

761-
/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
762-
* 16 in AVX2, when top-most pixel is on even coordinate */
763-
static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
771+
/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2 and NEON,
772+
* or 16 in AVX2, when top-most pixel is on even coordinate */
773+
static void opj_idwt53_v_cas0_mcols_SIMD(
764774
OPJ_INT32* tmp,
765775
const OPJ_INT32 sn,
766776
const OPJ_INT32 len,
@@ -865,9 +875,9 @@ static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
865875
}
866876

867877

868-
/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
869-
* 16 in AVX2, when top-most pixel is on odd coordinate */
870-
static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
878+
/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2 and NEON,
879+
* or 16 in AVX2, when top-most pixel is on odd coordinate */
880+
static void opj_idwt53_v_cas1_mcols_SIMD(
871881
OPJ_INT32* tmp,
872882
const OPJ_INT32 sn,
873883
const OPJ_INT32 len,
@@ -1107,11 +1117,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
11071117
if (dwt->cas == 0) {
11081118
/* If len == 1, unmodified value */
11091119

1110-
#if (defined(__SSE2__) || defined(__AVX2__))
1120+
#if (defined(__ARM_NEON) || defined(__SSE2__) || defined(__AVX2__))
11111121
if (len > 1 && nb_cols == PARALLEL_COLS_53) {
1112-
/* Same as below general case, except that thanks to SSE2/AVX2 */
1122+
/* Same as below general case, except that thanks to SIMD */
11131123
/* we can efficiently process 8/16 columns in parallel */
1114-
opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
1124+
opj_idwt53_v_cas0_mcols_SIMD(dwt->mem, sn, len, tiledp_col, stride);
11151125
return;
11161126
}
11171127
#endif
@@ -1150,11 +1160,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
11501160
return;
11511161
}
11521162

1153-
#if (defined(__SSE2__) || defined(__AVX2__))
1163+
#if (defined(__ARM_NEON) || defined(__SSE2__) || defined(__AVX2__))
11541164
if (len > 2 && nb_cols == PARALLEL_COLS_53) {
1155-
/* Same as below general case, except that thanks to SSE2/AVX2 */
1165+
/* Same as below general case, except that thanks to SIMD */
11561166
/* we can efficiently process 8/16 columns in parallel */
1157-
opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
1167+
opj_idwt53_v_cas1_mcols_SIMD(dwt->mem, sn, len, tiledp_col, stride);
11581168
return;
11591169
}
11601170
#endif

0 commit comments

Comments
 (0)