7676/** Number of int32 values in a AVX2 register */
7777#define VREG_INT_COUNT 8
7878#else
79- /** Number of int32 values in a SSE2 register */
79+ /** Number of int32 values in a SSE2 or NEON register */
8080#define VREG_INT_COUNT 4
8181#endif
8282
@@ -702,7 +702,7 @@ static void opj_idwt53_h(const opj_dwt_t *dwt,
702702#endif
703703}
704704
705- #if (defined(__SSE2__ ) || defined(__AVX2__ ) || defined(__AVX512F__ )) && !defined(STANDARD_SLOW_VERSION )
705+ #if (defined(__ARM_NEON ) || defined( __SSE2__ ) || defined(__AVX2__ ) || defined(__AVX512F__ )) && !defined(STANDARD_SLOW_VERSION )
706706
707707/* Conveniency macros to improve the readability of the formulas */
708708#if defined(__AVX512F__ )
@@ -725,6 +725,16 @@ static void opj_idwt53_h(const opj_dwt_t *dwt,
725725#define ADD (x ,y ) _mm256_add_epi32((x),(y))
726726#define SUB (x ,y ) _mm256_sub_epi32((x),(y))
727727#define SAR (x ,y ) _mm256_srai_epi32((x),(y))
728+ #elif defined(__ARM_NEON )
729+ #define VREG int32x4_t
730+ #define LOAD_CST (x ) vdupq_n_s32(x)
731+ #define LOAD (x ) vld1q_s32((const int32_t*)(x))
732+ #define LOADU (x ) vld1q_s32((const int32_t*)(x))
733+ #define STORE (x ,y ) vst1q_s32((int32_t*)(x),(y))
734+ #define STOREU (x ,y ) vst1q_s32((int32_t*)(x),(y))
735+ #define ADD (x ,y ) vaddq_s32((x),(y))
736+ #define SUB (x ,y ) vsubq_s32((x),(y))
737+ #define SAR (x ,y ) vshrq_n_s32((x),(y))
728738#else
729739#define VREG __m128i
730740#define LOAD_CST (x ) _mm_set1_epi32(x)
@@ -758,9 +768,9 @@ void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
758768 }
759769}
760770
761- /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
762- * 16 in AVX2, when top-most pixel is on even coordinate */
763- static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2 (
771+ /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2 and NEON,
772+ * or 16 in AVX2, when top-most pixel is on even coordinate */
773+ static void opj_idwt53_v_cas0_mcols_SIMD (
764774 OPJ_INT32 * tmp ,
765775 const OPJ_INT32 sn ,
766776 const OPJ_INT32 len ,
@@ -865,9 +875,9 @@ static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
865875}
866876
867877
868- /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
869- * 16 in AVX2, when top-most pixel is on odd coordinate */
870- static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2 (
878+ /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2 and NEON,
879+ * or 16 in AVX2, when top-most pixel is on odd coordinate */
880+ static void opj_idwt53_v_cas1_mcols_SIMD (
871881 OPJ_INT32 * tmp ,
872882 const OPJ_INT32 sn ,
873883 const OPJ_INT32 len ,
@@ -1107,11 +1117,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
11071117 if (dwt -> cas == 0 ) {
11081118 /* If len == 1, unmodified value */
11091119
1110- #if (defined(__SSE2__ ) || defined(__AVX2__ ))
1120+ #if (defined(__ARM_NEON ) || defined( __SSE2__ ) || defined(__AVX2__ ))
11111121 if (len > 1 && nb_cols == PARALLEL_COLS_53 ) {
1112- /* Same as below general case, except that thanks to SSE2/AVX2 */
1122+ /* Same as below general case, except that thanks to SIMD */
11131123 /* we can efficiently process 8/16 columns in parallel */
1114- opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2 (dwt -> mem , sn , len , tiledp_col , stride );
1124+ opj_idwt53_v_cas0_mcols_SIMD (dwt -> mem , sn , len , tiledp_col , stride );
11151125 return ;
11161126 }
11171127#endif
@@ -1150,11 +1160,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
11501160 return ;
11511161 }
11521162
1153- #if (defined(__SSE2__ ) || defined(__AVX2__ ))
1163+ #if (defined(__ARM_NEON ) || defined( __SSE2__ ) || defined(__AVX2__ ))
11541164 if (len > 2 && nb_cols == PARALLEL_COLS_53 ) {
1155- /* Same as below general case, except that thanks to SSE2/AVX2 */
1165+ /* Same as below general case, except that thanks to SIMD */
11561166 /* we can efficiently process 8/16 columns in parallel */
1157- opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2 (dwt -> mem , sn , len , tiledp_col , stride );
1167+ opj_idwt53_v_cas1_mcols_SIMD (dwt -> mem , sn , len , tiledp_col , stride );
11581168 return ;
11591169 }
11601170#endif
0 commit comments