Skip to content

Commit 4d5e75f

Browse files
authored
[libsleef] Add modified Payne Hanek argument reduction (#197)
This patch adds a modified Payne Hanek argument reduction that can be used for very large arguments. Payne Hanek reduction algorithm can handle very large arguments by table look-ups. In this patch, a vectorized version of the algorithm is implemented. The argument range for DP and SP trig functions will become [-1e+299, 1e+299] and [-1e+28, 1e+28], respectively. In order to avoid using 64 bit or 128 bit multiplication, the algorithm is modified to use DD computation. Gather instructions are used for table look-ups. The reduction subroutine is tested to confirm that it correctly handle the worst case with 6381956970095103.0 * 2.0^797.
1 parent bb9f00d commit 4d5e75f

34 files changed

+2285
-532
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1717
https://github.com/shibatch/sleef/pull/192
1818
- Power VSX target support is added to libsleef.
1919
https://github.com/shibatch/sleef/pull/195
20+
- Payne-Hanek like argument reduction is added to libsleef.
21+
https://github.com/shibatch/sleef/pull/197
2022
## 3.2 - 2018-02-26
2123
### Added
2224
- The whole build system of the project migrated from makefiles to

Jenkinsfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ pipeline {
1010
sh '''
1111
echo "AArch64 SVE on" `hostname`
1212
export PATH=$PATH:/opt/arm/arm-instruction-emulator-1.2.1_Generic-AArch64_Ubuntu-14.04_aarch64-linux/bin
13-
export LD_LIBRARY_PATH=/opt/arm/arm-hpc-compiler-18.1_Generic-AArch64_Ubuntu-16.04_aarch64-linux/lib
13+
export LD_LIBRARY_PATH=/opt/arm/arm-instruction-emulator-1.2.1_Generic-AArch64_Ubuntu-14.04_aarch64-linux/lib:/opt/arm/arm-hpc-compiler-18.1_Generic-AArch64_Ubuntu-16.04_aarch64-linux/lib
1414
export CC=/opt/arm/arm-hpc-compiler-18.1_Generic-AArch64_Ubuntu-16.04_aarch64-linux/bin/armclang
1515
rm -rf build
1616
mkdir build
@@ -24,7 +24,7 @@ pipeline {
2424
'''
2525
}
2626
}
27-
27+
2828
stage('Intel Compiler') {
2929
agent { label 'icc' }
3030
steps {

src/arch/helperadvsimd.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,19 @@ static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
7272
static INLINE vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
7373
static INLINE void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }
7474

75+
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
76+
return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
77+
}
78+
79+
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
80+
return ((vfloat) {
81+
ptr[vgetq_lane_s32(vi2, 0)],
82+
ptr[vgetq_lane_s32(vi2, 1)],
83+
ptr[vgetq_lane_s32(vi2, 2)],
84+
ptr[vgetq_lane_s32(vi2, 3)]
85+
});
86+
}
87+
7588
// Basic logical operations for mask
7689
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
7790
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) {

src/arch/helperavx.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
290290
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
291291
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
292292

293+
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
294+
int a[VECTLENDP];
295+
vstoreu_v_p_vi(a, vi);
296+
return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
297+
}
298+
293299
#if defined(_MSC_VER)
294300
// This function is needed when debugging on MSVC.
295301
static INLINE double vcast_d_vd(vdouble v) {
@@ -477,6 +483,13 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
477483
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
478484
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
479485

486+
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
487+
int a[VECTLENSP];
488+
vstoreu_v_p_vi2(a, vi2);
489+
return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
490+
ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
491+
}
492+
480493
#ifdef _MSC_VER
481494
// This function is needed when debugging on MSVC.
482495
static INLINE float vcast_f_vf(vfloat v) {

src/arch/helperavx2.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
255255
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
256256
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
257257

258+
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }
259+
258260
//
259261

260262
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
@@ -359,6 +361,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
359361
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
360362
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
361363

364+
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }
365+
362366
//
363367

364368
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })

src/arch/helperavx2_128.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
228228
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
229229
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
230230

231+
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }
232+
231233
#if defined(_MSC_VER)
232234
// This function is needed when debugging on MSVC.
233235
static INLINE double vcast_d_vd(vdouble v) {
@@ -330,6 +332,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
330332
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
331333
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
332334

335+
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }
336+
333337
#ifdef _MSC_VER
334338
// This function is needed when debugging on MSVC.
335339
static INLINE float vcast_f_vf(vfloat v) {

src/arch/helperavx512f.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(pt
290290
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
291291
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }
292292

293+
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }
294+
293295
//
294296

295297
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
@@ -417,6 +419,7 @@ static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_
417419
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
418420

419421
static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
422+
static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
420423

421424
#ifdef _MSC_VER
422425
// This function is needed when debugging on MSVC.
@@ -433,6 +436,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr)
433436
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
434437
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }
435438

439+
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }
440+
436441
//
437442

438443
static INLINE vdouble vposneg_vd_vd(vdouble d) {

src/arch/helperneon32.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,8 @@ static INLINE int vtestallones_i_vo32(vopmask g) {
4343
return vget_lane_u32(x1, 0);
4444
}
4545

46-
static vfloat vloaduf(float *p) { return vld1q_f32(p); }
47-
static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
48-
4946
static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
50-
static void vstoreu_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
47+
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
5148

5249
//
5350

@@ -210,6 +207,15 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
210207
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
211208
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
212209

210+
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
211+
return ((vfloat) {
212+
ptr[vgetq_lane_s32(vi2, 0)],
213+
ptr[vgetq_lane_s32(vi2, 1)],
214+
ptr[vgetq_lane_s32(vi2, 2)],
215+
ptr[vgetq_lane_s32(vi2, 3)]
216+
});
217+
}
218+
213219
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
214220
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
215221

src/arch/helperpower_128.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,18 @@ static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { ptr[0] = v[0]; ptr[1]
7474

7575
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
7676

77+
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
78+
int a[VECTLENDP];
79+
vstoreu_v_p_vi(a, vi);
80+
return ((vdouble) { ptr[a[0]], ptr[a[1]] });
81+
}
82+
83+
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
84+
int a[VECTLENSP];
85+
vstoreu_v_p_vi2(a, vi2);
86+
return ((vfloat) { ptr[a[0]], ptr[a[1]], ptr[a[2]], ptr[a[3]] });
87+
}
88+
7789
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
7890
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
7991
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }

src/arch/helperpurec.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,12 @@ static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
304304
static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
305305
static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }
306306

307+
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
308+
vdouble vd;
309+
for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
310+
return vd;
311+
}
312+
307313
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
308314
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
309315
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
@@ -418,6 +424,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
418424
return vf;
419425
}
420426

427+
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
428+
vfloat vf;
429+
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
430+
return vf;
431+
}
432+
421433
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
422434
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
423435
for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];

0 commit comments

Comments
 (0)