Skip to content
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
https://github.com/shibatch/sleef/pull/192
- Power VSX target support is added to libsleef.
https://github.com/shibatch/sleef/pull/195
- Payne-Hanek like argument reduction is added to libsleef.
https://github.com/shibatch/sleef/pull/???
## 3.2 - 2018-02-26
### Added
- The whole build system of the project migrated from makefiles to
Expand Down
5 changes: 4 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ pipeline {
stages {
stage('Preamble') {
parallel {
/*
// SVE testing is temporarily disabled due to compiler bug
stage('AArch64 SVE') {
agent { label 'aarch64' }
steps {
Expand All @@ -24,7 +26,8 @@ pipeline {
'''
}
}

*/

stage('Intel Compiler') {
agent { label 'icc' }
steps {
Expand Down
13 changes: 13 additions & 0 deletions src/arch/helperadvsimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@ static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
static INLINE vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
static INLINE void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return ((vfloat) {
ptr[vgetq_lane_s32(vi2, 0)],
ptr[vgetq_lane_s32(vi2, 1)],
ptr[vgetq_lane_s32(vi2, 2)],
ptr[vgetq_lane_s32(vi2, 3)]
});
}

// Basic logical operations for mask
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) {
Expand Down
13 changes: 13 additions & 0 deletions src/arch/helperavx.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[4];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this beint a[VECLENSP]?

vstoreu_v_p_vi(a, vi);
return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
Expand Down Expand Up @@ -477,6 +483,13 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
int a[8];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, shouldn't it be int a[VECLENSP]?

vstoreu_v_p_vi2(a, vi2);
return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
Expand Down
4 changes: 4 additions & 0 deletions src/arch/helperavx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }

//

static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
Expand Down Expand Up @@ -359,6 +361,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }

//

#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
Expand Down
4 changes: 4 additions & 0 deletions src/arch/helperavx2_128.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
Expand Down Expand Up @@ -330,6 +332,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
Expand Down
5 changes: 5 additions & 0 deletions src/arch/helperavx512f.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(pt
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }

//

static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
Expand Down Expand Up @@ -417,6 +419,7 @@ static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
Expand All @@ -433,6 +436,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr)
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }

//

static INLINE vdouble vposneg_vd_vd(vdouble d) {
Expand Down
14 changes: 10 additions & 4 deletions src/arch/helperneon32.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,8 @@ static INLINE int vtestallones_i_vo32(vopmask g) {
return vget_lane_u32(x1, 0);
}

static vfloat vloaduf(float *p) { return vld1q_f32(p); }
static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }

static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
static void vstoreu_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }

//

Expand Down Expand Up @@ -210,6 +207,15 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return ((vfloat) {
ptr[vgetq_lane_s32(vi2, 0)],
ptr[vgetq_lane_s32(vi2, 1)],
ptr[vgetq_lane_s32(vi2, 2)],
ptr[vgetq_lane_s32(vi2, 3)]
});
}

#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })

Expand Down
12 changes: 12 additions & 0 deletions src/arch/helperpower_128.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,18 @@ static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { ptr[0] = v[0]; ptr[1]

static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[4];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this also be int[VECLENDP]?

vstoreu_v_p_vi(a, vi);
return ((vdouble) { ptr[a[0]], ptr[a[1]] });
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
int a[4];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this also be int[VECLENSP]?

vstoreu_v_p_vi2(a, vi2);
return ((vfloat) { ptr[a[0]], ptr[a[1]], ptr[a[2]], ptr[a[3]] });
}

static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
Expand Down
12 changes: 12 additions & 0 deletions src/arch/helperpurec.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,12 @@ static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
return vd;
}

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
Expand Down Expand Up @@ -418,6 +424,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
return vf;
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
return vf;
}

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];
Expand Down
12 changes: 12 additions & 0 deletions src/arch/helpersse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[4];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[VECLENDP];?

vstoreu_v_p_vi(a, vi);
return _mm_set_pd(ptr[a[1]], ptr[a[0]]);
}

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
Expand Down Expand Up @@ -373,6 +379,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {
int a[4];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[VECLENSP]?

vstoreu_v_p_vi2(a, vi);
return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#ifdef _MSC_VER
// This function is useful when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
Expand Down
37 changes: 37 additions & 0 deletions src/arch/helpersve.h
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,9 @@ static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) {
static INLINE vint vand_vi_vi_vi(vint x, vint y) {
return svand_s32_x(ptrue, x, y);
}
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) {
return svbic_s32_x(ptrue, y, x);
}
static INLINE vint vxor_vi_vi_vi(vint x, vint y) {
return sveor_s32_x(ptrue, x, y);
}
Expand Down Expand Up @@ -657,6 +660,15 @@ static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);
}

// Gather

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
return svldff1_gather_s64offset_f64(ptrue, ptr, svreinterpret_s64_s32(svzip1_s32(vi, svdup_n_s32(0))));
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return svldff1_gather_s32offset_f32(ptrue, ptr, vi2);
}

// Operations for DFT

Expand Down Expand Up @@ -713,3 +725,28 @@ static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v);
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }

// These functions are for debugging
static double vcast_d_vd(vdouble v) {
double a[32];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double a[svcntd()]. We have done it for the gnu compatibility tests too: https://github.com/shibatch/sleef/blob/master/src/libm-tester/gnuabi_compatibility.c#L50

It requires the code to be build with the C11 standard.

vstoreu_v_p_vd(a, v);
return a[0];
}

static float vcast_f_vf(vfloat v) {
float a[64];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[svcntw()];

vstoreu_v_p_vf(a, v);
return a[0];
}

static int vcast_i_vi(vint v) {
int a[64];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[svcntw()];

vstoreu_v_p_vi(a, v);
return a[0];
}

static int vcast_i_vi2(vint2 v) {
int a[64];
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[svcntw()];

vstoreu_v_p_vi2(a, v);
return a[0];
}
12 changes: 12 additions & 0 deletions src/arch/helpervecext.h
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) {
return vd;
}

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[vi[i]];
return vd;
}

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
Expand Down Expand Up @@ -777,6 +783,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
return vf;
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[vi2[i]];
return vf;
}

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
for(int i=0;i<VECTLENSP;i++) ptr[i] = v[i];
Expand Down
4 changes: 2 additions & 2 deletions src/common/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@
bits. So, the maximum argument that could be correctly reduced
should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
double precision calculation, the actual maximum argument that can
be correctly reduced is around 2^50 = 1.1e+15.
be correctly reduced is around 2^47.
*/

#define PI_A 3.1415926218032836914
#define PI_B 3.1786509424591713469e-08
#define PI_C 1.2246467864107188502e-16
#define PI_D 1.2736634327021899816e-24
#define TRIGRANGEMAX 1e+15
#define TRIGRANGEMAX 1e+14

/*
PI_A2 and PI_B2 are constants that satisfy the following two conditions.
Expand Down
Loading