shibatch
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Jenkinsfile‎
Lines changed: 2 additions & 2 deletions b/‎Jenkinsfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/arch/helperadvsimd.h‎
Lines changed: 13 additions & 0 deletions b/‎src/arch/helperadvsimd.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/arch/helperavx.h‎
Lines changed: 13 additions & 0 deletions b/‎src/arch/helperavx.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/arch/helperavx2.h‎
Lines changed: 4 additions & 0 deletions b/‎src/arch/helperavx2.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/arch/helperavx2_128.h‎
Lines changed: 4 additions & 0 deletions b/‎src/arch/helperavx2_128.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/arch/helperavx512f.h‎
Lines changed: 5 additions & 0 deletions b/‎src/arch/helperavx512f.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/arch/helperneon32.h‎
Lines changed: 10 additions & 4 deletions b/‎src/arch/helperneon32.h‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎src/arch/helperpower_128.h‎
Lines changed: 12 additions & 0 deletions b/‎src/arch/helperpower_128.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/arch/helperpurec.h‎
Lines changed: 12 additions & 0 deletions b/‎src/arch/helperpurec.h‎
Lines changed: 12 additions & 0 deletions
@@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
   https://github.com/shibatch/sleef/pull/192
 - Power VSX target support is added to libsleef.
   https://github.com/shibatch/sleef/pull/195
+- Payne-Hanek like argument reduction is added to libsleef.
+  https://github.com/shibatch/sleef/pull/197
 ## 3.2 - 2018-02-26
 ### Added
 - The whole build system of the project migrated from makefiles to
 
@@ -10,7 +10,7 @@ pipeline {
 	    	     	 sh '''
                 	 echo "AArch64 SVE on" `hostname`
 			 export PATH=$PATH:/opt/arm/arm-instruction-emulator-1.2.1_Generic-AArch64_Ubuntu-14.04_aarch64-linux/bin
-			 export LD_LIBRARY_PATH=/opt/arm/arm-hpc-compiler-18.1_Generic-AArch64_Ubuntu-16.04_aarch64-linux/lib
+			 export LD_LIBRARY_PATH=/opt/arm/arm-instruction-emulator-1.2.1_Generic-AArch64_Ubuntu-14.04_aarch64-linux/lib:/opt/arm/arm-hpc-compiler-18.1_Generic-AArch64_Ubuntu-16.04_aarch64-linux/lib
 			 export CC=/opt/arm/arm-hpc-compiler-18.1_Generic-AArch64_Ubuntu-16.04_aarch64-linux/bin/armclang
 			 rm -rf build
  			 mkdir build
@@ -24,7 +24,7 @@ pipeline {
 			 '''
             	     }
                 }
-                
+
                 stage('Intel Compiler') {
                     agent { label 'icc' }
                     steps {
 
@@ -72,6 +72,19 @@ static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
 static INLINE vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
 static INLINE void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
+}
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  return ((vfloat) {
+      ptr[vgetq_lane_s32(vi2, 0)],
+      ptr[vgetq_lane_s32(vi2, 1)],
+      ptr[vgetq_lane_s32(vi2, 2)],
+      ptr[vgetq_lane_s32(vi2, 3)]
+    });
+}
+
 // Basic logical operations for mask
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) {
 
@@ -290,6 +290,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  int a[VECTLENDP];
+  vstoreu_v_p_vi(a, vi);
+  return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
+}
+
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) {
@@ -477,6 +483,13 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  int a[VECTLENSP];
+  vstoreu_v_p_vi2(a, vi2);
+  return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
+		       ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
+}
+
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {
 
@@ -255,6 +255,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }
+
 //
 
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
@@ -359,6 +361,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }
+
 //
 
 #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
 
@@ -228,6 +228,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }
+
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) {
@@ -330,6 +332,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }
+
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {
 
@@ -290,6 +290,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(pt
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }
+
 //
 
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
@@ -417,6 +419,7 @@ static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_
 static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
 
 static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
+static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
 
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
@@ -433,6 +436,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr)
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }
+
 //
 
 static INLINE vdouble vposneg_vd_vd(vdouble d) {
 
@@ -43,11 +43,8 @@ static INLINE int vtestallones_i_vo32(vopmask g) {
   return vget_lane_u32(x1, 0);
 }
 
-static vfloat vloaduf(float *p) { return vld1q_f32(p); }
-static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
-
 static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
-static void vstoreu_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
 
 //
 
@@ -210,6 +207,15 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  return ((vfloat) {
+      ptr[vgetq_lane_s32(vi2, 0)],
+      ptr[vgetq_lane_s32(vi2, 1)],
+      ptr[vgetq_lane_s32(vi2, 2)],
+      ptr[vgetq_lane_s32(vi2, 3)]
+    });
+}
+
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
 
 
@@ -74,6 +74,18 @@ static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { ptr[0] = v[0]; ptr[1]
 
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  int a[VECTLENDP];
+  vstoreu_v_p_vi(a, vi);
+  return ((vdouble) { ptr[a[0]], ptr[a[1]] });
+}
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  int a[VECTLENSP];
+  vstoreu_v_p_vi2(a, vi2);
+  return ((vfloat) { ptr[a[0]], ptr[a[1]], ptr[a[2]], ptr[a[3]] });
+}
+
 static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
 static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
 static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
 
@@ -304,6 +304,12 @@ static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
 static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
 static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  vdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
+  return vd;
+}
+
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
@@ -418,6 +424,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
   return vf;
 }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  vfloat vf;
+  for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
+  return vf;
+}
+
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
   for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];