Skip to content

Commit 7784b68

Browse files
authored
[Quad] Add trigonometric functions (#240)
This patch adds sin, cos, tan, exp, exp2, exp10, expm1, log, log2, log10, log1p, asin, acos, atan, comparison functions and cast functions between quad and double to libsleefquad.
1 parent 1230817 commit 7784b68

18 files changed

+6839
-86
lines changed

src/arch/helperadvsimd.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,20 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
665665
vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) };
666666
}
667667

668+
static INLINE vint vuninterleave_vi_vi(vint v) { return v; }
669+
static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }
670+
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }
671+
static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }
672+
static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }
673+
674+
static vmask2 vloadu_vm2_p(void *p) {
675+
vmask2 vm2 = {
676+
vld1q_u32((uint32_t *)p),
677+
vld1q_u32((uint32_t *)((uint8_t *)p + sizeof(vmask)))
678+
};
679+
return vm2;
680+
}
681+
668682
static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
669683
union {
670684
vargquad aq;
@@ -705,3 +719,9 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
705719

706720
#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
707721
#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
722+
723+
static INLINE vmask vcast_vm_vi(vint vi) {
724+
vmask m = vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)));
725+
return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vreinterpret_s32_u32(vget_low_u32(vgt_vo_vi_vi(vcast_vi_i(0), vi))))), m);
726+
}
727+
static INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vm))); }

src/arch/helperavx.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,38 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
575575
vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) };
576576
}
577577

578+
static INLINE vint vuninterleave_vi_vi(vint v) {
579+
return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
580+
}
581+
582+
static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
583+
double tmp[4];
584+
vstoreu_v_p_vd(tmp, vd);
585+
double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
586+
return vloadu_vd_p(tmp);
587+
}
588+
589+
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
590+
double tmp[4];
591+
vstoreu_v_p_vd(tmp, vd);
592+
double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
593+
return vloadu_vd_p(tmp);
594+
}
595+
596+
static INLINE vmask vinterleave_vm_vm(vmask vm) {
597+
double tmp[4];
598+
vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm));
599+
double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
600+
return vreinterpret_vm_vd(vloadu_vd_p(tmp));
601+
}
602+
603+
static INLINE vmask vuninterleave_vm_vm(vmask vm) {
604+
double tmp[4];
605+
vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm));
606+
double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
607+
return vreinterpret_vm_vd(vloadu_vd_p(tmp));
608+
}
609+
578610
static vmask2 vloadu_vm2_p(void *p) {
579611
vmask2 vm2 = {
580612
vcast_vm_vi2(vloadu_vi2_p((int32_t *)p)),
@@ -645,3 +677,14 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
645677
#define vsrl64_vm_vm_i(x, c) \
646678
_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \
647679
_mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
680+
681+
static INLINE vmask vcast_vm_vi(vint vi) {
682+
vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1));
683+
vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1));
684+
vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1);
685+
return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1)))), m);
686+
}
687+
static INLINE vint vcast_vi_vm(vmask vm) {
688+
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
689+
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
690+
}

src/arch/helperavx2.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,26 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
428428
return (vmask2) { _mm256_unpacklo_epi64(v.x, v.y), _mm256_unpackhi_epi64(v.x, v.y) };
429429
}
430430

431+
static INLINE vint vuninterleave_vi_vi(vint v) {
432+
return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
433+
}
434+
435+
static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
436+
return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)));
437+
}
438+
439+
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
440+
return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)));
441+
}
442+
443+
static INLINE vmask vinterleave_vm_vm(vmask vm) {
444+
return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0));
445+
}
446+
447+
static INLINE vmask vuninterleave_vm_vm(vmask vm) {
448+
return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0));
449+
}
450+
431451
static vmask2 vloadu_vm2_p(void *p) {
432452
vmask2 vm2 = {
433453
vloadu_vi2_p((int32_t *)p),
@@ -481,3 +501,9 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi
481501

482502
#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
483503
#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
504+
505+
static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); }
506+
static INLINE vint vcast_vi_vm(vmask vm) {
507+
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
508+
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
509+
}

src/arch/helperavx512f.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,26 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
545545
return (vmask2) { _mm512_unpacklo_epi64(v.x, v.y), _mm512_unpackhi_epi64(v.x, v.y) };
546546
}
547547

548+
static INLINE vint vuninterleave_vi_vi(vint v) {
549+
return _mm256_permutevar8x32_epi32(v, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
550+
}
551+
552+
static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
553+
return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vreinterpret_vm_vd(vd)));
554+
}
555+
556+
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
557+
return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vreinterpret_vm_vd(vd)));
558+
}
559+
560+
static INLINE vmask vinterleave_vm_vm(vmask vm) {
561+
return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vm);
562+
}
563+
564+
static INLINE vmask vuninterleave_vm_vm(vmask vm) {
565+
return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vm);
566+
}
567+
548568
static vmask2 vloadu_vm2_p(void *p) {
549569
vmask2 vm2 = {
550570
vloadu_vi2_p((int32_t *)p),
@@ -600,3 +620,10 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64
600620

601621
#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
602622
#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
623+
624+
static INLINE vmask vcast_vm_vi(vint vi) {
625+
return _mm512_cvtepi32_epi64(vi);
626+
}
627+
static INLINE vint vcast_vi_vm(vmask vm) {
628+
return _mm512_cvtepi64_epi32(vm);
629+
}

src/arch/helperpurec_scalar.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,11 @@ typedef Sleef_quad1 vargquad;
379379

380380
static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) { return v; }
381381
static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) { return v; }
382+
static INLINE vint vuninterleave_vi_vi(vint v) { return v; }
383+
static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }
384+
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }
385+
static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }
386+
static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }
382387

383388
static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
384389
union {
@@ -408,3 +413,6 @@ static INLINE vmask vneg64_vm_vm(vmask x) { return -(int64_t)x; }
408413
#define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))
409414

410415
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return (int64_t)x > (int64_t)y ? ~(uint32_t)0 : 0; }
416+
417+
static INLINE vmask vcast_vm_vi(vint vi) { return vi; }
418+
static INLINE vint vcast_vi_vm(vmask vm) { return vm; }

src/arch/helpersse2.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,12 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
454454
return (vmask2) { _mm_unpacklo_epi64(v.x, v.y), _mm_unpackhi_epi64(v.x, v.y) };
455455
}
456456

457+
static INLINE vint vuninterleave_vi_vi(vint v) { return v; }
458+
static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }
459+
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }
460+
static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }
461+
static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }
462+
457463
static vmask2 vloadu_vm2_p(void *p) {
458464
vmask2 vm2 = {
459465
vloadu_vi2_p((int32_t *)p),
@@ -513,3 +519,9 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
513519
_mm_storeu_si128((__m128i *)ay, y);
514520
return _mm_set_epi64x(ax[1] > ay[1] ? -1 : 0, ax[0] > ay[0] ? -1 : 0);
515521
}
522+
523+
static INLINE vmask vcast_vm_vi(vint vi) {
524+
vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
525+
return vor_vm_vm_vm(vcastu_vi2_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
526+
}
527+
static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }

src/arch/helpersve.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,36 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
791791
svreinterpret_s32_u64(svtrn2_u64(svreinterpret_u64_s32(v.x), svreinterpret_u64_s32(v.y))) };
792792
}
793793

794+
static INLINE vint vuninterleave_vi_vi(vint v) {
795+
return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v)),
796+
svtrn2_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v))));
797+
}
798+
799+
static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
800+
return svtrn1_f64(svzip1_f64(vd, vd), svzip2_f64(vd, vd));
801+
}
802+
803+
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
804+
return svuzp1_f64(svtrn1_f64(vd, vd), svtrn2_f64(vd, vd));
805+
}
806+
807+
static INLINE vmask vinterleave_vm_vm(vmask vm) {
808+
return svreinterpret_s32_u64(svtrn1_u64(svzip1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)),
809+
svzip2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm))));
810+
}
811+
static INLINE vmask vuninterleave_vm_vm(vmask vm) {
812+
return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)),
813+
svtrn2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm))));
814+
}
815+
816+
static vmask2 vloadu_vm2_p(void *p) {
817+
vmask2 vm2 = {
818+
svld1_s32(ptrue, (int32_t *)p),
819+
svld1_s32(ptrue, (int32_t *)((uint8_t *)p + 8 * svcntd()))
820+
};
821+
return vm2;
822+
}
823+
794824
static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
795825
return vinterleave_vm2_vm2((vmask2) { svld1_s32(ptrue, (int32_t *)&aq), svld1_s32(ptrue, (int32_t *)&(aq.s[svcntd()/2])) });
796826
}
@@ -827,3 +857,6 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
827857

828858
#define vsll64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))
829859
#define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))
860+
861+
static INLINE vmask vcast_vm_vi(vint vi) { return svreinterpret_s32_s64(svextw_s64_z(ptrue, svreinterpret_s64_s32(vi))); }
862+
static INLINE vint vcast_vi_vm(vmask vm) { return vand_vm_vm_vm(vm, vcast_vm_i_i(0, 0xffffffff)); }

src/common/misc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ typedef struct {
172172
#if defined(ENABLEFLOAT128)
173173
typedef __float128 Sleef_quad;
174174
#else
175-
typedef struct { uint64_t x, y; } Sleef_quad;
175+
typedef struct { double x, y; } Sleef_quad;
176176
#endif
177177
#endif
178178

src/quad-tester/qiutsimd.c

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,61 @@ typedef union {
171171
} \
172172
}
173173

174+
#define func_i_q_q(funcStr, funcName) { \
175+
while (startsWith(buf, funcStr " ")) { \
176+
sentinel = 0; \
177+
int lane = xrand() % VECTLENDP; \
178+
cnv128 c0, c1; \
179+
sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l, &c1.h, &c1.l); \
180+
vargquad a0, a1; \
181+
memrand(&a0, sizeof(vargquad)); \
182+
memrand(&a1, sizeof(vargquad)); \
183+
a0.s[lane] = c0.q; \
184+
a1.s[lane] = c1.q; \
185+
vint vi = funcName(a0, a1); \
186+
int t[VECTLENDP]; \
187+
vstoreu_v_p_vi(t, vi); \
188+
printf("%d\n", t[lane]); \
189+
fflush(stdout); \
190+
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
191+
} \
192+
}
193+
194+
#define func_d_q(funcStr, funcName) { \
195+
while (startsWith(buf, funcStr " ")) { \
196+
sentinel = 0; \
197+
int lane = xrand() % VECTLENDP; \
198+
cnv128 c0; \
199+
sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \
200+
vargquad a0; \
201+
memrand(&a0, sizeof(vargquad)); \
202+
a0.s[lane] = c0.q; \
203+
double d[VECTLENDP]; \
204+
vstoreu_v_p_vd(d, funcName(a0)); \
205+
printf("%" PRIx64 "\n", d2u(d[lane])); \
206+
fflush(stdout); \
207+
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
208+
} \
209+
}
210+
211+
#define func_q_d(funcStr, funcName) { \
212+
while (startsWith(buf, funcStr " ")) { \
213+
sentinel = 0; \
214+
int lane = xrand() % VECTLENDP; \
215+
uint64_t u; \
216+
sscanf(buf, funcStr " %" PRIx64, &u); \
217+
double s[VECTLENDP]; \
218+
memrand(s, sizeof(s)); \
219+
s[lane] = u2d(u); \
220+
vargquad a0 = funcName(vloadu_vd_p(s)); \
221+
cnv128 c0; \
222+
c0.q = a0.s[lane]; \
223+
printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \
224+
fflush(stdout); \
225+
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
226+
} \
227+
}
228+
174229
#define func_strtoq(funcStr) { \
175230
while (startsWith(buf, funcStr " ")) { \
176231
sentinel = 0; \
@@ -224,7 +279,30 @@ int do_test(int argc, char **argv) {
224279
func_q_q_q("mulq_u05", xmulq_u05);
225280
func_q_q_q("divq_u05", xdivq_u05);
226281
func_q_q("sqrtq_u05", xsqrtq_u05);
282+
func_q_q("sinq_u10", xsinq_u10);
283+
func_q_q("cosq_u10", xcosq_u10);
284+
func_q_q("tanq_u10", xtanq_u10);
285+
func_q_q("asinq_u10", xasinq_u10);
286+
func_q_q("acosq_u10", xacosq_u10);
287+
func_q_q("atanq_u10", xatanq_u10);
288+
func_q_q("expq_u10", xexpq_u10);
289+
func_q_q("exp2q_u10", xexp2q_u10);
290+
func_q_q("exp10q_u10", xexp10q_u10);
291+
func_q_q("expm1q_u10", xexpm1q_u10);
292+
func_q_q("logq_u10", xlogq_u10);
293+
func_q_q("log2q_u10", xlog2q_u10);
294+
func_q_q("log10q_u10", xlog10q_u10);
295+
func_q_q("log1pq_u10", xlog1pq_u10);
227296
func_q_q("negq", xnegq);
297+
func_q_d("cast_from_doubleq", xcast_from_doubleq);
298+
func_d_q("cast_to_doubleq", xcast_to_doubleq);
299+
func_i_q_q("cmpltq", xcmpltq);
300+
func_i_q_q("cmpgtq", xcmpgtq);
301+
func_i_q_q("cmpleq", xcmpleq);
302+
func_i_q_q("cmpgeq", xcmpgeq);
303+
func_i_q_q("cmpeqq", xcmpeqq);
304+
func_i_q_q("cmpneqq", xcmpneqq);
305+
func_i_q_q("unordq", xunordq);
228306
func_strtoq("strtoq");
229307
func_qtostr("qtostr");
230308
sentinel++;

0 commit comments

Comments
 (0)