Skip to content

Commit a053716

Browse files
authored
Add quadprecision math library (#235)
This is a part of implementation of issue #233 ( #233 ). At this point, add, mul, div and sqrt with testers are implemented. Remaining functions will be committed in the succeeding PRs. As for vector extensions, SSE2, AVX, FMA4, AVX2, AV2_128, AVX512F, AdvSIMD and SVE are supported. This quad-precision math library is built only if -DBUILD_QUAD option is given to cmake. For some time(1 year?), this sub-project is positioned at alpha development stage.
1 parent 8e6e52f commit a053716

36 files changed

+3774
-48
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# Options
22

33
option(BUILD_SHARED_LIBS "Build shared libs" ON)
4+
option(BUILD_LIBM "libsleef will be built." ON)
45
option(BUILD_DFT "libsleefdft will be built." ON)
6+
option(BUILD_QUAD "libsleefquad will be built." OFF)
57
option(BUILD_GNUABI_LIBS "libsleefgnuabi will be built." ON)
68
option(BUILD_TESTS "Tests will be built." ON)
79

Configure.cmake

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ set(SLEEF_SUPPORTED_GNUABI_EXTENSIONS
6161
SSE2 AVX AVX2 AVX512F ADVSIMD SVE
6262
CACHE STRING "List of SIMD architectures supported by libsleef for GNU ABI."
6363
)
64-
64+
set(SLEEFQUAD_SUPPORTED_EXT
65+
PUREC_SCALAR PURECFMA_SCALAR SSE2 AVX FMA4 AVX2 AVX512F ADVSIMD SVE)
6566
# Force set default build type if none was specified
6667
# Note: some sleef code requires the optimisation flags turned on
6768
if(NOT CMAKE_BUILD_TYPE)
@@ -329,8 +330,8 @@ elseif(CMAKE_C_COMPILER_ID MATCHES "Intel")
329330
set(FLAGS_ENABLE_AVX512F "-xCOMMON-AVX512")
330331
set(FLAGS_ENABLE_AVX512FNOFMA "-xCOMMON-AVX512")
331332
set(FLAGS_ENABLE_PURECFMA_SCALAR "-march=core-avx2")
332-
set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_type -qoverride-limits")
333-
set(FLAGS_FASTMATH "-fp-model fast=2 -Qoption,cpp,--extended_float_type -qoverride-limits")
333+
set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_type")
334+
set(FLAGS_FASTMATH "-fp-model fast=2 -Qoption,cpp,--extended_float_type")
334335
set(FLAGS_WALL "-fmax-errors=3 -Wall -Wno-unused -Wno-attributes")
335336
set(FLAGS_NO_ERRNO "")
336337
endif()

Jenkinsfile

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ pipeline {
1414
rm -rf build
1515
mkdir build
1616
cd build
17-
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE ..
17+
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE ..
1818
make -j 6 all
1919
export OMP_WAIT_POLICY=passive
2020
export CTEST_OUTPUT_ON_FAILURE=TRUE
@@ -34,7 +34,7 @@ pipeline {
3434
rm -rf build
3535
mkdir build
3636
cd build
37-
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DFORCE_AAVPCS=On -DENABLE_GNUABI=On ..
37+
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DFORCE_AAVPCS=On -DENABLE_GNUABI=On -DBUILD_QUAD=TRUE ..
3838
make -j 6 all
3939
export OMP_WAIT_POLICY=passive
4040
export CTEST_OUTPUT_ON_FAILURE=TRUE
@@ -55,7 +55,7 @@ pipeline {
5555
rm -rf build
5656
mkdir build
5757
cd build
58-
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE ..
58+
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE ..
5959
make -j 4 all
6060
export OMP_WAIT_POLICY=passive
6161
export CTEST_OUTPUT_ON_FAILURE=TRUE
@@ -76,7 +76,7 @@ pipeline {
7676
rm -rf build
7777
mkdir build
7878
cd build
79-
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE ..
79+
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE ..
8080
make -j 4 all
8181
export OMP_WAIT_POLICY=passive
8282
export CTEST_OUTPUT_ON_FAILURE=TRUE
@@ -96,7 +96,7 @@ pipeline {
9696
rm -rf build
9797
mkdir build
9898
cd build
99-
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE ..
99+
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE ..
100100
make -j 4 all
101101
export OMP_WAIT_POLICY=passive
102102
export CTEST_OUTPUT_ON_FAILURE=TRUE
@@ -116,7 +116,7 @@ pipeline {
116116
rm -rf build
117117
mkdir build
118118
cd build
119-
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DBUILD_SHARED_LIBS=FALSE -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DENFORCE_TESTER3=TRUE ..
119+
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DBUILD_SHARED_LIBS=FALSE -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE ..
120120
make -j 2 all
121121
export OMP_WAIT_POLICY=passive
122122
export CTEST_OUTPUT_ON_FAILURE=TRUE
@@ -134,14 +134,14 @@ pipeline {
134134
set "ORG_PATH=%PATH%"
135135
PATH C:/Cygwin64/bin;C:/Cygwin64/usr/bin;%PROJECT_DIR%/build-cygwin/bin;%PATH%
136136
rmdir /S /Q build-cygwin
137-
C:/Cygwin64/bin/bash -c 'mkdir build-cygwin;cd build-cygwin;cmake -g"Unix Makefiles" ..;make -j 4'
137+
C:/Cygwin64/bin/bash -c 'mkdir build-cygwin;cd build-cygwin;cmake -g"Unix Makefiles" .. -DBUILD_QUAD=TRUE;make -j 4'
138138
del /Q /F %PROJECT_DIR%/build-cygwin/bin/iut*
139139
PATH %ORG_PATH%;C:/Cygwin64/bin;C:/Cygwin64/usr/bin;%PROJECT_DIR%/build-cygwin/bin;%PROJECT_DIR%/build/bin
140140
cd %PROJECT_DIR%
141141
rmdir /S /Q build
142142
mkdir build
143143
cd build
144-
cmake -G"Visual Studio 15 2017 Win64" .. -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DBUILD_SHARED_LIBS=FALSE -DENFORCE_TESTER3=TRUE
144+
cmake -G"Visual Studio 15 2017 Win64" .. -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DBUILD_SHARED_LIBS=FALSE -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE
145145
cmake --build . --target install --config Release
146146
ctest --output-on-failure -j 4 -C Release
147147
'''
@@ -156,7 +156,7 @@ pipeline {
156156
rm -rf build
157157
mkdir build
158158
cd build
159-
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE ..
159+
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE ..
160160
make -j 4 all
161161
export OMP_WAIT_POLICY=passive
162162
export CTEST_OUTPUT_ON_FAILURE=TRUE
@@ -174,7 +174,7 @@ pipeline {
174174
rm -rf build-native
175175
mkdir build-native
176176
cd build-native
177-
cmake -DSLEEF_SHOW_CONFIG=1 ..
177+
cmake -DSLEEF_SHOW_CONFIG=1 .. -DBUILD_QUAD=TRUE
178178
make -j 4 all
179179
cd ..
180180
export PATH=$PATH:`pwd`/travis
@@ -183,7 +183,7 @@ pipeline {
183183
rm -rf build
184184
mkdir build
185185
cd build
186-
cmake -DCMAKE_TOOLCHAIN_FILE=../travis/toolchain-ppc64el.cmake -DNATIVE_BUILD_DIR=`pwd`/../build-native -DEMULATOR=qemu-ppc64le-static -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE ..
186+
cmake -DCMAKE_TOOLCHAIN_FILE=../travis/toolchain-ppc64el.cmake -DNATIVE_BUILD_DIR=`pwd`/../build-native -DEMULATOR=qemu-ppc64le-static -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE ..
187187
make -j 4 all
188188
export OMP_WAIT_POLICY=passive
189189
export CTEST_OUTPUT_ON_FAILURE=TRUE
@@ -201,7 +201,7 @@ pipeline {
201201
rm -rf build
202202
mkdir build
203203
cd build
204-
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE ..
204+
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE ..
205205
make -j 4 all
206206
export OMP_WAIT_POLICY=passive
207207
export CTEST_OUTPUT_ON_FAILURE=TRUE
@@ -219,7 +219,7 @@ pipeline {
219219
rm -rf build
220220
mkdir build
221221
cd build
222-
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE ..
222+
cmake -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE ..
223223
make -j 3 all
224224
export OMP_WAIT_POLICY=passive
225225
export CTEST_OUTPUT_ON_FAILURE=TRUE

appveyor.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@ install:
1212
- if "%DO_TEST%" == "TRUE" set ORGPATH="%PATH%"
1313
- if "%DO_TEST%" == "TRUE" "C:\\Cygwin64\\setup-x86_64.exe" -q -g -P libmpfr-devel,libgmp-devel,cmake
1414
- if "%DO_TEST%" == "TRUE" PATH c:\Cygwin64\bin;c:\Cygwin64\usr\bin;c:\projects\sleef\build-cygwin\bin;"%PATH%"
15-
- if "%DO_TEST%" == "TRUE" "C:\\Cygwin64\\bin\\bash" -c 'mkdir build-mingw;cd build-mingw;CC=x86_64-w64-mingw32-gcc cmake -g\"Unix Makefiles\" .. -DBUILD_SHARED_LIBS=FALSE;make -j 2'
15+
- if "%DO_TEST%" == "TRUE" "C:\\Cygwin64\\bin\\bash" -c 'mkdir build-mingw;cd build-mingw;CC=x86_64-w64-mingw32-gcc cmake -g\"Unix Makefiles\" .. -DBUILD_SHARED_LIBS=FALSE -DBUILD_QUAD=TRUE;make -j 2'
1616
- if "%DO_TEST%" == "TRUE" cd "c:\\projects\\sleef"
17-
- if "%DO_TEST%" == "TRUE" "C:\\Cygwin64\\bin\\bash" -c 'mkdir build-cygwin;cd build-cygwin;cmake -g\"Unix Makefiles\" ..;make -j 2'
17+
- if "%DO_TEST%" == "TRUE" "C:\\Cygwin64\\bin\\bash" -c 'mkdir build-cygwin;cd build-cygwin;cmake -g\"Unix Makefiles\" -DBUILD_QUAD=TRUE ..;make -j 2'
1818
- if "%DO_TEST%" == "TRUE" del /Q /F c:\projects\sleef\build-cygwin\bin\iut*
1919
- if "%DO_TEST%" == "TRUE" PATH "%ORGPATH%";c:\Cygwin64\bin;c:\Cygwin64\usr\bin;c:\projects\sleef\build-cygwin\bin;c:\projects\sleef\build\bin
2020
- if "%DO_TEST%" == "TRUE" cd "c:\\projects\\sleef"
2121
- mkdir build
2222
- cd build
23-
- cmake -G"Visual Studio 15 2017 Win64" .. -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_SHOW_ERROR_LOG=1 -DENFORCE_TESTER3=TRUE %ENV_BUILD_STATIC%
23+
- cmake -G"Visual Studio 15 2017 Win64" .. -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_SHOW_ERROR_LOG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE %ENV_BUILD_STATIC%
2424
build_script:
2525
- cmake --build . --target install --config Release
2626
test_script:

src/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,10 @@ if (BUILD_DFT AND NOT MINGW)
1313
add_subdirectory("dft-tester")
1414
endif()
1515
endif()
16+
17+
if (BUILD_QUAD)
18+
add_subdirectory("quad")
19+
if (BUILD_TESTS)
20+
add_subdirectory("quad-tester")
21+
endif()
22+
endif()

src/arch/helperadvsimd.h

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ typedef int32x4_t vint2;
4545
typedef float64x2_t vdouble;
4646
typedef int32x2_t vint;
4747

48+
typedef struct {
49+
vmask x, y;
50+
} vmask2;
51+
4852
#define DFTPRIORITY 10
4953

5054
static INLINE int vavailability_i(int name) { return 3; }
@@ -644,3 +648,60 @@ static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int s
644648
vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
645649
vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
646650
}
651+
652+
//
653+
654+
typedef Sleef_quad2 vargquad;
655+
656+
static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
657+
return (vmask2) {
658+
vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))),
659+
vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) };
660+
}
661+
662+
static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
663+
return (vmask2) {
664+
vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))),
665+
vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) };
666+
}
667+
668+
static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
669+
union {
670+
vargquad aq;
671+
vmask2 vm2;
672+
} c;
673+
c.aq = aq;
674+
return vinterleave_vm2_vm2(c.vm2);
675+
}
676+
677+
static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
678+
union {
679+
vargquad aq;
680+
vmask2 vm2;
681+
} c;
682+
c.vm2 = vuninterleave_vm2_vm2(vm2);
683+
return c.aq;
684+
}
685+
686+
static INLINE int vtestallzeros_i_vo64(vopmask g) {
687+
uint32x2_t x0 = vorr_u32(vget_low_u32(g), vget_high_u32(g));
688+
uint32x2_t x1 = vpmax_u32(x0, x0);
689+
return ~vget_lane_u32(x1, 0);
690+
}
691+
692+
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return vbslq_u32(m, x, y); }
693+
694+
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
695+
return vreinterpretq_u32_s64(vsubq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
696+
}
697+
698+
static INLINE vmask vneg64_vm_vm(vmask x) {
699+
return vreinterpretq_u32_s64(vnegq_s64(vreinterpretq_s64_u32(x)));
700+
}
701+
702+
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
703+
return vreinterpretq_u32_u64(vcgtq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
704+
}
705+
706+
#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
707+
#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))

src/arch/helperavx.h

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ typedef __m128i vint;
4848
typedef __m256 vfloat;
4949
typedef struct { __m128i x, y; } vint2;
5050

51+
typedef struct {
52+
vmask x, y;
53+
} vmask2;
54+
5155
//
5256

5357
#ifndef __SLEEF_H__
@@ -552,3 +556,90 @@ static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat
552556
}
553557

554558
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
559+
560+
//
561+
562+
typedef Sleef_quad4 vargquad;
563+
564+
static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
565+
return (vmask2) {
566+
vreinterpret_vm_vd(_mm256_unpacklo_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))),
567+
vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) };
568+
}
569+
570+
static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
571+
return (vmask2) {
572+
vreinterpret_vm_vd(_mm256_unpacklo_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))),
573+
vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) };
574+
}
575+
576+
static vmask2 vloadu_vm2_p(void *p) {
577+
vmask2 vm2 = {
578+
vcast_vm_vi2(vloadu_vi2_p((int32_t *)p)),
579+
vcast_vm_vi2(vloadu_vi2_p((int32_t *)((uint8_t *)p + sizeof(vmask))))
580+
};
581+
return vm2;
582+
}
583+
584+
static void vstoreu_v_p_vm2(void *p, vmask2 vm2) {
585+
vstoreu_v_p_vi2((int32_t *)p, vcast_vi2_vm(vm2.x));
586+
vstoreu_v_p_vi2((int32_t *)((uint8_t *)p + sizeof(vmask)), vcast_vi2_vm(vm2.y));
587+
}
588+
589+
static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
590+
#if !defined(_MSC_VER)
591+
union {
592+
vargquad aq;
593+
vmask2 vm2;
594+
} c;
595+
c.aq = aq;
596+
return vinterleave_vm2_vm2(c.vm2);
597+
#else
598+
return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));
599+
#endif
600+
}
601+
602+
static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
603+
#if !defined(_MSC_VER)
604+
union {
605+
vargquad aq;
606+
vmask2 vm2;
607+
} c;
608+
c.vm2 = vuninterleave_vm2_vm2(vm2);
609+
return c.aq;
610+
#else
611+
vargquad a;
612+
vstoreu_v_p_vm2(&a, vuninterleave_vm2_vm2(vm2));
613+
return a;
614+
#endif
615+
}
616+
617+
static INLINE int vtestallzeros_i_vo64(vopmask g) {
618+
return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
619+
}
620+
621+
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
622+
return vreinterpret_vm_vd(_mm256_blendv_pd(vreinterpret_vd_vm(y), vreinterpret_vd_vm(x), vreinterpret_vd_vm(o)));
623+
}
624+
625+
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
626+
__m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
627+
__m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
628+
vmask r = _mm256_castsi128_si256(_mm_sub_epi64(xl, yl));
629+
return _mm256_insertf128_si256(r, _mm_sub_epi64(xh, yh), 1);
630+
}
631+
632+
static INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast_vm_i_i(0, 0), x); }
633+
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
634+
__m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
635+
__m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
636+
vmask r = _mm256_castsi128_si256(_mm_cmpgt_epi64(xl, yl));
637+
return _mm256_insertf128_si256(r, _mm_cmpgt_epi64(xh, yh), 1);
638+
}
639+
640+
#define vsll64_vm_vm_i(x, c) \
641+
_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), \
642+
_mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
643+
#define vsrl64_vm_vm_i(x, c) \
644+
_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \
645+
_mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)

0 commit comments

Comments
 (0)