Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## Next Release
### Added
- SVE target support is added to libm.
https://github.com/shibatch/sleef/pull/180
- SVE target support is added to DFT. With this patch, DFT operations
can be carried out using 256, 512, 1024 and 2048-bit wide vectors
according to runtime availability of vector registers and operators.
https://github.com/shibatch/sleef/pull/182
## 3.2 - 2018-02-26
### Added
- The whole build system of the project migrated from makefiles to
Expand Down
120 changes: 112 additions & 8 deletions src/arch/helpersve.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,67 @@

#include "misc.h"

#define ENABLE_SP
#if defined(VECTLENDP) || defined(VECTLENSP)
#error VECTLENDP or VECTLENSP already defined
#endif

#if CONFIG == 1
// Vector length agnostic
#define VECTLENSP (svcntw())
#define VECTLENDP (svcntd())
#define ISANAME "AArch64 SVE"
#define ptrue svptrue_b8()
#elif CONFIG == 8
// 256-bit vector length
#define ISANAME "AArch64 SVE 256-bit"
#define LOG2VECTLENDP 2
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is misleading to me. LOG2VECTLENDP is not the log-size of the vectors, but of the partial vectors you are using. I think you should add a comment saying that these VLENs are used for the DFT of the partial vectors.

#define ptrue svptrue_pat_b8(SV_VL32)
#define DFTPRIORITY 20
#elif CONFIG == 9
// 512-bit vector length
#define ISANAME "AArch64 SVE 512-bit"
#define LOG2VECTLENDP 3
#define ptrue svptrue_pat_b8(SV_VL64)
#define DFTPRIORITY 21
#elif CONFIG == 10
// 1024-bit vector length
#define ISANAME "AArch64 SVE 1024-bit"
#define LOG2VECTLENDP 4
#define ptrue svptrue_pat_b8(SV_VL128)
#define DFTPRIORITY 22
#elif CONFIG == 11
// 2048-bit vector length
#define ISANAME "AArch64 SVE 2048-bit"
#define LOG2VECTLENDP 5
#define ptrue svptrue_pat_b8(SV_VL256)
#define DFTPRIORITY 23
#else
#error CONFIG macro invalid or not defined
#endif

#ifdef LOG2VECTLENDP
// For DFT, VECTLENDP and VECTLENSP are not the size of the available
// vector length, but the size of the partial vectors utilized in the
// computation. The appropriate VECTLENDP and VECTLENSP are chosen by
// the dispatcher according to the value of svcntd().

#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENDP (1 << LOG2VECTLENDP)
#define VECTLENSP (1 << LOG2VECTLENSP)
static INLINE int vavailability_i(int name) { return svcntd() >= VECTLENDP ? 3 : 0; }
#else
static INLINE int vavailability_i(int name) { return 3; }
#endif

#define ENABLE_SP
#define ENABLE_FMA_SP

#define ENABLE_DP
#define VECTLENDP (svcntd())
#define ENABLE_FMA_DP

#define FULL_FP_ROUNDING
#define ACCURATE_SQRT

#define ISANAME "AArch64 SVE"

// Mask definition
typedef svint32_t vmask;
typedef svbool_t vopmask;
Expand All @@ -40,13 +88,9 @@ typedef svfloat64_t vdouble;
typedef svint32_t vint;

// masking predicates
#define ptrue svptrue_b8()
#define ALL_TRUE_MASK svdup_n_s32(0xffffffff)
#define ALL_FALSE_MASK svdup_n_s32(0x0)

#define DFTPRIORITY 10

static INLINE int vavailability_i(int name) { return 3; }
static INLINE void vprefetch_v_p(const void *ptr) {}

//
Expand Down Expand Up @@ -375,6 +419,9 @@ static INLINE vmask vcast_vm_i_i(int i0, int i1) {
/*********************************/

// Vector load/store
static INLINE vdouble vload_vd_p(const double *ptr) {
return svld1_f64(ptrue, ptr);
}
static INLINE vdouble vloadu_vd_p(const double *ptr) {
return svld1_f64(ptrue, ptr);
}
Expand Down Expand Up @@ -609,3 +656,60 @@ static INLINE vfloat vrev21_vf_vf(vfloat vf) {
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);
}


// Operations for DFT

static INLINE vdouble vposneg_vd_vd(vdouble d) {
return svneg_f64_m(d, svdupq_n_b64(false, true), d);
}

static INLINE vdouble vnegpos_vd_vd(vdouble d) {
return svneg_f64_m(d, svdupq_n_b64(true, false), d);
}

static INLINE vfloat vposneg_vf_vf(vfloat d) {
return svneg_f32_m(d, svdupq_n_b32(false, true, false, true), d);
}

static INLINE vfloat vnegpos_vf_vf(vfloat d) {
return svneg_f32_m(d, svdupq_n_b32(true, false, true, false), d);
}

static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vfma_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfma_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }

//

static INLINE vdouble vrev21_vd_vd(vdouble x) { return svzip1_f64(svuzp2_f64(x, x), svuzp1_f64(x, x)); }

static INLINE vdouble vreva2_vd_vd(vdouble vd) {
svint64_t x = svindex_s64((VECTLENDP-1), -1);
x = svzip1_s64(svuzp2_s64(x, x), svuzp1_s64(x, x));
return svtbl_f64(vd, svreinterpret_u64_s64(x));
}

static INLINE vfloat vreva2_vf_vf(vfloat vf) {
svint32_t x = svindex_s32((VECTLENSP-1), -1);
x = svzip1_s32(svuzp2_s32(x, x), svuzp1_s32(x, x));
return svtbl_f32(vf, svreinterpret_u32_s32(x));
}

//

static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
svst1_scatter_u64index_f64(ptrue, ptr + offset*2, svzip1_u64(svindex_u64(0, step*2), svindex_u64(1, step*2)), v);
}

static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
svst1_scatter_u32index_f32(ptrue, ptr + offset*2, svzip1_u32(svindex_u32(0, step*2), svindex_u32(1, step*2)), v);
}

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { vstoreu_v_p_vd(ptr, v); }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v); }
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
2 changes: 2 additions & 0 deletions src/common/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ typedef struct {
#if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)

#define INLINE __attribute__((always_inline))
#define RESTRICT __restrict__

#ifndef __INTEL_COMPILER
#define CONST const
Expand Down Expand Up @@ -220,6 +221,7 @@ typedef struct {

#define INLINE __forceinline
#define CONST
#define RESTRICT

#ifndef SLEEF_STATIC_LIBS
#define EXPORT __declspec(dllexport)
Expand Down
4 changes: 3 additions & 1 deletion src/dft-tester/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ set(COMMON_TARGET_PROPERTIES
#

function(add_test_dft TESTNAME)
if (NOT EMULATOR AND NOT SDE_COMMAND)
if (ARMIE_COMMAND)
add_test(NAME ${TESTNAME} COMMAND ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS} ${ARGN})
elseif (NOT EMULATOR AND NOT SDE_COMMAND)
add_test(NAME ${TESTNAME} COMMAND ${ARGN})
elseif(NOT EMULATOR)
add_test(NAME ${TESTNAME} COMMAND ${SDE_COMMAND} "--" ${ARGN})
Expand Down
27 changes: 26 additions & 1 deletion src/dft/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Option MAXBUTWIDTH

set(MAXBUTWIDTH 4 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
if (COMPILER_SUPPORTS_SVE)
set(MAXBUTWIDTH 6 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
else()
set(MAXBUTWIDTH 4 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
endif()

# Compiler properties

Expand Down Expand Up @@ -77,6 +81,22 @@ set(MACRODEF_advsimdsp BASETYPEID=2 ENABLE_ADVSIMD CONFIG=1)
set(CFLAGS_advsimdsp ${FLAGS_ENABLE_ADVSIMD})
set(MACRODEF_neon32sp BASETYPEID=2 ENABLE_NEON32 CONFIG=1)
set(CFLAGS_neon32sp ${FLAGS_ENABLE_NEON32})
set(MACRODEF_sve256dp BASETYPEID=1 ENABLE_SVE CONFIG=8)
set(CFLAGS_sve256dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve256sp BASETYPEID=2 ENABLE_SVE CONFIG=8)
set(CFLAGS_sve256sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve512dp BASETYPEID=1 ENABLE_SVE CONFIG=9)
set(CFLAGS_sve512dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve512sp BASETYPEID=2 ENABLE_SVE CONFIG=9)
set(CFLAGS_sve512sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve1024dp BASETYPEID=1 ENABLE_SVE CONFIG=10)
set(CFLAGS_sve1024dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve1024sp BASETYPEID=2 ENABLE_SVE CONFIG=10)
set(CFLAGS_sve1024sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve2048dp BASETYPEID=1 ENABLE_SVE CONFIG=11)
set(CFLAGS_sve2048dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve2048sp BASETYPEID=2 ENABLE_SVE CONFIG=11)
set(CFLAGS_sve2048sp ${FLAGS_ENABLE_SVE})

# List all available scalar data types

Expand Down Expand Up @@ -132,6 +152,11 @@ if (COMPILER_SUPPORTS_ADVSIMD)
set(ISALIST_DP ${ISALIST_DP} advsimddp)
endif(COMPILER_SUPPORTS_ADVSIMD)

if (COMPILER_SUPPORTS_SVE)
set(ISALIST_SP ${ISALIST_SP} sve256sp sve512sp sve1024sp sve2048sp)
set(ISALIST_DP ${ISALIST_DP} sve256dp sve512dp sve1024dp sve2048dp)
endif(COMPILER_SUPPORTS_SVE)

if (COMPILER_SUPPORTS_NEON32)
set(ISALIST_SP ${ISALIST_SP} neon32sp)
endif(COMPILER_SUPPORTS_NEON32)
Expand Down
Loading