Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## Next Release
- SVE target support is added to libm.
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please mention these two items under section ### Added, as requested by the "keep a changelog" format

https://github.com/shibatch/sleef/pull/180
- SVE target support is added to DFT. With this patch, DFT operations
can be carried out using 256, 512, 1024 and 2048-bit wide vectors
according to runtime availability of vector registers and operators.
https://github.com/shibatch/sleef/pull/182
## 3.2 - 2018-02-26
### Added
- The whole build system of the project migrated from makefiles to
Expand Down
115 changes: 107 additions & 8 deletions src/arch/helpersve.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,58 @@

#include "misc.h"

#define ENABLE_SP
#if CONFIG == 1
// Vector length agnostic
#define VECTLENSP (svcntw())
#define VECTLENDP (svcntd())
#define ISANAME "AArch64 SVE"
#define ptrue svptrue_b8()
#elif CONFIG == 8
// 256-bit vector length
#define ISANAME "AArch64 SVE 256-bit"
#define LOG2VECTLENDP 2
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is misleading to me. LOG2VECTLENDP is not the log-size of the vectors, but of the partial vectors you are using. I think you should add a comment saying that these VLENs are used for the DFT of the partial vectors.

#define ptrue svptrue_pat_b8(SV_VL32)
#define DFTPRIORITY 20
#elif CONFIG == 9
// 512-bit vector length
#define ISANAME "AArch64 SVE 512-bit"
#define LOG2VECTLENDP 3
#define ptrue svptrue_pat_b8(SV_VL64)
#define DFTPRIORITY 21
#elif CONFIG == 10
// 1024-bit vector length
#define ISANAME "AArch64 SVE 1024-bit"
#define LOG2VECTLENDP 4
#define ptrue svptrue_pat_b8(SV_VL128)
#define DFTPRIORITY 22
#elif CONFIG == 11
// 2048-bit vector length
#define ISANAME "AArch64 SVE 2048-bit"
#define LOG2VECTLENDP 5
#define ptrue svptrue_pat_b8(SV_VL256)
#define DFTPRIORITY 23
#else
#error CONFIG macro invalid or not defined
#endif

#ifdef LOG2VECTLENDP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENDP (1 << LOG2VECTLENDP)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to make also that the sure that the VECTLENDP = svcntd() and VECTLENDP = svcntw() don't get overwritten when building libsleef and libsleefgnuabi. Could you please raise an #error at this point if any of VECTLENDP or VECTLENSP are already defined at this point?

#define VECTLENSP (1 << LOG2VECTLENSP)
static INLINE int vavailability_i(int name) { return svcntd() >= VECTLENDP ? 3 : 0; }
#else
static INLINE int vavailability_i(int name) { return 3; }
#endif

#define ENABLE_SP
#define ENABLE_FMA_SP

#define ENABLE_DP
#define VECTLENDP (svcntd())
#define ENABLE_FMA_DP

#define FULL_FP_ROUNDING
#define ACCURATE_SQRT

#define ISANAME "AArch64 SVE"

// Mask definition
typedef svint32_t vmask;
typedef svbool_t vopmask;
Expand All @@ -40,13 +79,9 @@ typedef svfloat64_t vdouble;
typedef svint32_t vint;

// masking predicates
#define ptrue svptrue_b8()
#define ALL_TRUE_MASK svdup_n_s32(0xffffffff)
#define ALL_FALSE_MASK svdup_n_s32(0x0)

#define DFTPRIORITY 10

static INLINE int vavailability_i(int name) { return 3; }
static INLINE void vprefetch_v_p(const void *ptr) {}

//
Expand Down Expand Up @@ -375,6 +410,9 @@ static INLINE vmask vcast_vm_i_i(int i0, int i1) {
/*********************************/

// Vector load/store
static INLINE vdouble vload_vd_p(const double *ptr) {
return svld1_f64(ptrue, ptr);
}
static INLINE vdouble vloadu_vd_p(const double *ptr) {
return svld1_f64(ptrue, ptr);
}
Expand Down Expand Up @@ -609,3 +647,64 @@ static INLINE vfloat vrev21_vf_vf(vfloat vf) {
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);
}


// Operations for DFT

static INLINE vdouble vposneg_vd_vd(vdouble d) {
vmask pnmask = svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svindex_u64(0, 1), 63));
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand you are creating a positive/negative patterns to even/odd lanes here. Any chance you could avoid using vmask for these operations (and the v*subadd operation) and use native predication by building the repeated predicate patterns withDUPQ?

See 6.21.4.4 of https://static.docs.arm.com/100987/0000/acle_sve_100987_0000_00_en.pdf

Something like:

vsubadd (x,y) = vadd(ptrue, vadd(dupq(true,false), x, y),
                            vsub(dupq(false,true), x, y)

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's not a good idea since it includes three (or two) FP add operations, and FP operations are considered to be expensive and slow. Those operations are also dependent of the output by the previous instruction. We assume that the ALUs for the unmasked elements are not used, but that might not be the case since power-gating may take some time to kick in.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I came up with a good way to remove vmask without additional FP operation.
This should reduce register pressure.

static INLINE vdouble vposneg_vd_vd(vdouble d) {
  return svneg_f64_m(d, svdupq_n_b64(false, true), d);
}

return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), pnmask));
}

static INLINE vdouble vnegpos_vd_vd(vdouble d) {
vmask pnmask = svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svindex_u64(1, 1), 63));
return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), pnmask));
}

static INLINE vfloat vposneg_vf_vf(vfloat d) {
vmask pnmask = svreinterpret_s32_u32(svlsl_n_u32_x(ptrue, svindex_u32(0, 1), 31));
return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), pnmask));
}

static INLINE vfloat vnegpos_vf_vf(vfloat d) {
vmask pnmask = svreinterpret_s32_u32(svlsl_n_u32_x(ptrue, svindex_u32(1, 1), 31));
return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), pnmask));
}

static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vfma_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfma_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }

//

static INLINE vdouble vrev21_vd_vd(vdouble x) { return svzip1_f64(svuzp2_f64(x, x), svuzp1_f64(x, x)); }

static INLINE vdouble vreva2_vd_vd(vdouble vd) {
svint64_t x = svindex_s64((VECTLENDP-1), -1);
x = svzip1_s64(svuzp2_s64(x, x), svuzp1_s64(x, x));
return svtbl_f64(vd, svreinterpret_u64_s64(x));
}

static INLINE vfloat vreva2_vf_vf(vfloat vf) {
svint32_t x = svindex_s32((VECTLENSP-1), -1);
x = svzip1_s32(svuzp2_s32(x, x), svuzp1_s32(x, x));
return svtbl_f32(vf, svreinterpret_u32_s32(x));
}

//

static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
svst1_scatter_u64index_f64(ptrue, ptr + offset*2, svzip1_u64(svindex_u64(0, step*2), svindex_u64(1, step*2)), v);
}

static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
svst1_scatter_u32index_f32(ptrue, ptr + offset*2, svzip1_u32(svindex_u32(0, step*2), svindex_u32(1, step*2)), v);
}

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { vstoreu_v_p_vd(ptr, v); }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v); }
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
2 changes: 2 additions & 0 deletions src/common/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ typedef struct {
#if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)

#define INLINE __attribute__((always_inline))
#define RESTRICT __restrict__

#ifndef __INTEL_COMPILER
#define CONST const
Expand Down Expand Up @@ -220,6 +221,7 @@ typedef struct {

#define INLINE __forceinline
#define CONST
#define RESTRICT

#ifndef SLEEF_STATIC_LIBS
#define EXPORT __declspec(dllexport)
Expand Down
4 changes: 3 additions & 1 deletion src/dft-tester/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ set(COMMON_TARGET_PROPERTIES
#

function(add_test_dft TESTNAME)
if (NOT EMULATOR AND NOT SDE_COMMAND)
if (ARMIE_COMMAND)
add_test(NAME ${TESTNAME} COMMAND ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS} ${ARGN})
elseif (NOT EMULATOR AND NOT SDE_COMMAND)
add_test(NAME ${TESTNAME} COMMAND ${ARGN})
elseif(NOT EMULATOR)
add_test(NAME ${TESTNAME} COMMAND ${SDE_COMMAND} "--" ${ARGN})
Expand Down
27 changes: 26 additions & 1 deletion src/dft/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Option MAXBUTWIDTH

set(MAXBUTWIDTH 4 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
if (COMPILER_SUPPORTS_SVE)
set(MAXBUTWIDTH 6 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
else()
set(MAXBUTWIDTH 4 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
endif()

# Compiler properties

Expand Down Expand Up @@ -77,6 +81,22 @@ set(MACRODEF_advsimdsp BASETYPEID=2 ENABLE_ADVSIMD CONFIG=1)
set(CFLAGS_advsimdsp ${FLAGS_ENABLE_ADVSIMD})
set(MACRODEF_neon32sp BASETYPEID=2 ENABLE_NEON32 CONFIG=1)
set(CFLAGS_neon32sp ${FLAGS_ENABLE_NEON32})
set(MACRODEF_sve256dp BASETYPEID=1 ENABLE_SVE CONFIG=8)
set(CFLAGS_sve256dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve256sp BASETYPEID=2 ENABLE_SVE CONFIG=8)
set(CFLAGS_sve256sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve512dp BASETYPEID=1 ENABLE_SVE CONFIG=9)
set(CFLAGS_sve512dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve512sp BASETYPEID=2 ENABLE_SVE CONFIG=9)
set(CFLAGS_sve512sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve1024dp BASETYPEID=1 ENABLE_SVE CONFIG=10)
set(CFLAGS_sve1024dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve1024sp BASETYPEID=2 ENABLE_SVE CONFIG=10)
set(CFLAGS_sve1024sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve2048dp BASETYPEID=1 ENABLE_SVE CONFIG=11)
set(CFLAGS_sve2048dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve2048sp BASETYPEID=2 ENABLE_SVE CONFIG=11)
set(CFLAGS_sve2048sp ${FLAGS_ENABLE_SVE})

# List all available scalar data types

Expand Down Expand Up @@ -132,6 +152,11 @@ if (COMPILER_SUPPORTS_ADVSIMD)
set(ISALIST_DP ${ISALIST_DP} advsimddp)
endif(COMPILER_SUPPORTS_ADVSIMD)

if (COMPILER_SUPPORTS_SVE)
set(ISALIST_SP ${ISALIST_SP} sve256sp sve512sp sve1024sp sve2048sp)
set(ISALIST_DP ${ISALIST_DP} sve256dp sve512dp sve1024dp sve2048dp)
endif(COMPILER_SUPPORTS_SVE)

if (COMPILER_SUPPORTS_NEON32)
set(ISALIST_SP ${ISALIST_SP} neon32sp)
endif(COMPILER_SUPPORTS_NEON32)
Expand Down
Loading