Skip to content

Commit 832cfd9

Browse files
committed
dsp(simd,resampler): optimize RTL widen, FIR scratch paths, and resampler layout
1 parent 47a70e4 commit 832cfd9

16 files changed

Lines changed: 1248 additions & 329 deletions

include/dsd-neo/dsp/demod_state.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ struct demod_state {
8383
dsd_thread_t thread;
8484
float* lowpassed;
8585
double squelch_running_power;
86-
float* resamp_taps; /* normalized taps, length = K*L */
87-
float* resamp_hist; /* circular history, length = K */
86+
float* resamp_taps; /* normalized taps as L contiguous phase blocks, length = K*L */
87+
float* resamp_hist; /* mirrored history window, length = 2*K */
8888
int (*discriminator)(int, int, int, int);
8989
void (*mode_demod)(struct demod_state*);
9090
struct output_state* output_target;
@@ -167,7 +167,7 @@ struct demod_state {
167167
int resamp_phase; /* 0..L-1 accumulator */
168168
int resamp_taps_len; /* prototype taps length (padded to K*L) */
169169
int resamp_taps_per_phase; /* K = ceil(taps_len/L) */
170-
int resamp_hist_head; /* head index into circular history [0..K-1] */
170+
int resamp_hist_head; /* next write index into base history window [0..K-1] */
171171

172172
/* Legacy FM FLL state (for non-CQPSK FM/C4FM paths).
173173
* Used by fll_update_error() and fll_mix_and_update() in demod_pipeline.cpp.

include/dsd-neo/dsp/resampler.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@ struct demod_state;
2323
/**
2424
* @brief Design windowed-sinc low-pass prototype for polyphase upfirdn (runs at L*Fs_in).
2525
*
26-
* Taps are stored phase-major with stride L (k*L + phase). The function allocates
27-
* aligned storage for taps and history inside the provided demod_state and
28-
* initializes the resampler bookkeeping fields.
26+
* Taps are stored as contiguous per-phase blocks with oldest-to-newest sample
27+
* order inside each block. The function allocates aligned storage for taps and
28+
* mirrored history inside the provided demod_state and initializes the
29+
* resampler bookkeeping fields.
2930
*
3031
* @param s Demodulator state to receive resampler taps/history.
3132
* @param L Upsampling factor.

src/dsp/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@ target_sources(dsd-neo_dsp PRIVATE
2525
# x86-64 SIMD sources with arch-specific flags
2626
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64")
2727
target_sources(dsd-neo_dsp PRIVATE simd_fir_sse2.cpp)
28+
target_sources(dsd-neo_dsp PRIVATE simd_widen_sse2.cpp)
2829
# Explicit SSE2 flag for clarity and cross-compilation safety
2930
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
3031
set_source_files_properties(simd_fir_sse2.cpp PROPERTIES COMPILE_FLAGS "-msse2")
32+
set_source_files_properties(simd_widen_sse2.cpp PROPERTIES COMPILE_FLAGS "-msse2")
3133
endif()
3234

3335
# AVX2+FMA needs explicit flag; add source only if the compiler accepts it
@@ -46,14 +48,17 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64")
4648
endif()
4749
if(_avx2_flags)
4850
target_sources(dsd-neo_dsp PRIVATE simd_fir_avx2.cpp)
51+
target_sources(dsd-neo_dsp PRIVATE simd_widen_avx2.cpp)
4952
set_source_files_properties(simd_fir_avx2.cpp PROPERTIES COMPILE_FLAGS "${_avx2_flags}")
53+
set_source_files_properties(simd_widen_avx2.cpp PROPERTIES COMPILE_FLAGS "${_avx2_flags}")
5054
target_compile_definitions(dsd-neo_dsp PRIVATE $<$<COMPILE_LANGUAGE:CXX>:DSD_NEO_DSP_HAVE_AVX2_IMPL=1>)
5155
endif()
5256
endif()
5357

5458
# ARM64 NEON (always available on AArch64)
5559
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
5660
target_sources(dsd-neo_dsp PRIVATE simd_fir_neon.cpp)
61+
target_sources(dsd-neo_dsp PRIVATE simd_widen_neon.cpp)
5762
endif()
5863

5964
target_include_directories(dsd-neo_dsp

src/dsp/resampler.cpp

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#endif
3636

3737
static const double kPi = 3.14159265358979323846;
38+
static const int kDefaultTapsPerPhase = 16;
3839

3940
template <typename T>
4041
static inline T*
@@ -64,9 +65,37 @@ dsd_neo_sinc(double x) {
6465
return sin(kPi * x) / (kPi * x);
6566
}
6667

68+
static inline float
69+
resamp_dot_contiguous(const float* DSD_NEO_RESTRICT samples, const float* DSD_NEO_RESTRICT taps, int len) {
70+
float acc = 0.0f;
71+
DSD_NEO_IVDEP
72+
for (int k = 0; k < len; k++) {
73+
acc += samples[k] * taps[k];
74+
}
75+
return acc;
76+
}
77+
78+
static inline float
79+
resamp_dot16_contiguous(const float* DSD_NEO_RESTRICT samples, const float* DSD_NEO_RESTRICT taps) {
80+
float acc0 = 0.0f;
81+
float acc1 = 0.0f;
82+
float acc2 = 0.0f;
83+
float acc3 = 0.0f;
84+
85+
DSD_NEO_IVDEP
86+
for (int k = 0; k < 16; k += 4) {
87+
acc0 += samples[k + 0] * taps[k + 0];
88+
acc1 += samples[k + 1] * taps[k + 1];
89+
acc2 += samples[k + 2] * taps[k + 2];
90+
acc3 += samples[k + 3] * taps[k + 3];
91+
}
92+
93+
return (acc0 + acc1) + (acc2 + acc3);
94+
}
95+
6796
void
6897
resamp_design(struct demod_state* s, int L, int M) {
69-
int taps_per_phase = 16; /* K */
98+
int taps_per_phase = kDefaultTapsPerPhase; /* K */
7099
if (taps_per_phase < 8) {
71100
taps_per_phase = 8;
72101
}
@@ -92,22 +121,22 @@ resamp_design(struct demod_state* s, int L, int M) {
92121
s->resamp_taps = (float*)mem_ptr;
93122
}
94123
{
95-
void* mem_ptr = dsd_neo_aligned_malloc((size_t)taps_per_phase * sizeof(float));
124+
void* mem_ptr = dsd_neo_aligned_malloc((size_t)taps_per_phase * 2U * sizeof(float));
96125
s->resamp_hist = (float*)mem_ptr;
97126
}
98127
if (!s->resamp_taps || !s->resamp_hist) {
99128
if (s->resamp_taps) {
100-
free(s->resamp_taps);
129+
dsd_neo_aligned_free(s->resamp_taps);
101130
s->resamp_taps = NULL;
102131
}
103132
if (s->resamp_hist) {
104-
free(s->resamp_hist);
133+
dsd_neo_aligned_free(s->resamp_hist);
105134
s->resamp_hist = NULL;
106135
}
107136
s->resamp_enabled = 0;
108137
return;
109138
}
110-
memset(s->resamp_hist, 0, (size_t)taps_per_phase * sizeof(float));
139+
memset(s->resamp_hist, 0, (size_t)taps_per_phase * 2U * sizeof(float));
111140
s->resamp_hist_head = 0;
112141

113142
double gain = 0.0;
@@ -122,12 +151,16 @@ resamp_design(struct demod_state* s, int L, int M) {
122151
}
123152

124153
const double phase_gain_comp = (double)L;
125-
for (int n = 0; n < N; n++) {
126-
int m = n - mid;
127-
double w = 0.54 - 0.46 * cos(2.0 * kPi * (double)n / (double)(N - 1));
128-
double h = 2.0 * fc * dsd_neo_sinc(2.0 * fc * (double)m);
129-
double t = (h * w / gain) * phase_gain_comp;
130-
s->resamp_taps[n] = (float)t;
154+
for (int phase = 0; phase < L; phase++) {
155+
float* phase_taps = s->resamp_taps + (size_t)phase * (size_t)taps_per_phase;
156+
for (int k = 0; k < taps_per_phase; k++) {
157+
int src_index = phase + ((taps_per_phase - 1 - k) * L);
158+
int m = src_index - mid;
159+
double w = 0.54 - 0.46 * cos(2.0 * kPi * (double)src_index / (double)(N - 1));
160+
double h = 2.0 * fc * dsd_neo_sinc(2.0 * fc * (double)m);
161+
double t = (h * w / gain) * phase_gain_comp;
162+
phase_taps[k] = (float)t;
163+
}
131164
}
132165

133166
s->resamp_L = L;
@@ -159,26 +192,17 @@ resamp_process_block(struct demod_state* s, const float* DSD_NEO_RESTRICT in, in
159192

160193
for (int n = 0; n < in_len; n++) {
161194
hist[head] = in_al[n];
195+
hist[head + K] = in_al[n];
162196
head++;
163197
if (head == K) {
164198
head = 0;
165199
}
166200
int local_phase = phase;
201+
const float* DSD_NEO_RESTRICT hist_window = hist + head;
167202
while (local_phase < L) {
168-
float acc = 0.0f;
169-
const float* DSD_NEO_RESTRICT tk = taps_al + local_phase;
170-
int idx = head - 1;
171-
if (idx < 0) {
172-
idx += K;
173-
}
174-
for (int k = 0; k < K; k++) {
175-
acc += hist[idx] * tk[0];
176-
tk += L;
177-
idx--;
178-
if (idx < 0) {
179-
idx += K;
180-
}
181-
}
203+
const float* DSD_NEO_RESTRICT phase_taps = taps_al + (size_t)local_phase * (size_t)K;
204+
float acc = (K == kDefaultTapsPerPhase) ? resamp_dot16_contiguous(hist_window, phase_taps)
205+
: resamp_dot_contiguous(hist_window, phase_taps, K);
182206
out_al[out_len++] = acc;
183207
local_phase += M;
184208
}

src/dsp/simd_fir.cpp

Lines changed: 2 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,7 @@
1616
#include <atomic>
1717
#include <cstring>
1818

19-
/* Platform-specific CPU feature detection for optional AVX2 dispatch */
20-
#if defined(__x86_64__) || defined(_M_X64)
21-
#if defined(DSD_NEO_DSP_HAVE_AVX2_IMPL)
22-
#if defined(_MSC_VER)
23-
#include <intrin.h>
24-
#else
25-
#include <cpuid.h>
26-
27-
/* Use inline assembly for _xgetbv to avoid target-specific option issues */
28-
static inline unsigned long long
29-
dsd_xgetbv(unsigned int xcr) {
30-
unsigned int eax, edx;
31-
__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
32-
return ((unsigned long long)edx << 32) | eax;
33-
}
34-
#endif
35-
#endif
36-
#endif
19+
#include "simd_x86_cpu.h"
3720

3821
/* Forward declarations for SIMD specializations (defined in arch-specific TUs) */
3922
#if defined(__x86_64__) || defined(_M_X64)
@@ -305,61 +288,6 @@ simd_hb_decim2_real_scalar(const float* in, int in_len, float* out, float* hist,
305288
return out_len;
306289
}
307290

308-
/* -------------------------------------------------------------------------- */
309-
/* CPU Feature Detection */
310-
/* -------------------------------------------------------------------------- */
311-
312-
#if (defined(__x86_64__) || defined(_M_X64)) && defined(DSD_NEO_DSP_HAVE_AVX2_IMPL)
313-
314-
static bool
315-
cpu_has_avx2_with_os_support() {
316-
#if defined(__GNUC__) || defined(__clang__)
317-
unsigned int eax, ebx, ecx, edx;
318-
if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
319-
return false;
320-
}
321-
bool osxsave = (ecx & (1u << 27)) != 0;
322-
bool avx = (ecx & (1u << 28)) != 0;
323-
bool fma = (ecx & (1u << 12)) != 0;
324-
if (!osxsave || !avx) {
325-
return false;
326-
}
327-
/* Check OS has enabled YMM state saving via XGETBV */
328-
unsigned long long xcr0 = dsd_xgetbv(0);
329-
bool ymm_enabled = (xcr0 & 0x6) == 0x6; /* XMM + YMM state enabled */
330-
if (!ymm_enabled) {
331-
return false;
332-
}
333-
/* Check AVX2 support in extended features */
334-
if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
335-
return false;
336-
}
337-
bool has_avx2 = (ebx & (1u << 5)) != 0;
338-
return has_avx2 && fma;
339-
#elif defined(_MSC_VER)
340-
int cpuInfo[4];
341-
__cpuid(cpuInfo, 1);
342-
bool osxsave = (cpuInfo[2] & (1 << 27)) != 0;
343-
bool avx = (cpuInfo[2] & (1 << 28)) != 0;
344-
bool fma = (cpuInfo[2] & (1 << 12)) != 0;
345-
if (!osxsave || !avx) {
346-
return false;
347-
}
348-
unsigned long long xcr0 = _xgetbv(0);
349-
bool ymm_enabled = (xcr0 & 0x6) == 0x6;
350-
if (!ymm_enabled) {
351-
return false;
352-
}
353-
__cpuidex(cpuInfo, 7, 0);
354-
bool has_avx2 = (cpuInfo[1] & (1 << 5)) != 0;
355-
return has_avx2 && fma;
356-
#else
357-
return false;
358-
#endif
359-
}
360-
361-
#endif /* x86_64 && DSD_NEO_DSP_HAVE_AVX2_IMPL */
362-
363291
/* -------------------------------------------------------------------------- */
364292
/* Function Pointer Dispatch */
365293
/* -------------------------------------------------------------------------- */
@@ -390,7 +318,7 @@ simd_fir_init_dispatch() {
390318
/* Perform one-time initialization */
391319
#if defined(__x86_64__) || defined(_M_X64)
392320
#if defined(DSD_NEO_DSP_HAVE_AVX2_IMPL)
393-
if (cpu_has_avx2_with_os_support()) {
321+
if (dsd_neo_cpu_has_avx2_with_os_support()) {
394322
g_fir_complex_impl = simd_fir_complex_apply_avx2;
395323
g_hb_decim2_complex_impl = simd_hb_decim2_complex_avx2;
396324
g_hb_decim2_real_impl = simd_hb_decim2_real_avx2;

0 commit comments

Comments
 (0)