[perf](trx-backend-soapysdr): use circular buffer for wfm resampler history

Replace shift_append (O(N) rotate_left per sample) with a circular buffer
index for O(1) writes. The polyphase resampler now reads from the ring
buffer directly, eliminating 3 × 32-element memmoves per composite sample.
Remove unused dot_product functions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Stan Grams <sjg@haxx.space>
This commit is contained in:
2026-03-01 09:46:35 +01:00
parent 132cd5b950
commit 7eb88b6669
@@ -81,61 +81,23 @@ fn build_wfm_resample_bank(cutoff: f32) -> [[f32; WFM_RESAMP_TAPS]; WFM_RESAMP_P
bank
}
/// Polyphase FIR resample from a circular buffer.
/// `pos` points to the oldest sample (next write position).
#[inline]
fn shift_append<const N: usize>(hist: &mut [f32; N], sample: f32) {
hist.rotate_left(1);
hist[N - 1] = sample;
}
#[inline]
fn polyphase_resample(
fn polyphase_resample_ring(
hist: &[f32; WFM_RESAMP_TAPS],
pos: usize,
bank: &[[f32; WFM_RESAMP_TAPS]; WFM_RESAMP_PHASES],
frac: f32,
) -> f32 {
let phase = (frac.clamp(0.0, 0.999_999) * WFM_RESAMP_PHASES as f32).round() as usize;
let phase = phase.min(WFM_RESAMP_PHASES - 1);
dot_product(&hist[..], &bank[phase][..])
let coeffs = &bank[phase];
let mut acc = 0.0_f32;
for tap in 0..WFM_RESAMP_TAPS {
acc += hist[(pos + tap) % WFM_RESAMP_TAPS] * coeffs[tap];
}
#[inline]
fn dot_product_scalar(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn dot_product_avx2(a: &[f32], b: &[f32]) -> f32 {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let len = a.len().min(b.len());
let mut acc = _mm256_setzero_ps();
let mut i = 0usize;
while i + 8 <= len {
let av = unsafe { _mm256_loadu_ps(a.as_ptr().add(i)) };
let bv = unsafe { _mm256_loadu_ps(b.as_ptr().add(i)) };
acc = _mm256_add_ps(acc, _mm256_mul_ps(av, bv));
i += 8;
}
let mut lanes = [0.0_f32; 8];
unsafe { _mm256_storeu_ps(lanes.as_mut_ptr(), acc) };
lanes.iter().sum::<f32>() + dot_product_scalar(&a[i..len], &b[i..len])
}
#[inline]
fn dot_product(a: &[f32], b: &[f32]) -> f32 {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if std::arch::is_x86_feature_detected!("avx2") {
return unsafe { dot_product_avx2(a, b) };
}
}
dot_product_scalar(a, b)
acc
}
#[inline]
@@ -650,12 +612,14 @@ pub struct WfmStereoDecoder {
fm_gain: f32,
/// Shared coefficient bank for the polyphase fractional audio resampler.
resample_bank: [[f32; WFM_RESAMP_TAPS]; WFM_RESAMP_PHASES],
/// History ring for polyphase FIR resampling of the sum channel.
/// Circular buffer for polyphase FIR resampling of the sum channel.
sum_hist: [f32; WFM_RESAMP_TAPS],
/// History ring for polyphase FIR resampling of the diff channel.
/// Circular buffer for polyphase FIR resampling of the diff channel.
diff_hist: [f32; WFM_RESAMP_TAPS],
/// History ring for polyphase FIR resampling of the quadrature diff channel.
/// Circular buffer for polyphase FIR resampling of the quadrature diff channel.
diff_q_hist: [f32; WFM_RESAMP_TAPS],
/// Write position for the circular history buffers.
hist_pos: usize,
/// Previous pilot blend sample for simple linear interpolation.
prev_blend: f32,
/// Fractional phase increment per composite sample = audio_rate / composite_rate.
@@ -719,6 +683,7 @@ impl WfmStereoDecoder {
sum_hist: [0.0; WFM_RESAMP_TAPS],
diff_hist: [0.0; WFM_RESAMP_TAPS],
diff_q_hist: [0.0; WFM_RESAMP_TAPS],
hist_pos: 0,
prev_blend: 0.0,
output_phase_inc,
output_phase: 0.0,
@@ -807,9 +772,12 @@ impl WfmStereoDecoder {
// --- Polyphase FIR fractional resampling ---
// This uses a short windowed-sinc bank instead of cubic interpolation
// to reduce top-end overshoot/ringing near the audio cutoff.
shift_append(&mut self.sum_hist, sum);
shift_append(&mut self.diff_hist, diff_i);
shift_append(&mut self.diff_q_hist, diff_q);
// Circular buffer: O(1) write instead of O(N) shift.
let pos = self.hist_pos;
self.sum_hist[pos] = sum;
self.diff_hist[pos] = diff_i;
self.diff_q_hist[pos] = diff_q;
self.hist_pos = (pos + 1) % WFM_RESAMP_TAPS;
let prev_phase = self.output_phase;
self.output_phase += self.output_phase_inc;
@@ -823,9 +791,10 @@ impl WfmStereoDecoder {
// interval. The FIR bank reconstructs a band-limited sample using
// a fixed two-sample lookahead in the decoder.
let frac = ((1.0 - prev_phase) / self.output_phase_inc) as f32;
let sum_i = polyphase_resample(&self.sum_hist, &self.resample_bank, frac);
let diff_i = polyphase_resample(&self.diff_hist, &self.resample_bank, frac);
let diff_q = polyphase_resample(&self.diff_q_hist, &self.resample_bank, frac);
let ring_pos = self.hist_pos; // oldest sample = next write position
let sum_i = polyphase_resample_ring(&self.sum_hist, ring_pos, &self.resample_bank, frac);
let diff_i = polyphase_resample_ring(&self.diff_hist, ring_pos, &self.resample_bank, frac);
let diff_q = polyphase_resample_ring(&self.diff_q_hist, ring_pos, &self.resample_bank, frac);
let blend_i =
(self.prev_blend + frac * (stereo_blend_target - self.prev_blend)).clamp(0.0, 1.0);
self.prev_blend = stereo_blend_target;
@@ -919,6 +888,7 @@ impl WfmStereoDecoder {
self.sum_hist = [0.0; WFM_RESAMP_TAPS];
self.diff_hist = [0.0; WFM_RESAMP_TAPS];
self.diff_q_hist = [0.0; WFM_RESAMP_TAPS];
self.hist_pos = 0;
self.prev_blend = 0.0;
self.output_phase = 0.0;
}