[fix](trx-wefax): block-based DSP for realtime decode performance
Replace per-sample circular-buffer processing with block-based linear buffers in the FM discriminator and polyphase resampler. This eliminates modular indexing in FIR inner loops, enabling compiler auto-vectorisation. Also fix O(n²) drain pattern in the line slicer. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Stan Grams <sjg@haxx.space>
This commit is contained in:
@@ -7,6 +7,9 @@
|
|||||||
//! Computes instantaneous frequency from the analytic signal produced by a
|
//! Computes instantaneous frequency from the analytic signal produced by a
|
||||||
//! Hilbert transform FIR, then maps the frequency to a 0.0–1.0 luminance
|
//! Hilbert transform FIR, then maps the frequency to a 0.0–1.0 luminance
|
||||||
//! value (1500 Hz = black, 2300 Hz = white).
|
//! value (1500 Hz = black, 2300 Hz = white).
|
||||||
|
//!
|
||||||
|
//! Uses block-based linear processing for auto-vectorisation of the FIR
|
||||||
|
//! convolution, consistent with `docs/Optimization-Guidelines.md`.
|
||||||
|
|
||||||
use std::f32::consts::PI;
|
use std::f32::consts::PI;
|
||||||
|
|
||||||
@@ -18,85 +21,88 @@ const HILBERT_DELAY: usize = HILBERT_TAPS / 2;
|
|||||||
|
|
||||||
/// FM discriminator producing luminance values from audio samples.
|
/// FM discriminator producing luminance values from audio samples.
|
||||||
pub struct FmDiscriminator {
|
pub struct FmDiscriminator {
|
||||||
sample_rate: f32,
|
|
||||||
/// Hilbert FIR coefficients (odd-length, anti-symmetric).
|
/// Hilbert FIR coefficients (odd-length, anti-symmetric).
|
||||||
hilbert_coeffs: Vec<f32>,
|
hilbert_coeffs: [f32; HILBERT_TAPS],
|
||||||
/// Input sample delay line for FIR convolution.
|
/// Tail buffer: last `HILBERT_TAPS - 1` input samples from the previous
|
||||||
delay_line: Vec<f32>,
|
/// block (used to prime the next convolution without modular indexing).
|
||||||
/// Write position in delay line (circular buffer).
|
tail: Vec<f32>,
|
||||||
write_pos: usize,
|
|
||||||
/// Previous analytic signal sample for frequency differentiation.
|
/// Previous analytic signal sample for frequency differentiation.
|
||||||
prev_i: f32,
|
prev_i: f32,
|
||||||
prev_q: f32,
|
prev_q: f32,
|
||||||
/// Centre frequency for luminance mapping.
|
/// Pre-computed constants.
|
||||||
center_hz: f32,
|
inv_2pi_ts: f32,
|
||||||
/// Deviation for luminance mapping.
|
black_hz: f32,
|
||||||
deviation_hz: f32,
|
inv_range_hz: f32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FmDiscriminator {
|
impl FmDiscriminator {
|
||||||
pub fn new(sample_rate: u32, center_hz: f32, deviation_hz: f32) -> Self {
|
pub fn new(sample_rate: u32, center_hz: f32, deviation_hz: f32) -> Self {
|
||||||
let coeffs = design_hilbert_fir(HILBERT_TAPS);
|
let coeffs = design_hilbert_fir();
|
||||||
|
let sr = sample_rate as f32;
|
||||||
Self {
|
Self {
|
||||||
sample_rate: sample_rate as f32,
|
|
||||||
hilbert_coeffs: coeffs,
|
hilbert_coeffs: coeffs,
|
||||||
delay_line: vec![0.0; HILBERT_TAPS],
|
tail: vec![0.0; HILBERT_TAPS - 1],
|
||||||
write_pos: 0,
|
|
||||||
prev_i: 0.0,
|
prev_i: 0.0,
|
||||||
prev_q: 0.0,
|
prev_q: 0.0,
|
||||||
center_hz,
|
inv_2pi_ts: sr / (2.0 * PI),
|
||||||
deviation_hz,
|
black_hz: center_hz - deviation_hz,
|
||||||
|
inv_range_hz: 1.0 / (2.0 * deviation_hz),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Process a block of real-valued audio samples, returning luminance
|
/// Process a block of real-valued audio samples, returning luminance
|
||||||
/// values in the range 0.0 (black / 1500 Hz) to 1.0 (white / 2300 Hz).
|
/// values in the range 0.0 (black / 1500 Hz) to 1.0 (white / 2300 Hz).
|
||||||
|
///
|
||||||
|
/// The Hilbert FIR is evaluated on a contiguous linear buffer
|
||||||
|
/// (`[tail | samples]`) so the inner loop uses straight indexing—no
|
||||||
|
/// modular arithmetic—and the compiler can auto-vectorise.
|
||||||
pub fn process(&mut self, samples: &[f32]) -> Vec<f32> {
|
pub fn process(&mut self, samples: &[f32]) -> Vec<f32> {
|
||||||
let mut output = Vec::with_capacity(samples.len());
|
|
||||||
let n = HILBERT_TAPS;
|
let n = HILBERT_TAPS;
|
||||||
let half = HILBERT_DELAY;
|
let half = HILBERT_DELAY;
|
||||||
let inv_2pi_ts = self.sample_rate / (2.0 * PI);
|
let tail_len = n - 1;
|
||||||
let black_hz = self.center_hz - self.deviation_hz; // 1500
|
|
||||||
let range_hz = 2.0 * self.deviation_hz; // 800
|
|
||||||
|
|
||||||
for &sample in samples {
|
// Build contiguous work buffer: [tail from previous block | new samples].
|
||||||
// Write into circular delay line.
|
let work_len = tail_len + samples.len();
|
||||||
self.delay_line[self.write_pos] = sample;
|
let mut work = Vec::with_capacity(work_len);
|
||||||
self.write_pos = (self.write_pos + 1) % n;
|
work.extend_from_slice(&self.tail);
|
||||||
|
work.extend_from_slice(samples);
|
||||||
|
|
||||||
// Compute Hilbert-transformed (quadrature) output via FIR.
|
let mut output = Vec::with_capacity(samples.len());
|
||||||
|
let coeffs = &self.hilbert_coeffs;
|
||||||
|
|
||||||
|
for i in 0..samples.len() {
|
||||||
|
// Linear FIR convolution — window is work[i..i+n].
|
||||||
|
let window = &work[i..i + n];
|
||||||
let mut q = 0.0f32;
|
let mut q = 0.0f32;
|
||||||
for k in 0..n {
|
for k in 0..n {
|
||||||
let idx = (self.write_pos + k) % n;
|
q += coeffs[k] * window[n - 1 - k];
|
||||||
q += self.hilbert_coeffs[k] * self.delay_line[idx];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// The in-phase component is the delayed input (centre tap of the
|
// In-phase component is the delayed input (group delay = half).
|
||||||
// Hilbert FIR corresponds to the group delay).
|
let i_val = work[i + half];
|
||||||
let i = self.delay_line[(self.write_pos + half) % n];
|
|
||||||
|
|
||||||
// Instantaneous frequency via phase differentiation:
|
// Instantaneous frequency via phase differentiation:
|
||||||
// f = arg(z[n] · conj(z[n-1])) / (2π·Ts)
|
// f = |arg(z[n] · conj(z[n-1]))| / (2π·Ts)
|
||||||
// z[n] · conj(z[n-1]) = (i + jq)(prev_i - j·prev_q)
|
let di = i_val * self.prev_i + q * self.prev_q;
|
||||||
let di = i * self.prev_i + q * self.prev_q;
|
let dq = q * self.prev_i - i_val * self.prev_q;
|
||||||
let dq = q * self.prev_i - i * self.prev_q;
|
let freq = dq.atan2(di).abs() * self.inv_2pi_ts;
|
||||||
let phase_diff = dq.atan2(di);
|
|
||||||
let freq = phase_diff.abs() * inv_2pi_ts;
|
|
||||||
|
|
||||||
// Map frequency to luminance.
|
// Map frequency to luminance.
|
||||||
let lum = ((freq - black_hz) / range_hz).clamp(0.0, 1.0);
|
let lum = ((freq - self.black_hz) * self.inv_range_hz).clamp(0.0, 1.0);
|
||||||
output.push(lum);
|
output.push(lum);
|
||||||
|
|
||||||
self.prev_i = i;
|
self.prev_i = i_val;
|
||||||
self.prev_q = q;
|
self.prev_q = q;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Save tail for next call.
|
||||||
|
self.tail.copy_from_slice(&work[work_len - tail_len..]);
|
||||||
|
|
||||||
output
|
output
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn reset(&mut self) {
|
pub fn reset(&mut self) {
|
||||||
self.delay_line.fill(0.0);
|
self.tail.fill(0.0);
|
||||||
self.write_pos = 0;
|
|
||||||
self.prev_i = 0.0;
|
self.prev_i = 0.0;
|
||||||
self.prev_q = 0.0;
|
self.prev_q = 0.0;
|
||||||
}
|
}
|
||||||
@@ -106,26 +112,26 @@ impl FmDiscriminator {
|
|||||||
///
|
///
|
||||||
/// The impulse response is: h[n] = 2/(πn) for odd n (relative to centre),
|
/// The impulse response is: h[n] = 2/(πn) for odd n (relative to centre),
|
||||||
/// 0 for even n, windowed with a Blackman window.
|
/// 0 for even n, windowed with a Blackman window.
|
||||||
#[allow(clippy::needless_range_loop)]
|
fn design_hilbert_fir() -> [f32; HILBERT_TAPS] {
|
||||||
fn design_hilbert_fir(num_taps: usize) -> Vec<f32> {
|
let num_taps = HILBERT_TAPS;
|
||||||
assert!(num_taps % 2 == 1, "Hilbert FIR must have odd length");
|
let mut coeffs = [0.0f32; HILBERT_TAPS];
|
||||||
let mut coeffs = vec![0.0f32; num_taps];
|
let m = (num_taps - 1) as f64;
|
||||||
let mid = (num_taps - 1) as f64 / 2.0;
|
let mid = m / 2.0;
|
||||||
|
|
||||||
for i in 0..num_taps {
|
let mut i = 0;
|
||||||
|
while i < num_taps {
|
||||||
let n = i as f64 - mid;
|
let n = i as f64 - mid;
|
||||||
let ni = n.round() as i64;
|
let ni = n.round() as i64;
|
||||||
if ni == 0 {
|
if ni != 0 && ni % 2 != 0 {
|
||||||
coeffs[i] = 0.0;
|
|
||||||
} else if ni % 2 != 0 {
|
|
||||||
// Hilbert kernel: 2/(π·n) for odd offsets.
|
// Hilbert kernel: 2/(π·n) for odd offsets.
|
||||||
let h = 2.0 / (std::f64::consts::PI * n);
|
let h = 2.0 / (std::f64::consts::PI * n);
|
||||||
// Blackman window.
|
// Blackman window.
|
||||||
let w = 0.42
|
let w = 0.42
|
||||||
- 0.5 * (2.0 * std::f64::consts::PI * i as f64 / (num_taps - 1) as f64).cos()
|
- 0.5 * (2.0 * std::f64::consts::PI * i as f64 / m).cos()
|
||||||
+ 0.08 * (4.0 * std::f64::consts::PI * i as f64 / (num_taps - 1) as f64).cos();
|
+ 0.08 * (4.0 * std::f64::consts::PI * i as f64 / m).cos();
|
||||||
coeffs[i] = (h * w) as f32;
|
coeffs[i] = (h * w) as f32;
|
||||||
}
|
}
|
||||||
|
i += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
coeffs
|
coeffs
|
||||||
|
|||||||
@@ -58,14 +58,18 @@ impl LineSlicer {
|
|||||||
self.aligned = true;
|
self.aligned = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract complete lines.
|
// Extract complete lines (single drain at the end to avoid O(n²)).
|
||||||
while self.buffer.len() >= self.samples_per_line {
|
let mut offset = 0;
|
||||||
let line_samples = &self.buffer[..self.samples_per_line];
|
while offset + self.samples_per_line <= self.buffer.len() {
|
||||||
|
let line_samples = &self.buffer[offset..offset + self.samples_per_line];
|
||||||
let pixels = self.resample_line(line_samples);
|
let pixels = self.resample_line(line_samples);
|
||||||
lines.push(pixels);
|
lines.push(pixels);
|
||||||
self.buffer.drain(..self.samples_per_line);
|
offset += self.samples_per_line;
|
||||||
self.consumed += self.samples_per_line;
|
self.consumed += self.samples_per_line;
|
||||||
}
|
}
|
||||||
|
if offset > 0 {
|
||||||
|
self.buffer.drain(..offset);
|
||||||
|
}
|
||||||
|
|
||||||
lines
|
lines
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,9 +4,13 @@
|
|||||||
|
|
||||||
//! Polyphase rational resampler: 48000 Hz → 11025 Hz.
|
//! Polyphase rational resampler: 48000 Hz → 11025 Hz.
|
||||||
//!
|
//!
|
||||||
//! Ratio: 11025/48000 = 441/1920 (after GCD reduction).
|
//! Ratio: 11025/48000 = 147/640 (after GCD reduction).
|
||||||
//! Uses a polyphase FIR filter bank to avoid computing the full upsampled
|
//! Uses a polyphase FIR filter bank to avoid computing the full upsampled
|
||||||
//! signal, consistent with `docs/Optimization-Guidelines.md`.
|
//! signal, consistent with `docs/Optimization-Guidelines.md`.
|
||||||
|
//!
|
||||||
|
//! Block-based: builds a linear `[history | input]` work buffer so the inner
|
||||||
|
//! FIR convolution loop uses straight indexing (no modular arithmetic) and
|
||||||
|
//! benefits from auto-vectorisation.
|
||||||
|
|
||||||
/// Internal processing sample rate.
|
/// Internal processing sample rate.
|
||||||
pub const INTERNAL_RATE: u32 = 11025;
|
pub const INTERNAL_RATE: u32 = 11025;
|
||||||
@@ -24,7 +28,7 @@ pub struct Resampler {
|
|||||||
taps_per_phase: usize,
|
taps_per_phase: usize,
|
||||||
/// Polyphase filter bank: `up` sub-filters, each with `taps_per_phase` taps.
|
/// Polyphase filter bank: `up` sub-filters, each with `taps_per_phase` taps.
|
||||||
bank: Vec<Vec<f32>>,
|
bank: Vec<Vec<f32>>,
|
||||||
/// Input history buffer for FIR convolution.
|
/// Input history buffer (`taps_per_phase` samples from the previous block).
|
||||||
history: Vec<f32>,
|
history: Vec<f32>,
|
||||||
/// Current phase accumulator (tracks position in the up-sampled domain).
|
/// Current phase accumulator (tracks position in the up-sampled domain).
|
||||||
phase: usize,
|
phase: usize,
|
||||||
@@ -82,23 +86,28 @@ impl Resampler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Process a block of input samples, returning resampled output.
|
/// Process a block of input samples, returning resampled output.
|
||||||
|
///
|
||||||
|
/// Uses a linear `[history | input]` work buffer so the inner FIR
|
||||||
|
/// convolution runs on contiguous memory with plain indexing.
|
||||||
#[allow(clippy::needless_range_loop)]
|
#[allow(clippy::needless_range_loop)]
|
||||||
pub fn process(&mut self, input: &[f32]) -> Vec<f32> {
|
pub fn process(&mut self, input: &[f32]) -> Vec<f32> {
|
||||||
|
let tpp = self.taps_per_phase;
|
||||||
let mut output = Vec::with_capacity(input.len() * self.up / self.down + 2);
|
let mut output = Vec::with_capacity(input.len() * self.up / self.down + 2);
|
||||||
|
|
||||||
for &sample in input {
|
// Contiguous work buffer: [previous history | new input].
|
||||||
// Shift sample into history (newest at end).
|
let mut work = Vec::with_capacity(tpp + input.len());
|
||||||
self.history.copy_within(1.., 0);
|
work.extend_from_slice(&self.history);
|
||||||
self.history[self.taps_per_phase - 1] = sample;
|
work.extend_from_slice(input);
|
||||||
|
|
||||||
|
for p in 0..input.len() {
|
||||||
// Generate output samples for all phases that map to this input.
|
// Generate output samples for all phases that map to this input.
|
||||||
while self.phase < self.up {
|
while self.phase < self.up {
|
||||||
let coeffs = &self.bank[self.phase];
|
let coeffs = &self.bank[self.phase];
|
||||||
let mut acc = 0.0f32;
|
let mut acc = 0.0f32;
|
||||||
for k in 0..self.taps_per_phase {
|
// Newest sample is at work[p + tpp], oldest at work[p + 1].
|
||||||
// History is stored newest-last, coefficients are indexed
|
// coeffs[k] corresponds to the (k+1)-th newest sample.
|
||||||
// from newest to oldest (matching the polyphase decomposition).
|
for k in 0..tpp {
|
||||||
acc += coeffs[k] * self.history[self.taps_per_phase - 1 - k];
|
acc += coeffs[k] * work[p + tpp - k];
|
||||||
}
|
}
|
||||||
output.push(acc);
|
output.push(acc);
|
||||||
self.phase += self.down;
|
self.phase += self.down;
|
||||||
@@ -106,6 +115,10 @@ impl Resampler {
|
|||||||
self.phase -= self.up;
|
self.phase -= self.up;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Save last `tpp` samples as history for next block.
|
||||||
|
let work_len = work.len();
|
||||||
|
self.history.copy_from_slice(&work[work_len - tpp..]);
|
||||||
|
|
||||||
output
|
output
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user