[refactor](trx-ftx): optimize hot paths and deduplicate decoder internals

- Cache generator matrix with OnceLock (P0.1) - Store raw complex in WfElem, eliminate powf round-trip (P0.2) - Reuse FFT planners across decode cycles in Ft2Pipeline (P0.3) - Deduplicate fast_atanh/ldpc_check into ldpc.rs (P1.1) - Gate unused sum-product ldpc_decode behind #[cfg(test)] (P1.2) - Eliminate double pack_bits in verify_crc_and_build_message (P1.3) - Remove unnecessary unsafe impl Send for Ft8Decoder (P1.4) - Convert key loops to iterator/zip patterns (P2.1) - Remove resolved clippy::manual_memcpy suppressions Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Stan Grams <sjg@haxx.space>
2026-03-19 23:22:58 +01:00
parent 3dc6918082
commit 2da749b978
8 changed files with 113 additions and 87 deletions
@@ -38,8 +38,7 @@ pub struct FtxMessage {
 }

 fn wf_elem_to_complex(elem: WfElem) -> Complex32 {
-    let amplitude = 10.0_f32.powf(elem.mag / 20.0);
-    Complex32::from_polar(amplitude, elem.phase)
+    Complex32::new(elem.re, elem.im)
 }

 fn get_cand_offset(wf: &Waterfall, cand: &Candidate) -> usize {
@@ -55,10 +54,7 @@ fn wf_mag_at(wf: &Waterfall, base: usize, idx: isize) -> &WfElem {
    if i < wf.mag.len() {
        &wf.mag[i]
    } else {
-        &WfElem {
-            mag: -120.0,
-            phase: 0.0,
-        }
+        &DEFAULT_WF_ELEM
    }
 }

@@ -66,6 +62,8 @@ fn wf_mag_at(wf: &Waterfall, base: usize, idx: isize) -> &WfElem {
 static DEFAULT_WF_ELEM: WfElem = WfElem {
    mag: -120.0,
    phase: 0.0,
+    re: 0.0,
+    im: 0.0,
 };

 fn wf_mag_safe(wf: &Waterfall, idx: usize) -> &WfElem {
@@ -534,6 +532,7 @@ pub(crate) fn verify_crc_and_build_message(
    let mut a91 = [0u8; crate::protocol::FTX_LDPC_K_BYTES];
    pack_bits(plain174, crate::protocol::FTX_LDPC_K, &mut a91);

+    let a91_orig = a91;
    let crc_extracted = crate::crc::ftx_extract_crc(&a91);
    a91[9] &= 0xF8;
    a91[10] = 0x00;
@@ -543,8 +542,7 @@ pub(crate) fn verify_crc_and_build_message(
        return None;
    }

-    // Re-read a91 since we modified it for CRC check
-    pack_bits(plain174, crate::protocol::FTX_LDPC_K, &mut a91);
+    let a91 = a91_orig;

    let mut message = FtxMessage {
        hash: crc_calculated,
@@ -45,9 +45,6 @@ pub struct Ft8Decoder {
    ft2_pipeline: Option<crate::ft2::Ft2Pipeline>,
 }

-// Ft8Decoder is not shared across threads, but may be moved between tasks.
-unsafe impl Send for Ft8Decoder {}
-
 impl Ft8Decoder {
    /// Create a new FT8 decoder.
    pub fn new(sample_rate: u32) -> Result<Self, String> {
@@ -9,6 +9,8 @@
 //! around that frequency, applies a spectral window, and inverse-FFTs to produce
 //! a complex baseband signal at a reduced sample rate (12000/NDOWN = 1333.3 Hz).

+use std::sync::Arc;
+
 use num_complex::Complex32;
 use rustfft::FftPlanner;

@@ -64,8 +66,22 @@ impl DownsampleContext {
    /// Initialize the downsample context by computing the forward FFT of
    /// the raw audio and preparing the spectral window.
    ///
+    /// If `real_fft` and `ifft` are provided, they are reused instead of
+    /// creating fresh planners. The real FFT must be a forward plan of length
+    /// `nraw` and the IFFT must be an inverse plan of length `nraw / NDOWN`.
+    ///
    /// Returns `None` if the raw audio is too short or allocation fails.
    pub fn new(raw_audio: &[f32], sample_rate: f32) -> Option<Self> {
+        Self::new_with_plans(raw_audio, sample_rate, None, None)
+    }
+
+    /// Initialize with optional pre-built FFT plans for reuse across decode cycles.
+    pub fn new_with_plans(
+        raw_audio: &[f32],
+        sample_rate: f32,
+        real_fft: Option<Arc<dyn realfft::RealToComplex<f32>>>,
+        ifft: Option<Arc<dyn rustfft::Fft<f32>>>,
+    ) -> Option<Self> {
        let nraw = raw_audio.len();
        if nraw == 0 {
            return None;
@@ -85,8 +101,13 @@ impl DownsampleContext {
        }

        // Forward real FFT of raw audio
-        let mut real_planner = realfft::RealFftPlanner::<f32>::new();
-        let fft = real_planner.plan_fft_forward(nraw);
+        let fft = match real_fft {
+            Some(f) => f,
+            None => {
+                let mut real_planner = realfft::RealFftPlanner::<f32>::new();
+                real_planner.plan_fft_forward(nraw)
+            }
+        };
        let mut input = fft.make_input_vec();
        let mut output = fft.make_output_vec();
        let mut scratch = fft.make_scratch_vec();
@@ -98,8 +119,13 @@ impl DownsampleContext {
        let spectrum = output;

        // IFFT plan for downsampled length
-        let mut planner = FftPlanner::<f32>::new();
-        let ifft = planner.plan_fft_inverse(nfft2);
+        let ifft = match ifft {
+            Some(f) => f,
+            None => {
+                let mut planner = FftPlanner::<f32>::new();
+                planner.plan_fft_inverse(nfft2)
+            }
+        };
        let ifft_scratch_len = ifft.get_inplace_scratch_len();

        Some(Self {
@@ -13,8 +13,11 @@ pub mod downsample;
 pub mod osd;
 pub mod sync;

+use std::sync::Arc;
+
 use num_complex::Complex32;
 use realfft::RealFftPlanner;
+use rustfft::FftPlanner;

 use crate::decode::{verify_crc_and_build_message, FtxMessage};
 use crate::protocol::*;
@@ -117,6 +120,9 @@ pub struct Ft2Pipeline {
    raw_capacity: usize,
    waveforms: SyncWaveforms,
    peak_search: PeakSearchWorkspace,
+    // Cached FFT plans reused across decode cycles
+    ds_real_fft: Arc<dyn realfft::RealToComplex<f32>>,
+    ds_ifft: Arc<dyn rustfft::Fft<f32>>,
 }

 struct Ft2DecodeWorkspace {
@@ -176,12 +182,21 @@ impl PeakSearchWorkspace {
 impl Ft2Pipeline {
    /// Create a new FT2 pipeline for the given sample rate.
    pub fn new(sample_rate: i32) -> Self {
+        // Pre-build FFT plans for the downsample context (reused every decode cycle)
+        let nfft2 = FT2_NMAX / FT2_NDOWN;
+        let mut real_planner = RealFftPlanner::<f32>::new();
+        let ds_real_fft = real_planner.plan_fft_forward(FT2_NMAX);
+        let mut fft_planner = FftPlanner::<f32>::new();
+        let ds_ifft = fft_planner.plan_fft_inverse(nfft2);
+
        Self {
            sample_rate: sample_rate as f32,
            raw_audio: Vec::with_capacity(FT2_NMAX),
            raw_capacity: FT2_NMAX,
            waveforms: prepare_sync_waveforms(),
            peak_search: PeakSearchWorkspace::new(),
+            ds_real_fft,
+            ds_ifft,
        }
    }

@@ -216,7 +231,12 @@ impl Ft2Pipeline {
            return Vec::new();
        }

-        let ctx = match DownsampleContext::new(&self.raw_audio, self.sample_rate) {
+        let ctx = match DownsampleContext::new_with_plans(
+            &self.raw_audio,
+            self.sample_rate,
+            Some(Arc::clone(&self.ds_real_fft)),
+            Some(Arc::clone(&self.ds_ifft)),
+        ) {
            Some(ctx) => ctx,
            None => return Vec::new(),
        };
@@ -16,39 +16,15 @@
 //! 3. Exhaustive search over bit-flip patterns of increasing weight
 //! 4. Pattern hashing (OSD-2) to efficiently search two-bit-flip corrections

+use std::sync::OnceLock;
+
 use crate::constants::{FTX_LDPC_GENERATOR, FTX_LDPC_MN, FTX_LDPC_NM, FTX_LDPC_NUM_ROWS};
 use crate::crc::{ftx_compute_crc, ftx_extract_crc};
 use crate::decode::pack_bits;
 use crate::encode::parity8;
+use crate::ldpc::ldpc_check;
 use crate::protocol::{FTX_LDPC_K, FTX_LDPC_K_BYTES, FTX_LDPC_M, FTX_LDPC_N};

-/// Check LDPC parity of a 174-bit codeword. Returns number of parity errors.
-pub fn ft2_ldpc_check(codeword: &[u8]) -> i32 {
-    let mut errors = 0i32;
-    for m in 0..FTX_LDPC_M {
-        let mut x: u8 = 0;
-        let num_rows = FTX_LDPC_NUM_ROWS[m] as usize;
-        for i in 0..num_rows {
-            let idx = FTX_LDPC_NM[m][i] as usize;
-            if idx > 0 && idx - 1 < codeword.len() {
-                x ^= codeword[idx - 1];
-            }
-        }
-        if x != 0 {
-            errors += 1;
-        }
-    }
-    errors
-}
-
-/// Fast rational approximation of `atanh(x)`.
-fn fast_atanh(x: f32) -> f32 {
-    let x2 = x * x;
-    let a = x * (945.0 + x2 * (-735.0 + x2 * 64.0));
-    let b = 945.0 + x2 * (-1050.0 + x2 * 225.0);
-    a / b
-}
-
 /// Piecewise linear approximation of `atanh(x)` used in BP message passing.
 fn platanh(x: f32) -> f32 {
    let isign: f32 = if x < 0.0 { -1.0 } else { 1.0 };
@@ -102,23 +78,23 @@ fn encode174_91_nocrc_bits(message91: &[u8], codeword: &mut [u8; FTX_LDPC_N]) {

 /// XOR two byte slices.
 fn xor_rows(dst: &mut [u8], src: &[u8], len: usize) {
-    for i in 0..len {
-        dst[i] ^= src[i];
-    }
+    dst[..len]
+        .iter_mut()
+        .zip(&src[..len])
+        .for_each(|(d, s)| *d ^= s);
 }

 /// Matrix-vector multiply for re-encoding in OSD.
 fn mrbencode91(me: &[u8], codeword: &mut [u8], g2: &[u8], n: usize, k: usize) {
-    for c in codeword[..n].iter_mut() {
-        *c = 0;
-    }
+    codeword[..n].fill(0);
    for i in 0..k {
        if me[i] == 0 {
            continue;
        }
-        for j in 0..n {
-            codeword[j] ^= g2[j * k + i];
-        }
+        codeword[..n]
+            .iter_mut()
+            .enumerate()
+            .for_each(|(j, c)| *c ^= g2[j * k + i]);
    }
 }

@@ -269,9 +245,9 @@ pub fn osd174_91(
    let n = FTX_LDPC_N;
    let ndeep = ndeep.min(6);

-    // Build per-bit generator matrix (each row i generates codeword from
+    // Cached per-bit generator matrix (each row i generates codeword from
    // unit vector e_i)
-    let gen = build_generator_matrix();
+    let gen = generator_matrix();

    // Stack-allocated working buffers (k=91, n=174, n-k=83).
    let mut genmrb = [0u8; FTX_LDPC_K * FTX_LDPC_N];
@@ -608,22 +584,24 @@ fn reorder_result(
    }
 }

-/// Build the full per-bit generator matrix.
-/// Each row `i` contains the 174-bit codeword produced by encoding
-/// a unit vector with bit `i` set.
-fn build_generator_matrix() -> Box<[[u8; FTX_LDPC_N]; FTX_LDPC_K]> {
-    let mut gen = Box::new([[0u8; FTX_LDPC_N]; FTX_LDPC_K]);
-    for i in 0..FTX_LDPC_K {
-        let mut msg = [0u8; FTX_LDPC_K];
-        msg[i] = 1;
-        if i < 77 {
-            for j in 77..FTX_LDPC_K {
-                msg[j] = 0;
+/// Get a reference to the cached generator matrix.
+/// The matrix is computed once on first call and reused thereafter.
+fn generator_matrix() -> &'static [[u8; FTX_LDPC_N]; FTX_LDPC_K] {
+    static GEN: OnceLock<Box<[[u8; FTX_LDPC_N]; FTX_LDPC_K]>> = OnceLock::new();
+    GEN.get_or_init(|| {
+        let mut gen = Box::new([[0u8; FTX_LDPC_N]; FTX_LDPC_K]);
+        for i in 0..FTX_LDPC_K {
+            let mut msg = [0u8; FTX_LDPC_K];
+            msg[i] = 1;
+            if i < 77 {
+                for j in 77..FTX_LDPC_K {
+                    msg[j] = 0;
+                }
            }
+            encode174_91_nocrc_bits(&msg, &mut gen[i]);
        }
-        encode174_91_nocrc_bits(&msg, &mut gen[i]);
-    }
-    gen
+        gen
+    })
 }

 /// Full iterative BP decoder with OSD refinement.
@@ -698,7 +676,7 @@ pub fn ft2_decode174_91_osd(
        for i in 0..FTX_LDPC_N {
            best_cw[i] = if zn[i] > 0.0 { 1 } else { 0 };
        }
-        let ncheck = ft2_ldpc_check(&best_cw);
+        let ncheck = ldpc_check(&best_cw);

        if ncheck == 0 && check_crc91(&best_cw) {
            message91.copy_from_slice(&best_cw[..FTX_LDPC_K]);
@@ -815,18 +793,19 @@ pub fn ft2_decode174_91_osd(
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::ldpc::fast_atanh;

    #[test]
    fn ldpc_check_all_zeros() {
        let cw = [0u8; FTX_LDPC_N];
-        assert_eq!(ft2_ldpc_check(&cw), 0);
+        assert_eq!(ldpc_check(&cw), 0);
    }

    #[test]
    fn ldpc_check_single_bit_error() {
        let mut cw = [0u8; FTX_LDPC_N];
        cw[0] = 1;
-        assert!(ft2_ldpc_check(&cw) > 0);
+        assert!(ldpc_check(&cw) > 0);
    }

    #[test]
@@ -922,7 +901,7 @@ mod tests {

    #[test]
    fn generator_matrix_row_zero() {
-        let gen = build_generator_matrix();
+        let gen = generator_matrix();
        // Row 0 should encode unit vector e_0
        assert_eq!(gen[0][0], 1);
        // Some parity bits should be non-zero
@@ -13,7 +13,7 @@ use crate::constants::{FTX_LDPC_MN, FTX_LDPC_NM, FTX_LDPC_NUM_ROWS};
 use crate::protocol::{FTX_LDPC_M, FTX_LDPC_N};

 /// Fast rational approximation of `tanh(x)`, clamped at +/-4.97.
-fn fast_tanh(x: f32) -> f32 {
+pub(crate) fn fast_tanh(x: f32) -> f32 {
    if x < -4.97f32 {
        return -1.0f32;
    }
@@ -27,7 +27,7 @@ fn fast_tanh(x: f32) -> f32 {
 }

 /// Fast rational approximation of `atanh(x)`.
-fn fast_atanh(x: f32) -> f32 {
+pub(crate) fn fast_atanh(x: f32) -> f32 {
    let x2 = x * x;
    let a = x * (945.0f32 + x2 * (-735.0f32 + x2 * 64.0f32));
    let b = 945.0f32 + x2 * (-1050.0f32 + x2 * 225.0f32);
@@ -37,7 +37,7 @@ fn fast_atanh(x: f32) -> f32 {
 /// Count the number of LDPC parity errors in a 174-bit codeword.
 ///
 /// Returns 0 if all parity checks pass (valid codeword).
-pub fn ldpc_check(codeword: &[u8; FTX_LDPC_N]) -> i32 {
+pub(crate) fn ldpc_check(codeword: &[u8; FTX_LDPC_N]) -> i32 {
    let mut errors = 0i32;
    for m in 0..FTX_LDPC_M {
        let mut x: u8 = 0;
@@ -59,6 +59,7 @@ pub fn ldpc_check(codeword: &[u8; FTX_LDPC_N]) -> i32 {
 /// `max_iters` controls how many iterations to attempt.
 ///
 /// Returns the number of remaining parity errors (0 = success).
+#[cfg(test)]
 pub fn ldpc_decode(
    codeword: &mut [f32; FTX_LDPC_N],
    max_iters: usize,
@@ -10,14 +10,9 @@ pub mod decode;
 mod decoder;
 #[allow(clippy::needless_range_loop)]
 pub mod encode;
-#[allow(
-    dead_code,
-    clippy::manual_memcpy,
-    clippy::needless_range_loop,
-    clippy::too_many_arguments
-)]
+#[allow(dead_code, clippy::needless_range_loop, clippy::too_many_arguments)]
 pub mod ft2;
-#[allow(clippy::manual_memcpy, clippy::needless_range_loop)]
+#[allow(clippy::needless_range_loop)]
 pub mod ldpc;
 #[allow(clippy::explicit_counter_loop, clippy::needless_range_loop)]
 pub mod message;
@@ -12,11 +12,13 @@ use rustfft::FftPlanner;

 use crate::protocol::FtxProtocol;

-/// Waterfall element storing magnitude (dB) and phase (radians).
+/// Waterfall element storing magnitude (dB), phase (radians), and raw complex components.
 #[derive(Clone, Copy, Default)]
 pub struct WfElem {
    pub mag: f32,
    pub phase: f32,
+    pub re: f32,
+    pub im: f32,
 }

 impl WfElem {
@@ -192,9 +194,10 @@ impl Monitor {
            }

            // Windowed FFT
-            for pos in 0..self.nfft {
-                self.fft_input[pos] = self.window[pos] * self.last_frame[pos];
-            }
+            self.fft_input
+                .iter_mut()
+                .zip(self.window.iter().zip(self.last_frame.iter()))
+                .for_each(|(dst, (w, f))| *dst = w * f);
            self.real_fft
                .process_with_scratch(
                    &mut self.fft_input,
@@ -214,7 +217,12 @@ impl Monitor {
                        let phase = c.im.atan2(c.re);

                        if offset < self.wf.mag.len() {
-                            self.wf.mag[offset] = WfElem { mag: db, phase };
+                            self.wf.mag[offset] = WfElem {
+                                mag: db,
+                                phase,
+                                re: c.re,
+                                im: c.im,
+                            };
                        }
                        offset += 1;

@@ -226,6 +234,8 @@ impl Monitor {
                            self.wf.mag[offset] = WfElem {
                                mag: -120.0,
                                phase: 0.0,
+                                re: 0.0,
+                                im: 0.0,
                            };
                        }
                        offset += 1;