[refactor](trx-ftx): eliminate heap allocations in LDPC and OSD decoders

Replace Vec<Vec<f32>> with flat stack arrays in ldpc_decode (~114KB), convert 19+ Vec allocations to stack arrays in osd174_91, eliminate per-call temp Vec in nextpat91 via in-place mutation, and replace norm() with norm_sqr() in bitmetrics hot loop (~5.4M calls/frame). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Stan Grams <sjg@haxx.space>
2026-03-19 19:41:34 +01:00
parent 9b49b41fb3
commit 9c9026e7ca
3 changed files with 54 additions and 62 deletions
@@ -134,7 +134,7 @@ impl BitMetricsWorkspace {
                        }
                        _ => Complex32::new(0.0, 0.0),
                    };
-                    let coherent = sum.norm();
+                    let coherent = sum.norm_sqr();
                    for ib in 0..=ibmax {
                        if ((i >> (ibmax - ib)) & 1) != 0 {
@@ -136,25 +136,22 @@ fn nextpat91(mi: &mut [u8], k: usize, iorder: usize, iflag: &mut i32) {
        return;
    }
-    let mut ms = vec![0u8; k];
+    // Build new pattern in-place: zero out after ind, set the swap, pack remaining 1s at end
-    for i in 0..ind as usize {
+    let ind_u = ind as usize;
-        ms[i] = mi[i];
+    for i in (ind_u + 1)..k {
        mi[i] = 0;
    }
-    ms[ind as usize] = 1;
+    mi[ind_u] = 1;
    ms[ind as usize + 1] = 0;
    if (ind as usize + 1) < k {
    let mut nz = iorder as i32;
    for i in 0..k {
-            nz -= ms[i] as i32;
+        nz -= mi[i] as i32;
    }
    if nz > 0 {
        for i in (k - nz as usize)..k {
-                ms[i] = 1;
+            mi[i] = 1;
        }
    }
    }
    mi[..k].copy_from_slice(&ms[..k]);
    *iflag = -1;
    for i in 0..k {
@@ -276,40 +273,36 @@ pub fn osd174_91(
    // unit vector e_i)
    let gen = build_generator_matrix();
-    // Allocate working buffers
+    // Stack-allocated working buffers (k=91, n=174, n-k=83).
-    let mut genmrb = vec![0u8; k * n];
+    let mut genmrb = [0u8; FTX_LDPC_K * FTX_LDPC_N];
-    let mut g2 = vec![0u8; n * k];
+    let mut g2 = [0u8; FTX_LDPC_N * FTX_LDPC_K];
-    let mut m0 = vec![0u8; k];
+    let mut m0 = [0u8; FTX_LDPC_K];
-    let mut me = vec![0u8; k];
+    let mut me = [0u8; FTX_LDPC_K];
-    let mut mi = vec![0u8; k];
+    let mut mi = [0u8; FTX_LDPC_K];
-    let mut misub = vec![0u8; k];
+    let mut misub = [0u8; FTX_LDPC_K];
-    let mut e2sub = vec![0u8; n - k];
+    let mut e2sub = [0u8; FTX_LDPC_M];
-    let mut e2 = vec![0u8; n - k];
+    let mut e2 = [0u8; FTX_LDPC_M];
-    let mut ui = vec![0u8; n - k];
+    let mut ui = [0u8; FTX_LDPC_M];
-    let mut r2pat = vec![0u8; n - k];
+    let mut r2pat = [0u8; FTX_LDPC_M];
-    let mut hdec = vec![0u8; n];
+    let mut hdec = [0u8; FTX_LDPC_N];
-    let mut c0 = vec![0u8; n];
+    let mut c0 = [0u8; FTX_LDPC_N];
-    let mut ce = vec![0u8; n];
+    let mut ce = [0u8; FTX_LDPC_N];
-    let mut nxor = vec![0u8; n];
+    let mut nxor = [0u8; FTX_LDPC_N];
-    let mut apmaskr = vec![0u8; n];
+    let mut apmaskr = [0u8; FTX_LDPC_N];
-    let mut rx = vec![0.0f32; n];
+    let mut rx = [0.0f32; FTX_LDPC_N];
-    let mut absrx = vec![0.0f32; n];
+    let mut absrx = [0.0f32; FTX_LDPC_N];
-    let mut indices = vec![0usize; n];
+    let mut indices = [0usize; FTX_LDPC_N];
    // Sort bits by reliability (descending)
-    struct RelEntry {
+    let mut rel_indices = [0usize; FTX_LDPC_N];
-        index: usize,
+    let mut rel_abs = [0.0f32; FTX_LDPC_N];
-        abs_llr: f32,
+    for i in 0..n {
        rel_indices[i] = i;
        rel_abs[i] = llr[i].abs();
    }
-    let mut rel: Vec<RelEntry> = (0..n)
+    rel_indices[..n].sort_by(|&a, &b| {
-        .map(|i| RelEntry {
+        rel_abs[b]
-            index: i,
+            .partial_cmp(&rel_abs[a])
            abs_llr: llr[i].abs(),
        })
        .collect();
    rel.sort_by(|a, b| {
        b.abs_llr
            .partial_cmp(&a.abs_llr)
            .unwrap_or(std::cmp::Ordering::Equal)
    });
@@ -322,7 +315,7 @@ pub fn osd174_91(
    // Reorder by reliability
    for i in 0..n {
-        indices[i] = rel[i].index;
+        indices[i] = rel_indices[i];
        for row in 0..k {
            genmrb[row * n + i] = gen[row][indices[i]];
        }
@@ -618,8 +611,8 @@ fn reorder_result(
 /// Build the full per-bit generator matrix.
 /// Each row `i` contains the 174-bit codeword produced by encoding
 /// a unit vector with bit `i` set.
-fn build_generator_matrix() -> Vec<[u8; FTX_LDPC_N]> {
+fn build_generator_matrix() -> Box<[[u8; FTX_LDPC_N]; FTX_LDPC_K]> {
-    let mut gen = vec![[0u8; FTX_LDPC_N]; FTX_LDPC_K];
+    let mut gen = Box::new([[0u8; FTX_LDPC_N]; FTX_LDPC_K]);
    for i in 0..FTX_LDPC_K {
        let mut msg = [0u8; FTX_LDPC_K];
        msg[i] = 1;
@@ -674,7 +667,7 @@ pub fn ft2_decode174_91_osd(
    let nosd = if maxosd == 0 { 1 } else { maxosd };
-    let mut zsave = vec![[0.0f32; FTX_LDPC_N]; 3];
+    let mut zsave = [[0.0f32; FTX_LDPC_N]; 3];
    if maxosd == 0 {
        zsave[0].copy_from_slice(llr);
    }
@@ -64,15 +64,13 @@ pub fn ldpc_decode(
    max_iters: usize,
    plain: &mut [u8; FTX_LDPC_N],
 ) -> i32 {
-    // Allocate m[][] and e[][] on the heap (~60 kB each) to avoid stack overflow.
+    // Flat arrays for m[][] and e[][] (~57 kB each, ~114 kB total on stack).
-    let mut m_matrix: Vec<Vec<f32>> = vec![vec![0.0f32; FTX_LDPC_N]; FTX_LDPC_M];
+    let mut m_matrix = [0.0f32; FTX_LDPC_M * FTX_LDPC_N];
-    let mut e_matrix: Vec<Vec<f32>> = vec![vec![0.0f32; FTX_LDPC_N]; FTX_LDPC_M];
+    let mut e_matrix = [0.0f32; FTX_LDPC_M * FTX_LDPC_N];
    // Initialize m[][] with the channel LLRs.
    for j in 0..FTX_LDPC_M {
-        for i in 0..FTX_LDPC_N {
+        m_matrix[j * FTX_LDPC_N..][..FTX_LDPC_N].copy_from_slice(codeword);
            m_matrix[j][i] = codeword[i];
        }
    }
    let mut min_errors = FTX_LDPC_M as i32;
@@ -81,16 +79,17 @@ pub fn ldpc_decode(
        // Update e[][] from m[][]
        for j in 0..FTX_LDPC_M {
            let num_rows = FTX_LDPC_NUM_ROWS[j] as usize;
            let m_row = j * FTX_LDPC_N;
            for ii1 in 0..num_rows {
                let i1 = FTX_LDPC_NM[j][ii1] as usize - 1;
                let mut a = 1.0f32;
                for ii2 in 0..num_rows {
                    let i2 = FTX_LDPC_NM[j][ii2] as usize - 1;
                    if i2 != i1 {
-                        a *= fast_tanh(-m_matrix[j][i2] / 2.0f32);
+                        a *= fast_tanh(-m_matrix[m_row + i2] / 2.0f32);
                    }
                }
-                e_matrix[j][i1] = -2.0f32 * fast_atanh(a);
+                e_matrix[j * FTX_LDPC_N + i1] = -2.0f32 * fast_atanh(a);
            }
        }
@@ -98,7 +97,7 @@ pub fn ldpc_decode(
        for i in 0..FTX_LDPC_N {
            let mut l = codeword[i];
            for j in 0..3 {
-                l += e_matrix[FTX_LDPC_MN[i][j] as usize - 1][i];
+                l += e_matrix[(FTX_LDPC_MN[i][j] as usize - 1) * FTX_LDPC_N + i];
            }
            plain[i] = if l > 0.0 { 1 } else { 0 };
        }
@@ -119,10 +118,10 @@ pub fn ldpc_decode(
                for ji2 in 0..3 {
                    if ji1 != ji2 {
                        let j2 = FTX_LDPC_MN[i][ji2] as usize - 1;
-                        l += e_matrix[j2][i];
+                        l += e_matrix[j2 * FTX_LDPC_N + i];
                    }
                }
-                m_matrix[j1][i] = l;
+                m_matrix[j1 * FTX_LDPC_N + i] = l;
            }
        }
    }