[refactor](trx-ftx): eliminate heap allocations in LDPC and OSD decoders

Replace Vec<Vec<f32>> with flat stack arrays in ldpc_decode (~114KB),
convert 19+ Vec allocations to stack arrays in osd174_91, eliminate
per-call temp Vec in nextpat91 via in-place mutation, and replace
norm() with norm_sqr() in bitmetrics hot loop (~5.4M calls/frame).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Stan Grams <sjg@haxx.space>
This commit is contained in:
2026-03-19 19:41:34 +01:00
parent 9b49b41fb3
commit 9c9026e7ca
3 changed files with 54 additions and 62 deletions
+1 -1
View File
@@ -134,7 +134,7 @@ impl BitMetricsWorkspace {
} }
_ => Complex32::new(0.0, 0.0), _ => Complex32::new(0.0, 0.0),
}; };
let coherent = sum.norm(); let coherent = sum.norm_sqr();
for ib in 0..=ibmax { for ib in 0..=ibmax {
if ((i >> (ibmax - ib)) & 1) != 0 { if ((i >> (ibmax - ib)) & 1) != 0 {
+38 -45
View File
@@ -136,25 +136,22 @@ fn nextpat91(mi: &mut [u8], k: usize, iorder: usize, iflag: &mut i32) {
return; return;
} }
let mut ms = vec![0u8; k]; // Build new pattern in-place: zero out after ind, set the swap, pack remaining 1s at end
for i in 0..ind as usize { let ind_u = ind as usize;
ms[i] = mi[i]; for i in (ind_u + 1)..k {
mi[i] = 0;
} }
ms[ind as usize] = 1; mi[ind_u] = 1;
ms[ind as usize + 1] = 0;
if (ind as usize + 1) < k {
let mut nz = iorder as i32; let mut nz = iorder as i32;
for i in 0..k { for i in 0..k {
nz -= ms[i] as i32; nz -= mi[i] as i32;
} }
if nz > 0 { if nz > 0 {
for i in (k - nz as usize)..k { for i in (k - nz as usize)..k {
ms[i] = 1; mi[i] = 1;
} }
} }
}
mi[..k].copy_from_slice(&ms[..k]);
*iflag = -1; *iflag = -1;
for i in 0..k { for i in 0..k {
@@ -276,40 +273,36 @@ pub fn osd174_91(
// unit vector e_i) // unit vector e_i)
let gen = build_generator_matrix(); let gen = build_generator_matrix();
// Allocate working buffers // Stack-allocated working buffers (k=91, n=174, n-k=83).
let mut genmrb = vec![0u8; k * n]; let mut genmrb = [0u8; FTX_LDPC_K * FTX_LDPC_N];
let mut g2 = vec![0u8; n * k]; let mut g2 = [0u8; FTX_LDPC_N * FTX_LDPC_K];
let mut m0 = vec![0u8; k]; let mut m0 = [0u8; FTX_LDPC_K];
let mut me = vec![0u8; k]; let mut me = [0u8; FTX_LDPC_K];
let mut mi = vec![0u8; k]; let mut mi = [0u8; FTX_LDPC_K];
let mut misub = vec![0u8; k]; let mut misub = [0u8; FTX_LDPC_K];
let mut e2sub = vec![0u8; n - k]; let mut e2sub = [0u8; FTX_LDPC_M];
let mut e2 = vec![0u8; n - k]; let mut e2 = [0u8; FTX_LDPC_M];
let mut ui = vec![0u8; n - k]; let mut ui = [0u8; FTX_LDPC_M];
let mut r2pat = vec![0u8; n - k]; let mut r2pat = [0u8; FTX_LDPC_M];
let mut hdec = vec![0u8; n]; let mut hdec = [0u8; FTX_LDPC_N];
let mut c0 = vec![0u8; n]; let mut c0 = [0u8; FTX_LDPC_N];
let mut ce = vec![0u8; n]; let mut ce = [0u8; FTX_LDPC_N];
let mut nxor = vec![0u8; n]; let mut nxor = [0u8; FTX_LDPC_N];
let mut apmaskr = vec![0u8; n]; let mut apmaskr = [0u8; FTX_LDPC_N];
let mut rx = vec![0.0f32; n]; let mut rx = [0.0f32; FTX_LDPC_N];
let mut absrx = vec![0.0f32; n]; let mut absrx = [0.0f32; FTX_LDPC_N];
let mut indices = vec![0usize; n]; let mut indices = [0usize; FTX_LDPC_N];
// Sort bits by reliability (descending) // Sort bits by reliability (descending)
struct RelEntry { let mut rel_indices = [0usize; FTX_LDPC_N];
index: usize, let mut rel_abs = [0.0f32; FTX_LDPC_N];
abs_llr: f32, for i in 0..n {
rel_indices[i] = i;
rel_abs[i] = llr[i].abs();
} }
let mut rel: Vec<RelEntry> = (0..n) rel_indices[..n].sort_by(|&a, &b| {
.map(|i| RelEntry { rel_abs[b]
index: i, .partial_cmp(&rel_abs[a])
abs_llr: llr[i].abs(),
})
.collect();
rel.sort_by(|a, b| {
b.abs_llr
.partial_cmp(&a.abs_llr)
.unwrap_or(std::cmp::Ordering::Equal) .unwrap_or(std::cmp::Ordering::Equal)
}); });
@@ -322,7 +315,7 @@ pub fn osd174_91(
// Reorder by reliability // Reorder by reliability
for i in 0..n { for i in 0..n {
indices[i] = rel[i].index; indices[i] = rel_indices[i];
for row in 0..k { for row in 0..k {
genmrb[row * n + i] = gen[row][indices[i]]; genmrb[row * n + i] = gen[row][indices[i]];
} }
@@ -618,8 +611,8 @@ fn reorder_result(
/// Build the full per-bit generator matrix. /// Build the full per-bit generator matrix.
/// Each row `i` contains the 174-bit codeword produced by encoding /// Each row `i` contains the 174-bit codeword produced by encoding
/// a unit vector with bit `i` set. /// a unit vector with bit `i` set.
fn build_generator_matrix() -> Vec<[u8; FTX_LDPC_N]> { fn build_generator_matrix() -> Box<[[u8; FTX_LDPC_N]; FTX_LDPC_K]> {
let mut gen = vec![[0u8; FTX_LDPC_N]; FTX_LDPC_K]; let mut gen = Box::new([[0u8; FTX_LDPC_N]; FTX_LDPC_K]);
for i in 0..FTX_LDPC_K { for i in 0..FTX_LDPC_K {
let mut msg = [0u8; FTX_LDPC_K]; let mut msg = [0u8; FTX_LDPC_K];
msg[i] = 1; msg[i] = 1;
@@ -674,7 +667,7 @@ pub fn ft2_decode174_91_osd(
let nosd = if maxosd == 0 { 1 } else { maxosd }; let nosd = if maxosd == 0 { 1 } else { maxosd };
let mut zsave = vec![[0.0f32; FTX_LDPC_N]; 3]; let mut zsave = [[0.0f32; FTX_LDPC_N]; 3];
if maxosd == 0 { if maxosd == 0 {
zsave[0].copy_from_slice(llr); zsave[0].copy_from_slice(llr);
} }
+10 -11
View File
@@ -64,15 +64,13 @@ pub fn ldpc_decode(
max_iters: usize, max_iters: usize,
plain: &mut [u8; FTX_LDPC_N], plain: &mut [u8; FTX_LDPC_N],
) -> i32 { ) -> i32 {
// Allocate m[][] and e[][] on the heap (~60 kB each) to avoid stack overflow. // Flat arrays for m[][] and e[][] (~57 kB each, ~114 kB total on stack).
let mut m_matrix: Vec<Vec<f32>> = vec![vec![0.0f32; FTX_LDPC_N]; FTX_LDPC_M]; let mut m_matrix = [0.0f32; FTX_LDPC_M * FTX_LDPC_N];
let mut e_matrix: Vec<Vec<f32>> = vec![vec![0.0f32; FTX_LDPC_N]; FTX_LDPC_M]; let mut e_matrix = [0.0f32; FTX_LDPC_M * FTX_LDPC_N];
// Initialize m[][] with the channel LLRs. // Initialize m[][] with the channel LLRs.
for j in 0..FTX_LDPC_M { for j in 0..FTX_LDPC_M {
for i in 0..FTX_LDPC_N { m_matrix[j * FTX_LDPC_N..][..FTX_LDPC_N].copy_from_slice(codeword);
m_matrix[j][i] = codeword[i];
}
} }
let mut min_errors = FTX_LDPC_M as i32; let mut min_errors = FTX_LDPC_M as i32;
@@ -81,16 +79,17 @@ pub fn ldpc_decode(
// Update e[][] from m[][] // Update e[][] from m[][]
for j in 0..FTX_LDPC_M { for j in 0..FTX_LDPC_M {
let num_rows = FTX_LDPC_NUM_ROWS[j] as usize; let num_rows = FTX_LDPC_NUM_ROWS[j] as usize;
let m_row = j * FTX_LDPC_N;
for ii1 in 0..num_rows { for ii1 in 0..num_rows {
let i1 = FTX_LDPC_NM[j][ii1] as usize - 1; let i1 = FTX_LDPC_NM[j][ii1] as usize - 1;
let mut a = 1.0f32; let mut a = 1.0f32;
for ii2 in 0..num_rows { for ii2 in 0..num_rows {
let i2 = FTX_LDPC_NM[j][ii2] as usize - 1; let i2 = FTX_LDPC_NM[j][ii2] as usize - 1;
if i2 != i1 { if i2 != i1 {
a *= fast_tanh(-m_matrix[j][i2] / 2.0f32); a *= fast_tanh(-m_matrix[m_row + i2] / 2.0f32);
} }
} }
e_matrix[j][i1] = -2.0f32 * fast_atanh(a); e_matrix[j * FTX_LDPC_N + i1] = -2.0f32 * fast_atanh(a);
} }
} }
@@ -98,7 +97,7 @@ pub fn ldpc_decode(
for i in 0..FTX_LDPC_N { for i in 0..FTX_LDPC_N {
let mut l = codeword[i]; let mut l = codeword[i];
for j in 0..3 { for j in 0..3 {
l += e_matrix[FTX_LDPC_MN[i][j] as usize - 1][i]; l += e_matrix[(FTX_LDPC_MN[i][j] as usize - 1) * FTX_LDPC_N + i];
} }
plain[i] = if l > 0.0 { 1 } else { 0 }; plain[i] = if l > 0.0 { 1 } else { 0 };
} }
@@ -119,10 +118,10 @@ pub fn ldpc_decode(
for ji2 in 0..3 { for ji2 in 0..3 {
if ji1 != ji2 { if ji1 != ji2 {
let j2 = FTX_LDPC_MN[i][ji2] as usize - 1; let j2 = FTX_LDPC_MN[i][ji2] as usize - 1;
l += e_matrix[j2][i]; l += e_matrix[j2 * FTX_LDPC_N + i];
} }
} }
m_matrix[j1][i] = l; m_matrix[j1 * FTX_LDPC_N + i] = l;
} }
} }
} }