[refactor](trx-ftx): eliminate heap allocations in LDPC and OSD decoders

Replace Vec<Vec<f32>> with flat stack arrays in ldpc_decode (~114KB),
convert 19+ Vec allocations to stack arrays in osd174_91, eliminate
per-call temp Vec in nextpat91 via in-place mutation, and replace
norm() with norm_sqr() in bitmetrics hot loop (~5.4M calls/frame).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Stan Grams <sjg@haxx.space>
This commit is contained in:
2026-03-19 19:41:34 +01:00
parent 9b49b41fb3
commit 9c9026e7ca
3 changed files with 54 additions and 62 deletions
+1 -1
View File
@@ -134,7 +134,7 @@ impl BitMetricsWorkspace {
}
_ => Complex32::new(0.0, 0.0),
};
let coherent = sum.norm();
let coherent = sum.norm_sqr();
for ib in 0..=ibmax {
if ((i >> (ibmax - ib)) & 1) != 0 {
+43 -50
View File
@@ -136,25 +136,22 @@ fn nextpat91(mi: &mut [u8], k: usize, iorder: usize, iflag: &mut i32) {
return;
}
let mut ms = vec![0u8; k];
for i in 0..ind as usize {
ms[i] = mi[i];
// Build new pattern in-place: zero out after ind, set the swap, pack remaining 1s at end
let ind_u = ind as usize;
for i in (ind_u + 1)..k {
mi[i] = 0;
}
ms[ind as usize] = 1;
ms[ind as usize + 1] = 0;
mi[ind_u] = 1;
if (ind as usize + 1) < k {
let mut nz = iorder as i32;
for i in 0..k {
nz -= ms[i] as i32;
}
if nz > 0 {
for i in (k - nz as usize)..k {
ms[i] = 1;
}
let mut nz = iorder as i32;
for i in 0..k {
nz -= mi[i] as i32;
}
if nz > 0 {
for i in (k - nz as usize)..k {
mi[i] = 1;
}
}
mi[..k].copy_from_slice(&ms[..k]);
*iflag = -1;
for i in 0..k {
@@ -276,40 +273,36 @@ pub fn osd174_91(
// unit vector e_i)
let gen = build_generator_matrix();
// Allocate working buffers
let mut genmrb = vec![0u8; k * n];
let mut g2 = vec![0u8; n * k];
let mut m0 = vec![0u8; k];
let mut me = vec![0u8; k];
let mut mi = vec![0u8; k];
let mut misub = vec![0u8; k];
let mut e2sub = vec![0u8; n - k];
let mut e2 = vec![0u8; n - k];
let mut ui = vec![0u8; n - k];
let mut r2pat = vec![0u8; n - k];
let mut hdec = vec![0u8; n];
let mut c0 = vec![0u8; n];
let mut ce = vec![0u8; n];
let mut nxor = vec![0u8; n];
let mut apmaskr = vec![0u8; n];
let mut rx = vec![0.0f32; n];
let mut absrx = vec![0.0f32; n];
let mut indices = vec![0usize; n];
// Stack-allocated working buffers (k=91, n=174, n-k=83).
let mut genmrb = [0u8; FTX_LDPC_K * FTX_LDPC_N];
let mut g2 = [0u8; FTX_LDPC_N * FTX_LDPC_K];
let mut m0 = [0u8; FTX_LDPC_K];
let mut me = [0u8; FTX_LDPC_K];
let mut mi = [0u8; FTX_LDPC_K];
let mut misub = [0u8; FTX_LDPC_K];
let mut e2sub = [0u8; FTX_LDPC_M];
let mut e2 = [0u8; FTX_LDPC_M];
let mut ui = [0u8; FTX_LDPC_M];
let mut r2pat = [0u8; FTX_LDPC_M];
let mut hdec = [0u8; FTX_LDPC_N];
let mut c0 = [0u8; FTX_LDPC_N];
let mut ce = [0u8; FTX_LDPC_N];
let mut nxor = [0u8; FTX_LDPC_N];
let mut apmaskr = [0u8; FTX_LDPC_N];
let mut rx = [0.0f32; FTX_LDPC_N];
let mut absrx = [0.0f32; FTX_LDPC_N];
let mut indices = [0usize; FTX_LDPC_N];
// Sort bits by reliability (descending)
struct RelEntry {
index: usize,
abs_llr: f32,
let mut rel_indices = [0usize; FTX_LDPC_N];
let mut rel_abs = [0.0f32; FTX_LDPC_N];
for i in 0..n {
rel_indices[i] = i;
rel_abs[i] = llr[i].abs();
}
let mut rel: Vec<RelEntry> = (0..n)
.map(|i| RelEntry {
index: i,
abs_llr: llr[i].abs(),
})
.collect();
rel.sort_by(|a, b| {
b.abs_llr
.partial_cmp(&a.abs_llr)
rel_indices[..n].sort_by(|&a, &b| {
rel_abs[b]
.partial_cmp(&rel_abs[a])
.unwrap_or(std::cmp::Ordering::Equal)
});
@@ -322,7 +315,7 @@ pub fn osd174_91(
// Reorder by reliability
for i in 0..n {
indices[i] = rel[i].index;
indices[i] = rel_indices[i];
for row in 0..k {
genmrb[row * n + i] = gen[row][indices[i]];
}
@@ -618,8 +611,8 @@ fn reorder_result(
/// Build the full per-bit generator matrix.
/// Each row `i` contains the 174-bit codeword produced by encoding
/// a unit vector with bit `i` set.
fn build_generator_matrix() -> Vec<[u8; FTX_LDPC_N]> {
let mut gen = vec![[0u8; FTX_LDPC_N]; FTX_LDPC_K];
fn build_generator_matrix() -> Box<[[u8; FTX_LDPC_N]; FTX_LDPC_K]> {
let mut gen = Box::new([[0u8; FTX_LDPC_N]; FTX_LDPC_K]);
for i in 0..FTX_LDPC_K {
let mut msg = [0u8; FTX_LDPC_K];
msg[i] = 1;
@@ -674,7 +667,7 @@ pub fn ft2_decode174_91_osd(
let nosd = if maxosd == 0 { 1 } else { maxosd };
let mut zsave = vec![[0.0f32; FTX_LDPC_N]; 3];
let mut zsave = [[0.0f32; FTX_LDPC_N]; 3];
if maxosd == 0 {
zsave[0].copy_from_slice(llr);
}
+10 -11
View File
@@ -64,15 +64,13 @@ pub fn ldpc_decode(
max_iters: usize,
plain: &mut [u8; FTX_LDPC_N],
) -> i32 {
// Allocate m[][] and e[][] on the heap (~60 kB each) to avoid stack overflow.
let mut m_matrix: Vec<Vec<f32>> = vec![vec![0.0f32; FTX_LDPC_N]; FTX_LDPC_M];
let mut e_matrix: Vec<Vec<f32>> = vec![vec![0.0f32; FTX_LDPC_N]; FTX_LDPC_M];
// Flat arrays for m[][] and e[][] (~57 kB each, ~114 kB total on stack).
let mut m_matrix = [0.0f32; FTX_LDPC_M * FTX_LDPC_N];
let mut e_matrix = [0.0f32; FTX_LDPC_M * FTX_LDPC_N];
// Initialize m[][] with the channel LLRs.
for j in 0..FTX_LDPC_M {
for i in 0..FTX_LDPC_N {
m_matrix[j][i] = codeword[i];
}
m_matrix[j * FTX_LDPC_N..][..FTX_LDPC_N].copy_from_slice(codeword);
}
let mut min_errors = FTX_LDPC_M as i32;
@@ -81,16 +79,17 @@ pub fn ldpc_decode(
// Update e[][] from m[][]
for j in 0..FTX_LDPC_M {
let num_rows = FTX_LDPC_NUM_ROWS[j] as usize;
let m_row = j * FTX_LDPC_N;
for ii1 in 0..num_rows {
let i1 = FTX_LDPC_NM[j][ii1] as usize - 1;
let mut a = 1.0f32;
for ii2 in 0..num_rows {
let i2 = FTX_LDPC_NM[j][ii2] as usize - 1;
if i2 != i1 {
a *= fast_tanh(-m_matrix[j][i2] / 2.0f32);
a *= fast_tanh(-m_matrix[m_row + i2] / 2.0f32);
}
}
e_matrix[j][i1] = -2.0f32 * fast_atanh(a);
e_matrix[j * FTX_LDPC_N + i1] = -2.0f32 * fast_atanh(a);
}
}
@@ -98,7 +97,7 @@ pub fn ldpc_decode(
for i in 0..FTX_LDPC_N {
let mut l = codeword[i];
for j in 0..3 {
l += e_matrix[FTX_LDPC_MN[i][j] as usize - 1][i];
l += e_matrix[(FTX_LDPC_MN[i][j] as usize - 1) * FTX_LDPC_N + i];
}
plain[i] = if l > 0.0 { 1 } else { 0 };
}
@@ -119,10 +118,10 @@ pub fn ldpc_decode(
for ji2 in 0..3 {
if ji1 != ji2 {
let j2 = FTX_LDPC_MN[i][ji2] as usize - 1;
l += e_matrix[j2][i];
l += e_matrix[j2 * FTX_LDPC_N + i];
}
}
m_matrix[j1][i] = l;
m_matrix[j1 * FTX_LDPC_N + i] = l;
}
}
}