[refactor](trx-ftx): eliminate heap allocations in LDPC and OSD decoders
Replace Vec<Vec<f32>> with flat stack arrays in ldpc_decode (~114KB), convert 19+ Vec allocations to stack arrays in osd174_91, eliminate per-call temp Vec in nextpat91 via in-place mutation, and replace norm() with norm_sqr() in bitmetrics hot loop (~5.4M calls/frame). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Stan Grams <sjg@haxx.space>
This commit is contained in:
@@ -134,7 +134,7 @@ impl BitMetricsWorkspace {
|
|||||||
}
|
}
|
||||||
_ => Complex32::new(0.0, 0.0),
|
_ => Complex32::new(0.0, 0.0),
|
||||||
};
|
};
|
||||||
let coherent = sum.norm();
|
let coherent = sum.norm_sqr();
|
||||||
|
|
||||||
for ib in 0..=ibmax {
|
for ib in 0..=ibmax {
|
||||||
if ((i >> (ibmax - ib)) & 1) != 0 {
|
if ((i >> (ibmax - ib)) & 1) != 0 {
|
||||||
|
|||||||
@@ -136,25 +136,22 @@ fn nextpat91(mi: &mut [u8], k: usize, iorder: usize, iflag: &mut i32) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut ms = vec![0u8; k];
|
// Build new pattern in-place: zero out after ind, set the swap, pack remaining 1s at end
|
||||||
for i in 0..ind as usize {
|
let ind_u = ind as usize;
|
||||||
ms[i] = mi[i];
|
for i in (ind_u + 1)..k {
|
||||||
|
mi[i] = 0;
|
||||||
}
|
}
|
||||||
ms[ind as usize] = 1;
|
mi[ind_u] = 1;
|
||||||
ms[ind as usize + 1] = 0;
|
|
||||||
|
|
||||||
if (ind as usize + 1) < k {
|
|
||||||
let mut nz = iorder as i32;
|
let mut nz = iorder as i32;
|
||||||
for i in 0..k {
|
for i in 0..k {
|
||||||
nz -= ms[i] as i32;
|
nz -= mi[i] as i32;
|
||||||
}
|
}
|
||||||
if nz > 0 {
|
if nz > 0 {
|
||||||
for i in (k - nz as usize)..k {
|
for i in (k - nz as usize)..k {
|
||||||
ms[i] = 1;
|
mi[i] = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
mi[..k].copy_from_slice(&ms[..k]);
|
|
||||||
|
|
||||||
*iflag = -1;
|
*iflag = -1;
|
||||||
for i in 0..k {
|
for i in 0..k {
|
||||||
@@ -276,40 +273,36 @@ pub fn osd174_91(
|
|||||||
// unit vector e_i)
|
// unit vector e_i)
|
||||||
let gen = build_generator_matrix();
|
let gen = build_generator_matrix();
|
||||||
|
|
||||||
// Allocate working buffers
|
// Stack-allocated working buffers (k=91, n=174, n-k=83).
|
||||||
let mut genmrb = vec![0u8; k * n];
|
let mut genmrb = [0u8; FTX_LDPC_K * FTX_LDPC_N];
|
||||||
let mut g2 = vec![0u8; n * k];
|
let mut g2 = [0u8; FTX_LDPC_N * FTX_LDPC_K];
|
||||||
let mut m0 = vec![0u8; k];
|
let mut m0 = [0u8; FTX_LDPC_K];
|
||||||
let mut me = vec![0u8; k];
|
let mut me = [0u8; FTX_LDPC_K];
|
||||||
let mut mi = vec![0u8; k];
|
let mut mi = [0u8; FTX_LDPC_K];
|
||||||
let mut misub = vec![0u8; k];
|
let mut misub = [0u8; FTX_LDPC_K];
|
||||||
let mut e2sub = vec![0u8; n - k];
|
let mut e2sub = [0u8; FTX_LDPC_M];
|
||||||
let mut e2 = vec![0u8; n - k];
|
let mut e2 = [0u8; FTX_LDPC_M];
|
||||||
let mut ui = vec![0u8; n - k];
|
let mut ui = [0u8; FTX_LDPC_M];
|
||||||
let mut r2pat = vec![0u8; n - k];
|
let mut r2pat = [0u8; FTX_LDPC_M];
|
||||||
let mut hdec = vec![0u8; n];
|
let mut hdec = [0u8; FTX_LDPC_N];
|
||||||
let mut c0 = vec![0u8; n];
|
let mut c0 = [0u8; FTX_LDPC_N];
|
||||||
let mut ce = vec![0u8; n];
|
let mut ce = [0u8; FTX_LDPC_N];
|
||||||
let mut nxor = vec![0u8; n];
|
let mut nxor = [0u8; FTX_LDPC_N];
|
||||||
let mut apmaskr = vec![0u8; n];
|
let mut apmaskr = [0u8; FTX_LDPC_N];
|
||||||
let mut rx = vec![0.0f32; n];
|
let mut rx = [0.0f32; FTX_LDPC_N];
|
||||||
let mut absrx = vec![0.0f32; n];
|
let mut absrx = [0.0f32; FTX_LDPC_N];
|
||||||
let mut indices = vec![0usize; n];
|
let mut indices = [0usize; FTX_LDPC_N];
|
||||||
|
|
||||||
// Sort bits by reliability (descending)
|
// Sort bits by reliability (descending)
|
||||||
struct RelEntry {
|
let mut rel_indices = [0usize; FTX_LDPC_N];
|
||||||
index: usize,
|
let mut rel_abs = [0.0f32; FTX_LDPC_N];
|
||||||
abs_llr: f32,
|
for i in 0..n {
|
||||||
|
rel_indices[i] = i;
|
||||||
|
rel_abs[i] = llr[i].abs();
|
||||||
}
|
}
|
||||||
let mut rel: Vec<RelEntry> = (0..n)
|
rel_indices[..n].sort_by(|&a, &b| {
|
||||||
.map(|i| RelEntry {
|
rel_abs[b]
|
||||||
index: i,
|
.partial_cmp(&rel_abs[a])
|
||||||
abs_llr: llr[i].abs(),
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
rel.sort_by(|a, b| {
|
|
||||||
b.abs_llr
|
|
||||||
.partial_cmp(&a.abs_llr)
|
|
||||||
.unwrap_or(std::cmp::Ordering::Equal)
|
.unwrap_or(std::cmp::Ordering::Equal)
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -322,7 +315,7 @@ pub fn osd174_91(
|
|||||||
|
|
||||||
// Reorder by reliability
|
// Reorder by reliability
|
||||||
for i in 0..n {
|
for i in 0..n {
|
||||||
indices[i] = rel[i].index;
|
indices[i] = rel_indices[i];
|
||||||
for row in 0..k {
|
for row in 0..k {
|
||||||
genmrb[row * n + i] = gen[row][indices[i]];
|
genmrb[row * n + i] = gen[row][indices[i]];
|
||||||
}
|
}
|
||||||
@@ -618,8 +611,8 @@ fn reorder_result(
|
|||||||
/// Build the full per-bit generator matrix.
|
/// Build the full per-bit generator matrix.
|
||||||
/// Each row `i` contains the 174-bit codeword produced by encoding
|
/// Each row `i` contains the 174-bit codeword produced by encoding
|
||||||
/// a unit vector with bit `i` set.
|
/// a unit vector with bit `i` set.
|
||||||
fn build_generator_matrix() -> Vec<[u8; FTX_LDPC_N]> {
|
fn build_generator_matrix() -> Box<[[u8; FTX_LDPC_N]; FTX_LDPC_K]> {
|
||||||
let mut gen = vec![[0u8; FTX_LDPC_N]; FTX_LDPC_K];
|
let mut gen = Box::new([[0u8; FTX_LDPC_N]; FTX_LDPC_K]);
|
||||||
for i in 0..FTX_LDPC_K {
|
for i in 0..FTX_LDPC_K {
|
||||||
let mut msg = [0u8; FTX_LDPC_K];
|
let mut msg = [0u8; FTX_LDPC_K];
|
||||||
msg[i] = 1;
|
msg[i] = 1;
|
||||||
@@ -674,7 +667,7 @@ pub fn ft2_decode174_91_osd(
|
|||||||
|
|
||||||
let nosd = if maxosd == 0 { 1 } else { maxosd };
|
let nosd = if maxosd == 0 { 1 } else { maxosd };
|
||||||
|
|
||||||
let mut zsave = vec![[0.0f32; FTX_LDPC_N]; 3];
|
let mut zsave = [[0.0f32; FTX_LDPC_N]; 3];
|
||||||
if maxosd == 0 {
|
if maxosd == 0 {
|
||||||
zsave[0].copy_from_slice(llr);
|
zsave[0].copy_from_slice(llr);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -64,15 +64,13 @@ pub fn ldpc_decode(
|
|||||||
max_iters: usize,
|
max_iters: usize,
|
||||||
plain: &mut [u8; FTX_LDPC_N],
|
plain: &mut [u8; FTX_LDPC_N],
|
||||||
) -> i32 {
|
) -> i32 {
|
||||||
// Allocate m[][] and e[][] on the heap (~60 kB each) to avoid stack overflow.
|
// Flat arrays for m[][] and e[][] (~57 kB each, ~114 kB total on stack).
|
||||||
let mut m_matrix: Vec<Vec<f32>> = vec![vec![0.0f32; FTX_LDPC_N]; FTX_LDPC_M];
|
let mut m_matrix = [0.0f32; FTX_LDPC_M * FTX_LDPC_N];
|
||||||
let mut e_matrix: Vec<Vec<f32>> = vec![vec![0.0f32; FTX_LDPC_N]; FTX_LDPC_M];
|
let mut e_matrix = [0.0f32; FTX_LDPC_M * FTX_LDPC_N];
|
||||||
|
|
||||||
// Initialize m[][] with the channel LLRs.
|
// Initialize m[][] with the channel LLRs.
|
||||||
for j in 0..FTX_LDPC_M {
|
for j in 0..FTX_LDPC_M {
|
||||||
for i in 0..FTX_LDPC_N {
|
m_matrix[j * FTX_LDPC_N..][..FTX_LDPC_N].copy_from_slice(codeword);
|
||||||
m_matrix[j][i] = codeword[i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut min_errors = FTX_LDPC_M as i32;
|
let mut min_errors = FTX_LDPC_M as i32;
|
||||||
@@ -81,16 +79,17 @@ pub fn ldpc_decode(
|
|||||||
// Update e[][] from m[][]
|
// Update e[][] from m[][]
|
||||||
for j in 0..FTX_LDPC_M {
|
for j in 0..FTX_LDPC_M {
|
||||||
let num_rows = FTX_LDPC_NUM_ROWS[j] as usize;
|
let num_rows = FTX_LDPC_NUM_ROWS[j] as usize;
|
||||||
|
let m_row = j * FTX_LDPC_N;
|
||||||
for ii1 in 0..num_rows {
|
for ii1 in 0..num_rows {
|
||||||
let i1 = FTX_LDPC_NM[j][ii1] as usize - 1;
|
let i1 = FTX_LDPC_NM[j][ii1] as usize - 1;
|
||||||
let mut a = 1.0f32;
|
let mut a = 1.0f32;
|
||||||
for ii2 in 0..num_rows {
|
for ii2 in 0..num_rows {
|
||||||
let i2 = FTX_LDPC_NM[j][ii2] as usize - 1;
|
let i2 = FTX_LDPC_NM[j][ii2] as usize - 1;
|
||||||
if i2 != i1 {
|
if i2 != i1 {
|
||||||
a *= fast_tanh(-m_matrix[j][i2] / 2.0f32);
|
a *= fast_tanh(-m_matrix[m_row + i2] / 2.0f32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
e_matrix[j][i1] = -2.0f32 * fast_atanh(a);
|
e_matrix[j * FTX_LDPC_N + i1] = -2.0f32 * fast_atanh(a);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -98,7 +97,7 @@ pub fn ldpc_decode(
|
|||||||
for i in 0..FTX_LDPC_N {
|
for i in 0..FTX_LDPC_N {
|
||||||
let mut l = codeword[i];
|
let mut l = codeword[i];
|
||||||
for j in 0..3 {
|
for j in 0..3 {
|
||||||
l += e_matrix[FTX_LDPC_MN[i][j] as usize - 1][i];
|
l += e_matrix[(FTX_LDPC_MN[i][j] as usize - 1) * FTX_LDPC_N + i];
|
||||||
}
|
}
|
||||||
plain[i] = if l > 0.0 { 1 } else { 0 };
|
plain[i] = if l > 0.0 { 1 } else { 0 };
|
||||||
}
|
}
|
||||||
@@ -119,10 +118,10 @@ pub fn ldpc_decode(
|
|||||||
for ji2 in 0..3 {
|
for ji2 in 0..3 {
|
||||||
if ji1 != ji2 {
|
if ji1 != ji2 {
|
||||||
let j2 = FTX_LDPC_MN[i][ji2] as usize - 1;
|
let j2 = FTX_LDPC_MN[i][ji2] as usize - 1;
|
||||||
l += e_matrix[j2][i];
|
l += e_matrix[j2 * FTX_LDPC_N + i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m_matrix[j1][i] = l;
|
m_matrix[j1 * FTX_LDPC_N + i] = l;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user