Skip to content

Commit

Permalink
update occupancy calculations
Browse files Browse the repository at this point in the history
  • Loading branch information
peter6866 committed Nov 17, 2024
1 parent 93e5104 commit 84a0272
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tf-binding-rs"
version = "0.1.3"
version = "0.1.4"
edition = "2021"
description = "Fast transcription factor binding site prediction and FASTA manipulation in Rust"
license = "MIT"
Expand Down
9 changes: 5 additions & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
use tf_binding_rs::occupancy;

fn main() {
let df = occupancy::read_pwm_to_ewm("tests/data/tdmMotifs.meme").unwrap();

let pwm1 = df.get("NRL_HUMAN.MA0842.1").unwrap();
println!("{:?}", pwm1);
// let df = occupancy::
let seq = "GAGCCGGGTCATGAAAAAGGGGATCTTGTGTGTCTGTCCACGATAAGCACTATCACAAGGACTTTCTATAAACTCACAAGAAATTTCTGCCCACCCAGCACACAGTTTGTCCAGCTCATCCTGTAGGTGTCTCTATAATAGGACCTATCATAAAAAATTCCTCAAGACTGCAGCATTTCAGATAAGCCACCCTCACAAGA";
let ewms = occupancy::read_pwm_to_ewm("tests/data/tdmMotifs.meme").unwrap();
let landscape = occupancy::total_landscape(seq, &ewms, 8.0).unwrap();
println!("{:?}", landscape);
}
167 changes: 167 additions & 0 deletions src/occupancy.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use crate::error::MotifError;
use crate::fasta::reverse_complement;
use crate::types::*;
use polars::lazy::dsl::*;
use polars::prelude::*;
use std::collections::HashMap;
use std::fmt::format;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::iter::Peekable;
Expand Down Expand Up @@ -227,3 +229,168 @@ pub fn read_pwm_to_ewm(filename: &str) -> Result<EWMCollection, MotifError> {

Ok(ewms)
}

/// Scans both strands of a sequence with an energy matrix to compute binding energies
///
/// This function calculates the energy score for each possible k-mer in the sequence on both
/// forward and reverse strands. For each position, it extracts the k-mer subsequence and
/// calculates the total energy score by summing individual nucleotide contributions.
///
/// # Arguments
/// * `seq` - The DNA sequence to scan
/// * `ewm` - Energy Weight Matrix as a DataFrame where columns represent A,C,G,T and rows are positions
///
/// # Returns
/// * `Result<(Vec<f64>, Vec<f64>), MotifError>` - A tuple containing forward and reverse strand scores
///
/// # Errors
/// * `MotifError::DataError` - If there are issues extracting values from the EWM DataFrame
///
/// # Example
/// ```ignore
/// use tf_binding_rs::occupancy::energy_landscape;
///
/// let seq = "ATCGATCG";
/// let (fwd_scores, rev_scores) = energy_landscape(&seq, &ewm).unwrap();
/// println!("Forward strand scores: {:?}", fwd_scores);
/// println!("Reverse strand scores: {:?}", rev_scores);
/// ```
pub fn energy_landscape(seq: &str, ewm: &EWM) -> Result<(Vec<f64>, Vec<f64>), MotifError> {
let motif_len = ewm.height();
let n_scores = seq.len() - motif_len + 1;
let r_seq = reverse_complement(seq)?;

let mut fscores = vec![0.0; n_scores];
let mut rscores = vec![0.0; n_scores];

for (pos, (fscore, rscore)) in fscores.iter_mut().zip(rscores.iter_mut()).enumerate() {
let f_kmer = &seq[pos..pos + motif_len];
let r_kmer = &r_seq[pos..pos + motif_len];

*fscore = (0..motif_len)
.map(|i| {
ewm.column(&f_kmer[i..i + 1])
.unwrap()
.get(i)
.unwrap()
.try_extract::<f64>()
.map_err(|e| MotifError::DataError(e.to_string()))
})
.sum::<Result<f64, MotifError>>()?;

*rscore = (0..motif_len)
.map(|i| {
ewm.column(&r_kmer[i..i + 1])
.unwrap()
.get(i)
.unwrap()
.try_extract::<f64>()
.map_err(|e| MotifError::DataError(e.to_string()))
})
.sum::<Result<f64, MotifError>>()?;
}

rscores.reverse();
Ok((fscores, rscores))
}

/// Computes the occupancy landscape by scanning sequence with the energy matrix
///
/// This function calculates the probability of TF binding at each position by:
/// 1. Computing energy scores using `energy_landscape()`
/// 2. Converting energy scores to occupancy probabilities using the formula:
/// occupancy = 1 / (1 + exp(energy - mu))
/// where mu is the chemical potential of the transcription factor.
///
/// # Arguments
/// * `seq` - The DNA sequence to scan
/// * `ewm` - Energy Weight Matrix as a DataFrame
/// * `mu` - Chemical potential of the transcription factor
///
/// # Returns
/// * `Result<(Vec<f64>, Vec<f64>), MotifError>` - A tuple containing forward and reverse strand occupancies
///
/// # Errors
/// * `MotifError::DataError` - If there are issues calculating energy scores
///
/// # Example
/// ```ignore
/// use tf_binding_rs::occupancy::occupancy_landscape;
///
/// let seq = "ATCGATCG";
/// let mu = -3.0;
/// let (fwd_occ, rev_occ) = occupancy_landscape(&seq, &ewm, mu).unwrap();
/// println!("Forward strand occupancy: {:?}", fwd_occ);
/// ```
pub fn occupancy_landscape(
seq: &str,
ewm: &EWM,
mu: f64,
) -> Result<(Vec<f64>, Vec<f64>), MotifError> {
let (fscores, rscores) = energy_landscape(seq, ewm)?;

let foccupancies: Vec<f64> = fscores
.into_iter()
.map(|s| 1.0 / (1.0 + (s - mu).exp()))
.collect();

let roccupancies: Vec<f64> = rscores
.into_iter()
.map(|s| 1.0 / (1.0 + (s - mu).exp()))
.collect();

Ok((foccupancies, roccupancies))
}

/// Computes the occupancy landscape for multiple transcription factors
///
/// This function calculates binding probabilities for each TF in the collection and combines
/// them into a single DataFrame. The results include both forward and reverse strand occupancies
/// for each TF, with values padded to match the sequence length.
///
/// # Arguments
/// * `seq` - The DNA sequence to scan
/// * `ewms` - Collection of Energy Weight Matrices, where keys are TF names
/// * `mu` - Chemical potential of the transcription factors
///
/// # Returns
/// * `Result<DataFrame, MotifError>` - DataFrame containing occupancy predictions where:
/// - Rows represent positions in the sequence
/// - Columns are named "{TF_NAME}_F" and "{TF_NAME}_R" for forward/reverse orientations
/// - Values indicate predicted occupancy (0-1) at each position
///
/// # Errors
/// * `MotifError::DataError` - If there are issues creating the DataFrame or calculating occupancies
///
/// # Example
/// ```ignore
/// use tf_binding_rs::occupancy::total_landscape;
///
/// let seq = "ATCGATCG";
/// let mu = -3.0;
/// let landscape = total_landscape(&seq, &ewm_collection, mu).unwrap();
/// println!("Combined occupancy landscape:\n{}", landscape);
/// ```
pub fn total_landscape(seq: &str, ewms: &EWMCollection, mu: f64) -> Result<DataFrame, MotifError> {
let seq_len = seq.len();
let mut columns: Vec<Column> = Vec::new();
let mut names: Vec<String> = Vec::new();

for (name, ewm) in ewms {
let (fscores, rscores) = occupancy_landscape(seq, ewm, mu)?;

// pad scores to sequence length
let amount_to_add = seq_len - fscores.len();
let mut fscores_padded = fscores.clone();
let mut rscores_padded = rscores.clone();
fscores_padded.extend(vec![0.0; amount_to_add]);
rscores_padded.extend(vec![0.0; amount_to_add]);

// create series for forward and reverse scores
columns.push(Column::new(format!("{}_F", name).into(), fscores_padded));
columns.push(Column::new(format!("{}_R", name).into(), rscores_padded));
names.push(name.to_string());
}

DataFrame::new(columns).map_err(|e| MotifError::DataError(e.to_string()))
}

0 comments on commit 84a0272

Please sign in to comment.