Skip to content

Commit

Permalink
v1.5.4
Browse files Browse the repository at this point in the history
stats per ieb
  • Loading branch information
jlanga authored Apr 12, 2019
2 parents e300b50 + 59fa874 commit 615c0e6
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 8 deletions.
20 changes: 15 additions & 5 deletions bin/compare_to_gff3
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ a GFA1 prediction from exfi
import logging
import sys

import pandas as pd

from exfi.arguments import compare_to_gff_args
from exfi.logger import set_up_logger
from exfi.io.bed import BED3_COLS
Expand All @@ -15,7 +17,8 @@ from exfi.io.gff3_to_bed import gff3_to_bed3
from exfi.io.read_bed import read_bed3
from exfi.io.fasta_to_dict import fasta_to_dict
from exfi.compare import \
classify, compute_stats_per_base, compute_stats_per_exon
classify, \
compute_stats_per_base, compute_stats_per_exon, compute_stats_per_ieb


if __name__ == '__main__':
Expand Down Expand Up @@ -56,12 +59,19 @@ if __name__ == '__main__':

EXON_STATS = compute_stats_per_exon(CLASSIFICATION)
BASE_STATS = compute_stats_per_base(CLASSIFICATION)
IEB_STATS = compute_stats_per_ieb(
BED3_TRUE, BED3_PRED,
max_distance=ARGS['max_distance']
)

EXON_STATS.insert(loc=0, column='type', value='per_exon')
EXON_STATS.insert(loc=1, column='fraction', value=ARGS['fraction'])
EXON_STATS.insert(loc=1, column='fraction/dist', value=ARGS['fraction'])

BASE_STATS.insert(loc=0, column='type', value='per_base')
BASE_STATS.insert(loc=1, column='fraction', value=ARGS['fraction'])
BASE_STATS.insert(loc=1, column='fraction/dist', value=ARGS['fraction'])

IEB_STATS.insert(loc=0, column='type', value='per_ieb')
IEB_STATS.insert(loc=1, column='fraction/dist', value=ARGS['max_distance'])

EXON_STATS.to_csv(path_or_buf=sys.stdout, sep='\t', index=False)
BASE_STATS.to_csv(path_or_buf=sys.stdout, sep='\t', index=False)
pd.concat([EXON_STATS, BASE_STATS, IEB_STATS])\
.to_csv(path_or_buf=sys.stdout, sep='\t', index=False)
2 changes: 1 addition & 1 deletion exfi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
filters.
"""

__version__ = '1.5.3'
__version__ = '1.5.4'
12 changes: 12 additions & 0 deletions exfi/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,17 @@ def add_input_fraction(parser):
dest='fraction'
)

def add_max_distance(parser):
"""Add the maximum distance to IEB"""
parser.add_argument(
'--max-distance-to-ieb', '-b',
type=int,
help='Maximum distance to an IEB to consider as same',
default=10,
metavar='INT',
dest='max_distance'
)


def build_baited_bloom_filter_args():
"""Create the parser for build_baited_bloom_filter"""
Expand Down Expand Up @@ -436,5 +447,6 @@ def compare_to_gff_args():
add_input_fasta(parser)
add_gff3_type(parser)
add_input_fraction(parser)
add_max_distance(parser)

return parser
105 changes: 105 additions & 0 deletions exfi/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,20 @@
'precision': np.float, 'recall': np.float, 'f_1': np.float
}

DISTANCES_COLS = [
'pred_chrom', 'pred_chrom_start', 'pred_chrom_end',
'true_chrom', 'true_chrom_start', 'true_chrom_end',
'distance'
]

DISTANCES_DTYPES = {
'pred_chrom': object, 'pred_chrom_start': np.int, 'pred_chrom_end': np.int,
'true_chrom': object, 'true_chrom_start': np.int, 'true_chrom_end': np.int,
'distance': np.int
}



def bedtools_intersect(bed1_fn, bed2_fn, additional_flags=None):
"""Bedtools intersect wrapper
Expand Down Expand Up @@ -307,3 +321,94 @@ def compute_stats_per_base(classification):
.astype(STATS_DTYPES)

return stats



def get_starts(bed3):
"""Get the BED coordinates of the first nucleotide of each record."""
starts = bed3.copy()
starts['chrom_end'] = starts['chrom_start'] + 1
return starts.sort_values(by=['chrom', 'chrom_start'])



def get_ends(bed3):
"""Get the BED coordinates of the last nucleotide of each record."""
ends = bed3.copy()
ends['chrom_start'] = ends['chrom_end'] - 1
return ends.sort_values(by=['chrom', 'chrom_start'])



def get_distances(bed3_true, bed3_pred):
"""For every record in bed3_pred, get the closest record and distance to
bed3_true."""
bed3_true_fn = mkstemp()[1]
bed3_pred_fn = mkstemp()[1]
distances_fn = mkstemp()[1]

bed3_true.to_csv(bed3_true_fn, sep='\t', index=False, header=False)
bed3_pred.to_csv(bed3_pred_fn, sep='\t', index=False, header=False)

command = [
'bedtools', 'closest',
'-a', bed3_pred_fn, '-b', bed3_true_fn,
'-d', '-k', '1', '-t', 'first'
]

with open(distances_fn, 'w') as out:
process = Popen(command, stdout=out)
process.wait()

distances = pd.read_csv(
distances_fn,
sep='\t',
names=DISTANCES_COLS
)\
.astype(DISTANCES_DTYPES)

remove(bed3_true_fn)
remove(bed3_pred_fn)
remove(distances_fn)

return distances



def compute_stats_per_ieb(true_bed3, pred_bed3, max_distance=10):
"""Compute the classification stats per IEB"""

true_starts = get_starts(true_bed3)
true_ends = get_ends(true_bed3)

pred_starts = get_starts(pred_bed3)
pred_ends = get_ends(pred_bed3)

dist_start = get_distances(true_starts, pred_starts)
dist_end = get_distances(true_ends, pred_ends)

true_ieb = true_starts.shape[0] + true_ends.shape[0]
pred_ieb = pred_starts.shape[0] + pred_ends.shape[0]

tp_ieb = sum(
(dist_start.distance <= max_distance) & (dist_start.distance >= 0)
) + sum(
(dist_end.distance <= max_distance) & (dist_end.distance >= 0)
)

fp_ieb = pred_ieb - tp_ieb
fn_ieb = true_ieb - tp_ieb

stats = pd.DataFrame(
data=[[
true_ieb, pred_ieb,
tp_ieb, fp_ieb, fn_ieb,
compute_precision(tp_ieb, fp_ieb),
compute_recall(tp_ieb, fn_ieb),
compute_f_1(tp_ieb, fp_ieb, fn_ieb)
]],
columns=STATS_COLS
)\
.astype(STATS_DTYPES)

return stats
9 changes: 9 additions & 0 deletions tests/compare/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,12 @@
columns=STATS_COLS
)\
.astype(STATS_DTYPES)

STATS_PER_IEB = pd.DataFrame(
data=[[
30.0, 30.0, 29.0, 1.0, 1.0,
0.9666666666666667, 0.9666666666666667, 0.9666666666666667
]],
columns=STATS_COLS
)\
.astype(STATS_DTYPES)
16 changes: 14 additions & 2 deletions tests/test_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,16 @@
compute_true_positive_bases, \
compute_false_positive_bases, \
compute_false_negative_bases, \
compute_stats_per_base
compute_stats_per_base, \
compute_stats_per_ieb


from tests.compare import \
BED3_EMPTY_FN, BED3_TRUE_FN, BED3_PRED_FN, \
BED3_EMPTY, BED3_TRUE, BED3_PRED, \
TP_DF, FP_DF, FN_DF, \
CLASSIFICATION, \
STATS_PER_EXON, STATS_PER_BASE
STATS_PER_EXON, STATS_PER_BASE, STATS_PER_IEB



Expand Down Expand Up @@ -176,6 +177,17 @@ def test_complex(self):
self.assertTrue(observed.equals(STATS_PER_BASE))


class TestComputeStatsPerIEB(TestCase):
"""Tests for exfi.compare.compute_stats_per_ieb"""

def test_complex(self):
"""exfi.compare.compute_stats_per_ieb: some exons"""
observed = compute_stats_per_ieb(BED3_TRUE, BED3_PRED, 2)
print(observed.values.tolist())
print(STATS_PER_IEB.values.tolist())
self.assertTrue(observed.equals(STATS_PER_IEB))



if __name__ == '__main__':
main()

0 comments on commit 615c0e6

Please sign in to comment.