diff --git a/bin/compare_to_gff3 b/bin/compare_to_gff3 index 34f21e6..412d96c 100755 --- a/bin/compare_to_gff3 +++ b/bin/compare_to_gff3 @@ -7,6 +7,8 @@ a GFA1 prediction from exfi import logging import sys +import pandas as pd + from exfi.arguments import compare_to_gff_args from exfi.logger import set_up_logger from exfi.io.bed import BED3_COLS @@ -15,7 +17,8 @@ from exfi.io.gff3_to_bed import gff3_to_bed3 from exfi.io.read_bed import read_bed3 from exfi.io.fasta_to_dict import fasta_to_dict from exfi.compare import \ - classify, compute_stats_per_base, compute_stats_per_exon + classify, \ + compute_stats_per_base, compute_stats_per_exon, compute_stats_per_ieb if __name__ == '__main__': @@ -56,12 +59,19 @@ if __name__ == '__main__': EXON_STATS = compute_stats_per_exon(CLASSIFICATION) BASE_STATS = compute_stats_per_base(CLASSIFICATION) + IEB_STATS = compute_stats_per_ieb( + BED3_TRUE, BED3_PRED, + max_distance=ARGS['max_distance'] + ) EXON_STATS.insert(loc=0, column='type', value='per_exon') - EXON_STATS.insert(loc=1, column='fraction', value=ARGS['fraction']) + EXON_STATS.insert(loc=1, column='fraction/dist', value=ARGS['fraction']) BASE_STATS.insert(loc=0, column='type', value='per_base') - BASE_STATS.insert(loc=1, column='fraction', value=ARGS['fraction']) + BASE_STATS.insert(loc=1, column='fraction/dist', value=ARGS['fraction']) + + IEB_STATS.insert(loc=0, column='type', value='per_ieb') + IEB_STATS.insert(loc=1, column='fraction/dist', value=ARGS['max_distance']) - EXON_STATS.to_csv(path_or_buf=sys.stdout, sep='\t', index=False) - BASE_STATS.to_csv(path_or_buf=sys.stdout, sep='\t', index=False) + pd.concat([EXON_STATS, BASE_STATS, IEB_STATS])\ + .to_csv(path_or_buf=sys.stdout, sep='\t', index=False) diff --git a/exfi/__init__.py b/exfi/__init__.py index 35012d7..8025001 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.5.3' +__version__ = '1.5.4' diff --git a/exfi/arguments.py b/exfi/arguments.py index ce43028..0b9dd91 100644 --- a/exfi/arguments.py +++ b/exfi/arguments.py @@ -346,6 +346,17 @@ def add_input_fraction(parser): dest='fraction' ) +def add_max_distance(parser): + """Add the maximum distance to IEB""" + parser.add_argument( + '--max-distance-to-ieb', '-b', + type=int, + help='Maximum distance to an IEB to consider as same', + default=10, + metavar='INT', + dest='max_distance' + ) + def build_baited_bloom_filter_args(): """Create the parser for build_baited_bloom_filter""" @@ -436,5 +447,6 @@ def compare_to_gff_args(): add_input_fasta(parser) add_gff3_type(parser) add_input_fraction(parser) + add_max_distance(parser) return parser diff --git a/exfi/compare.py b/exfi/compare.py index 5e9f714..3a10977 100644 --- a/exfi/compare.py +++ b/exfi/compare.py @@ -35,6 +35,20 @@ 'precision': np.float, 'recall': np.float, 'f_1': np.float } +DISTANCES_COLS = [ + 'pred_chrom', 'pred_chrom_start', 'pred_chrom_end', + 'true_chrom', 'true_chrom_start', 'true_chrom_end', + 'distance' +] + +DISTANCES_DTYPES = { + 'pred_chrom': object, 'pred_chrom_start': np.int, 'pred_chrom_end': np.int, + 'true_chrom': object, 'true_chrom_start': np.int, 'true_chrom_end': np.int, + 'distance': np.int +} + + + def bedtools_intersect(bed1_fn, bed2_fn, additional_flags=None): """Bedtools intersect wrapper @@ -307,3 +321,94 @@ def compute_stats_per_base(classification): .astype(STATS_DTYPES) return stats + + + +def get_starts(bed3): + """Get the BED coordinates of the first nucleotide of each record.""" + starts = bed3.copy() + starts['chrom_end'] = starts['chrom_start'] + 1 + return starts.sort_values(by=['chrom', 'chrom_start']) + + + +def get_ends(bed3): + """Get the BED coordinates of the last nucleotide of each record.""" + ends = bed3.copy() + ends['chrom_start'] = ends['chrom_end'] - 1 + return ends.sort_values(by=['chrom', 'chrom_start']) + + + +def get_distances(bed3_true, bed3_pred): + """For every record in bed3_pred, get the closest record and distance to + bed3_true.""" + bed3_true_fn = mkstemp()[1] + bed3_pred_fn = mkstemp()[1] + distances_fn = mkstemp()[1] + + bed3_true.to_csv(bed3_true_fn, sep='\t', index=False, header=False) + bed3_pred.to_csv(bed3_pred_fn, sep='\t', index=False, header=False) + + command = [ + 'bedtools', 'closest', + '-a', bed3_pred_fn, '-b', bed3_true_fn, + '-d', '-k', '1', '-t', 'first' + ] + + with open(distances_fn, 'w') as out: + process = Popen(command, stdout=out) + process.wait() + + distances = pd.read_csv( + distances_fn, + sep='\t', + names=DISTANCES_COLS + )\ + .astype(DISTANCES_DTYPES) + + remove(bed3_true_fn) + remove(bed3_pred_fn) + remove(distances_fn) + + return distances + + + +def compute_stats_per_ieb(true_bed3, pred_bed3, max_distance=10): + """Compute the classification stats per IEB""" + + true_starts = get_starts(true_bed3) + true_ends = get_ends(true_bed3) + + pred_starts = get_starts(pred_bed3) + pred_ends = get_ends(pred_bed3) + + dist_start = get_distances(true_starts, pred_starts) + dist_end = get_distances(true_ends, pred_ends) + + true_ieb = true_starts.shape[0] + true_ends.shape[0] + pred_ieb = pred_starts.shape[0] + pred_ends.shape[0] + + tp_ieb = sum( + (dist_start.distance <= max_distance) & (dist_start.distance >= 0) + ) + sum( + (dist_end.distance <= max_distance) & (dist_end.distance >= 0) + ) + + fp_ieb = pred_ieb - tp_ieb + fn_ieb = true_ieb - tp_ieb + + stats = pd.DataFrame( + data=[[ + true_ieb, pred_ieb, + tp_ieb, fp_ieb, fn_ieb, + compute_precision(tp_ieb, fp_ieb), + compute_recall(tp_ieb, fn_ieb), + compute_f_1(tp_ieb, fp_ieb, fn_ieb) + ]], + columns=STATS_COLS + )\ + .astype(STATS_DTYPES) + + return stats diff --git a/tests/compare/__init__.py b/tests/compare/__init__.py index 440a57c..fb9b8c8 100644 --- a/tests/compare/__init__.py +++ b/tests/compare/__init__.py @@ -107,3 +107,12 @@ columns=STATS_COLS )\ .astype(STATS_DTYPES) + +STATS_PER_IEB = pd.DataFrame( + data=[[ + 30.0, 30.0, 29.0, 1.0, 1.0, + 0.9666666666666667, 0.9666666666666667, 0.9666666666666667 + ]], + columns=STATS_COLS +)\ +.astype(STATS_DTYPES) diff --git a/tests/test_compare.py b/tests/test_compare.py index 61be57a..9f2915c 100644 --- a/tests/test_compare.py +++ b/tests/test_compare.py @@ -14,7 +14,8 @@ compute_true_positive_bases, \ compute_false_positive_bases, \ compute_false_negative_bases, \ - compute_stats_per_base + compute_stats_per_base, \ + compute_stats_per_ieb from tests.compare import \ @@ -22,7 +23,7 @@ BED3_EMPTY, BED3_TRUE, BED3_PRED, \ TP_DF, FP_DF, FN_DF, \ CLASSIFICATION, \ - STATS_PER_EXON, STATS_PER_BASE + STATS_PER_EXON, STATS_PER_BASE, STATS_PER_IEB @@ -176,6 +177,17 @@ def test_complex(self): self.assertTrue(observed.equals(STATS_PER_BASE)) +class TestComputeStatsPerIEB(TestCase): + """Tests for exfi.compare.compute_stats_per_ieb""" + + def test_complex(self): + """exfi.compare.compute_stats_per_ieb: some exons""" + observed = compute_stats_per_ieb(BED3_TRUE, BED3_PRED, 2) + print(observed.values.tolist()) + print(STATS_PER_IEB.values.tolist()) + self.assertTrue(observed.equals(STATS_PER_IEB)) + + if __name__ == '__main__': main()