From 0f809e05527c004ea86a895481a67c92bd1ba08a Mon Sep 17 00:00:00 2001 From: David Feldman Date: Thu, 28 Dec 2023 11:52:31 -0800 Subject: [PATCH] add --- .gitignore | 2 + README.md | 26 +- environment.yaml | 19 + examples/.DS_Store | Bin 0 -> 6148 bytes install.sh | 7 + pyproject.toml | 41 ++ src/ngs_analysis/app.py | 717 +++++++++++++++++++++++++ src/ngs_analysis/constants.py | 167 ++++++ src/ngs_analysis/interactive.py | 23 + src/ngs_analysis/sequence.py | 915 ++++++++++++++++++++++++++++++++ src/ngs_analysis/timer.py | 112 ++++ 11 files changed, 2028 insertions(+), 1 deletion(-) create mode 100644 environment.yaml create mode 100644 examples/.DS_Store create mode 100644 install.sh create mode 100644 pyproject.toml create mode 100644 src/ngs_analysis/app.py create mode 100644 src/ngs_analysis/constants.py create mode 100644 src/ngs_analysis/interactive.py create mode 100644 src/ngs_analysis/sequence.py create mode 100644 src/ngs_analysis/timer.py diff --git a/.gitignore b/.gitignore index 68bc17f..b9180f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +examples/nanopore + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index 760727b..3e96fbf 100644 --- a/README.md +++ b/README.md @@ -1 +1,25 @@ -# ngs-analysis \ No newline at end of file +# ngs-analysis + +Intended for analysis of sequencing reads that span multiple DNA or protein parts. For instance, given a library of protein variants linked to DNA barcodes, it can answer questions like: + +- How accurate are the variant sequences, at the DNA or protein level? +- How frequently is the same barcode linked to two different variants? +- Which reads contain parts required for function (e.g., a kozak start sequence, or a fused protein tag)? + +This kind of analysis often involves parsing raw sequencing reads for DNA and/or protein sub-sequences (parts), then mapping the parts to a reference of anticipated part combinations. This package offers a simple workflow: + +1. Define how to parse reads into parts using plain text expressions (no code) +2. Test the parser on simulated DNA sequences (e.g., your vector map) +3. Parse a batch of sequencing samples +4. Map the (combination of) parts found in each read to your reference + +It’s been tested with Illumina paired-end reads and Oxford Nanopore long reads. Under the hood it uses [NGmerge](https://github.com/jsh58/NGmerge) to merge paired reads and [MMseqs2](https://github.com/soedinglab/MMseqs2) for sequencing mapping. It is moderately performant: 1 million paired-end reads can be mapped to a reference of 100,000 variant-barcode pairs in ~1 minute. + +# Installation + +```bash +pip install ngs-analysis +``` + +Tested on Linux and MacOS (Apple Silicon). + diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..102a76d --- /dev/null +++ b/environment.yaml @@ -0,0 +1,19 @@ +name: ngs-analysis +channels: + - conda-forge + - bioconda +dependencies: + - fire + - glob2 + - ipykernel + - natsort + - pandas + - pandera + - parse + - pytables + - python-levenshtein + - python-slugify + - pyyaml + - regex + - tqdm + - pyarrow diff --git a/examples/.DS_Store b/examples/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d0cf13a6e61dbfbe35ec1f25a40b4018f124e175 GIT binary patch literal 6148 zcmeHK%}T>S5Z-O8-BN@c6nYGJEm&(Uh?fxS3mDOZN=-=6V9b^zwTDv3SzpK}@p+ut z-GId$Jc-yDnEhsVW_B|lWPccA+?|Jqj9H8^0gA}cs1Y2lbu~;dB3E7 zndmPL;kOHxG9Og3FW(;|G63IuFiYaR7z{pmquJWpZi}|)iaYO7F1*4o=DF)&45`M5YhmtMaONLSldz zAO?tm&0)Zt0iwG(B~zuu05R}G25^6{K@lB;rAD=NK!ew3^tTXEz{a-(qHWMISZV|h z2-m5AI+dFz2G{8@ZkssAV5w23Gp<&KdCbby=61.0"] +build-backend = "setuptools.build_meta" + +[project.urls] +"Homepage" = "https://github.com/feldman4/plasmid-design" +"Bug Tracker" = "https://github.com/feldman4/plasmid-design/issues" + +[project.scripts] +ngs_analysis = "ngs_analysis.app:main" + diff --git a/src/ngs_analysis/app.py b/src/ngs_analysis/app.py new file mode 100644 index 0000000..97114ae --- /dev/null +++ b/src/ngs_analysis/app.py @@ -0,0 +1,717 @@ +"""Analyze sequencing reads and compare DNA and protein parts to a reference. + +Pattern-matching is used to define parts (e.g., designs, +variable linkers, fixed sequences, and barcodes). Identified parts can be +mapped to inferred reference sequences (e.g., design inferred +from barcode). See README for details. +""" +import os +import re +import subprocess +import tempfile +from glob import glob +from subprocess import DEVNULL +from typing import Optional, Tuple + +import fire +import Levenshtein +import numpy as np +import pandas as pd +import pandera as pa +import yaml +from pandera.typing import Series + +# import quickdna +from .sequence import (read_fasta, read_fastq, reverse_complement, + translate_dna, write_fake_fastq, write_fasta) +from .timer import Timer + +# user-provided +config_yaml = 'config.yaml' # how to parse and compare +sample_table = 'samples.csv' # NGS samples +possible_dna_table = 'possible_dna.csv' # designed insert DNA +planned_dna_table = 'planned_dna.csv' # cloning plan + +working_directories = '0_paired_reads', '1_reads', '2_parsed', '3_mapped', '3_mapped/map' + +# generated or provided +design_table = 'designs.csv' # designed DNA or protein parts + +ngmerge = 'NGmerge' +bwa = 'bwa' +mmseqs = 'mmseqs' + +MMSEQS_KMER_AA = 6 +MMSEQS_KMER_DNA = 12 + +### TYPES +# TODO: joint uniqueness +class PossibleDNA(pa.DataFrameModel): + subpool: Optional[Series[str]] = pa.Field(nullable=False, coerce=True) + name: Series[str] = pa.Field(nullable=False, unique=True) + vector_dna: Series[str] = pa.Field(nullable=False, unique=True) + + +class PlannedDNA(pa.DataFrameModel): + sample: Series[str] = pa.Field(nullable=False, coerce=True) + vector_dna: Series[str] = pa.Field(nullable=False) + + +class Samples(pa.DataFrameModel): + sample: Series[str] = pa.Field(nullable=False, unique=True, + coerce=True) + fastq_name: Series[str] = pa.Field(nullable=False, unique=True, + coerce=True) + + +# build this by using config.yaml to parse PossibleDNA +class Designs(pa.DataFrameModel): + subpool: Optional[Series[str]] = pa.Field(nullable=False, coerce=True) + name: Series[str] = pa.Field(nullable=False, coerce=True) + + +class Candidates(pa.DataFrameModel): + read_index: Series[int] = pa.Field(nullable=False) + query: Series[str] = pa.Field(nullable=False) + reference: Series[str] = pa.Field(nullable=False) + reference_name: Series[str] = pa.Field(nullable=False) + + +### MAIN STEPS +def setup(): + print('Creating directories...') + [os.makedirs(f'{d}/simulate', exist_ok=True) for d in working_directories] + + print('Checking config...') + # TODO: actual type check for the config + load_config() + + n = len(load_possible_dna()) + print(f'Found {n} reference sequences in {possible_dna_table}') + + # TODO: check that the input files exist + n = len(load_samples()) + print(f'Found {n} samples in {sample_table}') + + + +# ok +def dna_to_designs(): + """Generate a design table by parsing the possible DNA sequences. + """ + df_possible = load_possible_dna() + config = load_config() + parsed = (parse_sequences(config, df_possible['vector_dna']) + .drop('read_index', axis=1)) + m, n = len(parsed), len(df_possible) + if m != n: + raise ValueError(f'failed to parse {n-m}/{n} sequences') + + print(f'Parsed {len(parsed):,} sequences from {possible_dna_table}') + describe_parsed(parsed) + + pd.concat([ + df_possible.drop('vector_dna', axis=1), + parsed, + ], axis=1).to_csv(design_table, index=None) + print(f'Wrote {n:,} rows to {design_table}') + +# ok +def simulate_fastqs(read_lengths: Tuple[int, int]=(300, 300)): + """Create simulated fastq files based on planned samples. + + :param read_lengths: forward and reverse read lengths + """ + df_planned = load_planned_dna() + os.makedirs('1_reads/simulate', exist_ok=True) + + for sample, df in df_planned.groupby('sample', sort=False): + r1 = df['vector_dna'].str[:read_lengths[0]] + r2 = df['vector_dna'].apply(reverse_complement).str[:read_lengths[1]] + + write_fake_fastq(f'1_reads/simulate/{sample}_R1.fastq', r1) + write_fake_fastq(f'1_reads/simulate/{sample}_R2.fastq', r2) + print(f'Wrote {len(r1):,} paired reads to ' + f'1_reads/simulate/{sample}_R[12].fastq') + +# ok +def merge_read_pairs(sample, simulate=False): + """Use NGmerge to merge read pairs. + + :param sample: sample name + :param simulate: if True, use ./*/simulate/ subdirectories + """ + filenames = get_filenames(sample, simulate=simulate) + + command = (ngmerge, + '-1', filenames['paired_r1'], '-2', filenames['paired_r2'], + '-o', filenames['reads'], + '-z', # always gzip output + ) + with Timer(verbose=f'Running ngmerge on sample {sample}...'): + subprocess.run(command, check=True) + print(f'Wrote output to {filenames["reads"]}') + +# ok +def parse_reads(sample, simulate=False): + config = load_config() + filenames = get_filenames(sample, simulate) + with Timer('', verbose='Loading reads...'): + reads = read_fastq(filenames['reads']) + with Timer('', verbose=f'Parsing {len(reads):,} reads...'): + os.makedirs(os.path.dirname(filenames['parsed']), exist_ok=True) + parse_sequences(config, reads).to_parquet(filenames['parsed']) + +# ok +def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10): + """For each read, find nearest match for the requested fields. + + Candidate matches from the design table are found via mmseqs search, + then the nearest match is determined by Levenshtein distance. For + inferred fields, the match is set to the inferred value and the + distance to the design table entry accordingly. For barcodes, only + exact matches are considered. + + :param sample: sample name + :param simulate: if True, use ./*/simulate/ subdirectories + """ + filenames = get_filenames(sample, simulate) + + # prepare mmseqs db + mmseqs_make_design_db() + + match_to_ngs, secondary = get_comparison_fields() + arr = [] + for field in match_to_ngs: + if field == 'barcode': + # for barcode, only permit exact matches + with Timer('', verbose=f'Matching {field}...'): + arr += [map_barcode(sample, simulate, field)] + else: + with Timer('', verbose=f'Searching for {field} candidates...'): + # fastmap to get candidates + df_mapped = mmseqs_prefilter(sample, simulate, field, + mmseqs_max_seqs=mmseqs_max_seqs) + # calculate edit distances for candidates + msg = f'Calculating distances for {len(df_mapped)} {field} candidates...' + with Timer('', verbose=msg): + df_match = match_mapped(df_mapped, field) + arr += [df_match] + + df_match = arr[0] + for df in arr[1:]: + df_match = df_match.merge(df, on='read_index') + + if secondary: + df_designs = load_designs() + arr = [] + for field_a, field_b in secondary: + field = f'{field_b}_from_{field_a}' + if '_aa' in field_b: + df_designs[field_b] = [quick_translate(x) for x in df_designs[field_b[:-3]]] + name_to_field = df_designs.set_index('name')[field_b].to_dict() + # define reference for field_b using field_a + reference = (df_match + .rename(columns={f'{field_a}_match': 'reference_name'}) + .assign(reference=lambda x: x['reference_name'].map(name_to_field)) + .dropna() + ) + # calculate distance between the parsed value for field_b and + # the reference (where both exist) + df = (load_seqs_to_map(sample, simulate, field_b) + .merge(reference, on='read_index') + .pipe(Candidates) + .pipe(match_mapped, field) + .drop(f'{field}_match_equidistant', axis=1) + ) + df_match = df_match.merge(df, on='read_index') + + df_match.to_csv(filenames['mapped'], index=None) + print(f'Wrote match table ({len(df_match):,} rows) to {filenames["mapped"]}') + + +### LOAD + + +def load_config(): + with open(config_yaml, 'r') as fh: + return yaml.safe_load(fh) + + +def load_possible_dna() -> PossibleDNA: + return pd.read_csv(possible_dna_table).pipe(PossibleDNA) + + +def load_planned_dna() -> PlannedDNA: + return pd.read_csv(planned_dna_table).pipe(PlannedDNA) + + +def load_samples() -> Samples: + return pd.read_csv(sample_table).pipe(Samples) + + +def load_designs() -> Designs: + """Add translations that are needed for comparison but not already present. + """ + df_designs = pd.read_csv(design_table) + fields, _ = get_comparison_fields() + for field in fields: + if field.endswith('_aa') and field[:-3] in df_designs: + if field in df_designs: + raise ValueError(f'Cannot provide both DNA and amino acid for {field}') + df_designs[field] = [quick_translate(x) for x in df_designs[field[:-3]]] + elif field not in df_designs: + raise ValueError(f'Cannot derive {field} from design table') + return df_designs.pipe(Designs) + + +def load_seqs_to_map(sample, simulate, field): + filenames = get_filenames(sample, simulate, field) + + aa = field.endswith('_aa') + field_no_aa = field[:-3] if aa else field + + seqs_to_map = (pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa]) + .rename(columns={field_no_aa: 'query'}) + .dropna() + ) + if aa: + seqs_to_map['query'] = [quick_translate(x) for x in seqs_to_map['query']] + + return seqs_to_map + + +### UTILITIES + + +def describe_parsed(df_parsed): + df_parsed = df_parsed.copy() + df_parsed[df_parsed == ''] = np.nan + dna_length = df_parsed.select_dtypes('object').map(len, na_action='ignore') + aa_length = dna_length / 3 + + null_summary = pd.concat([ + (df_parsed.isnull() * 1).sum().rename('null entries'), + (df_parsed.isnull() * 1).mean().rename('null fraction'), + ], axis=1) + stats = ['count', 'min', 'median', 'max'] + describe = lambda x: x.describe().rename({'50%': 'median'}).loc[stats].astype(int) + + to_string = lambda x: ' ' + x.to_string().replace('\n', '\n ') + print('Null entries:') + print(null_summary.pipe(to_string)) + print('DNA length:') + print(dna_length.pipe(describe).pipe(to_string)) + print('Protein length:') + print(aa_length.pipe(describe).pipe(to_string)) + + +def quick_translate(seq): + # return str(quickdna.DnaSequence(seq).translate()) + n = len(seq) - len(seq) % 3 + return translate_dna(seq[:n]) + + +def format_string_to_capture_regex(x, optional=[], **kwargs): + formatted_string = re.sub(r'{(.*?)}', r'(?P<\1>.*)', x) + for key, value in kwargs.items(): + tail = '?' if key in optional else '' + formatted_string = formatted_string.replace( + f'(?P<{key}>.*)', f'(?P<{key}>{value}){tail}') + return formatted_string + + +def get_comparison_fields(): + """Parse fields for direct matching and field pairs for inferred matching. + """ + direct_match, inferred_match = set(), set() + config = load_config() + for c in config['compare']: + if '->' in c: + a, b = [x.strip() for x in c.split('->')] + direct_match |= {a} + inferred_match |= {(a, b)} + else: + direct_match |= {c.strip()} + + return direct_match, inferred_match + + +def parse_sequences(config, sequences): + """Match DNA sequence based on pattern config. + First, the sequence between left_adapter and right_adapter is extracted. + Then, any DNA parts are matched. + Then, a protein is translated starting at the position matched by the pattern + in config.protein.start_at using `translate_to_stop`. + Finally, the protein is matched using the template string protein.pattern, + and named matches are added to the result dictionary. Only DNA is stored here. + """ + re_insert = re.compile('{left_adapter}(.*){right_adapter}'.format(**config)) + import os + os.wtf = re_insert + capture = config['protein'].get('capture', {}) + optional = config['protein'].get('optional', []) + re_proteins = [re.compile(format_string_to_capture_regex( + x, optional, **capture)) for x in config['protein']['patterns']] + + arr = [] + for i, dna in enumerate(sequences): + dna = dna.upper() + entry = {} + try: + insert = re_insert.findall(dna)[0] + except IndexError: + continue + entry['read_index'] = i + entry['read_length'] = len(dna) + entry['insert_length'] = len(insert) + for name, pat in config['dna_parts'].items(): + m = re.findall(pat, insert) + if m: + entry[name] = m[0] + + protein_start = re.search(config['protein']['start_at'], insert).start() + protein = quick_translate(insert[protein_start:]) + protein = protein.split('*')[0] + entry['cds'] = insert[protein_start:][:len(protein) * 3] + + for re_protein in re_proteins: + match = re_protein.match(protein) + if match: + for i, name in enumerate(match.groupdict()): + name = name.rstrip('_') # collapse redundant names + start, end = match.span(i + 1) + entry[name] = entry['cds'][start*3:end*3] + arr += [entry] + # assert False + break + if len(arr) == 0: + raise Exception('Nothing found') + return pd.DataFrame(arr) + + +def map_barcode(sample, simulate, field): + """Only exact matches, distance and equidistant not returned. + """ + filenames = get_filenames(sample, simulate, field) + + aa = field.endswith('_aa') + field_no_aa = field[:-3] if aa else field + seqs_to_map = pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa]) + if aa: + seqs_to_map[field] = [quick_translate(x) for x in seqs_to_map[field_no_aa]] + seqs_to_map = seqs_to_map[['read_index', field]] + ref_seqs = (read_fasta(filenames['map mmseqs target fa'], as_df=True) + .rename(columns={'name': 'match', 'seq': 'reference'}) + ) + + match = f'{field}_match' + + return (seqs_to_map + .merge(ref_seqs, left_on=field, right_on='reference', how='left') + .rename(columns={'match': match}) + [['read_index', match]] + ) + + +def match_mapped(df_mapped: Candidates, field): + """Filter mapping table for a specific field. + + What happens if inference leads to multiple sequences? + """ + match = f'{field}_match' + distance = f'{field}_distance' + equidistant = f'{field}_match_equidistant' + + df_mapped = df_mapped.copy() + df_mapped[distance] = [Levenshtein.distance(a,b) for a,b in df_mapped[['query', 'reference']].values] + # keep only the minimum values + min_distance = df_mapped.groupby('read_index')[distance].transform('min') + df_mapped = df_mapped.loc[lambda x: x[distance] == min_distance] + + A = df_mapped.groupby('read_index').size().rename(equidistant).reset_index() + B = (df_mapped.sort_values(distance).drop_duplicates('read_index') + .rename(columns={'reference_name': match})[['read_index', match, distance]]) + return A.merge(B)[['read_index', match, distance, equidistant]] + + +### FILES + + +def get_filenames(sample, simulate=False, field=None): + """Get filenames used throughout pipeline for this sample. + + Fastq files ending with .fastq, .fq., .fastq.gz, or .fq.gz are identified + based on the sample table. In simulate mode, the fastq name is identical + to the sample name. + """ + if simulate: + search = f'1_reads/simulate/{sample}_R1' + else: + sample_to_fastq = load_samples().set_index('sample')['fastq_name'].to_dict() + fastq_search = sample_to_fastq[sample] + search = f'1_reads/*{fastq_search}*' + + extensions = '.fastq', '.fq', '.fq.gz', '.fastq.gz' + def search_extensions(search): + return sum([glob(search + x) for x in extensions], []) + + r = search_extensions(search) + r1 = search_extensions('paired_' + search + '_R1') + r2 = search_extensions('paired_' + search + '_R2') + paired_reads_found = len(r1) == 1 & len(r2) == 1 + + if len(r1) > 1 or len(r2) > 1: + raise ValueError(f'Too many paired read files for sample {sample}, found: {r1 + r2}') + + if len(r) > 1: + raise ValueError(f'Too many read files for sample {sample}, found: {r}') + + if not (paired_reads_found or len(r) == 1): + raise ValueError(f'Neither reads nor paired reads provided for sample {sample}, searched with: {search}') + + add_simulate = 'simulate/' if simulate else '' + filenames = { + 'reads': f'1_reads/{add_simulate}{sample}.fastq.gz', + 'parsed': f'2_parsed/{add_simulate}{sample}.parsed.pq', + 'mapped': f'3_mapped/{add_simulate}{sample}.mapped.csv', + } + if paired_reads_found: + filenames |= {'paired_r1': r1[0], 'paired_r2': r2[0]} + + if field: + filenames['map query'] = f'3_mapped/map/{add_simulate}{sample}_{field}.fa' + filenames['map mmseqs target db'] = f'3_mapped/map/{field}.target.db' + filenames['map mmseqs target fa'] = f'3_mapped/map/{field}.fa' + filenames['map mmseqs query db'] = f'3_mapped/map/{add_simulate}{sample}_{field}.query.db' + filenames['map mmseqs result db'] = f'3_mapped/map/{add_simulate}{sample}_{field}.result.db' + filenames['map mmseqs prefilter db'] = f'3_mapped/map/{add_simulate}{sample}_{field}.prefilter.db' + filenames['map mmseqs prefilter tsv'] = f'3_mapped/map/{add_simulate}{sample}_{field}.prefilter.tsv' + filenames['map mmseqs result m8'] = f'3_mapped/map/{add_simulate}{sample}_{field}.result.m8' + filenames['map result'] = f'3_mapped/map/{add_simulate}{sample}_{field}.csv' + + return filenames + + + +### MMSEQS + + +def mmseqs_make_design_db(): + df_designs = load_designs() + fields, _ = get_comparison_fields() + with Timer('', verbose=f'Creating mmseqs index for fields: {", ".join(fields)}'): + for field in fields: + aa = field.endswith('_aa') + f = f'3_mapped/map/{field}.fa' # should live in get_filenames + seqs_to_index = df_designs[['name', field]] + write_fasta(f, seqs_to_index.drop_duplicates(field)) + if field != 'barcode': + make_mmseqs_db(f, aa, is_query=False) + + +def make_mmseqs_db(fasta_file, dbtype_is_amino_acid, is_query): + """ + """ + db = fasta_file.replace('.fa', '.query.db' if is_query else '.target.db') + command = (mmseqs, 'createdb', fasta_file, db, + '--dbtype', '1' if dbtype_is_amino_acid else '2' + ) + result = subprocess.run(command, stdout=DEVNULL, stderr=DEVNULL) + if result.returncode != 0: + raise ValueError('Non-zero return code in mmseqs createdb') + + +def mmseqs_prefilter(sample, simulate, field, mmseqs_max_seqs=10, verbose=False): + filenames = get_filenames(sample, simulate, field) + + # make query database + aa = field.endswith('_aa') + field_no_aa = field[:-3] if aa else field + seqs_to_map = (pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa]) + .dropna() # remove reads that failed parsing + .rename(columns={field_no_aa: 'query'}) + ) + if aa: + seqs_to_map['query'] = [quick_translate(x) for x in seqs_to_map['query']] + kmer = 6 + else: + kmer = 12 + + write_fasta(filenames['map query'], seqs_to_map.dropna()) + make_mmseqs_db(filenames['map query'], aa, is_query=True) + + # search + [os.remove(x) for x in glob(filenames['map mmseqs prefilter db'] + '*')] + with tempfile.TemporaryDirectory() as tmp: + command = ('mmseqs', 'prefilter', + filenames['map mmseqs query db'], # query + filenames['map mmseqs target db'], # target + filenames['map mmseqs prefilter db'], + '-k', str(kmer), + '--max-seqs', str(mmseqs_max_seqs), + # without these flags, mmseqs will sometimes throw out exact matches + '--comp-bias-corr', '0', + '--mask', '0', + # '--exact-kmer-matching', '1', # prefilter only retains exact kmer matches + ) + if verbose: + print(' '.join(command)) + result = subprocess.run(command, stderr=DEVNULL, stdout=DEVNULL) + # result = subprocess.run(command) + if result.returncode != 0: + raise ValueError('Non-zero return code in mmseqs prefilter') + + # convert to tsv + command = ('mmseqs', 'createtsv', + filenames['map mmseqs query db'], # query + filenames['map mmseqs target db'], # target + filenames['map mmseqs prefilter db'], + filenames['map mmseqs prefilter tsv'], + ) + if verbose: + print(' '.join(command)) + result = subprocess.run(command, stderr=DEVNULL, stdout=DEVNULL) + if result.returncode != 0: + raise ValueError('Non-zero return code in mmseqs prefilter') + + ref_seqs = (read_fasta(filenames['map mmseqs target fa'], as_df=True) + .rename(columns={'name': 'reference_name', 'seq': 'reference'})) + + df_mmseqs = (read_mmseqs_prefilter_tsv(filenames['map mmseqs prefilter tsv']) + .rename(columns={'query': 'read_index', 'target': 'reference_name'}) + [['read_index', 'reference_name']] + ) + return df_mmseqs.merge(seqs_to_map).merge(ref_seqs).pipe(Candidates) + + +def read_mmseqs_prefilter_tsv(filename): + return pd.read_csv(filename, sep='\t', names=('query', 'target', 'score', 'diagonal')) + + +def read_m8(filename): + columns = ('query', 'target', 'sequence_identity', + 'alignment_length', 'num_mismatches', 'num_gaps', + 'query_start', 'query_end', 'target_start', + 'target_end', 'e_value', 'bit_score') + return pd.read_csv(filename, sep=r'\s+', header=None, names=columns) + + +def mmseqs_search(sample, field, simulate=False, min_seq_id=0.8) -> Candidates: + """Not in use. + + Use mmseqs search to map `field` in parsed reads to the design table. + + :param sample: sample name + :param field: a field name, if it ends in "_aa" the translation will be mapped + :param simulate: if True, use ./*/simulate/ subdirectories + :param min_seq_id: mmseqs2 prefilter parameter + """ + filenames = get_filenames(sample, simulate, field) + + # make query database + aa = field.endswith('_aa') + field_no_aa = field[:-3] if aa else field + seqs_to_map = (pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa]) + .dropna() # remove reads that failed parsing + .rename(columns={field_no_aa: 'query'}) + ) + if aa: + seqs_to_map['query'] = [quick_translate(x) for x in seqs_to_map['query']] + kmer = MMSEQS_KMER_AA + else: + kmer = MMSEQS_KMER_DNA + write_fasta(filenames['map query'], seqs_to_map.dropna()) + make_mmseqs_db(filenames['map query'], aa, is_query=True) + + # search + [os.remove(x) for x in glob(filenames['map mmseqs result db'] + '*')] + with tempfile.TemporaryDirectory() as tmp: + command = ('mmseqs', 'search', + filenames['map mmseqs query db'], # query + filenames['map mmseqs target db'], # target + filenames['map mmseqs result db'], + tmp, + '--min-seq-id', str(min_seq_id), + '-k', str(kmer), # memory usage is 21**k * 8 bytes (protein) or 5**k * 8 bytes (DNA) + '-s', '1', # sensitivity, 1 is fastest, 7.5 is most sensitive + '-c', '0.9', # minimum alignment coverage on query/target + '--exact-kmer-matching', '1', # prefilter only retains exact kmer matches + # split query database based on available memory + # '--split 0 --split-mode 1', + ) + if not aa: + command += ('--search-type', '3') + print(' '.join(command)) + # result = subprocess.run(command, stderr=DEVNULL, stdout=DEVNULL) + result = subprocess.run(command) + if result.returncode != 0: + raise ValueError('Non-zero return code in mmseqs search') + + # convert results + command = ('mmseqs', 'convertalis', + filenames['map mmseqs query db'], # query + filenames['map mmseqs target db'], # target + filenames['map mmseqs result db'], # result + filenames['map mmseqs result m8'], # result + ) + # print(' '.join(command)) + result = subprocess.run(command, stderr=DEVNULL, stdout=DEVNULL) + if result.returncode != 0: + raise ValueError('Non-zero return code in mmseqs convertalis') + + ref_seqs = read_fasta(filenames['map mmseqs target fa'], as_df=True).rename(columns={'name': 'reference_name', 'seq': 'reference'}) + + df_mmseqs = (read_m8(filenames['map mmseqs result m8']) + .rename(columns={'query': 'read_index', 'target': 'reference_name'}) + [['read_index', 'reference_name']].drop_duplicates() + ) + return df_mmseqs.merge(seqs_to_map).merge(ref_seqs).pipe(Candidates) + + +def load_fastmap(filename): + """A bit slow, 10% of fastmap runtime + """ + txt = open(filename).read() + queries = txt.strip().split('//') + arr = [] + for q in queries[:-1]: + if not q: + continue + lines = q.strip().split('\n') + + name = lines[0].split()[1] + for x in set([l.split()[-1].split(':')[0] for l in lines[1:]]): + arr += [{'query': name, 'match': x}] + return pd.DataFrame(arr) + + +if __name__ == '__main__': + + # order is preserved + commands = [ + 'setup', + 'dna_to_designs', + 'simulate_fastqs', + 'merge_read_pairs', + 'parse_reads', + 'map_parsed_reads', + ] + # if the command name is different from the function name + named = { + # 'search': search_app, + } + + final = {} + for k in commands: + try: + final[k] = named[k] + except KeyError: + final[k] = eval(k) + + try: + fire.Fire(final) + except BrokenPipeError: + pass + + diff --git a/src/ngs_analysis/constants.py b/src/ngs_analysis/constants.py new file mode 100644 index 0000000..26fc605 --- /dev/null +++ b/src/ngs_analysis/constants.py @@ -0,0 +1,167 @@ +from collections import defaultdict +import os +from pathlib import Path +import string + +try: + HOME = Path(os.environ['HOME']) +except KeyError: + HOME = Path('/home/dfeldman') + +JOBLIB_CACHE = HOME / '.joblib' + +RESOURCES = Path(__file__).parents[0] / 'resources' +RULE_SETS = RESOURCES / 'rule_sets.csv' +VISITOR_FONT_PATH = RESOURCES / 'visitor1.ttf' + +BQ_PROJECT_ID = 'bilf-350112' + +GO_TERM = 'GO_term' +GO = 'GO ID' +GO_SYNONYM = 'DB Object Synonym (|Synonym)' +GO_SYMBOL = 'DB Object Symbol' +GO_TERM_COUNTS = 'GO_term_counts' +SEARCH_KEYWORD = 'keyword' +GO_SYNONYM = 'DB Object Synonym (|Synonym)' + +GENE_ID = 'gene_id' +GENE_SYMBOL = 'gene_symbol' + +HGNC = 'HGNC' +UNIPROTKB = 'UniProtKB' +ENSG = 'ENSG' +GENE_ALIAS = 'gene_alias' +RCSB = 'RCSB' + +biomart_columns = {'Gene stable ID': ENSG, + 'HGNC ID': HGNC, + 'NCBI gene ID': GENE_ID, + 'NCBI gene (formerly Entrezgene) ID': GENE_ID, + 'HGNC symbol': GENE_SYMBOL, + 'UniProtKB Gene Name ID': UNIPROTKB, + } + +PTHR = 'PTHR' +PTHR_SF = 'PTHR_SF' +PTHR_FAMILY = 'PTHR_family' +PTHR_SUBFAMILY = 'PTHR_subfamily' +PTHR_CLASS_LIST = 'PC_list' + +pthr_columns = { + 0: 'identifier', + 2: PTHR_SF, + 3: PTHR_FAMILY, + 4: PTHR_SUBFAMILY, + 8: PTHR_CLASS_LIST, +} + +MZ_DOUBLE_SPACING = 0.5001917279701898 + +AA_3 = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', + 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'] +AA_1 = list('ARNDCQEGHILKMFPSTWYV') +CANONICAL_AA = AA_1 +AA_3_1 = dict(zip(AA_3, AA_1)) +AA_1_3 = dict(zip(AA_1, AA_3)) + +skyline_columns = { + 'Replicate': 'sample', + 'Replicate Name': 'sample', + 'Protein': 'short_name', + 'Peptide': 'sequence', + 'Peptide Retention Time': 'RTime', + 'Normalized Area': 'peak_area', + 'Total Area MS1': 'ms1_area', + 'Best Retention Time': 'RTime', + 'Min Start Time': 'RTime_start', + 'Max End Time': 'RTime_end', + 'Average Mass Error PPM': 'mass_error_ppm', + 'Isotope Dot Product': 'idotp', + } + + +PT02_BACKBONE_START = 'ATTCTCCTTGGAATTTGCCCTTTTTGAGTTTGGATCTTGGTTCAT' +iON_BACKBONE_START = 'GACATTGATTATTGACTAGTTATTAATAGTAATCAAT' +pET_BACKBONE_START = 'TAATACGACTCACTATAGGGGAATTGTGAGCGGATAACAATTCC' + +CODONS = { + 'TAA': '*', + 'TAG': '*', + 'TGA': '*', + 'GCA': 'A', + 'GCC': 'A', + 'GCG': 'A', + 'GCT': 'A', + 'TGC': 'C', + 'TGT': 'C', + 'GAC': 'D', + 'GAT': 'D', + 'GAA': 'E', + 'GAG': 'E', + 'TTC': 'F', + 'TTT': 'F', + 'GGA': 'G', + 'GGC': 'G', + 'GGG': 'G', + 'GGT': 'G', + 'CAC': 'H', + 'CAT': 'H', + 'ATA': 'I', + 'ATC': 'I', + 'ATT': 'I', + 'AAA': 'K', + 'AAG': 'K', + 'CTA': 'L', + 'CTC': 'L', + 'CTG': 'L', + 'CTT': 'L', + 'TTA': 'L', + 'TTG': 'L', + 'ATG': 'M', + 'AAC': 'N', + 'AAT': 'N', + 'CCA': 'P', + 'CCC': 'P', + 'CCG': 'P', + 'CCT': 'P', + 'CAA': 'Q', + 'CAG': 'Q', + 'AGA': 'R', + 'AGG': 'R', + 'CGA': 'R', + 'CGC': 'R', + 'CGG': 'R', + 'CGT': 'R', + 'AGC': 'S', + 'AGT': 'S', + 'TCA': 'S', + 'TCC': 'S', + 'TCG': 'S', + 'TCT': 'S', + 'ACA': 'T', + 'ACC': 'T', + 'ACG': 'T', + 'ACT': 'T', + 'GTA': 'V', + 'GTC': 'V', + 'GTG': 'V', + 'GTT': 'V', + 'TGG': 'W', + 'TAC': 'Y', + 'TAT': 'Y', +} + +CODONS_REVERSE = defaultdict(list) +[CODONS_REVERSE[v].append(k) for k, v in CODONS.items()] +CODONS_REVERSE = {k: '|'.join(v) for k, v in CODONS_REVERSE.items()} + +CHAIN_ALPHABET = string.ascii_uppercase + string.ascii_lowercase + +SLUGIFY_REPLACEMENTS = [ + ('<', 'lt'), + ('<=', 'lte'), + ('>', 'gt'), + ('>=', 'gte'), + ('&', 'and'), + ('|', 'or'), +] \ No newline at end of file diff --git a/src/ngs_analysis/interactive.py b/src/ngs_analysis/interactive.py new file mode 100644 index 0000000..b3d4f5c --- /dev/null +++ b/src/ngs_analysis/interactive.py @@ -0,0 +1,23 @@ +"""Common imports for interactive work +""" +from collections import Counter +from glob2 import glob +import logging +import os +import re +import sys + +from natsort import natsorted, natsort_keygen +import numpy as np +import pandas as pd + +import warnings +from math import ceil +from functools import partial +from pandas import IndexSlice as pdx +from tqdm.auto import tqdm +import IPython +from IPython.display import display, Image + +IPython.get_ipython().run_line_magic('load_ext', 'autoreload') +IPython.get_ipython().run_line_magic('autoreload', '2') diff --git a/src/ngs_analysis/sequence.py b/src/ngs_analysis/sequence.py new file mode 100644 index 0000000..2fc85f0 --- /dev/null +++ b/src/ngs_analysis/sequence.py @@ -0,0 +1,915 @@ +import gzip +import os +import re +from collections import defaultdict +from glob import glob + +import numpy as np +import pandas as pd +from natsort import natsorted + +from .constants import RESOURCES, CODONS, CODONS_REVERSE + + +watson_crick = {'A': 'T', + 'T': 'A', + 'C': 'G', + 'G': 'C', + 'U': 'A', + 'N': 'N'} + +watson_crick.update({k.lower(): v.lower() + for k, v in watson_crick.items()}) + +iupac = {'A': ['A'], + 'C': ['C'], + 'G': ['G'], + 'T': ['T'], + 'M': ['A', 'C'], + 'R': ['A', 'G'], + 'W': ['A', 'T'], + 'S': ['C', 'G'], + 'Y': ['C', 'T'], + 'K': ['G', 'T'], + 'V': ['A', 'C', 'G'], + 'H': ['A', 'C', 'T'], + 'D': ['A', 'G', 'T'], + 'B': ['C', 'G', 'T'], + 'N': ['G', 'A', 'T', 'C']} + +codon_maps = {} + + +def read_fasta(f, as_df=False): + if f.endswith('.gz'): + fh = gzip.open(f) + txt = fh.read().decode() + else: + fh = open(f, 'r') + txt = fh.read() + fh.close() + records = parse_fasta(txt) + if as_df: + return pd.DataFrame(records, columns=('name', 'seq')) + else: + return records + + +def parse_fasta(txt): + entries = [] + txt = '\n' + txt.strip() + for raw in txt.split('\n>'): + name = raw.split('\n')[0].strip() + seq = ''.join(raw.split('\n')[1:]).replace(' ', '') + if name: + entries += [(name, seq)] + return entries + + +def write_fasta(filename, list_or_records): + if isinstance(list_or_records, pd.DataFrame) and list_or_records.shape[1] == 2: + list_or_records = list_or_records.values + list_or_records = list(list_or_records) + with open(filename, 'w') as fh: + fh.write(format_fasta(list_or_records)) + + +def write_fake_fastq(filename, list_or_records): + with open(filename, 'w') as fh: + fh.write(format_fake_fastq(list_or_records)) + + +def format_fake_fastq(list_or_records): + """Generates a fake header for each read that is sufficient to fool bwa/NGmerge. + """ + fake_header = '@M08044:78:000000000-L568G:1:{tile}:{x}:{y} 1:N:0:AAAAAAAA' + if isinstance(next(iter(list_or_records)), str): + records = list_to_records(list_or_records) + else: + records = list_or_records + + max_value = 1000 + lines = [] + for i, (_, seq) in enumerate(records): + tile, rem = divmod(i, max_value**2) + x, y = divmod(rem, max_value) + lines.extend([fake_header.format(tile=tile, x=x, y=y), seq.upper(), '+', 'G' * len(seq)]) + return '\n'.join(lines) + + +def write_fastq(filename, names, sequences, quality_scores): + with open(filename, 'w') as fh: + fh.write(format_fastq(names, sequences, quality_scores)) + + +def format_fastq(names, sequences, quality_scores): + lines = [] + for name, seq, q_score in zip(names, sequences, quality_scores): + lines.extend([name, seq, '+', q_score]) + return '\n'.join(lines) + + +def list_to_records(xs): + n = len(xs) + width = int(np.ceil(np.log10(n))) + fmt = '{' + f':0{width}d' + '}' + records = [] + for i, s in enumerate(xs): + records += [(fmt.format(i), s)] + return records + + +def format_fasta(list_or_records): + if len(list_or_records) == 0: + records = [] + elif isinstance(list_or_records[0], str): + records = list_to_records(list_or_records) + else: + records = list_or_records + + lines = [] + for name, seq in records: + lines.extend([f'>{name}', str(seq)]) + return '\n'.join(lines) + + +def fasta_frame(files_or_search): + """Convenience function, pass either a list of files or a + glob wildcard search term. + """ + + if isinstance(files_or_search, str): + files = natsorted(glob(files_or_search)) + else: + files = files_or_search + + cols = ['name', 'seq', 'file_ix', 'file'] + records = [] + for f in files: + for i, (name, seq) in enumerate(read_fasta(f)): + records += [{ + 'name': name, 'seq': seq, 'file_ix': i, + 'file': f, + }] + + return pd.DataFrame(records)[cols] + + +def cast_cols(df, int_cols=tuple(), float_cols=tuple(), str_cols=tuple(), + cat_cols=tuple(), uint16_cols=tuple()): + return (df + .assign(**{c: df[c].astype(int) for c in int_cols}) + .assign(**{c: df[c].astype(np.uint16) for c in uint16_cols}) + .assign(**{c: df[c].astype(float) for c in float_cols}) + .assign(**{c: df[c].astype(str) for c in str_cols}) + .assign(**{c: df[c].astype('category') for c in cat_cols}) + ) + + +def translate_dna(s): + assert len(s) % 3 == 0, 'length must be a multiple of 3' + return ''.join([CODONS[s[i*3:(i+1)*3]] for i in range(int(len(s)/3))]) + + +def load_codons(organism): + f = os.path.join(RESOURCES, 'codon_usage', 'organisms.csv') + taxids = pd.read_csv(f).set_index('organism')['taxid'].to_dict() + + if organism.lower() == 'yeast': + organism = 's_cerevisiae' + organism = organism.lower().replace('.', '').replace(' ', '_') + + try: + table = f'{organism}_{taxids[organism]}.csv' + except KeyError: + raise ValueError(f'{organism} must be one of {list(taxids.keys())}') + f = os.path.join(RESOURCES, 'codon_usage', table) + return (pd.read_csv(f) + .assign(codon_dna=lambda x: x['codon'].str.replace('U', 'T'))) + + +def make_equivalent_codons(dna_to_aa): + """Make dictionary from codon to other codons for the same amino acid + """ + aa_to_dna = defaultdict(list) + for codon, aa in dna_to_aa.items(): + aa_to_dna[aa] += [codon] + + equivalent_codons = {} + for codon in dna_to_aa: + aa = dna_to_aa[codon] + equivalent_codons[codon] = list(set(aa_to_dna[aa]) - {codon}) + + return equivalent_codons + + +equivalent_codons = make_equivalent_codons(CODONS) + + +def reverse_complement(seq): + return ''.join(watson_crick[x] for x in seq)[::-1] + + +def sanger_database(drive): + df_sanger = drive.get_excel('cloning/sanger') + + extra_cols = [x for x in df_sanger.columns + if x not in ('identifier', 'search')] + + arr = [] + for _, row in df_sanger.iterrows(): + files = natsorted(glob(row['search'])) + if len(files) == 0: + print(f'No files found from row {row.to_dict()}') + continue + (pd.DataFrame({'file': files}) + .assign(**{x: row[x] for x in extra_cols}) + .assign(name=lambda x: x['file'].str.extract(row['identifier'])) + .assign(seq=lambda x: x['file'].apply(read_ab1)) + .assign(seq_rc=lambda x: x['seq'].apply(reverse_complement)) + .pipe(arr.append) + ) + + cols = extra_cols + ['name', 'file', 'seq', 'seq_rc'] + return pd.concat(arr)[cols] + + +def read_ab1(f): + from Bio import SeqIO + with open(f, 'rb') as fh: + records = list(SeqIO.parse(fh, 'abi')) + assert len(records) == 1 + seq = str(records[0].seq) + return seq + + +def print_alignment(a, b, width=60, as_string=False): + """Levenshtein alignment. + """ + import edlib + alignment = edlib.align(a, b, task='path') + d = edlib.getNiceAlignment(alignment, a, b) + lines = [] + for i in range(0, max(map(len, d.values())), width): + lines += [str(i)] + for x in d.values(): + lines += [x[i:i+width]] + + txt = '\n'.join(lines) + if as_string: + return txt + else: + print(txt) + + +def reverse_translate_max(aa_seq, organism='e_coli'): + if organism not in codon_maps: + codon_maps[organism] = (load_codons(organism) + .sort_values('relative_frequency', ascending=False) + .drop_duplicates('amino_acid') + .set_index('amino_acid')['codon_dna'].to_dict() + ) + codon_map = codon_maps[organism] + return ''.join([codon_map[x] for x in aa_seq]) + + +def reverse_translate_random(aa_seq, organism='e_coli', rs='input', cutoff=0.12): + if rs == 'input': + seed = hash(aa_seq) % 10**8 + rs = np.random.RandomState(seed=seed) + if (organism, cutoff) not in codon_maps: + codon_maps[(organism, cutoff)] = (load_codons(organism) + .query('relative_frequency > @cutoff') + .groupby('amino_acid')['codon_dna'].apply(list).to_dict() + ) + codon_map = codon_maps[(organism, cutoff)] + return ''.join([rs.choice(codon_map[x]) for x in aa_seq]) + + +def get_genbank_features(f, error_on_repeat=True): + from Bio import SeqIO + records = list(SeqIO.parse(open(f,'r'), 'genbank')) + if len(records) != 1: + raise ValueError(f'found {len(records)} records in genbank {f}') + + features = {} + for f in records[0].features: + label = f.qualifiers['label'][0] + seq = f.extract(records[0].seq) + if label in features and features[label] != seq and error_on_repeat: + raise ValueError(f'repeated feature {label}') + features[label] = str(seq) + return features + + +def rolling_gc(seq, window): + from scipy.ndimage.filters import convolve + gc_window = 20 + return convolve(np.array([x in 'GC' for x in seq])*1., + np.ones(window)/window, + mode='reflect') + + +def to_codons(seq): + assert len(seq) % 3 == 0 + return [seq[i*3:(i+1)*3] for i in range(int(len(seq)/3))] + + +def codon_adaptation_index(seq, organism='e_coli'): + return (load_codons(organism) + .assign(w=lambda x: x.groupby('amino_acid')['relative_frequency'] + .transform(lambda y: y / y.max())) + .set_index('codon_dna') + .loc[to_codons(seq)]['w'] + .pipe(lambda x: np.prod(x)**(1/len(x))) + ) + + +def compare_sequences(sequences, window=25, k=6): + import matplotlib.pyplot as plt + + fig, (ax0, ax1) = plt.subplots(figsize=(12, 4), ncols=2) + for name in sequences: + cai = codon_adaptation_index(sequences[name]) + mean_gc = np.mean([x in 'GC' for x in sequences[name]]) + gc_trace = rolling_gc(sequences[name], window) + label = f'{name}: avg={mean_gc:.2g} std={np.std(gc_trace):.2g} cai={cai:.2g}' + ax0.plot(gc_trace, label=label) + + (pd.Series(get_kmers(sequences[name], k)) + .value_counts().value_counts().sort_index() + .pipe(lambda x: x/x.sum()) + .plot(ax=ax1, marker='.', ms=10, label=name)) + + + ax0.plot([0, len(gc_trace)], [0.5, 0.5], color='gray', lw=1, ls='--', zorder=-1) + ax0.legend() + ax0.set_title(f'average GC content over {window} nt window') + ax0.set_ylabel('GC fraction') + ax0.set_xlabel('DNA sequence position') + ax0.set_ylim([0.25, 0.85]) + ax0.set_xlim([0, len(gc_trace)]) + + ax1.set_title(f'repeated kmers with k={k}') + ax1.set_xlabel('kmer count') + ax1.set_ylabel(f'fraction of kmers') + ax1.set_xticks([1,2,3,4,5]) + ax1.legend() + + return fig + + +def get_kmers(s, k): + n = len(s) + return [s[i:i+k] for i in range(n-k+1)] + + +def read_fastq(filename, max_reads=1e12, include_quality=False, include_name=False, + include_index=False, progress=lambda x: x, full_name=False): + if max_reads is None: + max_reads = 1e12 + if filename.endswith('gz'): + fh = gzip.open(filename, 'rt') + else: + fh = open(filename, 'r') + reads, quality_scores, names, indices = [], [], [], [] + read_count = 0 + for i, line in progress(enumerate(fh)): + if i % 4 == 1: + reads.append(line.strip()) + read_count += 1 + if include_quality and i % 4 == 3: + quality_scores.append(line.strip()) + if include_name and i % 4 == 0: + if full_name: + names.append(line.strip()) + else: + names.append(':'.join(line.split()[0].split(':')[3:7])) + if include_index and i % 4 == 0: + indices.append(line.split(':')[-1].strip()) + if i % 4 == 3 and read_count >= max_reads: + break + + fh.close() + if include_quality or include_name or include_index: + return_val = (reads,) + if include_quality: + return_val += (quality_scores,) + if include_name: + return_val += (names,) + if include_index: + return_val += (indices,) + return return_val + else: + return reads + + +def quality_scores_to_array(quality_scores, baseline=ord('!')): + """Only works if all quality scores have equal length. + Expects strings not bytes. + """ + q = np.array(quality_scores) + return (np.array(q).astype(f'S{len(q[0])}') + .view('S1').view(np.uint8) + .reshape(len(q), len(q[0])) + - baseline + ) + + +def aa_to_dna_re(aa_seq): + return ''.join(f'(?:{CODONS_REVERSE[x]})' for x in aa_seq) + + +def make_kmer_dict(sequences, k): + """ + """ + kmers = defaultdict(list) + for i, seq in enumerate(sequences): + for kmer in get_kmers(seq, k): + kmers[kmer].append(i) + return kmers + + +def match_nearest(query, sequences, kmers): + from Levenshtein import distance + + k = len(next(iter(kmers.keys()))) + + candidates = [] + for kmer in get_kmers(query, k): + candidates.extend(kmers[kmer]) + candidates = set(candidates) + # guess + candidates = sorted(candidates, key=lambda i: ~ + sequences[i].startswith(query[:2])) + + matches = [] + for i in candidates: + d = distance(sequences[i], query) + matches.append((d, i)) + # exact match + if d == 0: + break + d, i = sorted(matches)[0] + return d, i + + +def match_queries(queries, sequences, window, k, progress=lambda x: x): + """Match queries to reference sequences based on Levenshtein distance between + prefixes of length `window`. Only pairs with a shared kmer of length `k` are + checked. For each query, finds the first nearest prefix and returns all sequences + that share that prefix. + """ + query_lookup = {x: x[:window] for x in queries} + query_prefixes = sorted(set([x[:window] for x in queries])) + + ref_lookup = defaultdict(list) + for x in sequences: + ref_lookup[x[:window]].append(x) + ref_prefixes = sorted(set([x[:window] for x in sequences])) + + kmers = make_kmer_dict(ref_prefixes, k) + + hits = {} + for q in progress(query_prefixes): + try: + hits[q] = match_nearest(q, ref_prefixes, kmers) + except IndexError: + pass + + results = [] + for q in queries: + try: + d, i = hits[query_lookup[q]] + results.append(ref_lookup[ref_prefixes[i]]) + except KeyError: + results.append([]) + return results + + +def add_design_matches(df_reads, col, reference, window, k): + """ + `df_reads` is a dataframe containing `col` with sequences + `reference` is a list of references + """ + queries = df_reads[col].fillna('').pipe(list) + queries = [q if '*' not in q else '' for q in queries] + results = match_queries(queries, reference, window, k) + + df_reads = df_reads.copy() + design_distance, design_match, design_equidistant = zip( + *calculate_distance_matches(queries, results)) + return (df_reads + .assign(design_match=design_match, design_distance=design_distance, + design_equidistant=design_equidistant) + ) + + +def calculate_distance_matches(queries, results): + """Get columns `design_distance` and `design_match` from results of `match_queries`. + """ + from Levenshtein import distance + + arr = [] + for q, rs in zip(queries, results): + if len(rs) == 0: + arr += [(-1, '', 0)] + else: + ds = [(distance(q, r), r) for r in rs] + d, s = sorted(ds)[0] + degeneracy = sum([x[0] == d for x in ds]) + arr += [(d, s, degeneracy)] + return arr + + +def match_and_check(queries, reference, window, k, ignore_above=40, progress=lambda x: x): + """Perform fast Levenshtein distance matching of queries to reference + and check the results by brute force calculation (all pairs). Mismatches + with an edit distance greater than `ignore_above` are ignored. + """ + from Levenshtein import distance + + print(f'Matching {len(queries)} queries to {len(reference)} ' + f'reference sequences, window={window} and k={k}') + df_matched = (pd.DataFrame({'sequence': queries}) + .pipe(add_design_matches, col='sequence', + reference=reference, window=window, k=k)) + it = (df_matched + [['sequence', 'design_match']].values) + print('Checking fast matches against brute force matches...') + different = 0 + for seq, match in progress(it): + xs = sorted(reference, key=lambda x: distance(x, seq)) + a, b = distance(seq, match), distance(seq, xs[0]) + if b < a and b <= ignore_above: + different += 1 + print(f'{a},{b} (fast,exact distance); query={seq}') + print(f'Total mismatches: {different}') + return df_matched + + +def load_abi_zip(filename): + """Extract Bio.SeqIO records from sanger zip file. + """ + import zipfile + from io import BytesIO + + from Bio import SeqIO + zh = zipfile.ZipFile(filename, 'r') + arr = [] + for zi in zh.filelist: + if not zi.filename.endswith('ab1'): + print(f'Skipping {zi.filename}') + continue + fh = zh.open(zi.filename, 'r') + buffer = BytesIO(fh.read()) + arr += [SeqIO.read(buffer, 'abi')] + return arr + + +def get_abi_traces(abi_record): + channels = 'DATA9', 'DATA10', 'DATA11', 'DATA12' + bases = list('GATC') + traces = np.array([abi_record.annotations['abif_raw'][c] + for c in channels]) + df = pd.DataFrame(traces).T + df.columns = bases + return df + + +def try_translate_dna(s): + try: + return translate_dna(s) + except: + return None + + +def digest_protein_fasta(filename, digest_pat='[R|K]'): + """Convert protein fasta into a table of peptides, digesting with provided regex. + """ + records = read_fasta(filename) + + arr = [] + for name, seq in records: + parts = re.split(f'({digest_pat})', seq) + parts = [x for x in parts if x] + peptides = [a+b for a,b in zip(parts[::2], parts[1::2])] + for peptide in peptides: + arr += [{'name': name, 'sequence': peptide}] + + return pd.DataFrame(arr) + + +def findone(aa, dna): + """Simple case of amino acid substring in in-frame DNA. + """ + aa_ = translate_dna(dna) + i = aa_.index(aa) + return dna[i * 3:(i + len(aa)) * 3] + + +def select_most_different(xs, n): + """Quickly select sequences with high mutual Levenshtein distance. + """ + from Levenshtein import distance + xs = list(xs) + arr = [xs.pop()] + for _ in range(n - 1): + new = sorted(xs, key=lambda x: -min(distance(x, y) for y in arr))[0] + xs.remove(new) + arr += [new] + return arr + + +def edge_primers(dna, melt_temp): + from Bio.SeqUtils.MeltingTemp import Tm_NN + dna_rev = reverse_complement(dna) + + fwd = dna[:10] + while Tm_NN(fwd) < melt_temp: + fwd = dna[:len(fwd) + 1] + + rev = dna_rev[:10] + while Tm_NN(rev) < melt_temp: + rev = dna_rev[:len(rev) + 1] + + return fwd, rev + + +def find_orfs(seq, kozak='GCCACC', return_aa=True): + """Find open reading frames starting with kozak in circular + DNA sequence. + """ + plasmid = str(seq) + pat = f'{kozak}(ATG(?:...)*?)(?:TAA|TAG|TGA)' + ref = plasmid + plasmid + orfs = re.findall(pat, ref) + re.findall(pat, reverse_complement(ref)) + if return_aa: + orfs = [translate_dna(x) for x in orfs] + return sorted(set(orfs), key=lambda x: -1 * len(x)) + + +def find_longest_orf(dna, kozak='GCCACC'): + pat = f'{kozak}(ATG(?:...)*?)(?:TAA|TGA|TAG)' + orfs = re.findall(pat, dna.upper()) + if orfs: + return sorted(orfs, key=len)[-1] + + +def add_features(record, features): + """Add features to `record` based on name=>DNA dictionary `features`. + A feature with "orf" in the name is given a special annotation that + shows up as a translation in benchling. + """ + + vector = str(record.seq).upper() + + n = len(vector) + + features = dict(features) + arr = [] + for name, feature in features.items(): + feature = feature.upper() + m = len(feature) + + for strand in (-1, 1): + key = reverse_complement(feature) if strand == -1 else feature + starts = [x.start() for x in re.finditer(key, vector * 2)] + starts = [x for x in starts if x < n] + for start in starts: + end = start + m + qualifiers = dict(label=name) + feature_type = 'misc' + if 'orf' in name: + feature_type = 'CDS' + qualifiers['translation'] = translate_dna(key) + '*' + end += strand * 3 + + if end < n: + location = FeatureLocation(start, end, strand=strand) + else: + f1 = FeatureLocation(start, n, strand=strand) + f2 = FeatureLocation(0, end % n, strand=strand) + location = f1 + f2 + arr += [SeqFeature(location, type=feature_type, qualifiers=qualifiers)] + + # copies features but not annotations + new_record = record[:] + new_record.annotations = record.annotations + new_record.features += arr + return new_record + + +def translate_to_stop(x): + if not isinstance(x, str): + return + if 'N' in x: + return + y = translate_dna(x[:3 * int(len(x)/3)]) + if '*' in y: + return y.split('*')[0] + return + + +def ion_plasmid_integration(filename): + """Generate plasmid map after integration for an iON vector + :param filename: iON vector genbank + """ + left_itr = 'CCCTAGAAAGATAGTCTGCGTAAAATTGACGCATG'.upper() + right_itr = 'ccctagaaagataatcatattgtgacgtacgttaaagataatcatgcgtaaaattgacgcatg'.upper() + + record = list(SeqIO.parse(filename, 'genbank'))[0] + original_features = get_genbank_features(filename, error_on_repeat=False) + # remove annoying small features + original_features = {k: v for k,v in original_features.items() if 6 < len(v)} + + dna = str(record.seq) + start = dna * 2 + # in iON figure 1A notation + # partial, one, two, one, partial + _, one, two, _, _ = split_by_regex(f'{left_itr}|{right_itr}', start, keep='prefix') + + # deduplicate + one, two = ['AA' + x[:-2] for x in (one, two)] + result = two + reverse_complement(one) + + result_record = SeqRecord(Seq(result), name=record.name + '_integrated', + annotations={'molecule_type': 'DNA'}) + result_record = add_features(result_record, original_features) + + return result_record + + +def remove_restriction_sites(dna, sites, rs): + codon_sequence = to_codons(dna) + sites = set(sites) + for site in list(sites): + sites.add(reverse_complement(site)) + + for site in sites: + width = len(site) + dna = ''.join(codon_sequence) + if site not in dna: + continue + + for i in range(len(dna)): + # if we encounter a restriction site + if dna[i:i + len(site)] == site: + # change any of these codons + overlapped_codons = sorted( + set([int((i + offset) / 3) for offset in range(width)])) + # accept first change that removes restriction site + for j in overlapped_codons: + # change this codon + new_codon = swap_codon(codon_sequence[j], rs) + local_dna = ''.join([new_codon if k == j else codon_sequence[k] + for k in overlapped_codons]) + # if codon removes this site, keep it + if site not in local_dna: + codon_sequence = codon_sequence[:j] + \ + [new_codon] + codon_sequence[j + 1:] + break + dna = ''.join(codon_sequence) + for site in sites: + assert site not in dna + + return dna + + +def restriction_sites_in_dna(dna, sites): + sites = set(list(sites) + [reverse_complement(x) for x in sites]) + for site in sites: + if site in dna: + return True + return False + + +def to_codons(dna): + assert len(dna) % 3 == 0 + return [dna[i * 3:(i + 1) * 3] for i in range(int(len(dna) / 3))] + + +def swap_codon(codon, rs): + """Swap codon at random, if possible. + """ + options = equivalent_codons[codon] + if not options: + return codon + return rs.choice(options) + + +def remove_restriction_sites_dnachisel(cds, avoid, rs=None): + if not restriction_sites_in_dna(cds, avoid): + return cds + + import dnachisel as dc + + if rs is None: + np.random.seed(0) + else: + np.random.seed(int(1e9*rs.rand())) + + n = len(cds) + + constraints = [dc.AvoidPattern(x, location=0) for x in avoid] + constraints += [dc.EnforceTranslation(location=(0, n))] + + problem = dc.DnaOptimizationProblem( + sequence=cds, + constraints=constraints, + objectives=[dc.AvoidChanges()], + logger=None + ) + problem.max_iters = 50 + problem.resolve_constraints() + problem.optimize() + + assert not restriction_sites_in_dna(problem.sequence, avoid) + return problem.sequence + + +def pairwise_levenshtein(seqs): + """Calculate distance matrix for a list of sequences. + """ + from Levenshtein import distance + arr = [] + for i, a in enumerate(seqs): + for j, b in enumerate(seqs[i+1:]): + arr += [(i, i + j + 1, distance(a,b))] + n = len(seqs) + D = np.zeros((n, n), dtype=int) + i, j, d = zip(*arr) + D[i, j] = d + return D + D.T + + +def generate_adapters(priming_length=12, enzyme='BsmBI', tm_range=2, target_GC=0.5, + max_calculate=10_000, num_sample=100_000, distance_threshold=6, seed=0): + """Tm is calculated for priming site plus restriction site. A target Tm is defined + based on the average of sequences within 10% of target_GC. + """ + from Levenshtein import distance + from scipy.sparse import csr_matrix + from .utils import maxy_clique_groups + from Bio.SeqUtils import MeltingTemp as mt + + enzymes = {'BsmBI': 'CGTCTC'} + site = enzymes[enzyme] + avoid = 'AAA', 'GGG', 'CCC', 'TTT', site, reverse_complement(site) + rs = np.random.RandomState(seed) + candidates = [''.join(x) for x in rs.choice(list('ACTG'), size=(num_sample, priming_length))] + candidates = [x for x in candidates if not any(y in x for y in avoid)] + + df_adapters = pd.DataFrame({'sequence': candidates}) + + df_adapters['Tm'] = (df_adapters['sequence'] + site).apply(mt.Tm_NN) + df_adapters['GC'] = df_adapters['sequence'].str.count('G|C') / df_adapters['sequence'].str.len() + + mean_tm = df_adapters.query('abs(GC - @target_GC) < 0.1')['Tm'].mean() + df_adapters = df_adapters.query('abs(Tm - @mean_tm) < @tm_range/2') + + seqs = df_adapters['sequence'][:max_calculate] + print(f'Calculating pairwise distances for {len(seqs):,} priming sites') + D = pairwise_levenshtein(seqs) + + print(f'Selecting priming sites with pairwise distance >= {distance_threshold}') + + A = distance_threshold <= D + cm = csr_matrix(1-A, shape=A.shape) + centers = np.array(maxy_clique_groups(cm, [1] * A.shape[0])) + + return df_adapters['sequence'].iloc[centers].pipe(list) + + +def kmers_unique(s, k): + kmers = get_kmers(s, k) + return len(kmers) == len(set(kmers)) + + +def translate_to_stop(dna): + n = len(dna) % 3 + if n != 0: + dna = dna[:-(len(dna) % 3)] + dna = dna.upper() + assert len(dna) % 3 == 0 + aa = translate_dna(dna) + if '*' in aa: + return aa.split('*')[0] + return aa + # assert '*' in aa + + +def find_aa_in_dna(aa, dna): + """Simple case of amino acid substring in DNA. + """ + dna = dna[:-(len(dna) % 3)].upper() + aa_ = translate_dna(dna) + i = aa_.index(aa) + return dna[i * 3:(i + len(aa)) * 3] + + +def sequence_sap_score(x): + """Sum of SAP weights for all residues, regardless of solvent exposure. + """ + from .ppi.constants import SAP_WEIGHTS + return sum([SAP_WEIGHTS[aa] for aa in x]) \ No newline at end of file diff --git a/src/ngs_analysis/timer.py b/src/ngs_analysis/timer.py new file mode 100644 index 0000000..9f31c13 --- /dev/null +++ b/src/ngs_analysis/timer.py @@ -0,0 +1,112 @@ +"""A timer that can be used in a "with" statement. The results +can be printed out or saved to a dictionary (repeat uses in the +same closure will be saved too). +""" +import time +import inspect +import uuid + +_timings = {} +invocation_ids = {} + +class Timer: + def __init__(self, name=None, verbose=False): + """A useful timer. See `example` below. + + :param name: used in timings dictionary, if None then not added to + dictionary + :param verbose: if True, print the name and time elapsed when finished; + if a string, print the string when starting, and time elapsed when + finished + """ + self.name = name + self.verbose = verbose + + def __enter__(self): + self.start = time.time() + self.closure = identify_this_function(1) + if isinstance(self.verbose, str): + print(self.verbose, end=' ', flush=True) + return self + + def __exit__(self, type, value, traceback): + end = time.time() + elapsed_time = end - self.start + if self.verbose is True: + if self.name is None: + print('Done in: {elapsed_time:.3g} seconds') + return + else: + print(f'{self.name}: {elapsed_time:.3g} seconds') + elif isinstance(self.verbose, str): + print(f'done in {elapsed_time:.3g} seconds') + + this = self.closure + + if this not in _timings: + _timings[this] = {} + + if self.name in _timings[this]: + if isinstance(_timings[this][self.name], list): + _timings[this][self.name].append(elapsed_time) + else: + # convert to list + _timings[this][self.name] = [_timings[this][self.name], elapsed_time] + else: + _timings[this][self.name] = elapsed_time + + +def get_all_timings(): + """Gets timings measured in the parent frame. + """ + return _timings.get(identify_this_function(1), {}) + + +def example(): + + def f_0(): + with Timer('task1'): + time.sleep(0.5) + with Timer('task1'): + time.sleep(0.2) + print(get_all_timings()) + + def f_1(): + with Timer('task2', verbose=True): + time.sleep(1) + print(get_all_timings()) + + def f_2(): + for _ in range(5): + with Timer('task3'): + time.sleep(0.1) + print(get_all_timings()) + + def f_3(): + with Timer('task4', verbose='Running task4...'): + time.sleep(3) + print(get_all_timings()) + + f_0() + f_1() + f_2() + f_3() + f_0() + + +def identify_this_function(go_up=0): + """Get a unique ID specific to this invocation of the function. + + :param go_up: how many extra frames to go up in the stack + """ + frame = inspect.currentframe() + caller_frame = frame.f_back + for _ in range(go_up): + caller_frame = caller_frame.f_back + + frame_id = id(caller_frame) + + if frame_id not in invocation_ids: + invocation_ids[frame_id] = str(uuid.uuid4()) + + return invocation_ids[frame_id]