diff --git a/environment.yaml b/environment.yaml index 3dfc184..549c06f 100644 --- a/environment.yaml +++ b/environment.yaml @@ -15,4 +15,5 @@ dependencies: - python-slugify - pyyaml - regex + - seaborn - tqdm diff --git a/examples/paired_reads/config.yaml b/examples/paired_reads/config.yaml index 8ed32e7..2ce7fe4 100644 --- a/examples/paired_reads/config.yaml +++ b/examples/paired_reads/config.yaml @@ -1,6 +1,6 @@ # 1. match the insert DNA left_adapter: GATATACC # before MGS -right_adapter: CTCGAGCA # after barcode +right_adapter: CTCGAGCACCA # after barcode # 2. use regular expression to capture a part from the # insert DNA (adapters are removed) diff --git a/examples/paired_reads/reference_dna.csv b/examples/paired_reads/reference_dna.csv index 8788fce..274e60e 100644 --- a/examples/paired_reads/reference_dna.csv +++ b/examples/paired_reads/reference_dna.csv @@ -1,3 +1,5 @@ -name,vector_dna -example_1,ttgtttaactttaagaaggaGATATACCATGGGCAGCAGCCATCATCACCATCACCACAGCAGTGGCAGTGGATCCGACAAAGAATGGATTTTACAAAAGATCTATGAAATCATGCGCTTACTGGACGAGTTGGGCCATGCAGAAGCAAGTATGCGTGTCTCCGACTTGATCTACGAGTTCATGAAGAAGGGGGACGAGCGCTTATTGGAAGAAGCGGAACGTTTGTTAGAAGAAGTAGAGTAGGTTACCGGTATCTCGCTTGCGGTGAGTTCTCGAGCAccaccataactcgagcacca -example_2,ttgtttaactttaagaaggaGATATACCATGGGCAGCGACAAAGAATGGATTTACCAAAAGATCTATGAAATCATGCGCTTACTGAGGGAGTTGGGCCATGCAGAAGCAAGTATGCGTGTCTCCGACTTGATCTACGAGGCAATGAAGAAGGGGGACGAGCGCTTATTGGAAGAAGCGGAACGTTTGTTAGAAGAAGTAGAGAGCAGTGGCAGTGGATCCCATCATCACCATCACCACTAGGTTACCGGTATCTCGCTTGCACCGCAGTCTCGAGCAccaccataactcgagcacca +source,name,reference_dna +pool1,example_1,ttgtttaactttaagaaggaGATATACCATGGGCAGCAGCCATCATCACCATCACCACAGCAGTGGCAGTGGATCCGACAAAGAATGGATTTTACAAAAGATCTATGAAATCATGCGCTTACTGGACGAGTTGGGCCATGCAGAAGCAAGTATGCGTGTCTCCGACTTGATCTACGAGTTCATGAAGAAGGGGGACGAGCGCTTATTGGAAGAAGCGGAACGTTTGTTAGAAGAAGTAGAGTAGGTTACCGGTATCTCGCTTGCGGTGAGTTCTCGAGCAccaccataagtcagtcagtc +pool2,example_2,ttgtttaactttaagaaggaGATATACCATGGGCAGCGACAAAGAATGGATTTACCAAAAGATCTATGAAATCATGCGCTTACTGAGGGAGTTGGGCCATGCAGAAGCAAGTATGCGTGTCTCCGACTTGATCTACGAGGCAATGAAGAAGGGGGACGAGCGCTTATTGGAAGAAGCGGAACGTTTGTTAGAAGAAGTAGAGAGCAGTGGCAGTGGATCCCATCATCACCATCACCACTAGGTTACCGGTATCTCGCTTGCACCGCAGTCTCGAGCAccaccataagtcagtcagtc +pool3,example_1,ttgtttaactttaagaaggaGATATACCATGGGCAGCAGCCATCATCACCATCACCACAGCAGTGGCAGTGGATCCGACAAAGAATGGATTTTACAAAAGATCTATGAAATCATGCGCTTACTGGACGAGTTGGGCCATGCAGAAGCAAGTATGCGTGTCTCCGACTTGATCTACGAGTTCATGAAGAAGGGGGACGAGCGCTTATTGGAAGAAGCGGAACGTTTGTTAGAAGAAGTAGAGTAGGTTACCGGTATCTCGCTTGCGGTGAGTTCTCGAGCAccaccataagtcagtcagtc +pool3,example_3,ttgtttaactttaagaaggaGATATACCATGGGCAGCGACAAAGAATGGATTTACCAAAAGATCTATGAAATCATGCGCTTACTGAGGGAGTTGGGCCATGGCGAAGCAAGTATGCGTGTCTTAGACTTGATCTACGAGGCAATGAGGAAGGGGGACGAGCGCTTATTGAGGGAACTCGAACGTTTGTTAGAAGAAGTAGAGAGCAGTGGCAGTGGATCCCATCATCACCATCACCACTAGGTTACCGGTATCTCGCTTGCGTTGTACGCTCGAGCAccaccataagtcagtcagtc \ No newline at end of file diff --git a/examples/paired_reads/samples.csv b/examples/paired_reads/samples.csv index 7034b3a..481871b 100644 --- a/examples/paired_reads/samples.csv +++ b/examples/paired_reads/samples.csv @@ -1,3 +1,5 @@ sample,fastq_name sample_A,sample_A sample_B,sample_B +sample_C,sample_C +sample_D,sample_D diff --git a/src/ngs_analysis/app.py b/src/ngs_analysis/app.py index ab37ac6..13ae81d 100644 --- a/src/ngs_analysis/app.py +++ b/src/ngs_analysis/app.py @@ -1,11 +1,10 @@ -"""Analyze sequencing reads and compare Dna and protein parts to a reference. +"""Analyze sequencing reads and compare DNA and protein parts to a reference. Pattern-matching is used to define parts (e.g., designs, variable linkers, fixed sequences, and barcodes). Identified parts can be mapped to inferred reference sequences (e.g., design inferred from barcode). See README for details. """ -import io import os import re import shutil @@ -13,81 +12,21 @@ import tempfile from glob import glob from subprocess import DEVNULL -from typing import Optional, Tuple +from typing import Tuple import fire import Levenshtein import numpy as np import pandas as pd -import pandera as pa -import yaml -from pandera.typing import Series # import quickdna +from .constants import * from .sequence import (read_fasta, read_fastq, reverse_complement, - translate_dna, write_fake_fastq, write_fasta) + write_fake_fastq, write_fasta) from .timer import Timer +from .types import * from .utils import assert_unique, nglob, sample_at_coverage - -# user-provided -config_yaml = 'config.yaml' # how to parse and compare -sample_table = 'samples.csv' # NGS samples -reference_dna_table = 'reference_dna.csv' # designed insert DNA -sample_plan_table = 'sample_plan.csv' # cloning plan - -working_directories = '0_paired_reads', '1_reads', '2_parsed', '3_mapped', '3_mapped/map' - -# generated or provided -design_table = 'designs.csv' # designed DNA or protein parts - -ngmerge = 'NGmerge' -bwa = 'bwa' -mmseqs = 'mmseqs' - -MMSEQS_KMER_AA = 6 -MMSEQS_KMER_DNA = 12 - -### TYPES -# TODO: joint uniqueness, DNA characters -# currently handled in load_reference_dna -class ReferenceDna(pa.DataFrameModel): - source: Optional[Series[str]] = pa.Field(nullable=False, coerce=True) - name: Series[str] = pa.Field(nullable=False, unique=True) - reference_dna: Series[str] = pa.Field(nullable=False, unique=True) - - -# simulate based on reference sources -# could also be used to generate comparison of expected vs actual results -class SamplePlan(pa.DataFrameModel): - sample: Series[str] = pa.Field(nullable=False, coerce=True) - source: Optional[Series[str]] = pa.Field(nullable=False, coerce=True) - coverage: Optional[Series[float]] = pa.Field(nullable=False) - - -# simulate based on input sequences -class DnaPlan(pa.DataFrameModel): - sample: Series[str] = pa.Field(nullable=False, coerce=True) - reference_dna: Series[str] = pa.Field(nullable=False, unique=True) - - -class Samples(pa.DataFrameModel): - sample: Series[str] = pa.Field(nullable=False, unique=True, - coerce=True) - fastq_name: Series[str] = pa.Field(nullable=False, unique=True, - coerce=True) - - -# build this by using config.yaml to parse ReferenceDna -class Designs(pa.DataFrameModel): - source: Optional[Series[str]] = pa.Field(nullable=False, coerce=True) - name: Series[str] = pa.Field(nullable=False, coerce=True) - - -class Candidates(pa.DataFrameModel): - read_index: Series[int] = pa.Field(nullable=False) - query: Series[str] = pa.Field(nullable=False) - reference: Series[str] = pa.Field(nullable=False) - reference_name: Series[str] = pa.Field(nullable=False) +from .load import * ### MAIN STEPS @@ -132,7 +71,7 @@ def setup(clean=False): def dna_to_designs(): - """Generate a design table by parsing the reference Dna sequences. + """Generate a design table by parsing the reference DNA sequences. """ df_reference = load_reference_dna() config = load_config() @@ -178,21 +117,25 @@ def simulate_single_reads(mutations_per_read: int=0, coverage=None): print(f'Wrote {len(reads):,} single reads to {f}') +def clear_simulation(): + files = glob('*/simulate/*') + [os.remove(x) for x in files] + + def simulate_paired_reads(read_lengths: Tuple[int, int]=(300, 300), mutations_per_read: int=0, coverage=None): """Create simulated fastq files based on planned samples. :param read_lengths: forward and reverse read lengths """ - df_planned = load_sample_plan() df_reference = load_reference_dna() - df = df_planned.merge(df_reference, how='left') - if 'coverage' not in df: - df['coverage'] = coverage + df_planned = load_sample_plan().merge(df_reference, how='left') + if 'coverage' not in df_planned: + df_planned['coverage'] = coverage create_directories() - it = df.groupby(['sample', 'coverage'], sort=False, dropna=False) + it = df_planned.groupby(['sample', 'coverage'], sort=False, dropna=False) for (sample, coverage), df in it: r1 = df['reference_dna'].str[:read_lengths[0]] r2 = df['reference_dna'].apply(reverse_complement).str[:read_lengths[1]] @@ -321,68 +264,6 @@ def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10): print(f'Wrote match table ({len(df_match):,} rows) to {filenames["mapped"]}') -### LOAD - - -def load_config(): - with open(config_yaml, 'r') as fh: - return yaml.safe_load(fh) - - -def load_reference_dna() -> ReferenceDna: - return (pd.read_csv(reference_dna_table) - .pipe(ReferenceDna) - .assign(reference_dna=lambda x: x['reference_dna'].str.upper()) - .pipe(assert_unique, 'name') - .pipe(assert_unique, 'reference_dna') - ) - - -def load_sample_plan() -> SamplePlan: - df = pd.read_csv(sample_plan_table).pipe(SamplePlan) - if 'source' in df: - cols = pd.read_csv(reference_dna_table, nrows=1).columns - if 'source' not in cols: - raise ValueError('Sample plan has source column but reference does not') - # TODO: check that samples are in samples.csv - return df - - -def load_samples() -> Samples: - return pd.read_csv(sample_table).pipe(Samples) - - -def load_designs() -> Designs: - """Add translations that are needed for comparison but not already present. - """ - df_designs = pd.read_csv(design_table) - fields, _ = get_comparison_fields() - for field in fields: - if field.endswith('_aa') and field[:-3] in df_designs: - if field in df_designs: - raise ValueError(f'Cannot provide both Dna and amino acid for {field}') - df_designs[field] = [quick_translate(x) for x in df_designs[field[:-3]]] - elif field not in df_designs: - raise ValueError(f'Cannot derive {field} from design table') - return df_designs.pipe(Designs) - - -def load_seqs_to_map(sample, simulate, field): - filenames = get_filenames(sample, simulate, field) - - aa = field.endswith('_aa') - field_no_aa = field[:-3] if aa else field - - seqs_to_map = (pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa]) - .rename(columns={field_no_aa: 'query'}) - .dropna() - ) - if aa: - seqs_to_map['query'] = [quick_translate(x) for x in seqs_to_map['query']] - - return seqs_to_map - - ### UTILITIES @@ -402,18 +283,12 @@ def describe_parsed(df_parsed): to_string = lambda x: ' ' + x.to_string().replace('\n', '\n ') print('Null entries:') print(null_summary.pipe(to_string)) - print('Dna length:') + print('DNA length:') print(dna_length.pipe(describe).pipe(to_string)) print('Protein length:') print(aa_length.pipe(describe).pipe(to_string)) -def quick_translate(seq): - # return str(quickdna.DnaSequence(seq).translate()) - n = len(seq) - len(seq) % 3 - return translate_dna(seq[:n]) - - def format_string_to_capture_regex(x, optional=[], **kwargs): formatted_string = re.sub(r'{(.*?)}', r'(?P<\1>.*)', x) for key, value in kwargs.items(): @@ -423,34 +298,16 @@ def format_string_to_capture_regex(x, optional=[], **kwargs): return formatted_string -def get_comparison_fields(): - """Parse fields for direct matching and field pairs for inferred matching. - """ - direct_match, inferred_match = set(), set() - config = load_config() - for c in config['compare']: - if '->' in c: - a, b = [x.strip() for x in c.split('->')] - direct_match |= {a} - inferred_match |= {(a, b)} - else: - direct_match |= {c.strip()} - - return direct_match, inferred_match - - def parse_sequences(config, sequences): - """Match Dna sequence based on pattern config. + """Match DNA sequence based on pattern config. First, the sequence between left_adapter and right_adapter is extracted. - Then, any Dna parts are matched. + Then, any DNA parts are matched. Then, a protein is translated starting at the position matched by the pattern in config.protein.start_at using `translate_to_stop`. Finally, the protein is matched using the template string protein.pattern, - and named matches are added to the result dictionary. Only Dna is stored here. + and named matches are added to the result dictionary. Only DNA is stored here. """ re_insert = re.compile('{left_adapter}(.*){right_adapter}'.format(**config)) - import os - os.wtf = re_insert capture = config['protein'].get('capture', {}) optional = config['protein'].get('optional', []) re_proteins = [re.compile(format_string_to_capture_regex( @@ -650,7 +507,8 @@ def mmseqs_prefilter(sample, simulate, field, mmseqs_max_seqs=10, verbose=False) seqs_to_map['query'] = [quick_translate(x) for x in seqs_to_map['query']] kmer = 6 else: - kmer = 12 + # what should this be? + kmer = 6 write_fasta(filenames['map query'], seqs_to_map.dropna()) make_mmseqs_db(filenames['map query'], aa, is_query=True) @@ -667,6 +525,7 @@ def mmseqs_prefilter(sample, simulate, field, mmseqs_max_seqs=10, verbose=False) # without these flags, mmseqs will sometimes throw out exact matches '--comp-bias-corr', '0', '--mask', '0', + '--spaced-kmer-mode', '0', # '--exact-kmer-matching', '1', # prefilter only retains exact kmer matches ) if verbose: @@ -674,12 +533,13 @@ def mmseqs_prefilter(sample, simulate, field, mmseqs_max_seqs=10, verbose=False) result = subprocess.run(command, capture_output=True) # result = subprocess.run(command) if result.returncode != 0: + # if result.returncode != 0 or True: msg = '\n'.join([str(x) for x in (result.args, result.stdout.decode(), result.stderr.decode()) ]) - raise ValueError('Non-zero return code in mmseqs prefilter') + raise ValueError(f'Non-zero return code in mmseqs prefilter:\n{msg}') # convert to tsv @@ -753,7 +613,7 @@ def mmseqs_search(sample, field, simulate=False, min_seq_id=0.8) -> Candidates: filenames['map mmseqs result db'], tmp, '--min-seq-id', str(min_seq_id), - '-k', str(kmer), # memory usage is 21**k * 8 bytes (protein) or 5**k * 8 bytes (Dna) + '-k', str(kmer), # memory usage is 21**k * 8 bytes (protein) or 5**k * 8 bytes (DNA) '-s', '1', # sensitivity, 1 is fastest, 7.5 is most sensitive '-c', '0.9', # minimum alignment coverage on query/target '--exact-kmer-matching', '1', # prefilter only retains exact kmer matches diff --git a/src/ngs_analysis/constants.py b/src/ngs_analysis/constants.py index 26fc605..86dd275 100644 --- a/src/ngs_analysis/constants.py +++ b/src/ngs_analysis/constants.py @@ -1,61 +1,23 @@ from collections import defaultdict -import os -from pathlib import Path -import string -try: - HOME = Path(os.environ['HOME']) -except KeyError: - HOME = Path('/home/dfeldman') +# user-provided +config_yaml = 'config.yaml' # how to parse and compare +sample_table = 'samples.csv' # NGS samples +reference_dna_table = 'reference_dna.csv' # designed insert DNA +sample_plan_table = 'sample_plan.csv' # cloning plan -JOBLIB_CACHE = HOME / '.joblib' +working_directories = '0_paired_reads', '1_reads', '2_parsed', '3_mapped', '3_mapped/map' -RESOURCES = Path(__file__).parents[0] / 'resources' -RULE_SETS = RESOURCES / 'rule_sets.csv' -VISITOR_FONT_PATH = RESOURCES / 'visitor1.ttf' +# generated or provided +design_table = 'designs.csv' # designed DNA or protein parts -BQ_PROJECT_ID = 'bilf-350112' +ngmerge = 'NGmerge' +bwa = 'bwa' +mmseqs = 'mmseqs' -GO_TERM = 'GO_term' -GO = 'GO ID' -GO_SYNONYM = 'DB Object Synonym (|Synonym)' -GO_SYMBOL = 'DB Object Symbol' -GO_TERM_COUNTS = 'GO_term_counts' -SEARCH_KEYWORD = 'keyword' -GO_SYNONYM = 'DB Object Synonym (|Synonym)' +MMSEQS_KMER_AA = 6 +MMSEQS_KMER_DNA = 6 # TODO: problematic for short sequences? -GENE_ID = 'gene_id' -GENE_SYMBOL = 'gene_symbol' - -HGNC = 'HGNC' -UNIPROTKB = 'UniProtKB' -ENSG = 'ENSG' -GENE_ALIAS = 'gene_alias' -RCSB = 'RCSB' - -biomart_columns = {'Gene stable ID': ENSG, - 'HGNC ID': HGNC, - 'NCBI gene ID': GENE_ID, - 'NCBI gene (formerly Entrezgene) ID': GENE_ID, - 'HGNC symbol': GENE_SYMBOL, - 'UniProtKB Gene Name ID': UNIPROTKB, - } - -PTHR = 'PTHR' -PTHR_SF = 'PTHR_SF' -PTHR_FAMILY = 'PTHR_family' -PTHR_SUBFAMILY = 'PTHR_subfamily' -PTHR_CLASS_LIST = 'PC_list' - -pthr_columns = { - 0: 'identifier', - 2: PTHR_SF, - 3: PTHR_FAMILY, - 4: PTHR_SUBFAMILY, - 8: PTHR_CLASS_LIST, -} - -MZ_DOUBLE_SPACING = 0.5001917279701898 AA_3 = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'] @@ -64,26 +26,6 @@ AA_3_1 = dict(zip(AA_3, AA_1)) AA_1_3 = dict(zip(AA_1, AA_3)) -skyline_columns = { - 'Replicate': 'sample', - 'Replicate Name': 'sample', - 'Protein': 'short_name', - 'Peptide': 'sequence', - 'Peptide Retention Time': 'RTime', - 'Normalized Area': 'peak_area', - 'Total Area MS1': 'ms1_area', - 'Best Retention Time': 'RTime', - 'Min Start Time': 'RTime_start', - 'Max End Time': 'RTime_end', - 'Average Mass Error PPM': 'mass_error_ppm', - 'Isotope Dot Product': 'idotp', - } - - -PT02_BACKBONE_START = 'ATTCTCCTTGGAATTTGCCCTTTTTGAGTTTGGATCTTGGTTCAT' -iON_BACKBONE_START = 'GACATTGATTATTGACTAGTTATTAATAGTAATCAAT' -pET_BACKBONE_START = 'TAATACGACTCACTATAGGGGAATTGTGAGCGGATAACAATTCC' - CODONS = { 'TAA': '*', 'TAG': '*', @@ -155,8 +97,6 @@ [CODONS_REVERSE[v].append(k) for k, v in CODONS.items()] CODONS_REVERSE = {k: '|'.join(v) for k, v in CODONS_REVERSE.items()} -CHAIN_ALPHABET = string.ascii_uppercase + string.ascii_lowercase - SLUGIFY_REPLACEMENTS = [ ('<', 'lt'), ('<=', 'lte'), diff --git a/src/ngs_analysis/load.py b/src/ngs_analysis/load.py new file mode 100644 index 0000000..e3f8ea8 --- /dev/null +++ b/src/ngs_analysis/load.py @@ -0,0 +1,181 @@ +import pandas as pd +import yaml + +from .constants import * +from .types import * +from .utils import * +from .sequence import quick_translate + + +def load_config(): + with open(config_yaml, 'r') as fh: + return yaml.safe_load(fh) + + +def load_reference_dna() -> ReferenceDna: + """Repeat entries are allowed (same sequence from a different + source), but `name` must uniquely specify `reference_dna`. + """ + df = (pd.read_csv(reference_dna_table) + .pipe(ReferenceDna) + .assign(reference_dna=lambda x: x['reference_dna'].str.upper()) + ) + (df + .drop_duplicates(['name', 'reference_dna']) + .pipe(assert_unique, 'reference_dna') + ) + return df + + +def load_sample_plan() -> SamplePlan: + df = pd.read_csv(sample_plan_table).pipe(SamplePlan) + if 'source' in df: + df_ref = load_reference_dna() + if 'source' not in df_ref: + raise ValueError('Sample plan has source column but reference does not') + sample_sources = set(df['source']) + ref_sources = set(df_ref['source']) + if missing := sample_sources - ref_sources: + msg = ('Sample plan refers to sources not in ' + f'reference: {missing}') + raise ValueError(msg) + + # TODO: check that samples are in samples.csv + return df + + +def load_samples() -> Samples: + return pd.read_csv(sample_table).pipe(Samples) + + +def load_designs() -> Designs: + """Add translations that are needed for comparison but not already present. + """ + df_designs = pd.read_csv(design_table) + fields, _ = get_comparison_fields() + for field in fields: + if field.endswith('_aa') and field[:-3] in df_designs: + if field in df_designs: + raise ValueError(f'Cannot provide both Dna and amino acid for {field}') + df_designs[field] = [quick_translate(x) for x in df_designs[field[:-3]]] + elif field not in df_designs: + raise ValueError(f'Cannot derive {field} from design table') + return df_designs.pipe(Designs) + + +def load_seqs_to_map(sample, simulate, field): + filenames = get_filenames(sample, simulate, field) + + aa = field.endswith('_aa') + field_no_aa = field[:-3] if aa else field + + seqs_to_map = (pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa]) + .rename(columns={field_no_aa: 'query'}) + .dropna() + ) + if aa: + seqs_to_map['query'] = [quick_translate(x) for x in seqs_to_map['query']] + + return seqs_to_map + + +def get_filenames(sample, simulate=False, field=None): + """Get filenames used throughout pipeline for this sample. + + Fastq files ending with .fastq, .fq., .fastq.gz, or .fq.gz are identified + based on the sample table. In simulate mode, the fastq name is identical + to the sample name. + """ + samples = load_samples()['sample'] + if sample not in samples.values: + raise ValueError(f'Sample {sample} not present in {sample_table}') + + if simulate: + search = f'1_reads/simulate/{sample}' + else: + sample_to_fastq = load_samples().set_index('sample')['fastq_name'].to_dict() + fastq_search = sample_to_fastq[sample] + search = f'1_reads/*{fastq_search}*' + + extensions = '.fastq', '.fq', '.fq.gz', '.fastq.gz' + def search_extensions(search): + return sum([glob(search + x) for x in extensions], []) + + r = search_extensions(search) + search_paired = search.replace('1_reads', '0_paired_reads') + r1 = search_extensions(search_paired + '_R1') + r2 = search_extensions(search_paired + '_R2') + paired_reads_found = len(r1) == 1 & len(r2) == 1 + + if len(r1) > 1 or len(r2) > 1: + raise ValueError(f'Too many paired read files for sample {sample}, found: {r1 + r2}') + + if len(r) > 1: + raise ValueError(f'Too many read files for sample {sample}, found: {r}') + + if not (paired_reads_found or len(r) == 1): + msg = [ + f'Neither paired reads nor single reads provided for ' + f'sample {sample}, searches returned:', + f'{search_paired}: {nglob(search_paired)}', + f'{search}: {nglob(search)}', + ] + raise ValueError('\n'.join(msg)) + + add_simulate = 'simulate/' if simulate else '' + filenames = { + 'reads': f'1_reads/{add_simulate}{sample}.fastq', + 'parsed': f'2_parsed/{add_simulate}{sample}.parsed.pq', + 'mapped': f'3_mapped/{add_simulate}{sample}.mapped.csv', + } + if paired_reads_found: + filenames |= {'paired_r1': r1[0], 'paired_r2': r2[0]} + + if field: + filenames['map query'] = f'3_mapped/map/{add_simulate}{sample}_{field}.fa' + filenames['map mmseqs target db'] = f'3_mapped/map/{field}.target.db' + filenames['map mmseqs target fa'] = f'3_mapped/map/{field}.fa' + filenames['map mmseqs query db'] = f'3_mapped/map/{add_simulate}{sample}_{field}.query.db' + filenames['map mmseqs result db'] = f'3_mapped/map/{add_simulate}{sample}_{field}.result.db' + filenames['map mmseqs prefilter db'] = f'3_mapped/map/{add_simulate}{sample}_{field}.prefilter.db' + filenames['map mmseqs prefilter tsv'] = f'3_mapped/map/{add_simulate}{sample}_{field}.prefilter.tsv' + filenames['map mmseqs result m8'] = f'3_mapped/map/{add_simulate}{sample}_{field}.result.m8' + filenames['map result'] = f'3_mapped/map/{add_simulate}{sample}_{field}.csv' + + return filenames + + +def get_comparison_fields(): + """Parse fields for direct matching and field pairs for inferred matching. + """ + direct_match, inferred_match = set(), set() + config = load_config() + for c in config['compare']: + if '->' in c: + a, b = [x.strip() for x in c.split('->')] + direct_match |= {a} + inferred_match |= {(a, b)} + else: + direct_match |= {c.strip()} + + return direct_match, inferred_match + + +def load_data_for_plotting(samples, simulate=False): + samples = pd.Series(list(samples)).drop_duplicates() + + arr = [] + for sample in samples: + f = get_filenames(sample, simulate=simulate)['mapped'] + arr += [pd.read_csv(f).assign(sample=sample)] + df_mapped = pd.concat(arr).reset_index(drop=True) + + direct, indirect = get_comparison_fields() + fields = direct | set(y for x in indirect for y in x) + + return df_mapped, fields + + +def reference_contains_source(): + return 'source' in pd.read_csv(reference_dna_table, nrows=1) + diff --git a/src/ngs_analysis/plot.py b/src/ngs_analysis/plot.py new file mode 100644 index 0000000..190617c --- /dev/null +++ b/src/ngs_analysis/plot.py @@ -0,0 +1,166 @@ +import os +from .load import * +import seaborn as sns +import matplotlib.pyplot as plt +from .timer import Timer + + +def plot_all(samples, output_prefix='figures/', fuzzy_distance=15): + + # TODO: add config parsing for plot options + + os.makedirs(os.path.dirname(output_prefix), exist_ok=True) + mapped_counts = prepare_mapped_counts(samples) + + df_ref = load_reference_dna() + unique_names = df_ref['name'].drop_duplicates(keep=False) + + source_flag = reference_contains_source() + + if source_flag: + + for field, matches in mapped_counts.items(): + plot_crossmapping(field, matches, output_prefix, + unique_names) + + + plot_abundances_by_sample(field, matches, + output_prefix, df_ref) + + +def divide_matches(matches, unique_names): + # higher is better + priority = (matches + .groupby(['sample', 'source']) + ['read_count'].sum().rename('priority') + .reset_index() + ) + + # keep matches to the dominant source + # (removes matches to duplicates across sources) + dominant_matches = (matches + .merge(priority) + .sort_values('priority', ascending=False) + .drop_duplicates(['sample', 'name']) + ) + + unique_matches = (matches + .loc[lambda x: x['name'].isin(unique_names)] + .assign(read_fraction=lambda x: + x['read_fraction'] + / x.groupby('sample')['read_fraction'].transform('sum')) + ) + + return dominant_matches, unique_matches + + +def plot_crossmapping(field, matches, output_prefix, unique_names): + dominant_matches, unique_matches = divide_matches( + matches, unique_names) + + msg = f'Plotting sample vs. source for {field}...' + with Timer(verbose=msg): + fig, ax, df_plot = heatmap_sample_vs_source(dominant_matches) + ax.set_title(f'Read fraction for\n{field}') + f = f'{output_prefix}heatmap_sample_vs_source_{field}' + fig.savefig(f, bbox_inches='tight') + plt.close(fig) + df_plot.to_csv(f + '.csv', index=None) + + fig, ax, df_plot = heatmap_sample_vs_source(unique_matches) + ax.set_title(f'Unique read fraction for\n{field}') + f = f'{output_prefix}heatmap_sample_vs_source_{field}_unique' + fig.savefig(f, bbox_inches='tight') + plt.close(fig) + df_plot.to_csv(f + '.csv', index=None) + + +def plot_abundances_by_sample(field, matches, output_prefix, + df_reference): + msg = f'Plotting abundance by sample for {field}...' + with Timer(verbose=msg): + source_counts = (df_reference + .drop_duplicates(['source', 'name']) + .groupby('source').size()) + fig = facet_abundances_by_sample(matches, source_counts) + f = f'{output_prefix}abundance_by_sample_{field}' + fig.savefig(f, bbox_inches='tight') + plt.close(fig) + matches.to_csv(f + '.csv', index=None) + + +def prepare_mapped_counts(samples, minimum_distance=None): + df_mapped, fields = load_data_for_plotting(samples, simulate=True) + + if minimum_distance is None: + minimum_distance = {} + + df_reference = load_reference_dna().drop('reference_dna', axis=1) + + cols = ['sample', 'name'] + if reference_contains_source(): + cols += ['source'] + + # prepare data + mapped_counts = {} + for field in fields: + cutoff = minimum_distance.get(field, 0) + keep = df_mapped[f'{field}_distance'] <= cutoff + mapped_counts[field] = (df_mapped[keep] + .rename(columns={f'{field}_match': 'name'}) + .groupby(['sample', 'name']) + .size().rename('read_count').reset_index() + .assign(read_fraction=lambda x: + x['read_count'] / x.groupby('sample')['read_count'].transform('sum')) + .merge(df_reference) + ) + + return mapped_counts + + +def heatmap_sample_vs_source(matches): + df_plot = (matches + .pivot_table(index='sample', columns='source', + values='read_fraction', aggfunc='sum') + .fillna(0)) + + padding = np.array([1, 1.5]) + scaling = np.array([0.5, 0.5]) + figsize = padding + scaling * df_plot.shape + fig, ax = plt.subplots(figsize=figsize) + (df_plot + .pipe(sns.heatmap, annot=True, xticklabels=True, yticklabels=True, + cbar=False, ax=ax) + ) + plt.yticks(rotation=0) + plt.xticks(rotation=30) + + return fig, ax, df_plot + + +def facet_abundances_by_sample(matches, source_counts): + + def plot(data, label, color): + data = sorted(data['read_count'])[::-1] + ax = plt.gca() + ax.plot(np.arange(len(data)) + 1, data, color=color, label=label, marker='.') + x = source_counts[label] + ax.plot([x, x], [1, 100], color=color, ls=':', label='# designs') + + hue_order = natsorted(set(matches['source'])) + row_order = natsorted(set(matches['sample'])) + fg = (matches + .pipe(sns.FacetGrid, + row='sample', row_order=row_order, + hue='source', hue_order=hue_order, + height=2, aspect=2, sharex=False) + .map_dataframe(plot) + .add_legend() + ) + + fg.axes.flat[-1].set_xlabel('Rank'); + for ax in fg.axes.flat[:]: + ax.set_ylabel('Read count') + ax.set_yscale('log') + + return fg.fig \ No newline at end of file diff --git a/src/ngs_analysis/sequence.py b/src/ngs_analysis/sequence.py index 3a3f9a2..4055a55 100644 --- a/src/ngs_analysis/sequence.py +++ b/src/ngs_analysis/sequence.py @@ -1,6 +1,4 @@ import gzip -import os -import re from collections import defaultdict from glob import glob @@ -8,7 +6,7 @@ import pandas as pd from natsort import natsorted -from .constants import RESOURCES, CODONS, CODONS_REVERSE +from .constants import CODONS, CODONS_REVERSE watson_crick = {'A': 'T', @@ -175,23 +173,6 @@ def translate_dna(s): return ''.join([CODONS[s[i*3:(i+1)*3]] for i in range(int(len(s)/3))]) -def load_codons(organism): - f = os.path.join(RESOURCES, 'codon_usage', 'organisms.csv') - taxids = pd.read_csv(f).set_index('organism')['taxid'].to_dict() - - if organism.lower() == 'yeast': - organism = 's_cerevisiae' - organism = organism.lower().replace('.', '').replace(' ', '_') - - try: - table = f'{organism}_{taxids[organism]}.csv' - except KeyError: - raise ValueError(f'{organism} must be one of {list(taxids.keys())}') - f = os.path.join(RESOURCES, 'codon_usage', table) - return (pd.read_csv(f) - .assign(codon_dna=lambda x: x['codon'].str.replace('U', 'T'))) - - def make_equivalent_codons(dna_to_aa): """Make dictionary from codon to other codons for the same amino acid """ @@ -214,154 +195,6 @@ def reverse_complement(seq): return ''.join(watson_crick[x] for x in seq)[::-1] -def sanger_database(drive): - df_sanger = drive.get_excel('cloning/sanger') - - extra_cols = [x for x in df_sanger.columns - if x not in ('identifier', 'search')] - - arr = [] - for _, row in df_sanger.iterrows(): - files = natsorted(glob(row['search'])) - if len(files) == 0: - print(f'No files found from row {row.to_dict()}') - continue - (pd.DataFrame({'file': files}) - .assign(**{x: row[x] for x in extra_cols}) - .assign(name=lambda x: x['file'].str.extract(row['identifier'])) - .assign(seq=lambda x: x['file'].apply(read_ab1)) - .assign(seq_rc=lambda x: x['seq'].apply(reverse_complement)) - .pipe(arr.append) - ) - - cols = extra_cols + ['name', 'file', 'seq', 'seq_rc'] - return pd.concat(arr)[cols] - - -def read_ab1(f): - from Bio import SeqIO - with open(f, 'rb') as fh: - records = list(SeqIO.parse(fh, 'abi')) - assert len(records) == 1 - seq = str(records[0].seq) - return seq - - -def print_alignment(a, b, width=60, as_string=False): - """Levenshtein alignment. - """ - import edlib - alignment = edlib.align(a, b, task='path') - d = edlib.getNiceAlignment(alignment, a, b) - lines = [] - for i in range(0, max(map(len, d.values())), width): - lines += [str(i)] - for x in d.values(): - lines += [x[i:i+width]] - - txt = '\n'.join(lines) - if as_string: - return txt - else: - print(txt) - - -def reverse_translate_max(aa_seq, organism='e_coli'): - if organism not in codon_maps: - codon_maps[organism] = (load_codons(organism) - .sort_values('relative_frequency', ascending=False) - .drop_duplicates('amino_acid') - .set_index('amino_acid')['codon_dna'].to_dict() - ) - codon_map = codon_maps[organism] - return ''.join([codon_map[x] for x in aa_seq]) - - -def reverse_translate_random(aa_seq, organism='e_coli', rs='input', cutoff=0.12): - if rs == 'input': - seed = hash(aa_seq) % 10**8 - rs = np.random.RandomState(seed=seed) - if (organism, cutoff) not in codon_maps: - codon_maps[(organism, cutoff)] = (load_codons(organism) - .query('relative_frequency > @cutoff') - .groupby('amino_acid')['codon_dna'].apply(list).to_dict() - ) - codon_map = codon_maps[(organism, cutoff)] - return ''.join([rs.choice(codon_map[x]) for x in aa_seq]) - - -def get_genbank_features(f, error_on_repeat=True): - from Bio import SeqIO - records = list(SeqIO.parse(open(f,'r'), 'genbank')) - if len(records) != 1: - raise ValueError(f'found {len(records)} records in genbank {f}') - - features = {} - for f in records[0].features: - label = f.qualifiers['label'][0] - seq = f.extract(records[0].seq) - if label in features and features[label] != seq and error_on_repeat: - raise ValueError(f'repeated feature {label}') - features[label] = str(seq) - return features - - -def rolling_gc(seq, window): - from scipy.ndimage.filters import convolve - gc_window = 20 - return convolve(np.array([x in 'GC' for x in seq])*1., - np.ones(window)/window, - mode='reflect') - - -def to_codons(seq): - assert len(seq) % 3 == 0 - return [seq[i*3:(i+1)*3] for i in range(int(len(seq)/3))] - - -def codon_adaptation_index(seq, organism='e_coli'): - return (load_codons(organism) - .assign(w=lambda x: x.groupby('amino_acid')['relative_frequency'] - .transform(lambda y: y / y.max())) - .set_index('codon_dna') - .loc[to_codons(seq)]['w'] - .pipe(lambda x: np.prod(x)**(1/len(x))) - ) - - -def compare_sequences(sequences, window=25, k=6): - import matplotlib.pyplot as plt - - fig, (ax0, ax1) = plt.subplots(figsize=(12, 4), ncols=2) - for name in sequences: - cai = codon_adaptation_index(sequences[name]) - mean_gc = np.mean([x in 'GC' for x in sequences[name]]) - gc_trace = rolling_gc(sequences[name], window) - label = f'{name}: avg={mean_gc:.2g} std={np.std(gc_trace):.2g} cai={cai:.2g}' - ax0.plot(gc_trace, label=label) - - (pd.Series(get_kmers(sequences[name], k)) - .value_counts().value_counts().sort_index() - .pipe(lambda x: x/x.sum()) - .plot(ax=ax1, marker='.', ms=10, label=name)) - - - ax0.plot([0, len(gc_trace)], [0.5, 0.5], color='gray', lw=1, ls='--', zorder=-1) - ax0.legend() - ax0.set_title(f'average GC content over {window} nt window') - ax0.set_ylabel('GC fraction') - ax0.set_xlabel('DNA sequence position') - ax0.set_ylim([0.25, 0.85]) - ax0.set_xlim([0, len(gc_trace)]) - - ax1.set_title(f'repeated kmers with k={k}') - ax1.set_xlabel('kmer count') - ax1.set_ylabel(f'fraction of kmers') - ax1.set_xticks([1,2,3,4,5]) - ax1.legend() - - return fig - def get_kmers(s, k): n = len(s) @@ -552,35 +385,6 @@ def match_and_check(queries, reference, window, k, ignore_above=40, progress=lam return df_matched -def load_abi_zip(filename): - """Extract Bio.SeqIO records from sanger zip file. - """ - import zipfile - from io import BytesIO - - from Bio import SeqIO - zh = zipfile.ZipFile(filename, 'r') - arr = [] - for zi in zh.filelist: - if not zi.filename.endswith('ab1'): - print(f'Skipping {zi.filename}') - continue - fh = zh.open(zi.filename, 'r') - buffer = BytesIO(fh.read()) - arr += [SeqIO.read(buffer, 'abi')] - return arr - - -def get_abi_traces(abi_record): - channels = 'DATA9', 'DATA10', 'DATA11', 'DATA12' - bases = list('GATC') - traces = np.array([abi_record.annotations['abif_raw'][c] - for c in channels]) - df = pd.DataFrame(traces).T - df.columns = bases - return df - - def try_translate_dna(s): try: return translate_dna(s) @@ -588,22 +392,6 @@ def try_translate_dna(s): return None -def digest_protein_fasta(filename, digest_pat='[R|K]'): - """Convert protein fasta into a table of peptides, digesting with provided regex. - """ - records = read_fasta(filename) - - arr = [] - for name, seq in records: - parts = re.split(f'({digest_pat})', seq) - parts = [x for x in parts if x] - peptides = [a+b for a,b in zip(parts[::2], parts[1::2])] - for peptide in peptides: - arr += [{'name': name, 'sequence': peptide}] - - return pd.DataFrame(arr) - - def findone(aa, dna): """Simple case of amino acid substring in in-frame DNA. """ @@ -625,85 +413,6 @@ def select_most_different(xs, n): return arr -def edge_primers(dna, melt_temp): - from Bio.SeqUtils.MeltingTemp import Tm_NN - dna_rev = reverse_complement(dna) - - fwd = dna[:10] - while Tm_NN(fwd) < melt_temp: - fwd = dna[:len(fwd) + 1] - - rev = dna_rev[:10] - while Tm_NN(rev) < melt_temp: - rev = dna_rev[:len(rev) + 1] - - return fwd, rev - - -def find_orfs(seq, kozak='GCCACC', return_aa=True): - """Find open reading frames starting with kozak in circular - DNA sequence. - """ - plasmid = str(seq) - pat = f'{kozak}(ATG(?:...)*?)(?:TAA|TAG|TGA)' - ref = plasmid + plasmid - orfs = re.findall(pat, ref) + re.findall(pat, reverse_complement(ref)) - if return_aa: - orfs = [translate_dna(x) for x in orfs] - return sorted(set(orfs), key=lambda x: -1 * len(x)) - - -def find_longest_orf(dna, kozak='GCCACC'): - pat = f'{kozak}(ATG(?:...)*?)(?:TAA|TGA|TAG)' - orfs = re.findall(pat, dna.upper()) - if orfs: - return sorted(orfs, key=len)[-1] - - -def add_features(record, features): - """Add features to `record` based on name=>DNA dictionary `features`. - A feature with "orf" in the name is given a special annotation that - shows up as a translation in benchling. - """ - - vector = str(record.seq).upper() - - n = len(vector) - - features = dict(features) - arr = [] - for name, feature in features.items(): - feature = feature.upper() - m = len(feature) - - for strand in (-1, 1): - key = reverse_complement(feature) if strand == -1 else feature - starts = [x.start() for x in re.finditer(key, vector * 2)] - starts = [x for x in starts if x < n] - for start in starts: - end = start + m - qualifiers = dict(label=name) - feature_type = 'misc' - if 'orf' in name: - feature_type = 'CDS' - qualifiers['translation'] = translate_dna(key) + '*' - end += strand * 3 - - if end < n: - location = FeatureLocation(start, end, strand=strand) - else: - f1 = FeatureLocation(start, n, strand=strand) - f2 = FeatureLocation(0, end % n, strand=strand) - location = f1 + f2 - arr += [SeqFeature(location, type=feature_type, qualifiers=qualifiers)] - - # copies features but not annotations - new_record = record[:] - new_record.annotations = record.annotations - new_record.features += arr - return new_record - - def translate_to_stop(x): if not isinstance(x, str): return @@ -715,123 +424,11 @@ def translate_to_stop(x): return -def ion_plasmid_integration(filename): - """Generate plasmid map after integration for an iON vector - :param filename: iON vector genbank - """ - left_itr = 'CCCTAGAAAGATAGTCTGCGTAAAATTGACGCATG'.upper() - right_itr = 'ccctagaaagataatcatattgtgacgtacgttaaagataatcatgcgtaaaattgacgcatg'.upper() - - record = list(SeqIO.parse(filename, 'genbank'))[0] - original_features = get_genbank_features(filename, error_on_repeat=False) - # remove annoying small features - original_features = {k: v for k,v in original_features.items() if 6 < len(v)} - - dna = str(record.seq) - start = dna * 2 - # in iON figure 1A notation - # partial, one, two, one, partial - _, one, two, _, _ = split_by_regex(f'{left_itr}|{right_itr}', start, keep='prefix') - - # deduplicate - one, two = ['AA' + x[:-2] for x in (one, two)] - result = two + reverse_complement(one) - - result_record = SeqRecord(Seq(result), name=record.name + '_integrated', - annotations={'molecule_type': 'DNA'}) - result_record = add_features(result_record, original_features) - - return result_record - - -def remove_restriction_sites(dna, sites, rs): - codon_sequence = to_codons(dna) - sites = set(sites) - for site in list(sites): - sites.add(reverse_complement(site)) - - for site in sites: - width = len(site) - dna = ''.join(codon_sequence) - if site not in dna: - continue - - for i in range(len(dna)): - # if we encounter a restriction site - if dna[i:i + len(site)] == site: - # change any of these codons - overlapped_codons = sorted( - set([int((i + offset) / 3) for offset in range(width)])) - # accept first change that removes restriction site - for j in overlapped_codons: - # change this codon - new_codon = swap_codon(codon_sequence[j], rs) - local_dna = ''.join([new_codon if k == j else codon_sequence[k] - for k in overlapped_codons]) - # if codon removes this site, keep it - if site not in local_dna: - codon_sequence = codon_sequence[:j] + \ - [new_codon] + codon_sequence[j + 1:] - break - dna = ''.join(codon_sequence) - for site in sites: - assert site not in dna - - return dna - - -def restriction_sites_in_dna(dna, sites): - sites = set(list(sites) + [reverse_complement(x) for x in sites]) - for site in sites: - if site in dna: - return True - return False - - def to_codons(dna): assert len(dna) % 3 == 0 return [dna[i * 3:(i + 1) * 3] for i in range(int(len(dna) / 3))] -def swap_codon(codon, rs): - """Swap codon at random, if possible. - """ - options = equivalent_codons[codon] - if not options: - return codon - return rs.choice(options) - - -def remove_restriction_sites_dnachisel(cds, avoid, rs=None): - if not restriction_sites_in_dna(cds, avoid): - return cds - - import dnachisel as dc - - if rs is None: - np.random.seed(0) - else: - np.random.seed(int(1e9*rs.rand())) - - n = len(cds) - - constraints = [dc.AvoidPattern(x, location=0) for x in avoid] - constraints += [dc.EnforceTranslation(location=(0, n))] - - problem = dc.DnaOptimizationProblem( - sequence=cds, - constraints=constraints, - objectives=[dc.AvoidChanges()], - logger=None - ) - problem.max_iters = 50 - problem.resolve_constraints() - problem.optimize() - - assert not restriction_sites_in_dna(problem.sequence, avoid) - return problem.sequence - - def pairwise_levenshtein(seqs): """Calculate distance matrix for a list of sequences. """ @@ -847,44 +444,6 @@ def pairwise_levenshtein(seqs): return D + D.T -def generate_adapters(priming_length=12, enzyme='BsmBI', tm_range=2, target_GC=0.5, - max_calculate=10_000, num_sample=100_000, distance_threshold=6, seed=0): - """Tm is calculated for priming site plus restriction site. A target Tm is defined - based on the average of sequences within 10% of target_GC. - """ - from Levenshtein import distance - from scipy.sparse import csr_matrix - from .utils import maxy_clique_groups - from Bio.SeqUtils import MeltingTemp as mt - - enzymes = {'BsmBI': 'CGTCTC'} - site = enzymes[enzyme] - avoid = 'AAA', 'GGG', 'CCC', 'TTT', site, reverse_complement(site) - rs = np.random.RandomState(seed) - candidates = [''.join(x) for x in rs.choice(list('ACTG'), size=(num_sample, priming_length))] - candidates = [x for x in candidates if not any(y in x for y in avoid)] - - df_adapters = pd.DataFrame({'sequence': candidates}) - - df_adapters['Tm'] = (df_adapters['sequence'] + site).apply(mt.Tm_NN) - df_adapters['GC'] = df_adapters['sequence'].str.count('G|C') / df_adapters['sequence'].str.len() - - mean_tm = df_adapters.query('abs(GC - @target_GC) < 0.1')['Tm'].mean() - df_adapters = df_adapters.query('abs(Tm - @mean_tm) < @tm_range/2') - - seqs = df_adapters['sequence'][:max_calculate] - print(f'Calculating pairwise distances for {len(seqs):,} priming sites') - D = pairwise_levenshtein(seqs) - - print(f'Selecting priming sites with pairwise distance >= {distance_threshold}') - - A = distance_threshold <= D - cm = csr_matrix(1-A, shape=A.shape) - centers = np.array(maxy_clique_groups(cm, [1] * A.shape[0])) - - return df_adapters['sequence'].iloc[centers].pipe(list) - - def kmers_unique(s, k): kmers = get_kmers(s, k) return len(kmers) == len(set(kmers)) @@ -912,8 +471,9 @@ def find_aa_in_dna(aa, dna): return dna[i * 3:(i + len(aa)) * 3] -def sequence_sap_score(x): - """Sum of SAP weights for all residues, regardless of solvent exposure. - """ - from .ppi.constants import SAP_WEIGHTS - return sum([SAP_WEIGHTS[aa] for aa in x]) \ No newline at end of file +def quick_translate(seq): + # return str(quickdna.DnaSequence(seq).translate()) + n = len(seq) - len(seq) % 3 + return translate_dna(seq[:n]) + + \ No newline at end of file diff --git a/src/ngs_analysis/types.py b/src/ngs_analysis/types.py new file mode 100644 index 0000000..5de8e74 --- /dev/null +++ b/src/ngs_analysis/types.py @@ -0,0 +1,46 @@ +from pandera.typing import Series +import pandera as pa +from typing import Optional + + +# TODO: joint uniqueness, DNA characters +# currently handled in load_reference_dna +class ReferenceDna(pa.DataFrameModel): + source: Optional[Series[str]] = pa.Field(nullable=False, coerce=True) + name: Series[str] = pa.Field(nullable=False) + reference_dna: Series[str] = pa.Field(nullable=False) + + +# simulate based on reference sources +# could also be used to generate comparison of expected vs actual results +class SamplePlan(pa.DataFrameModel): + sample: Series[str] = pa.Field(nullable=False, coerce=True) + source: Optional[Series[str]] = pa.Field(nullable=False, coerce=True) + coverage: Optional[Series[float]] = pa.Field(nullable=False, coerce=True) + + +# simulate based on input sequences +class DnaPlan(pa.DataFrameModel): + sample: Series[str] = pa.Field(nullable=False, coerce=True) + reference_dna: Series[str] = pa.Field(nullable=False, unique=True) + + +class Samples(pa.DataFrameModel): + sample: Series[str] = pa.Field(nullable=False, unique=True, + coerce=True) + fastq_name: Series[str] = pa.Field(nullable=False, unique=True, + coerce=True) + + +# build this by using config.yaml to parse ReferenceDna +class Designs(pa.DataFrameModel): + source: Optional[Series[str]] = pa.Field(nullable=False, coerce=True) + name: Series[str] = pa.Field(nullable=False, coerce=True) + + +class Candidates(pa.DataFrameModel): + read_index: Series[int] = pa.Field(nullable=False) + query: Series[str] = pa.Field(nullable=False) + reference: Series[str] = pa.Field(nullable=False) + reference_name: Series[str] = pa.Field(nullable=False) +