From 0f809e05527c004ea86a895481a67c92bd1ba08a Mon Sep 17 00:00:00 2001
From: David Feldman <david.feldman.21106@gmail.com>
Date: Thu, 28 Dec 2023 11:52:31 -0800
Subject: [PATCH] add

---
 .gitignore                      |   2 +
 README.md                       |  26 +-
 environment.yaml                |  19 +
 examples/.DS_Store              | Bin 0 -> 6148 bytes
 install.sh                      |   7 +
 pyproject.toml                  |  41 ++
 src/ngs_analysis/app.py         | 717 +++++++++++++++++++++++++
 src/ngs_analysis/constants.py   | 167 ++++++
 src/ngs_analysis/interactive.py |  23 +
 src/ngs_analysis/sequence.py    | 915 ++++++++++++++++++++++++++++++++
 src/ngs_analysis/timer.py       | 112 ++++
 11 files changed, 2028 insertions(+), 1 deletion(-)
 create mode 100644 environment.yaml
 create mode 100644 examples/.DS_Store
 create mode 100644 install.sh
 create mode 100644 pyproject.toml
 create mode 100644 src/ngs_analysis/app.py
 create mode 100644 src/ngs_analysis/constants.py
 create mode 100644 src/ngs_analysis/interactive.py
 create mode 100644 src/ngs_analysis/sequence.py
 create mode 100644 src/ngs_analysis/timer.py

diff --git a/.gitignore b/.gitignore
index 68bc17f..b9180f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+examples/nanopore
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/README.md b/README.md
index 760727b..3e96fbf 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,25 @@
-# ngs-analysis
\ No newline at end of file
+# ngs-analysis
+
+Intended for analysis of sequencing reads that span multiple DNA or protein parts. For instance, given a library of protein variants linked to DNA barcodes, it can answer questions like:
+
+- How accurate are the variant sequences, at the DNA or protein level?
+- How frequently is the same barcode linked to two different variants?
+- Which reads contain parts required for function (e.g., a kozak start sequence, or a fused protein tag)?
+
+This kind of analysis often involves parsing raw sequencing reads for DNA and/or protein sub-sequences (parts), then mapping the parts to a reference of anticipated part combinations. This package offers a simple workflow: 
+
+1. Define how to parse reads into parts using plain text expressions (no code)
+2. Test the parser on simulated DNA sequences (e.g., your vector map)
+3. Parse a batch of sequencing samples
+4. Map the (combination of) parts found in each read to your reference
+
+It’s been tested with Illumina paired-end reads and Oxford Nanopore long reads. Under the hood it uses [NGmerge](https://github.com/jsh58/NGmerge) to merge paired reads and [MMseqs2](https://github.com/soedinglab/MMseqs2) for sequencing mapping. It is moderately performant: 1 million paired-end reads can be mapped to a reference of 100,000 variant-barcode pairs in ~1 minute.
+
+# Installation
+
+```bash
+pip install ngs-analysis
+```
+
+Tested on Linux and MacOS (Apple Silicon).
+
diff --git a/environment.yaml b/environment.yaml
new file mode 100644
index 0000000..102a76d
--- /dev/null
+++ b/environment.yaml
@@ -0,0 +1,19 @@
+name: ngs-analysis
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - fire
+  - glob2
+  - ipykernel
+  - natsort
+  - pandas
+  - pandera
+  - parse
+  - pytables
+  - python-levenshtein
+  - python-slugify
+  - pyyaml
+  - regex
+  - tqdm
+  - pyarrow
diff --git a/examples/.DS_Store b/examples/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..d0cf13a6e61dbfbe35ec1f25a40b4018f124e175
GIT binary patch
literal 6148
zcmeHK%}T>S5Z-O8-BN@c6nYGJEm&(Uh?fxS3mDOZN=-=6V9b^zwTDv3SzpK}@p+ut
z-GId$Jc-yDnEhsVW_B|lWPccA+?|Jqj9H8^0gA}cs1Y2lbu~;dB3E<Fm}D7?f<->7
zndmPL;kOHxG9Og3FW(;|G63IuFiYaR7z{pmquJWpZi}|)iaYO7F1*4o=DF)<H`u$B
zG6^c%53b^9F|~HjWLo%f8f7XW4kHM;y^hmRE?hZJ!(8Qh+95ikGqrll<#06a+oR#}
zs&6k(h6AvV#;a9F?Cl?%UQC~o=TyFGGCA<BWXE6yZ(w|@>&45`M5YhmtMaONLSldz
zAO?tm&0)Zt0iwG(B~zuu05R}G25^6{K@lB;rAD=NK!ew3^tTXEz{a-(qHWMISZV|h
z2-m5AI+dFz2G{8@ZkssAV5w23Gp<&KdCbby<Atl$Vcb^Xj5`{sB?gFrbq1QcYvK8S
z4!=z8BY!=Gdc*)R@Xr|Ft%)~rp(t~<ek%{pS{w8p6b0i-G(f;!y97W7_mQ4*YQK#(
Z#5o2_jW`RAt8_rR2q;3RAqIYdfluC8O(*~W

literal 0
HcmV?d00001

diff --git a/install.sh b/install.sh
new file mode 100644
index 0000000..1a95ea2
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# local install for interactive use
+
+micromamba create -f environment.yaml -y
+micromamba run -n ngs-analysis python -m ipykernel install --user --name=ngs-analysis
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..a6d0d5c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,41 @@
+[project]
+name = "ngs-analysis"
+version = "0.0.1"
+description = "Analyze deep sequencing of complex libraries"
+authors = [
+  { name="David Feldman" },
+]
+license = {file = "LICENSE.txt"}
+readme = "README.md"
+requires-python = ">=3.7"
+
+keywords = ["NGS", "library", "variant", "barcode"]
+
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+dependencies = [
+  "fire",
+  "joblib",
+  "natsort",
+  "python-slugify",
+  "python-levenshtein",
+  "pandera",
+  "pandas",	
+  "pyyaml",
+]
+
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project.urls]
+"Homepage" = "https://github.com/feldman4/plasmid-design"
+"Bug Tracker" = "https://github.com/feldman4/plasmid-design/issues"
+
+[project.scripts]
+ngs_analysis = "ngs_analysis.app:main"
+
diff --git a/src/ngs_analysis/app.py b/src/ngs_analysis/app.py
new file mode 100644
index 0000000..97114ae
--- /dev/null
+++ b/src/ngs_analysis/app.py
@@ -0,0 +1,717 @@
+"""Analyze sequencing reads and compare DNA and protein parts to a reference.
+
+Pattern-matching is used to define parts (e.g., designs, 
+variable linkers, fixed sequences, and barcodes). Identified parts can be 
+mapped to inferred reference sequences (e.g., design inferred 
+from barcode). See README for details.
+"""
+import os
+import re
+import subprocess
+import tempfile
+from glob import glob
+from subprocess import DEVNULL
+from typing import Optional, Tuple
+
+import fire
+import Levenshtein
+import numpy as np
+import pandas as pd
+import pandera as pa
+import yaml
+from pandera.typing import Series
+
+# import quickdna
+from .sequence import (read_fasta, read_fastq, reverse_complement,
+                       translate_dna, write_fake_fastq, write_fasta)
+from .timer import Timer
+
+# user-provided
+config_yaml = 'config.yaml' # how to parse and compare
+sample_table = 'samples.csv' # NGS samples
+possible_dna_table = 'possible_dna.csv' # designed insert DNA
+planned_dna_table = 'planned_dna.csv' # cloning plan
+
+working_directories = '0_paired_reads', '1_reads', '2_parsed', '3_mapped', '3_mapped/map'
+
+# generated or provided
+design_table = 'designs.csv' # designed DNA or protein parts
+
+ngmerge = 'NGmerge'
+bwa = 'bwa'
+mmseqs = 'mmseqs'
+
+MMSEQS_KMER_AA = 6
+MMSEQS_KMER_DNA = 12
+
+### TYPES
+# TODO: joint uniqueness
+class PossibleDNA(pa.DataFrameModel):
+    subpool: Optional[Series[str]] = pa.Field(nullable=False, coerce=True)
+    name: Series[str] = pa.Field(nullable=False, unique=True)
+    vector_dna: Series[str] = pa.Field(nullable=False, unique=True)
+
+
+class PlannedDNA(pa.DataFrameModel):
+    sample: Series[str] = pa.Field(nullable=False, coerce=True)
+    vector_dna: Series[str] = pa.Field(nullable=False)
+
+
+class Samples(pa.DataFrameModel):
+    sample: Series[str] = pa.Field(nullable=False, unique=True, 
+                                   coerce=True)
+    fastq_name: Series[str] = pa.Field(nullable=False, unique=True, 
+                                       coerce=True)
+
+
+# build this by using config.yaml to parse PossibleDNA
+class Designs(pa.DataFrameModel):
+    subpool: Optional[Series[str]] = pa.Field(nullable=False, coerce=True)
+    name: Series[str] = pa.Field(nullable=False, coerce=True)
+
+
+class Candidates(pa.DataFrameModel):
+    read_index: Series[int] = pa.Field(nullable=False)
+    query: Series[str] = pa.Field(nullable=False)
+    reference: Series[str] = pa.Field(nullable=False)
+    reference_name: Series[str] = pa.Field(nullable=False)
+
+
+### MAIN STEPS
+def setup():
+    print('Creating directories...')
+    [os.makedirs(f'{d}/simulate', exist_ok=True) for d in working_directories]
+
+    print('Checking config...')
+    # TODO: actual type check for the config
+    load_config()
+
+    n = len(load_possible_dna())
+    print(f'Found {n} reference sequences in {possible_dna_table}')
+
+    # TODO: check that the input files exist
+    n = len(load_samples())
+    print(f'Found {n} samples in {sample_table}')
+    
+    
+
+# ok
+def dna_to_designs():
+    """Generate a design table by parsing the possible DNA sequences.
+    """
+    df_possible = load_possible_dna()
+    config = load_config()
+    parsed = (parse_sequences(config, df_possible['vector_dna'])
+     .drop('read_index', axis=1))
+    m, n = len(parsed), len(df_possible)
+    if m != n:
+        raise ValueError(f'failed to parse {n-m}/{n} sequences')
+    
+    print(f'Parsed {len(parsed):,} sequences from {possible_dna_table}')
+    describe_parsed(parsed)
+
+    pd.concat([
+        df_possible.drop('vector_dna', axis=1),
+        parsed,
+    ], axis=1).to_csv(design_table, index=None)
+    print(f'Wrote {n:,} rows to {design_table}')
+
+# ok
+def simulate_fastqs(read_lengths: Tuple[int, int]=(300, 300)):
+    """Create simulated fastq files based on planned samples.
+
+    :param read_lengths: forward and reverse read lengths
+    """
+    df_planned = load_planned_dna()
+    os.makedirs('1_reads/simulate', exist_ok=True)
+
+    for sample, df in df_planned.groupby('sample', sort=False):
+        r1 = df['vector_dna'].str[:read_lengths[0]]
+        r2 = df['vector_dna'].apply(reverse_complement).str[:read_lengths[1]]
+
+        write_fake_fastq(f'1_reads/simulate/{sample}_R1.fastq', r1)
+        write_fake_fastq(f'1_reads/simulate/{sample}_R2.fastq', r2)
+        print(f'Wrote {len(r1):,} paired reads to '
+              f'1_reads/simulate/{sample}_R[12].fastq')
+
+# ok
+def merge_read_pairs(sample, simulate=False):
+    """Use NGmerge to merge read pairs.
+
+    :param sample: sample name
+    :param simulate: if True, use ./*/simulate/ subdirectories
+    """
+    filenames = get_filenames(sample, simulate=simulate)
+    
+    command = (ngmerge, 
+    '-1', filenames['paired_r1'], '-2', filenames['paired_r2'], 
+    '-o', filenames['reads'], 
+    '-z', # always gzip output
+    )
+    with Timer(verbose=f'Running ngmerge on sample {sample}...'):
+        subprocess.run(command, check=True)
+    print(f'Wrote output to {filenames["reads"]}') 
+
+# ok
+def parse_reads(sample, simulate=False):
+    config = load_config()
+    filenames = get_filenames(sample, simulate)
+    with Timer('', verbose='Loading reads...'):
+        reads = read_fastq(filenames['reads'])
+    with Timer('', verbose=f'Parsing {len(reads):,} reads...'):
+        os.makedirs(os.path.dirname(filenames['parsed']), exist_ok=True)
+        parse_sequences(config, reads).to_parquet(filenames['parsed'])
+    
+# ok
+def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10):
+    """For each read, find nearest match for the requested fields. 
+    
+    Candidate matches from the design table are found via mmseqs search, 
+    then the nearest match is determined by Levenshtein distance. For 
+    inferred fields, the match is set to the inferred value and the 
+    distance to the design table entry accordingly. For barcodes, only
+    exact matches are considered.
+
+    :param sample: sample name
+    :param simulate: if True, use ./*/simulate/ subdirectories
+    """
+    filenames = get_filenames(sample, simulate)
+
+    # prepare mmseqs db
+    mmseqs_make_design_db()
+    
+    match_to_ngs, secondary = get_comparison_fields()
+    arr = []
+    for field in match_to_ngs:
+        if field == 'barcode': 
+            # for barcode, only permit exact matches
+            with Timer('', verbose=f'Matching {field}...'):
+                arr += [map_barcode(sample, simulate, field)]
+        else:
+            with Timer('', verbose=f'Searching for {field} candidates...'):
+            # fastmap to get candidates
+                df_mapped = mmseqs_prefilter(sample, simulate, field,
+                 mmseqs_max_seqs=mmseqs_max_seqs)
+            # calculate edit distances for candidates
+            msg = f'Calculating distances for {len(df_mapped)} {field} candidates...'
+            with Timer('', verbose=msg):
+                df_match = match_mapped(df_mapped, field)
+            arr += [df_match]
+
+    df_match = arr[0]
+    for df in arr[1:]:
+        df_match = df_match.merge(df, on='read_index')
+
+    if secondary:
+        df_designs = load_designs()
+        arr = []
+        for field_a, field_b in secondary:
+            field = f'{field_b}_from_{field_a}'
+            if '_aa' in field_b:
+                df_designs[field_b] = [quick_translate(x) for x in df_designs[field_b[:-3]]]
+            name_to_field = df_designs.set_index('name')[field_b].to_dict()
+            # define reference for field_b using field_a
+            reference = (df_match
+            .rename(columns={f'{field_a}_match': 'reference_name'})
+            .assign(reference=lambda x: x['reference_name'].map(name_to_field))
+            .dropna()
+            )
+            # calculate distance between the parsed value for field_b and
+            # the reference (where both exist)
+            df = (load_seqs_to_map(sample, simulate, field_b)
+                .merge(reference, on='read_index')
+                .pipe(Candidates)
+                .pipe(match_mapped, field)
+                .drop(f'{field}_match_equidistant', axis=1)
+            )
+            df_match = df_match.merge(df, on='read_index')
+    
+    df_match.to_csv(filenames['mapped'], index=None)
+    print(f'Wrote match table ({len(df_match):,} rows) to {filenames["mapped"]}')
+
+
+### LOAD
+
+
+def load_config():
+    with open(config_yaml, 'r') as fh:
+        return yaml.safe_load(fh)
+
+
+def load_possible_dna() -> PossibleDNA:
+    return pd.read_csv(possible_dna_table).pipe(PossibleDNA)
+
+
+def load_planned_dna() -> PlannedDNA:
+    return pd.read_csv(planned_dna_table).pipe(PlannedDNA)
+
+
+def load_samples() -> Samples:
+    return pd.read_csv(sample_table).pipe(Samples)
+
+
+def load_designs() -> Designs:
+    """Add translations that are needed for comparison but not already present.
+    """
+    df_designs = pd.read_csv(design_table)
+    fields, _ = get_comparison_fields()
+    for field in fields:
+        if field.endswith('_aa') and field[:-3] in df_designs:
+            if field in df_designs:
+                raise ValueError(f'Cannot provide both DNA and amino acid for {field}')
+            df_designs[field] = [quick_translate(x) for x in df_designs[field[:-3]]]
+        elif field not in df_designs:
+            raise ValueError(f'Cannot derive {field} from design table')
+    return df_designs.pipe(Designs)
+
+
+def load_seqs_to_map(sample, simulate, field):
+    filenames = get_filenames(sample, simulate, field)
+
+    aa = field.endswith('_aa')
+    field_no_aa = field[:-3] if aa else field
+
+    seqs_to_map = (pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa])
+     .rename(columns={field_no_aa: 'query'})
+     .dropna()
+    )
+    if aa:
+        seqs_to_map['query'] = [quick_translate(x) for x in seqs_to_map['query']]
+
+    return seqs_to_map
+
+
+### UTILITIES
+
+
+def describe_parsed(df_parsed):
+    df_parsed = df_parsed.copy()
+    df_parsed[df_parsed == ''] = np.nan
+    dna_length = df_parsed.select_dtypes('object').map(len, na_action='ignore')
+    aa_length = dna_length / 3
+
+    null_summary = pd.concat([
+        (df_parsed.isnull() * 1).sum().rename('null entries'),
+        (df_parsed.isnull() * 1).mean().rename('null fraction'),
+    ], axis=1)
+    stats = ['count', 'min', 'median', 'max']
+    describe = lambda x: x.describe().rename({'50%': 'median'}).loc[stats].astype(int)
+
+    to_string = lambda x: '    ' + x.to_string().replace('\n', '\n    ')
+    print('Null entries:')
+    print(null_summary.pipe(to_string))
+    print('DNA length:')
+    print(dna_length.pipe(describe).pipe(to_string))
+    print('Protein length:')
+    print(aa_length.pipe(describe).pipe(to_string))
+
+
+def quick_translate(seq):
+    # return str(quickdna.DnaSequence(seq).translate())
+    n = len(seq) - len(seq) % 3
+    return translate_dna(seq[:n])
+
+
+def format_string_to_capture_regex(x, optional=[], **kwargs):
+    formatted_string = re.sub(r'{(.*?)}', r'(?P<\1>.*)', x)
+    for key, value in kwargs.items():
+        tail = '?' if key in optional else ''
+        formatted_string = formatted_string.replace(
+            f'(?P<{key}>.*)', f'(?P<{key}>{value}){tail}')
+    return formatted_string
+
+
+def get_comparison_fields():
+    """Parse fields for direct matching and field pairs for inferred matching.
+    """
+    direct_match, inferred_match = set(), set()
+    config = load_config()
+    for c in config['compare']:
+        if '->' in c:
+            a, b = [x.strip() for x in c.split('->')]
+            direct_match |= {a}
+            inferred_match |= {(a, b)}
+        else:
+            direct_match |= {c.strip()}
+
+    return direct_match, inferred_match
+
+
+def parse_sequences(config, sequences):
+    """Match DNA sequence based on pattern config.
+    First, the sequence between left_adapter and right_adapter is extracted.
+    Then, any DNA parts are matched.
+    Then, a protein is translated starting at the position matched by the pattern 
+    in config.protein.start_at using `translate_to_stop`.
+    Finally, the protein is matched using the template string protein.pattern,
+    and named matches are added to the result dictionary. Only DNA is stored here.
+    """
+    re_insert = re.compile('{left_adapter}(.*){right_adapter}'.format(**config))
+    import os
+    os.wtf = re_insert
+    capture = config['protein'].get('capture', {})
+    optional = config['protein'].get('optional', [])
+    re_proteins = [re.compile(format_string_to_capture_regex(
+        x, optional, **capture)) for x in config['protein']['patterns']]
+    
+    arr = []
+    for i, dna in enumerate(sequences):
+        dna = dna.upper()
+        entry = {}
+        try:
+            insert = re_insert.findall(dna)[0]
+        except IndexError:
+            continue
+        entry['read_index'] = i
+        entry['read_length'] = len(dna)
+        entry['insert_length'] = len(insert)
+        for name, pat in config['dna_parts'].items():
+            m = re.findall(pat, insert)
+            if m:
+                entry[name] = m[0]
+        
+        protein_start = re.search(config['protein']['start_at'], insert).start()
+        protein = quick_translate(insert[protein_start:])
+        protein = protein.split('*')[0]
+        entry['cds'] = insert[protein_start:][:len(protein) * 3]
+        
+        for re_protein in re_proteins:
+            match = re_protein.match(protein)
+            if match:
+                for i, name in enumerate(match.groupdict()):
+                    name = name.rstrip('_') # collapse redundant names
+                    start, end = match.span(i + 1)
+                    entry[name] = entry['cds'][start*3:end*3]
+                arr += [entry]
+                # assert False
+                break
+    if len(arr) == 0:
+        raise Exception('Nothing found')
+    return pd.DataFrame(arr)
+
+
+def map_barcode(sample, simulate, field):
+    """Only exact matches, distance and equidistant not returned.
+    """
+    filenames = get_filenames(sample, simulate, field)
+
+    aa = field.endswith('_aa')
+    field_no_aa = field[:-3] if aa else field
+    seqs_to_map = pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa])
+    if aa:
+        seqs_to_map[field] = [quick_translate(x) for x in seqs_to_map[field_no_aa]]
+        seqs_to_map = seqs_to_map[['read_index', field]]    
+    ref_seqs = (read_fasta(filenames['map mmseqs target fa'], as_df=True)
+    .rename(columns={'name': 'match', 'seq': 'reference'})
+    )
+
+    match = f'{field}_match'
+
+    return (seqs_to_map
+     .merge(ref_seqs, left_on=field, right_on='reference', how='left')
+     .rename(columns={'match': match})
+     [['read_index', match]]
+    )
+
+
+def match_mapped(df_mapped: Candidates, field):
+    """Filter mapping table for a specific field.
+
+    What happens if inference leads to multiple sequences?
+    """
+    match = f'{field}_match'
+    distance = f'{field}_distance'
+    equidistant = f'{field}_match_equidistant'
+
+    df_mapped = df_mapped.copy()
+    df_mapped[distance] = [Levenshtein.distance(a,b) for a,b in df_mapped[['query', 'reference']].values]
+    # keep only the minimum values
+    min_distance = df_mapped.groupby('read_index')[distance].transform('min')
+    df_mapped = df_mapped.loc[lambda x: x[distance] == min_distance]
+
+    A = df_mapped.groupby('read_index').size().rename(equidistant).reset_index()
+    B = (df_mapped.sort_values(distance).drop_duplicates('read_index')
+     .rename(columns={'reference_name': match})[['read_index', match, distance]])
+    return A.merge(B)[['read_index', match, distance, equidistant]]
+
+
+### FILES
+
+
+def get_filenames(sample, simulate=False, field=None):
+    """Get filenames used throughout pipeline for this sample.
+
+    Fastq files ending with .fastq, .fq., .fastq.gz, or .fq.gz are identified 
+    based on the sample table. In simulate mode, the fastq name is identical 
+    to the sample name.
+    """
+    if simulate:
+        search = f'1_reads/simulate/{sample}_R1'
+    else:
+        sample_to_fastq = load_samples().set_index('sample')['fastq_name'].to_dict()
+        fastq_search = sample_to_fastq[sample]
+        search = f'1_reads/*{fastq_search}*'
+
+    extensions = '.fastq', '.fq', '.fq.gz', '.fastq.gz'
+    def search_extensions(search):
+        return sum([glob(search + x) for x in extensions], [])
+
+    r = search_extensions(search)
+    r1 = search_extensions('paired_' + search + '_R1')
+    r2 = search_extensions('paired_' + search + '_R2')
+    paired_reads_found = len(r1) == 1 & len(r2) == 1
+
+    if len(r1) > 1 or len(r2) > 1:
+        raise ValueError(f'Too many paired read files for sample {sample}, found: {r1 + r2}')
+    
+    if len(r) > 1:
+        raise ValueError(f'Too many read files for sample {sample}, found: {r}')
+
+    if not (paired_reads_found or len(r) == 1):
+        raise ValueError(f'Neither reads nor paired reads provided for sample {sample}, searched with: {search}')
+
+    add_simulate = 'simulate/' if simulate else ''
+    filenames = {
+        'reads': f'1_reads/{add_simulate}{sample}.fastq.gz',
+        'parsed': f'2_parsed/{add_simulate}{sample}.parsed.pq',
+        'mapped': f'3_mapped/{add_simulate}{sample}.mapped.csv',
+    }
+    if paired_reads_found:
+        filenames |= {'paired_r1': r1[0], 'paired_r2': r2[0]}
+
+    if field:
+        filenames['map query'] = f'3_mapped/map/{add_simulate}{sample}_{field}.fa'
+        filenames['map mmseqs target db'] = f'3_mapped/map/{field}.target.db'
+        filenames['map mmseqs target fa'] = f'3_mapped/map/{field}.fa'
+        filenames['map mmseqs query db'] = f'3_mapped/map/{add_simulate}{sample}_{field}.query.db'
+        filenames['map mmseqs result db'] = f'3_mapped/map/{add_simulate}{sample}_{field}.result.db'
+        filenames['map mmseqs prefilter db'] = f'3_mapped/map/{add_simulate}{sample}_{field}.prefilter.db'
+        filenames['map mmseqs prefilter tsv'] = f'3_mapped/map/{add_simulate}{sample}_{field}.prefilter.tsv'
+        filenames['map mmseqs result m8'] = f'3_mapped/map/{add_simulate}{sample}_{field}.result.m8'
+        filenames['map result'] = f'3_mapped/map/{add_simulate}{sample}_{field}.csv'
+
+    return filenames
+
+
+
+### MMSEQS
+
+
+def mmseqs_make_design_db():
+    df_designs = load_designs()
+    fields, _ = get_comparison_fields()
+    with Timer('', verbose=f'Creating mmseqs index for fields: {", ".join(fields)}'):
+        for field in fields:
+            aa = field.endswith('_aa')
+            f = f'3_mapped/map/{field}.fa' # should live in get_filenames
+            seqs_to_index = df_designs[['name', field]]
+            write_fasta(f, seqs_to_index.drop_duplicates(field))
+            if field != 'barcode':
+                make_mmseqs_db(f, aa, is_query=False)
+
+
+def make_mmseqs_db(fasta_file, dbtype_is_amino_acid, is_query):
+    """
+    """
+    db = fasta_file.replace('.fa', '.query.db' if is_query else '.target.db')
+    command = (mmseqs, 'createdb', fasta_file, db,
+     '--dbtype', '1' if dbtype_is_amino_acid else '2'
+    )
+    result = subprocess.run(command, stdout=DEVNULL, stderr=DEVNULL)
+    if result.returncode != 0:
+        raise ValueError('Non-zero return code in mmseqs createdb')
+
+
+def mmseqs_prefilter(sample, simulate, field, mmseqs_max_seqs=10, verbose=False):
+    filenames = get_filenames(sample, simulate, field)
+    
+    # make query database
+    aa = field.endswith('_aa')
+    field_no_aa = field[:-3] if aa else field
+    seqs_to_map = (pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa])
+     .dropna() # remove reads that failed parsing
+     .rename(columns={field_no_aa: 'query'})
+    )
+    if aa:
+        seqs_to_map['query'] = [quick_translate(x) for x in seqs_to_map['query']]
+        kmer = 6
+    else:
+        kmer = 12
+
+    write_fasta(filenames['map query'], seqs_to_map.dropna())
+    make_mmseqs_db(filenames['map query'], aa, is_query=True)
+
+    # search
+    [os.remove(x) for x in glob(filenames['map mmseqs prefilter db'] + '*')]
+    with tempfile.TemporaryDirectory() as tmp:
+        command = ('mmseqs', 'prefilter', 
+        filenames['map mmseqs query db'], # query
+        filenames['map mmseqs target db'], # target
+        filenames['map mmseqs prefilter db'], 
+        '-k', str(kmer), 
+        '--max-seqs', str(mmseqs_max_seqs),
+        # without these flags, mmseqs will sometimes throw out exact matches
+        '--comp-bias-corr', '0',
+        '--mask', '0',
+        # '--exact-kmer-matching', '1', # prefilter only retains exact kmer matches
+        )
+        if verbose:
+            print(' '.join(command))
+        result = subprocess.run(command, stderr=DEVNULL, stdout=DEVNULL)
+        # result = subprocess.run(command)
+    if result.returncode != 0:
+        raise ValueError('Non-zero return code in mmseqs prefilter')
+
+    # convert to tsv
+    command = ('mmseqs', 'createtsv', 
+    filenames['map mmseqs query db'], # query
+    filenames['map mmseqs target db'], # target
+    filenames['map mmseqs prefilter db'],
+    filenames['map mmseqs prefilter tsv'],
+    )
+    if verbose:
+        print(' '.join(command))
+    result = subprocess.run(command, stderr=DEVNULL, stdout=DEVNULL)
+    if result.returncode != 0:
+        raise ValueError('Non-zero return code in mmseqs prefilter')
+    
+    ref_seqs = (read_fasta(filenames['map mmseqs target fa'], as_df=True)
+     .rename(columns={'name': 'reference_name', 'seq': 'reference'}))
+
+    df_mmseqs = (read_mmseqs_prefilter_tsv(filenames['map mmseqs prefilter tsv'])
+    .rename(columns={'query': 'read_index', 'target': 'reference_name'})
+    [['read_index', 'reference_name']]
+    )
+    return df_mmseqs.merge(seqs_to_map).merge(ref_seqs).pipe(Candidates)
+    
+
+def read_mmseqs_prefilter_tsv(filename):
+    return pd.read_csv(filename, sep='\t', names=('query', 'target', 'score', 'diagonal'))
+
+
+def read_m8(filename):
+    columns = ('query', 'target', 'sequence_identity', 
+               'alignment_length', 'num_mismatches', 'num_gaps', 
+               'query_start', 'query_end', 'target_start', 
+               'target_end', 'e_value', 'bit_score')
+    return pd.read_csv(filename, sep=r'\s+', header=None, names=columns)
+
+
+def mmseqs_search(sample, field, simulate=False, min_seq_id=0.8) -> Candidates:
+    """Not in use.
+    
+    Use mmseqs search to map `field` in parsed reads to the design table.
+
+    :param sample: sample name
+    :param field: a field name, if it ends in "_aa" the translation will be mapped
+    :param simulate: if True, use ./*/simulate/ subdirectories
+    :param min_seq_id: mmseqs2 prefilter parameter
+    """
+    filenames = get_filenames(sample, simulate, field)
+    
+    # make query database
+    aa = field.endswith('_aa')
+    field_no_aa = field[:-3] if aa else field
+    seqs_to_map = (pd.read_parquet(filenames['parsed'], columns=['read_index', field_no_aa])
+     .dropna() # remove reads that failed parsing
+     .rename(columns={field_no_aa: 'query'})
+    )
+    if aa:
+        seqs_to_map['query'] = [quick_translate(x) for x in seqs_to_map['query']]
+        kmer = MMSEQS_KMER_AA
+    else:
+        kmer = MMSEQS_KMER_DNA
+    write_fasta(filenames['map query'], seqs_to_map.dropna())
+    make_mmseqs_db(filenames['map query'], aa, is_query=True)
+
+    # search
+    [os.remove(x) for x in glob(filenames['map mmseqs result db'] + '*')]
+    with tempfile.TemporaryDirectory() as tmp:
+        command = ('mmseqs', 'search', 
+        filenames['map mmseqs query db'], # query
+        filenames['map mmseqs target db'], # target
+        filenames['map mmseqs result db'], 
+        tmp,
+        '--min-seq-id', str(min_seq_id),
+        '-k', str(kmer), # memory usage is 21**k * 8 bytes (protein) or 5**k * 8 bytes (DNA)
+        '-s', '1', # sensitivity, 1 is fastest, 7.5 is most sensitive
+        '-c', '0.9', # minimum alignment coverage on query/target
+        '--exact-kmer-matching', '1', # prefilter only retains exact kmer matches
+        # split query database based on available memory
+        # '--split 0 --split-mode 1',
+        )
+        if not aa:
+            command += ('--search-type', '3')
+        print(' '.join(command))
+        # result = subprocess.run(command, stderr=DEVNULL, stdout=DEVNULL)
+        result = subprocess.run(command)
+    if result.returncode != 0:
+        raise ValueError('Non-zero return code in mmseqs search')
+
+    # convert results
+    command = ('mmseqs', 'convertalis', 
+    filenames['map mmseqs query db'], # query
+    filenames['map mmseqs target db'], # target
+    filenames['map mmseqs result db'], # result
+    filenames['map mmseqs result m8'], # result
+    )
+    # print(' '.join(command))
+    result = subprocess.run(command, stderr=DEVNULL, stdout=DEVNULL)
+    if result.returncode != 0:
+        raise ValueError('Non-zero return code in mmseqs convertalis')
+    
+    ref_seqs = read_fasta(filenames['map mmseqs target fa'], as_df=True).rename(columns={'name': 'reference_name', 'seq': 'reference'})
+
+    df_mmseqs = (read_m8(filenames['map mmseqs result m8'])
+    .rename(columns={'query': 'read_index', 'target': 'reference_name'})
+    [['read_index', 'reference_name']].drop_duplicates()
+    )
+    return df_mmseqs.merge(seqs_to_map).merge(ref_seqs).pipe(Candidates)
+
+
+def load_fastmap(filename):
+    """A bit slow, 10% of fastmap runtime
+    """
+    txt = open(filename).read()
+    queries = txt.strip().split('//')
+    arr = []
+    for q in queries[:-1]:
+        if not q:
+            continue
+        lines = q.strip().split('\n')
+
+        name = lines[0].split()[1]
+        for x in set([l.split()[-1].split(':')[0] for l in lines[1:]]):
+            arr += [{'query': name, 'match': x}]
+    return pd.DataFrame(arr)
+
+
+if __name__ == '__main__':
+
+    # order is preserved
+    commands = [
+        'setup',
+        'dna_to_designs', 
+        'simulate_fastqs', 
+        'merge_read_pairs', 
+        'parse_reads',
+        'map_parsed_reads',
+    ]
+    # if the command name is different from the function name
+    named = {
+        # 'search': search_app,
+        }
+
+    final = {}
+    for k in commands:
+        try:
+            final[k] = named[k]
+        except KeyError:
+            final[k] = eval(k)
+
+    try:
+        fire.Fire(final)
+    except BrokenPipeError:
+        pass
+    
+
diff --git a/src/ngs_analysis/constants.py b/src/ngs_analysis/constants.py
new file mode 100644
index 0000000..26fc605
--- /dev/null
+++ b/src/ngs_analysis/constants.py
@@ -0,0 +1,167 @@
+from collections import defaultdict
+import os
+from pathlib import Path
+import string
+
+try:
+    HOME = Path(os.environ['HOME'])
+except KeyError:
+    HOME = Path('/home/dfeldman')
+
+JOBLIB_CACHE = HOME / '.joblib'
+
+RESOURCES = Path(__file__).parents[0] / 'resources'
+RULE_SETS = RESOURCES / 'rule_sets.csv'
+VISITOR_FONT_PATH = RESOURCES / 'visitor1.ttf'
+
+BQ_PROJECT_ID = 'bilf-350112'
+
+GO_TERM = 'GO_term'
+GO = 'GO ID'
+GO_SYNONYM = 'DB Object Synonym (|Synonym)'
+GO_SYMBOL = 'DB Object Symbol'
+GO_TERM_COUNTS = 'GO_term_counts'
+SEARCH_KEYWORD = 'keyword'
+GO_SYNONYM = 'DB Object Synonym (|Synonym)'
+
+GENE_ID = 'gene_id'
+GENE_SYMBOL = 'gene_symbol'
+
+HGNC = 'HGNC'
+UNIPROTKB = 'UniProtKB'
+ENSG = 'ENSG'
+GENE_ALIAS = 'gene_alias'
+RCSB = 'RCSB'
+
+biomart_columns = {'Gene stable ID': ENSG,
+                   'HGNC ID': HGNC,
+                   'NCBI gene ID': GENE_ID,
+                   'NCBI gene (formerly Entrezgene) ID': GENE_ID,
+                   'HGNC symbol': GENE_SYMBOL,
+                   'UniProtKB Gene Name ID': UNIPROTKB,
+                  }
+
+PTHR = 'PTHR'
+PTHR_SF = 'PTHR_SF'
+PTHR_FAMILY = 'PTHR_family'
+PTHR_SUBFAMILY = 'PTHR_subfamily'
+PTHR_CLASS_LIST = 'PC_list'
+
+pthr_columns = {
+    0: 'identifier',
+    2: PTHR_SF,
+    3: PTHR_FAMILY,
+    4: PTHR_SUBFAMILY,
+    8: PTHR_CLASS_LIST,
+}
+
+MZ_DOUBLE_SPACING = 0.5001917279701898
+
+AA_3 = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE',
+           'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL']
+AA_1 = list('ARNDCQEGHILKMFPSTWYV')
+CANONICAL_AA = AA_1
+AA_3_1 = dict(zip(AA_3, AA_1))
+AA_1_3 = dict(zip(AA_1, AA_3))
+
+skyline_columns = {
+    'Replicate': 'sample', 
+    'Replicate Name': 'sample',
+    'Protein': 'short_name', 
+    'Peptide': 'sequence',
+    'Peptide Retention Time': 'RTime',
+    'Normalized Area': 'peak_area',
+    'Total Area MS1': 'ms1_area',
+    'Best Retention Time': 'RTime',
+    'Min Start Time': 'RTime_start',
+    'Max End Time': 'RTime_end',
+    'Average Mass Error PPM': 'mass_error_ppm',
+    'Isotope Dot Product': 'idotp',
+    }
+
+
+PT02_BACKBONE_START = 'ATTCTCCTTGGAATTTGCCCTTTTTGAGTTTGGATCTTGGTTCAT'
+iON_BACKBONE_START = 'GACATTGATTATTGACTAGTTATTAATAGTAATCAAT'
+pET_BACKBONE_START = 'TAATACGACTCACTATAGGGGAATTGTGAGCGGATAACAATTCC'
+
+CODONS = {
+    'TAA': '*',
+    'TAG': '*',
+    'TGA': '*',
+    'GCA': 'A',
+    'GCC': 'A',
+    'GCG': 'A',
+    'GCT': 'A',
+    'TGC': 'C',
+    'TGT': 'C',
+    'GAC': 'D',
+    'GAT': 'D',
+    'GAA': 'E',
+    'GAG': 'E',
+    'TTC': 'F',
+    'TTT': 'F',
+    'GGA': 'G',
+    'GGC': 'G',
+    'GGG': 'G',
+    'GGT': 'G',
+    'CAC': 'H',
+    'CAT': 'H',
+    'ATA': 'I',
+    'ATC': 'I',
+    'ATT': 'I',
+    'AAA': 'K',
+    'AAG': 'K',
+    'CTA': 'L',
+    'CTC': 'L',
+    'CTG': 'L',
+    'CTT': 'L',
+    'TTA': 'L',
+    'TTG': 'L',
+    'ATG': 'M',
+    'AAC': 'N',
+    'AAT': 'N',
+    'CCA': 'P',
+    'CCC': 'P',
+    'CCG': 'P',
+    'CCT': 'P',
+    'CAA': 'Q',
+    'CAG': 'Q',
+    'AGA': 'R',
+    'AGG': 'R',
+    'CGA': 'R',
+    'CGC': 'R',
+    'CGG': 'R',
+    'CGT': 'R',
+    'AGC': 'S',
+    'AGT': 'S',
+    'TCA': 'S',
+    'TCC': 'S',
+    'TCG': 'S',
+    'TCT': 'S',
+    'ACA': 'T',
+    'ACC': 'T',
+    'ACG': 'T',
+    'ACT': 'T',
+    'GTA': 'V',
+    'GTC': 'V',
+    'GTG': 'V',
+    'GTT': 'V',
+    'TGG': 'W',
+    'TAC': 'Y',
+    'TAT': 'Y',
+}
+
+CODONS_REVERSE = defaultdict(list)
+[CODONS_REVERSE[v].append(k) for k, v in CODONS.items()]
+CODONS_REVERSE = {k: '|'.join(v) for k, v in CODONS_REVERSE.items()}
+
+CHAIN_ALPHABET = string.ascii_uppercase + string.ascii_lowercase
+
+SLUGIFY_REPLACEMENTS = [
+    ('<', 'lt'),
+    ('<=', 'lte'),
+    ('>', 'gt'),
+    ('>=', 'gte'),
+    ('&', 'and'),
+    ('|', 'or'),
+]
\ No newline at end of file
diff --git a/src/ngs_analysis/interactive.py b/src/ngs_analysis/interactive.py
new file mode 100644
index 0000000..b3d4f5c
--- /dev/null
+++ b/src/ngs_analysis/interactive.py
@@ -0,0 +1,23 @@
+"""Common imports for interactive work
+"""
+from collections import Counter
+from glob2 import glob
+import logging
+import os
+import re
+import sys
+
+from natsort import natsorted, natsort_keygen
+import numpy as np
+import pandas as pd
+
+import warnings
+from math import ceil
+from functools import partial
+from pandas import IndexSlice as pdx
+from tqdm.auto import tqdm
+import IPython
+from IPython.display import display, Image
+
+IPython.get_ipython().run_line_magic('load_ext', 'autoreload')
+IPython.get_ipython().run_line_magic('autoreload', '2')
diff --git a/src/ngs_analysis/sequence.py b/src/ngs_analysis/sequence.py
new file mode 100644
index 0000000..2fc85f0
--- /dev/null
+++ b/src/ngs_analysis/sequence.py
@@ -0,0 +1,915 @@
+import gzip
+import os
+import re
+from collections import defaultdict
+from glob import glob
+
+import numpy as np
+import pandas as pd
+from natsort import natsorted
+
+from .constants import RESOURCES, CODONS, CODONS_REVERSE
+
+
+watson_crick = {'A': 'T',
+                'T': 'A',
+                'C': 'G',
+                'G': 'C',
+                'U': 'A',
+                'N': 'N'}
+
+watson_crick.update({k.lower(): v.lower()
+                     for k, v in watson_crick.items()})
+
+iupac = {'A': ['A'],
+ 'C': ['C'],
+ 'G': ['G'],
+ 'T': ['T'],
+ 'M': ['A', 'C'],
+ 'R': ['A', 'G'],
+ 'W': ['A', 'T'],
+ 'S': ['C', 'G'],
+ 'Y': ['C', 'T'],
+ 'K': ['G', 'T'],
+ 'V': ['A', 'C', 'G'],
+ 'H': ['A', 'C', 'T'],
+ 'D': ['A', 'G', 'T'],
+ 'B': ['C', 'G', 'T'],
+ 'N': ['G', 'A', 'T', 'C']}
+
+codon_maps = {}
+
+
+def read_fasta(f, as_df=False):
+    if f.endswith('.gz'):
+        fh = gzip.open(f)
+        txt = fh.read().decode()
+    else:
+        fh = open(f, 'r')
+        txt = fh.read()
+    fh.close()
+    records = parse_fasta(txt)
+    if as_df:
+        return pd.DataFrame(records, columns=('name', 'seq'))
+    else:
+        return records
+
+
+def parse_fasta(txt):
+    entries = []
+    txt = '\n' + txt.strip()
+    for raw in txt.split('\n>'):
+        name = raw.split('\n')[0].strip()
+        seq = ''.join(raw.split('\n')[1:]).replace(' ', '')
+        if name:
+            entries += [(name, seq)]
+    return entries
+
+
+def write_fasta(filename, list_or_records):
+    if isinstance(list_or_records, pd.DataFrame) and list_or_records.shape[1] == 2:
+        list_or_records = list_or_records.values
+    list_or_records = list(list_or_records)
+    with open(filename, 'w') as fh:
+        fh.write(format_fasta(list_or_records))
+
+
+def write_fake_fastq(filename, list_or_records):
+    with open(filename, 'w') as fh:
+        fh.write(format_fake_fastq(list_or_records))
+
+
+def format_fake_fastq(list_or_records):
+    """Generates a fake header for each read that is sufficient to fool bwa/NGmerge.
+    """
+    fake_header = '@M08044:78:000000000-L568G:1:{tile}:{x}:{y} 1:N:0:AAAAAAAA'
+    if isinstance(next(iter(list_or_records)), str):
+        records = list_to_records(list_or_records)
+    else:
+        records = list_or_records
+
+    max_value = 1000
+    lines = []
+    for i, (_, seq) in enumerate(records):
+        tile, rem = divmod(i, max_value**2)
+        x, y = divmod(rem, max_value)
+        lines.extend([fake_header.format(tile=tile, x=x, y=y), seq.upper(), '+', 'G' * len(seq)])
+    return '\n'.join(lines)
+
+
+def write_fastq(filename, names, sequences, quality_scores):
+    with open(filename, 'w') as fh:
+        fh.write(format_fastq(names, sequences, quality_scores))
+
+
+def format_fastq(names, sequences, quality_scores):
+    lines = []
+    for name, seq, q_score in zip(names, sequences, quality_scores):
+        lines.extend([name, seq, '+', q_score])
+    return '\n'.join(lines)
+
+
+def list_to_records(xs):
+    n = len(xs)
+    width = int(np.ceil(np.log10(n)))
+    fmt = '{' + f':0{width}d' + '}'
+    records = []
+    for i, s in enumerate(xs):
+        records += [(fmt.format(i), s)]
+    return records
+
+
+def format_fasta(list_or_records):
+    if len(list_or_records) == 0:
+        records = []
+    elif isinstance(list_or_records[0], str):
+        records = list_to_records(list_or_records)
+    else:
+        records = list_or_records
+    
+    lines = []
+    for name, seq in records:
+        lines.extend([f'>{name}', str(seq)])
+    return '\n'.join(lines)
+
+
+def fasta_frame(files_or_search):
+    """Convenience function, pass either a list of files or a 
+    glob wildcard search term.
+    """
+    
+    if isinstance(files_or_search, str):
+        files = natsorted(glob(files_or_search))
+    else:
+        files = files_or_search
+
+    cols = ['name', 'seq', 'file_ix', 'file']
+    records = []
+    for f in files:
+        for i, (name, seq) in enumerate(read_fasta(f)):
+            records += [{
+                'name': name, 'seq': seq, 'file_ix': i, 
+                'file': f,
+            }]
+
+    return pd.DataFrame(records)[cols]
+
+
+def cast_cols(df, int_cols=tuple(), float_cols=tuple(), str_cols=tuple(), 
+              cat_cols=tuple(), uint16_cols=tuple()):
+    return (df
+           .assign(**{c: df[c].astype(int) for c in int_cols})
+           .assign(**{c: df[c].astype(np.uint16) for c in uint16_cols})
+           .assign(**{c: df[c].astype(float) for c in float_cols})
+           .assign(**{c: df[c].astype(str) for c in str_cols})
+           .assign(**{c: df[c].astype('category') for c in cat_cols})
+           )
+
+
+def translate_dna(s):
+    assert len(s) % 3 == 0, 'length must be a multiple of 3'
+    return ''.join([CODONS[s[i*3:(i+1)*3]] for i in range(int(len(s)/3))])
+
+
+def load_codons(organism):
+    f = os.path.join(RESOURCES, 'codon_usage', 'organisms.csv')
+    taxids = pd.read_csv(f).set_index('organism')['taxid'].to_dict()
+    
+    if organism.lower() == 'yeast':
+        organism = 's_cerevisiae'
+    organism = organism.lower().replace('.', '').replace(' ', '_')
+    
+    try:
+        table = f'{organism}_{taxids[organism]}.csv'
+    except KeyError:
+        raise ValueError(f'{organism} must be one of {list(taxids.keys())}')
+    f = os.path.join(RESOURCES, 'codon_usage', table)
+    return (pd.read_csv(f)
+            .assign(codon_dna=lambda x: x['codon'].str.replace('U', 'T')))
+
+
+def make_equivalent_codons(dna_to_aa):
+    """Make dictionary from codon to other codons for the same amino acid
+    """
+    aa_to_dna = defaultdict(list)
+    for codon, aa in dna_to_aa.items():
+        aa_to_dna[aa] += [codon]
+    
+    equivalent_codons = {}
+    for codon in dna_to_aa:
+        aa = dna_to_aa[codon]
+        equivalent_codons[codon] = list(set(aa_to_dna[aa]) - {codon})
+            
+    return equivalent_codons
+
+
+equivalent_codons = make_equivalent_codons(CODONS)
+
+
+def reverse_complement(seq):
+    return ''.join(watson_crick[x] for x in seq)[::-1]
+
+
+def sanger_database(drive):
+    df_sanger = drive.get_excel('cloning/sanger')
+
+    extra_cols = [x for x in df_sanger.columns
+                  if x not in ('identifier', 'search')]
+
+    arr = []
+    for _, row in df_sanger.iterrows():
+        files = natsorted(glob(row['search']))
+        if len(files) == 0:
+            print(f'No files found from row {row.to_dict()}')
+            continue
+        (pd.DataFrame({'file': files})
+         .assign(**{x: row[x] for x in extra_cols})
+         .assign(name=lambda x: x['file'].str.extract(row['identifier']))
+         .assign(seq=lambda x: x['file'].apply(read_ab1))
+         .assign(seq_rc=lambda x: x['seq'].apply(reverse_complement))
+         .pipe(arr.append)
+         )
+
+    cols = extra_cols + ['name', 'file', 'seq', 'seq_rc']
+    return pd.concat(arr)[cols]
+
+
+def read_ab1(f):
+    from Bio import SeqIO
+    with open(f, 'rb') as fh:
+        records = list(SeqIO.parse(fh, 'abi'))
+        assert len(records) == 1
+        seq = str(records[0].seq)
+    return seq
+
+
+def print_alignment(a, b, width=60, as_string=False):
+    """Levenshtein alignment.
+    """
+    import edlib
+    alignment = edlib.align(a, b, task='path')
+    d = edlib.getNiceAlignment(alignment, a, b)
+    lines = []
+    for i in range(0, max(map(len, d.values())), width):
+        lines += [str(i)]
+        for x in d.values():
+            lines += [x[i:i+width]]
+
+    txt = '\n'.join(lines)
+    if as_string:
+        return txt
+    else:
+        print(txt)
+
+
+def reverse_translate_max(aa_seq, organism='e_coli'):
+    if organism not in codon_maps:
+        codon_maps[organism] = (load_codons(organism)
+        .sort_values('relative_frequency', ascending=False)
+        .drop_duplicates('amino_acid')
+        .set_index('amino_acid')['codon_dna'].to_dict()
+        ) 
+    codon_map = codon_maps[organism]
+    return ''.join([codon_map[x] for x in aa_seq])
+
+
+def reverse_translate_random(aa_seq, organism='e_coli', rs='input', cutoff=0.12):
+    if rs == 'input':
+        seed = hash(aa_seq) % 10**8
+        rs = np.random.RandomState(seed=seed)
+    if (organism, cutoff) not in codon_maps:
+        codon_maps[(organism, cutoff)] = (load_codons(organism)
+        .query('relative_frequency > @cutoff')
+        .groupby('amino_acid')['codon_dna'].apply(list).to_dict()
+        )
+    codon_map = codon_maps[(organism, cutoff)]
+    return ''.join([rs.choice(codon_map[x]) for x in aa_seq])
+
+
+def get_genbank_features(f, error_on_repeat=True):
+    from Bio import SeqIO
+    records = list(SeqIO.parse(open(f,'r'), 'genbank'))
+    if len(records) != 1:
+        raise ValueError(f'found {len(records)} records in genbank {f}')
+
+    features = {}
+    for f in records[0].features:
+        label = f.qualifiers['label'][0]
+        seq = f.extract(records[0].seq)
+        if label in features and features[label] != seq and error_on_repeat:
+            raise ValueError(f'repeated feature {label}')
+        features[label] = str(seq)
+    return features
+
+
+def rolling_gc(seq, window):
+    from scipy.ndimage.filters import convolve
+    gc_window = 20
+    return convolve(np.array([x in 'GC' for x in seq])*1., 
+                          np.ones(window)/window, 
+                          mode='reflect')
+
+
+def to_codons(seq):
+    assert len(seq) % 3 == 0
+    return [seq[i*3:(i+1)*3] for i in range(int(len(seq)/3))]
+
+
+def codon_adaptation_index(seq, organism='e_coli'):
+    return (load_codons(organism)
+        .assign(w=lambda x: x.groupby('amino_acid')['relative_frequency']
+                .transform(lambda y: y / y.max()))
+        .set_index('codon_dna')
+        .loc[to_codons(seq)]['w']
+        .pipe(lambda x: np.prod(x)**(1/len(x)))
+           )
+
+
+def compare_sequences(sequences, window=25, k=6):
+    import matplotlib.pyplot as plt
+
+    fig, (ax0, ax1) = plt.subplots(figsize=(12, 4), ncols=2)
+    for name in sequences:
+        cai = codon_adaptation_index(sequences[name])
+        mean_gc = np.mean([x in 'GC' for x in sequences[name]])
+        gc_trace = rolling_gc(sequences[name], window)
+        label = f'{name}: avg={mean_gc:.2g} std={np.std(gc_trace):.2g} cai={cai:.2g}'
+        ax0.plot(gc_trace, label=label)
+    
+        (pd.Series(get_kmers(sequences[name], k))
+         .value_counts().value_counts().sort_index()
+         .pipe(lambda x: x/x.sum())
+         .plot(ax=ax1, marker='.', ms=10, label=name))
+        
+    
+    ax0.plot([0, len(gc_trace)], [0.5, 0.5], color='gray', lw=1, ls='--', zorder=-1)
+    ax0.legend()
+    ax0.set_title(f'average GC content over {window} nt window')
+    ax0.set_ylabel('GC fraction')
+    ax0.set_xlabel('DNA sequence position')
+    ax0.set_ylim([0.25, 0.85])
+    ax0.set_xlim([0, len(gc_trace)])
+    
+    ax1.set_title(f'repeated kmers with k={k}')
+    ax1.set_xlabel('kmer count')
+    ax1.set_ylabel(f'fraction of kmers')
+    ax1.set_xticks([1,2,3,4,5])
+    ax1.legend()
+
+    return fig
+
+
+def get_kmers(s, k):
+    n = len(s)
+    return [s[i:i+k] for i in range(n-k+1)]
+
+
+def read_fastq(filename, max_reads=1e12, include_quality=False, include_name=False, 
+               include_index=False, progress=lambda x: x, full_name=False):
+    if max_reads is None:
+        max_reads = 1e12
+    if filename.endswith('gz'):
+        fh = gzip.open(filename, 'rt')
+    else:
+        fh = open(filename, 'r')
+    reads, quality_scores, names, indices = [], [], [], []
+    read_count = 0
+    for i, line in progress(enumerate(fh)):
+        if i % 4 == 1:
+            reads.append(line.strip())
+            read_count += 1
+        if include_quality and i % 4 == 3:
+            quality_scores.append(line.strip())
+        if include_name and i % 4 == 0:
+            if full_name:
+                names.append(line.strip())
+            else:
+                names.append(':'.join(line.split()[0].split(':')[3:7]))
+        if include_index and i % 4 == 0:
+            indices.append(line.split(':')[-1].strip())
+        if i % 4 == 3 and read_count >= max_reads:
+            break
+        
+    fh.close()
+    if include_quality or include_name or include_index:
+        return_val = (reads,)
+        if include_quality:
+            return_val += (quality_scores,)
+        if include_name:
+            return_val += (names,)
+        if include_index:
+            return_val += (indices,)
+        return return_val
+    else:
+        return reads
+
+
+def quality_scores_to_array(quality_scores, baseline=ord('!')):
+    """Only works if all quality scores have equal length.
+    Expects strings not bytes.
+    """
+    q = np.array(quality_scores)
+    return (np.array(q).astype(f'S{len(q[0])}')
+              .view('S1').view(np.uint8)
+              .reshape(len(q), len(q[0]))
+               - baseline
+              )
+
+
+def aa_to_dna_re(aa_seq):
+    return ''.join(f'(?:{CODONS_REVERSE[x]})' for x in aa_seq)
+
+
+def make_kmer_dict(sequences, k):
+    """
+    """
+    kmers = defaultdict(list)
+    for i, seq in enumerate(sequences):
+        for kmer in get_kmers(seq, k):
+            kmers[kmer].append(i)
+    return kmers
+
+
+def match_nearest(query, sequences, kmers):
+    from Levenshtein import distance
+
+    k = len(next(iter(kmers.keys())))
+
+    candidates = []
+    for kmer in get_kmers(query, k):
+        candidates.extend(kmers[kmer])
+    candidates = set(candidates)
+    # guess
+    candidates = sorted(candidates, key=lambda i: ~
+                        sequences[i].startswith(query[:2]))
+
+    matches = []
+    for i in candidates:
+        d = distance(sequences[i], query)
+        matches.append((d, i))
+        # exact match
+        if d == 0:
+            break
+    d, i = sorted(matches)[0]
+    return d, i
+
+
+def match_queries(queries, sequences, window, k, progress=lambda x: x):
+    """Match queries to reference sequences based on Levenshtein distance between
+    prefixes of length `window`. Only pairs with a shared kmer of length `k` are
+    checked. For each query, finds the first nearest prefix and returns all sequences 
+    that share that prefix.
+    """
+    query_lookup = {x: x[:window] for x in queries}
+    query_prefixes = sorted(set([x[:window] for x in queries]))
+
+    ref_lookup = defaultdict(list)
+    for x in sequences:
+        ref_lookup[x[:window]].append(x)
+    ref_prefixes = sorted(set([x[:window] for x in sequences]))
+
+    kmers = make_kmer_dict(ref_prefixes, k)
+
+    hits = {}
+    for q in progress(query_prefixes):
+        try:
+            hits[q] = match_nearest(q, ref_prefixes, kmers)
+        except IndexError:
+            pass
+
+    results = []
+    for q in queries:
+        try:
+            d, i = hits[query_lookup[q]]
+            results.append(ref_lookup[ref_prefixes[i]])
+        except KeyError:
+            results.append([])
+    return results
+
+
+def add_design_matches(df_reads, col, reference, window, k):
+    """
+    `df_reads` is a dataframe containing `col` with sequences
+    `reference` is a list of references
+    """
+    queries = df_reads[col].fillna('').pipe(list)
+    queries = [q if '*' not in q else '' for q in queries]
+    results = match_queries(queries, reference, window, k)
+
+    df_reads = df_reads.copy()
+    design_distance, design_match, design_equidistant = zip(
+        *calculate_distance_matches(queries, results))
+    return (df_reads
+            .assign(design_match=design_match, design_distance=design_distance,
+                    design_equidistant=design_equidistant)
+            )
+
+
+def calculate_distance_matches(queries, results):
+    """Get columns `design_distance` and `design_match` from results of `match_queries`.
+    """
+    from Levenshtein import distance
+
+    arr = []
+    for q, rs in zip(queries, results):
+        if len(rs) == 0:
+            arr += [(-1, '', 0)]
+        else:
+            ds = [(distance(q, r), r) for r in rs]
+            d, s = sorted(ds)[0]
+            degeneracy = sum([x[0] == d for x in ds])
+            arr += [(d, s, degeneracy)]
+    return arr
+
+
+def match_and_check(queries, reference, window, k, ignore_above=40, progress=lambda x: x):
+    """Perform fast Levenshtein distance matching of queries to reference
+    and check the results by brute force calculation (all pairs). Mismatches
+    with an edit distance greater than `ignore_above` are ignored.
+    """
+    from Levenshtein import distance
+
+    print(f'Matching {len(queries)} queries to {len(reference)} '
+          f'reference sequences, window={window} and k={k}')
+    df_matched = (pd.DataFrame({'sequence': queries})
+                  .pipe(add_design_matches, col='sequence',
+                        reference=reference, window=window, k=k))
+    it = (df_matched
+          [['sequence', 'design_match']].values)
+    print('Checking fast matches against brute force matches...')
+    different = 0
+    for seq, match in progress(it):
+        xs = sorted(reference, key=lambda x: distance(x, seq))
+        a, b = distance(seq, match), distance(seq, xs[0])
+        if b < a and b <= ignore_above:
+            different += 1
+            print(f'{a},{b} (fast,exact distance); query={seq}')
+    print(f'Total mismatches: {different}')
+    return df_matched
+
+
+def load_abi_zip(filename):
+    """Extract Bio.SeqIO records from sanger zip file.
+    """
+    import zipfile
+    from io import BytesIO
+
+    from Bio import SeqIO
+    zh = zipfile.ZipFile(filename, 'r')
+    arr = []
+    for zi in zh.filelist:
+        if not zi.filename.endswith('ab1'):
+            print(f'Skipping {zi.filename}')
+            continue
+        fh = zh.open(zi.filename, 'r')
+        buffer = BytesIO(fh.read())
+        arr += [SeqIO.read(buffer, 'abi')]
+    return arr
+
+
+def get_abi_traces(abi_record):
+    channels = 'DATA9', 'DATA10', 'DATA11', 'DATA12'
+    bases = list('GATC')
+    traces = np.array([abi_record.annotations['abif_raw'][c]
+                       for c in channels])
+    df = pd.DataFrame(traces).T
+    df.columns = bases
+    return df
+
+
+def try_translate_dna(s):
+    try:
+        return translate_dna(s)
+    except:
+        return None
+
+
+def digest_protein_fasta(filename, digest_pat='[R|K]'):
+    """Convert protein fasta into a table of peptides, digesting with provided regex.
+    """
+    records = read_fasta(filename)
+
+    arr = []
+    for name, seq in records:
+        parts = re.split(f'({digest_pat})', seq)
+        parts = [x for x in parts if x]
+        peptides = [a+b for a,b in zip(parts[::2], parts[1::2])]
+        for peptide in peptides:
+            arr += [{'name': name, 'sequence': peptide}]
+
+    return pd.DataFrame(arr)
+
+
+def findone(aa, dna):
+    """Simple case of amino acid substring in in-frame DNA.
+    """
+    aa_ = translate_dna(dna)
+    i = aa_.index(aa)
+    return dna[i * 3:(i + len(aa)) * 3]
+
+
+def select_most_different(xs, n):
+    """Quickly select sequences with high mutual Levenshtein distance.
+    """
+    from Levenshtein import distance
+    xs = list(xs)
+    arr = [xs.pop()]
+    for _ in range(n - 1):
+        new = sorted(xs, key=lambda x: -min(distance(x, y) for y in arr))[0]
+        xs.remove(new)
+        arr += [new]
+    return arr
+
+
+def edge_primers(dna, melt_temp): 
+    from Bio.SeqUtils.MeltingTemp import Tm_NN
+    dna_rev = reverse_complement(dna)
+
+    fwd = dna[:10]
+    while Tm_NN(fwd) < melt_temp:
+        fwd = dna[:len(fwd) + 1]
+        
+    rev = dna_rev[:10]
+    while Tm_NN(rev) < melt_temp:
+        rev = dna_rev[:len(rev) + 1]
+        
+    return fwd, rev
+
+
+def find_orfs(seq, kozak='GCCACC', return_aa=True):
+    """Find open reading frames starting with kozak in circular 
+    DNA sequence.
+    """
+    plasmid = str(seq)
+    pat = f'{kozak}(ATG(?:...)*?)(?:TAA|TAG|TGA)'
+    ref = plasmid + plasmid
+    orfs = re.findall(pat, ref) + re.findall(pat, reverse_complement(ref))
+    if return_aa:
+        orfs = [translate_dna(x) for x in orfs]
+    return sorted(set(orfs), key=lambda x: -1 * len(x))
+
+
+def find_longest_orf(dna, kozak='GCCACC'):
+    pat = f'{kozak}(ATG(?:...)*?)(?:TAA|TGA|TAG)'
+    orfs = re.findall(pat, dna.upper())
+    if orfs:
+        return sorted(orfs, key=len)[-1]
+
+
+def add_features(record, features):
+    """Add features to `record` based on name=>DNA dictionary `features`.
+    A feature with "orf" in the name is given a special annotation that 
+    shows up as a translation in benchling.
+    """
+
+    vector = str(record.seq).upper()
+
+    n = len(vector)
+
+    features = dict(features)
+    arr = []
+    for name, feature in features.items():
+        feature = feature.upper()
+        m = len(feature)
+
+        for strand in (-1, 1):
+            key = reverse_complement(feature) if strand == -1 else feature
+            starts = [x.start() for x in re.finditer(key, vector * 2)] 
+            starts = [x for x in starts if x < n]
+            for start in starts:
+                end = start + m
+                qualifiers = dict(label=name)
+                feature_type = 'misc'
+                if 'orf' in name:
+                    feature_type = 'CDS'
+                    qualifiers['translation'] = translate_dna(key) + '*'
+                    end += strand * 3
+
+                if end < n:
+                    location = FeatureLocation(start, end, strand=strand)
+                else:
+                    f1 = FeatureLocation(start, n, strand=strand)
+                    f2 = FeatureLocation(0, end % n, strand=strand)
+                    location = f1 + f2
+                arr += [SeqFeature(location, type=feature_type, qualifiers=qualifiers)]
+
+    # copies features but not annotations
+    new_record = record[:]
+    new_record.annotations = record.annotations
+    new_record.features += arr
+    return new_record
+
+
+def translate_to_stop(x):
+    if not isinstance(x, str):
+        return
+    if 'N' in x:
+        return
+    y = translate_dna(x[:3 * int(len(x)/3)])
+    if '*' in y:
+        return y.split('*')[0]
+    return
+
+
+def ion_plasmid_integration(filename):
+    """Generate plasmid map after integration for an iON vector
+    :param filename: iON vector genbank
+    """
+    left_itr = 'CCCTAGAAAGATAGTCTGCGTAAAATTGACGCATG'.upper()
+    right_itr = 'ccctagaaagataatcatattgtgacgtacgttaaagataatcatgcgtaaaattgacgcatg'.upper()
+
+    record = list(SeqIO.parse(filename, 'genbank'))[0]
+    original_features = get_genbank_features(filename, error_on_repeat=False)
+    # remove annoying small features
+    original_features = {k: v for k,v in original_features.items() if 6 < len(v)}
+    
+    dna = str(record.seq)
+    start = dna * 2
+    # in iON figure 1A notation
+    # partial, one, two, one, partial
+    _, one, two, _, _ = split_by_regex(f'{left_itr}|{right_itr}', start, keep='prefix')
+
+    # deduplicate
+    one, two = ['AA' + x[:-2] for x in (one, two)]
+    result = two + reverse_complement(one)
+    
+    result_record = SeqRecord(Seq(result), name=record.name + '_integrated', 
+                              annotations={'molecule_type': 'DNA'})    
+    result_record = add_features(result_record, original_features)
+    
+    return result_record
+
+
+def remove_restriction_sites(dna, sites, rs):
+    codon_sequence = to_codons(dna)
+    sites = set(sites)
+    for site in list(sites):
+        sites.add(reverse_complement(site))
+
+    for site in sites:
+        width = len(site)
+        dna = ''.join(codon_sequence)
+        if site not in dna:
+            continue
+
+        for i in range(len(dna)):
+            # if we encounter a restriction site
+            if dna[i:i + len(site)] == site:
+                # change any of these codons
+                overlapped_codons = sorted(
+                    set([int((i + offset) / 3) for offset in range(width)]))
+                # accept first change that removes restriction site
+                for j in overlapped_codons:
+                    # change this codon
+                    new_codon = swap_codon(codon_sequence[j], rs)
+                    local_dna = ''.join([new_codon if k == j else codon_sequence[k]
+                                         for k in overlapped_codons])
+                    # if codon removes this site, keep it
+                    if site not in local_dna:
+                        codon_sequence = codon_sequence[:j] + \
+                            [new_codon] + codon_sequence[j + 1:]
+                        break
+    dna = ''.join(codon_sequence)
+    for site in sites:
+        assert site not in dna
+            
+    return dna
+
+
+def restriction_sites_in_dna(dna, sites):
+    sites = set(list(sites) + [reverse_complement(x) for x in sites])
+    for site in sites:
+        if site in dna:
+            return True
+    return False
+
+
+def to_codons(dna):
+    assert len(dna) % 3 == 0
+    return [dna[i * 3:(i + 1) * 3] for i in range(int(len(dna) / 3))]
+
+
+def swap_codon(codon, rs):
+    """Swap codon at random, if possible.
+    """
+    options = equivalent_codons[codon]
+    if not options:
+        return codon
+    return rs.choice(options)
+
+
+def remove_restriction_sites_dnachisel(cds, avoid, rs=None):
+    if not restriction_sites_in_dna(cds, avoid):
+        return cds
+
+    import dnachisel as dc
+    
+    if rs is None:
+        np.random.seed(0)
+    else:
+        np.random.seed(int(1e9*rs.rand()))
+        
+    n = len(cds)
+
+    constraints = [dc.AvoidPattern(x, location=0) for x in avoid]
+    constraints += [dc.EnforceTranslation(location=(0, n))]
+
+    problem = dc.DnaOptimizationProblem(
+        sequence=cds,
+        constraints=constraints,
+        objectives=[dc.AvoidChanges()], 
+        logger=None
+    )
+    problem.max_iters = 50
+    problem.resolve_constraints()
+    problem.optimize()
+    
+    assert not restriction_sites_in_dna(problem.sequence, avoid)
+    return problem.sequence
+
+
+def pairwise_levenshtein(seqs):
+    """Calculate distance matrix for a list of sequences.
+    """
+    from Levenshtein import distance
+    arr = []
+    for i, a in enumerate(seqs):
+        for j, b in enumerate(seqs[i+1:]):
+            arr += [(i, i + j + 1, distance(a,b))]
+    n = len(seqs)
+    D = np.zeros((n, n), dtype=int)
+    i, j, d = zip(*arr)
+    D[i, j] = d
+    return D + D.T
+
+
+def generate_adapters(priming_length=12, enzyme='BsmBI', tm_range=2, target_GC=0.5,
+                      max_calculate=10_000, num_sample=100_000, distance_threshold=6, seed=0):
+    """Tm is calculated for priming site plus restriction site. A target Tm is defined
+    based on the average of sequences within 10% of target_GC.
+    """
+    from Levenshtein import distance
+    from scipy.sparse import csr_matrix
+    from .utils import maxy_clique_groups
+    from Bio.SeqUtils import MeltingTemp as mt
+
+    enzymes = {'BsmBI': 'CGTCTC'}
+    site = enzymes[enzyme]
+    avoid = 'AAA', 'GGG', 'CCC', 'TTT', site, reverse_complement(site)
+    rs = np.random.RandomState(seed)
+    candidates = [''.join(x) for x in rs.choice(list('ACTG'), size=(num_sample, priming_length))]
+    candidates = [x for x in candidates if not any(y in x for y in avoid)]
+
+    df_adapters = pd.DataFrame({'sequence': candidates})
+
+    df_adapters['Tm'] = (df_adapters['sequence'] + site).apply(mt.Tm_NN)
+    df_adapters['GC'] = df_adapters['sequence'].str.count('G|C') / df_adapters['sequence'].str.len()
+
+    mean_tm = df_adapters.query('abs(GC - @target_GC) < 0.1')['Tm'].mean()
+    df_adapters = df_adapters.query('abs(Tm - @mean_tm) < @tm_range/2')
+    
+    seqs = df_adapters['sequence'][:max_calculate]
+    print(f'Calculating pairwise distances for {len(seqs):,} priming sites')
+    D = pairwise_levenshtein(seqs)
+    
+    print(f'Selecting priming sites with pairwise distance >= {distance_threshold}')
+    
+    A = distance_threshold <= D
+    cm = csr_matrix(1-A, shape=A.shape)
+    centers = np.array(maxy_clique_groups(cm, [1] * A.shape[0]))
+    
+    return df_adapters['sequence'].iloc[centers].pipe(list)
+
+
+def kmers_unique(s, k):
+    kmers = get_kmers(s, k)
+    return len(kmers) == len(set(kmers))
+
+
+def translate_to_stop(dna):
+    n = len(dna) % 3
+    if n != 0:
+        dna = dna[:-(len(dna) % 3)]
+    dna = dna.upper()
+    assert len(dna) % 3 == 0
+    aa = translate_dna(dna)
+    if '*' in aa:
+        return aa.split('*')[0]
+    return aa
+    # assert '*' in aa
+
+
+def find_aa_in_dna(aa, dna):
+    """Simple case of amino acid substring in DNA.
+    """
+    dna = dna[:-(len(dna) % 3)].upper()
+    aa_ = translate_dna(dna)
+    i = aa_.index(aa)
+    return dna[i * 3:(i + len(aa)) * 3]
+
+
+def sequence_sap_score(x):
+    """Sum of SAP weights for all residues, regardless of solvent exposure.
+    """
+    from .ppi.constants import SAP_WEIGHTS
+    return sum([SAP_WEIGHTS[aa] for aa in x])
\ No newline at end of file
diff --git a/src/ngs_analysis/timer.py b/src/ngs_analysis/timer.py
new file mode 100644
index 0000000..9f31c13
--- /dev/null
+++ b/src/ngs_analysis/timer.py
@@ -0,0 +1,112 @@
+"""A timer that can be used in a "with" statement. The results
+can be printed out or saved to a dictionary (repeat uses in the
+same closure will be saved too).
+"""
+import time
+import inspect
+import uuid
+
+_timings = {}
+invocation_ids = {}
+
+class Timer:
+    def __init__(self, name=None, verbose=False):
+        """A useful timer. See `example` below.
+
+        :param name: used in timings dictionary, if None then not added to
+            dictionary
+        :param verbose: if True, print the name and time elapsed when finished; 
+            if a string, print the string when starting, and time elapsed when 
+            finished
+        """
+        self.name = name
+        self.verbose = verbose
+
+    def __enter__(self):
+        self.start = time.time()
+        self.closure = identify_this_function(1)
+        if isinstance(self.verbose, str):
+            print(self.verbose, end=' ', flush=True)
+        return self
+
+    def __exit__(self, type, value, traceback):
+        end = time.time()
+        elapsed_time = end - self.start
+        if self.verbose is True:
+            if self.name is None:
+                print('Done in:  {elapsed_time:.3g} seconds')
+                return
+            else:
+                print(f'{self.name}: {elapsed_time:.3g} seconds')
+        elif isinstance(self.verbose, str):
+            print(f'done in {elapsed_time:.3g} seconds')
+        
+        this = self.closure
+        
+        if this not in _timings:
+            _timings[this] = {}
+
+        if self.name in _timings[this]:
+            if isinstance(_timings[this][self.name], list):
+                _timings[this][self.name].append(elapsed_time)
+            else:
+                # convert to list
+                _timings[this][self.name] = [_timings[this][self.name], elapsed_time]
+        else:
+            _timings[this][self.name] = elapsed_time
+
+
+def get_all_timings():
+    """Gets timings measured in the parent frame.
+    """
+    return _timings.get(identify_this_function(1), {})
+
+
+def example():
+
+    def f_0():
+        with Timer('task1'):
+            time.sleep(0.5)
+        with Timer('task1'):
+            time.sleep(0.2)
+        print(get_all_timings())
+
+    def f_1():
+        with Timer('task2', verbose=True):
+            time.sleep(1)
+        print(get_all_timings())
+
+    def f_2():
+        for _ in range(5):
+            with Timer('task3'):
+                time.sleep(0.1)
+        print(get_all_timings())
+
+    def f_3():
+        with Timer('task4', verbose='Running task4...'):
+            time.sleep(3)
+        print(get_all_timings())
+        
+    f_0()
+    f_1()
+    f_2()
+    f_3()
+    f_0()
+
+
+def identify_this_function(go_up=0):
+    """Get a unique ID specific to this invocation of the function.
+
+    :param go_up: how many extra frames to go up in the stack
+    """
+    frame = inspect.currentframe()
+    caller_frame = frame.f_back
+    for _ in range(go_up):
+        caller_frame = caller_frame.f_back
+
+    frame_id = id(caller_frame)
+
+    if frame_id not in invocation_ids:
+        invocation_ids[frame_id] = str(uuid.uuid4())
+
+    return invocation_ids[frame_id]