add

feldman4 · Dec 28, 2023 · 0f809e0 · 0f809e0
1 parent 8c8cbfa
commit 0f809e0
Show file tree

Hide file tree

Showing 11 changed files with 2,028 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+examples/nanopore
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -1 +1,25 @@
-# ngs-analysis
+# ngs-analysis
+
+Intended for analysis of sequencing reads that span multiple DNA or protein parts. For instance, given a library of protein variants linked to DNA barcodes, it can answer questions like:
+
+- How accurate are the variant sequences, at the DNA or protein level?
+- How frequently is the same barcode linked to two different variants?
+- Which reads contain parts required for function (e.g., a kozak start sequence, or a fused protein tag)?
+
+This kind of analysis often involves parsing raw sequencing reads for DNA and/or protein sub-sequences (parts), then mapping the parts to a reference of anticipated part combinations. This package offers a simple workflow: 
+
+1. Define how to parse reads into parts using plain text expressions (no code)
+2. Test the parser on simulated DNA sequences (e.g., your vector map)
+3. Parse a batch of sequencing samples
+4. Map the (combination of) parts found in each read to your reference
+
+It’s been tested with Illumina paired-end reads and Oxford Nanopore long reads. Under the hood it uses [NGmerge](https://github.com/jsh58/NGmerge) to merge paired reads and [MMseqs2](https://github.com/soedinglab/MMseqs2) for sequencing mapping. It is moderately performant: 1 million paired-end reads can be mapped to a reference of 100,000 variant-barcode pairs in ~1 minute.
+
+# Installation
+
+```bash
+pip install ngs-analysis
+```
+
+Tested on Linux and MacOS (Apple Silicon).
+
diff --git a/environment.yaml b/environment.yaml
@@ -0,0 +1,19 @@
+name: ngs-analysis
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - fire
+  - glob2
+  - ipykernel
+  - natsort
+  - pandas
+  - pandera
+  - parse
+  - pytables
+  - python-levenshtein
+  - python-slugify
+  - pyyaml
+  - regex
+  - tqdm
+  - pyarrow
diff --git a/examples/.DS_Store b/examples/.DS_Store
diff --git a/install.sh b/install.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# local install for interactive use
+
+micromamba create -f environment.yaml -y
+micromamba run -n ngs-analysis python -m ipykernel install --user --name=ngs-analysis
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,41 @@
+[project]
+name = "ngs-analysis"
+version = "0.0.1"
+description = "Analyze deep sequencing of complex libraries"
+authors = [
+  { name="David Feldman" },
+]
+license = {file = "LICENSE.txt"}
+readme = "README.md"
+requires-python = ">=3.7"
+
+keywords = ["NGS", "library", "variant", "barcode"]
+
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+dependencies = [
+  "fire",
+  "joblib",
+  "natsort",
+  "python-slugify",
+  "python-levenshtein",
+  "pandera",
+  "pandas",	
+  "pyyaml",
+]
+
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project.urls]
+"Homepage" = "https://github.com/feldman4/plasmid-design"
+"Bug Tracker" = "https://github.com/feldman4/plasmid-design/issues"
+
+[project.scripts]
+ngs_analysis = "ngs_analysis.app:main"
+