Skip to content

Commit

Permalink
Deploying to gh-pages from @ 0256709 🚀
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Dec 28, 2024
0 parents commit 53b7e77
Show file tree
Hide file tree
Showing 446 changed files with 195,931 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .buildinfo
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 15f4af516f07d5ffed2074e8a10805cb
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added .doctrees/_autosummary/precellar.Assay.doctree
Binary file not shown.
Binary file not shown.
Binary file added .doctrees/_autosummary/precellar.Assay.id.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added .doctrees/_autosummary/precellar.align.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added .doctrees/api.doctree
Binary file not shown.
Binary file added .doctrees/environment.pickle
Binary file not shown.
Binary file added .doctrees/index.doctree
Binary file not shown.
Binary file added .doctrees/install.doctree
Binary file not shown.
249 changes: 249 additions & 0 deletions .doctrees/nbsphinx/tutorials/generic.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Processing barcoded Fastq files\n",
"\n",
"You would likely encounter barcoded fastq files when working with single cell ATAC-seq data.\n",
"As on early days of single cell ATAC-seq, cell barcodes are usually added to the read name of the fastq files.\n",
"This notebook demonstrates how to process these barcoded fastq files."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import precellar"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extracting cell barcodes from read names"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"@CCAGCACAAGCCATCCTATCGT:A00953:155:HVCHLDRXX:1:1101:1036:1031 1:N:0:1\n",
"ANCTTGGATCATCAGGTTTGTCTGTAGCTGATTTATTTCTTTAAGTTTCCC\n",
"+\n",
"F#FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF\n",
"@TAACCACTACGAATGACTGACA:A00953:155:HVCHLDRXX:1:1101:1127:1031 1:N:0:1\n",
"TNCCAGGACCAGTGACCGTCACCCGCAGTAAGGATCGGGGCGGCTCCGCCA\n",
"+\n",
"F#:FFFFFFFFF:FFFFF:FF,F,FFFFFFFF,FFF:FFFF:FFFFFF,FF\n",
"@CGATATGTAGGGGACTAATTCC:A00953:155:HVCHLDRXX:1:1101:1145:1031 1:N:0:1\n",
"GNCGGATCACAAGGTCAGGAGTTCGAGACCTGGCTGGCCAACACGGTGAAA\n",
"\n",
"gzip: stdout: Broken pipe\n"
]
}
],
"source": [
"!zcat R1.fq.gz | head"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"precellar.utils.strip_barcode_from_fastq(\n",
" 'R1.fq.gz',\n",
" 'R1_processed.fq.zst',\n",
" out_barcode='I1.fq.zst',\n",
" regex=\"^([ACTG]+):\",\n",
" right_add=1,\n",
")\n",
"\n",
"precellar.utils.strip_barcode_from_fastq(\n",
" 'R2.fq.gz',\n",
" 'R2_processed.fq.zst',\n",
" regex=\"^([ACTG]+):\",\n",
" right_add=1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[90m[\u001b[0m2024-10-04T06:24:33Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Cached version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml is up-to-date\n"
]
}
],
"source": [
"assay = precellar.Assay(\"https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\n",
"└── atac(153-1150)\n",
" ├── atac-illumina_p5(29)\n",
" ├── atac-read1(34) [↓R1(1-98)✗]\n",
" ├── gDNA(1-1000)\n",
" ├── atac-read2(34) [↑R2(1-98)✗, ↓I1(22)✗]\n",
" ├── atac-cell_barcode(22)\n",
" └── atac-illumina_p7(24)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assay"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[90m[\u001b[0m2024-10-04T06:24:33Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m 'R1' may read through and contain sequences from: 'atac-read2'\n",
"\u001b[90m[\u001b[0m2024-10-04T06:24:33Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m 'R2' may read through and contain sequences from: 'atac-read1'\n"
]
}
],
"source": [
"assay.update_read(\"R1\", fastq=\"R1_processed.fq.zst\")\n",
"assay.update_read(\"I1\", fastq=\"I1.fq.zst\")\n",
"assay.update_read(\"R2\", fastq=\"R2_processed.fq.zst\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\n",
"└── atac(153-1150)\n",
" ├── atac-illumina_p5(29)\n",
" ├── atac-read1(34) [↓R1(51)✓]\n",
" ├── gDNA(1-1000)\n",
" ├── atac-read2(34) [↑R2(51)✓, ↓I1(22)✓]\n",
" ├── atac-cell_barcode(22)\n",
" └── atac-illumina_p7(24)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assay"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[90m[\u001b[0m2024-10-04T06:24:40Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
"\u001b[90m[\u001b[0m2024-10-04T06:24:40Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 2500 barcodes. 100.00% of them have an exact match in whitelist\n",
"\u001b[90m[\u001b[0m2024-10-04T06:24:40Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
"100%|██████████| 2500/2500 [00:00<00:00, 18920.08it/s]"
]
}
],
"source": [
"qc = precellar.align(\n",
" assay, \"/data/kzhang/GRCh38/hg38.fa.gz\",\n",
" modality=\"atac\",\n",
" output_fragment=\"atac_fragments.tsv.zst\",\n",
" num_threads=32,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'sequenced_reads': 5000.0,\n",
" 'frac_q30_bases_barcode': 1.0,\n",
" 'frac_fragment_in_nucleosome_free_region': 0.010427010923535254,\n",
" 'frac_q30_bases_read2': 0.9442745098039216,\n",
" 'frac_fragment_flanking_single_nucleosome': 0.0029791459781529296,\n",
" 'frac_valid_barcode': 1.0,\n",
" 'frac_unmapped': 0.07640000000000002,\n",
" 'frac_duplicates': 0.004940711462450593,\n",
" 'frac_nonnuclear': 0.0128,\n",
" 'sequenced_read_pairs': 2500.0,\n",
" 'frac_q30_bases_read1': 0.8179764705882353,\n",
" 'frac_confidently_mapped': 0.8524}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"qc"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 53b7e77

Please sign in to comment.