From ba66817b3e70590d824eb14721445f0c171dbce8 Mon Sep 17 00:00:00 2001 From: Postuma Date: Wed, 3 Nov 2021 09:55:03 +0100 Subject: [PATCH 1/5] Improved performance of clone_filter by using a newer version of stacks --- src/env/stacks.yaml | 276 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 271 insertions(+), 5 deletions(-) diff --git a/src/env/stacks.yaml b/src/env/stacks.yaml index 4bf3fd9..13caee7 100644 --- a/src/env/stacks.yaml +++ b/src/env/stacks.yaml @@ -1,8 +1,274 @@ +name: stacks2.6 channels: - - conda-forge - bioconda + - conda-forge + - defaults + - r dependencies: - - python=3.6.3=0 - - sqlite=3.13.0=1 - - stacks=2.0Beta8=pl5.22.0_0 - + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_gnu + - bzip2=1.0.8=h7f98852_4 + - c-ares=1.18.1=h7f98852_0 + - ca-certificates=2021.10.26=h06a4308_2 + - cairo=1.16.0=h6cf1ce9_1008 + - curl=7.79.1=h494985f_1 + - expat=2.4.1=h9c3ff4c_0 + - fontconfig=2.13.1=hba837de_1005 + - freetype=2.10.4=h0708190_1 + - fribidi=1.0.10=h516909a_0 + - gettext=0.21.0=hf68c758_0 + - giflib=5.2.1=h516909a_2 + - graphite2=1.3.14=h23475e2_0 + - graphviz=2.42.3=h0511662_0 + - harfbuzz=2.9.1=h83ec7ef_1 + - icu=68.1=h58526e2_0 + - jpeg=9d=h516909a_0 + - krb5=1.19.2=h48eae69_2 + - ld_impl_linux-64=2.36.1=hea4e1c9_2 + - libcurl=7.79.1=h494985f_1 + - libdb=6.2.32=he1b5a44_0 + - libedit=3.1.20210714=h7f8727e_0 + - libev=4.33=h516909a_1 + - libffi=3.4.2=h9c3ff4c_4 + - libgcc=7.2.0=h69d50b8_2 + - libgcc-ng=11.2.0=h1d223b6_11 + - libgd=2.2.5=h8e06009_4 + - libglib=2.70.0=h174f98d_1 + - libgomp=11.2.0=h1d223b6_11 + - libiconv=1.16=h516909a_0 + - libnghttp2=1.43.0=ha19adfc_1 + - libnsl=2.0.0=h7f98852_0 + - libpng=1.6.37=hed695b0_2 + - libssh2=1.10.0=ha35d2d1_2 + - libstdcxx-ng=11.2.0=he4da1e4_11 + - libtiff=4.1.0=hc3755c2_3 + - libtool=2.4.6=h9c3ff4c_1008 + - libuuid=2.32.1=h14c3975_1000 + - libwebp=1.0.2=h56121f0_5 + - libxcb=1.14=h7b6447c_0 + - libxml2=2.9.12=h72842e0_0 + - libxslt=1.1.33=h15afd5d_2 + - libzlib=1.2.11=h36c2ea0_1013 + - llvm-openmp=8.0.1=hc9558a2_0 + - lz4-c=1.9.3=h9c3ff4c_1 + - ncurses=6.2=h58526e2_4 + - openmp=8.0.1=0 + - openssl=3.0.0=h7f98852_1 + - pango=1.42.4=h80147aa_5 + - pcre=8.45=h9c3ff4c_0 + - perl=5.26.2=h36c2ea0_1008 + - perl-aceperl=1.92=pl526_2 + - perl-algorithm-munkres=0.08=pl526_1 + - perl-apache-test=1.40=pl526_1 + - perl-app-cpanminus=1.7044=pl526_1 + - perl-appconfig=1.71=pl526_1 + - perl-array-compare=3.0.1=pl526_1 + - perl-autoloader=5.74=pl526_2 + - perl-base=2.23=pl526_1 + - perl-bio-phylo=0.58=pl526_1 + - perl-bioperl-core=1.007002=pl526_1 + - perl-business-isbn=3.004=pl526_0 + - perl-business-isbn-data=20140910.003=pl526_0 + - perl-cache-cache=1.08=pl526_0 + - perl-carp=1.38=pl526_3 + - perl-cgi=4.44=pl526h14c3975_1 + - perl-class-inspector=1.34=pl526_0 + - perl-class-load=0.25=pl526_0 + - perl-class-load-xs=0.10=pl526h6bb024c_2 + - perl-class-method-modifiers=2.12=pl526_0 + - perl-clone=0.42=pl526h516909a_0 + - perl-common-sense=3.74=pl526_2 + - perl-compress-raw-zlib=2.087=pl526hc9558a2_0 + - perl-constant=1.33=pl526_1 + - perl-convert-binary-c=0.78=pl526h6bb024c_3 + - perl-convert-binhex=1.125=pl526_1 + - perl-crypt-rc4=2.02=pl526_1 + - perl-data-dumper=2.173=pl526_0 + - perl-data-optlist=0.110=pl526_2 + - perl-data-stag=0.14=pl526_1 + - perl-date-format=2.30=pl526_2 + - perl-db-file=1.855=pl526h516909a_0 + - perl-dbd-sqlite=1.64=pl526h516909a_0 + - perl-dbi=1.642=pl526_0 + - perl-devel-globaldestruction=0.14=pl526_0 + - perl-devel-overloadinfo=0.005=pl526_0 + - perl-devel-stacktrace=2.04=pl526_0 + - perl-digest-hmac=1.03=pl526_3 + - perl-digest-md5=2.55=pl526_0 + - perl-digest-perl-md5=1.9=pl526_1 + - perl-digest-sha1=2.13=pl526h6bb024c_1 + - perl-dist-checkconflicts=0.11=pl526_2 + - perl-dynaloader=1.25=pl526_1 + - perl-email-date-format=1.005=pl526_2 + - perl-encode=2.88=pl526_1 + - perl-encode-locale=1.05=pl526_6 + - perl-error=0.17027=pl526_1 + - perl-eval-closure=0.14=pl526h6bb024c_4 + - perl-exporter=5.72=pl526_1 + - perl-exporter-tiny=1.002001=pl526_0 + - perl-extutils-makemaker=7.36=pl526_1 + - perl-file-listing=6.04=pl526_1 + - perl-file-path=2.16=pl526_0 + - perl-file-slurp-tiny=0.004=pl526_1 + - perl-file-spec=3.48_01=pl526_1 + - perl-file-temp=0.2304=pl526_2 + - perl-file-which=1.23=pl526_0 + - perl-font-afm=1.20=pl526_2 + - perl-font-ttf=1.06=pl526_0 + - perl-gd=2.71=pl526he860b03_0 + - perl-getopt-long=2.50=pl526_1 + - perl-graph=0.9704=pl526_1 + - perl-graphviz=2.24=pl526h734ff71_0 + - perl-html-element-extended=1.18=pl526_1 + - perl-html-entities-numbered=0.04=pl526_1 + - perl-html-formatter=2.16=pl526_0 + - perl-html-parser=3.72=pl526h6bb024c_5 + - perl-html-tableextract=2.13=pl526_2 + - perl-html-tagset=3.20=pl526_3 + - perl-html-tidy=1.60=pl526_0 + - perl-html-tree=5.07=pl526_1 + - perl-html-treebuilder-xpath=0.14=pl526_1 + - perl-http-cookies=6.04=pl526_0 + - perl-http-daemon=6.01=pl526_1 + - perl-http-date=6.02=pl526_3 + - perl-http-message=6.18=pl526_0 + - perl-http-negotiate=6.01=pl526_3 + - perl-image-info=1.38=pl526_1 + - perl-image-size=3.300=pl526_2 + - perl-io-html=1.001=pl526_2 + - perl-io-sessiondata=1.03=pl526_1 + - perl-io-socket-ssl=2.066=pl526_0 + - perl-io-string=1.08=pl526_3 + - perl-io-stringy=2.111=pl526_1 + - perl-io-tty=1.12=pl526_1 + - perl-ipc-run=20180523.0=pl526_0 + - perl-ipc-sharelite=0.17=pl526h6bb024c_1 + - perl-jcode=2.07=pl526_2 + - perl-json=4.02=pl526_0 + - perl-json-xs=2.34=pl526h6bb024c_3 + - perl-lib=0.63=pl526_1 + - perl-libwww-perl=6.39=pl526_0 + - perl-libxml-perl=0.08=pl526_2 + - perl-list-moreutils=0.428=pl526_1 + - perl-list-moreutils-xs=0.428=pl526_0 + - perl-lwp-mediatypes=6.04=pl526_0 + - perl-lwp-protocol-https=6.07=pl526_4 + - perl-lwp-simple=6.15=pl526h470a237_4 + - perl-mailtools=2.21=pl526_0 + - perl-math-cdf=0.1=pl526h14c3975_5 + - perl-math-derivative=1.01=pl526_0 + - perl-math-random=0.72=pl526h14c3975_2 + - perl-math-spline=0.02=pl526_2 + - perl-mime-base64=3.15=pl526_1 + - perl-mime-lite=3.030=pl526_1 + - perl-mime-tools=5.508=pl526_1 + - perl-mime-types=2.17=pl526_0 + - perl-mldbm=2.05=pl526_1 + - perl-module-implementation=0.09=pl526_2 + - perl-module-runtime=0.016=pl526_1 + - perl-module-runtime-conflicts=0.003=pl526_0 + - perl-moo=2.003004=pl526_0 + - perl-moose=2.2011=pl526hf484d3e_1 + - perl-mozilla-ca=20180117=pl526_1 + - perl-mro-compat=0.13=pl526_0 + - perl-net-http=6.19=pl526_0 + - perl-net-ssleay=1.74=0 + - perl-ntlm=1.09=pl526_4 + - perl-ole-storage_lite=0.19=pl526_3 + - perl-package-deprecationmanager=0.17=pl526_0 + - perl-package-stash=0.38=pl526hf484d3e_1 + - perl-package-stash-xs=0.28=pl526hf484d3e_1 + - perl-params-util=1.07=pl526h6bb024c_4 + - perl-parent=0.236=pl526_1 + - perl-parse-recdescent=1.967015=pl526_0 + - perl-pathtools=3.75=pl526h14c3975_1 + - perl-pdf-api2=2.035=pl526_0 + - perl-pod-escapes=1.07=pl526_1 + - perl-pod-usage=1.69=pl526_1 + - perl-posix=1.38_03=pl526_1 + - perl-postscript=0.06=pl526_2 + - perl-role-tiny=2.000008=pl526_0 + - perl-scalar-list-utils=1.52=pl526h516909a_0 + - perl-set-scalar=1.29=pl526_2 + - perl-soap-lite=1.19=pl526_1 + - perl-socket=2.027=pl526_1 + - perl-sort-naturally=1.03=pl526_2 + - perl-spreadsheet-parseexcel=0.65=pl526_2 + - perl-spreadsheet-writeexcel=2.40=pl526_2 + - perl-statistics-descriptive=3.0702=pl526_0 + - perl-storable=3.15=pl526h14c3975_0 + - perl-sub-exporter=0.987=pl526_2 + - perl-sub-exporter-progressive=0.001013=pl526_0 + - perl-sub-identify=0.14=pl526h14c3975_0 + - perl-sub-install=0.928=pl526_2 + - perl-sub-name=0.21=pl526_1 + - perl-sub-quote=2.006003=pl526_1 + - perl-svg=2.84=pl526_0 + - perl-svg-graph=0.02=pl526_3 + - perl-task-weaken=1.06=pl526_0 + - perl-template-toolkit=2.26=pl526_1 + - perl-test=1.26=pl526_1 + - perl-test-harness=3.42=pl526_0 + - perl-test-leaktrace=0.16=pl526h14c3975_2 + - perl-test-requiresinternet=0.05=pl526_0 + - perl-threaded=5.26.0=0 + - perl-tie-ixhash=1.23=pl526_2 + - perl-time-hires=1.9760=pl526h14c3975_1 + - perl-time-local=1.28=pl526_1 + - perl-timedate=2.30=pl526_1 + - perl-tree-dag_node=1.31=pl526_0 + - perl-try-tiny=0.30=pl526_1 + - perl-type-tiny=1.004004=pl526_0 + - perl-types-serialiser=1.0=pl526_2 + - perl-unicode-map=0.112=pl526h6bb024c_3 + - perl-uri=1.76=pl526_0 + - perl-www-robotrules=6.02=pl526_3 + - perl-xml-dom=1.46=pl526_0 + - perl-xml-dom-xpath=0.14=pl526_1 + - perl-xml-filter-buffertext=1.01=pl526_2 + - perl-xml-libxml=2.0132=pl526h7ec2d77_1 + - perl-xml-libxslt=1.94=pl526_1 + - perl-xml-namespacesupport=1.12=pl526_0 + - perl-xml-parser=2.44_01=pl5262hc3e0081_1002 + - perl-xml-regexp=0.04=pl526_2 + - perl-xml-sax=1.02=pl526_0 + - perl-xml-sax-base=1.09=pl526_0 + - perl-xml-sax-expat=0.51=pl526_3 + - perl-xml-sax-writer=0.57=pl526_0 + - perl-xml-simple=2.25=pl526_1 + - perl-xml-twig=3.52=pl526_2 + - perl-xml-writer=0.625=pl526_2 + - perl-xml-xpath=1.44=pl526_0 + - perl-xml-xpathengine=0.14=pl526_2 + - perl-xsloader=0.24=pl526_0 + - perl-yaml=1.29=pl526_0 + - pip=21.3.1=pyhd8ed1ab_0 + - pixman=0.40.0=h36c2ea0_0 + - python=3.10.0=h543edf9_2_cpython + - readline=8.1=h46c0cb4_0 + - samtools=1.7=1 + - sqlite=3.36.0=h9cd32fc_2 + - stacks=2.55=h9a82719_1 + - tidyp=1.04=h779adbc_3 + - tk=8.6.11=h27826a3_1 + - tzdata=2021e=he74cb21_0 + - velvet=1.2.10=h5bf99c6_4 + - wheel=0.37.0=pyhd8ed1ab_1 + - xorg-kbproto=1.0.7=h14c3975_1002 + - xorg-libice=1.0.10=h516909a_0 + - xorg-libsm=1.2.3=hd9c2040_1000 + - xorg-libx11=1.6.12=h516909a_0 + - xorg-libxext=1.3.4=h516909a_0 + - xorg-libxpm=3.5.13=h516909a_0 + - xorg-libxrender=0.9.10=h516909a_1002 + - xorg-libxt=1.1.5=h516909a_1003 + - xorg-renderproto=0.11.1=h14c3975_1002 + - xorg-xextproto=7.3.0=h14c3975_1002 + - xorg-xproto=7.0.31=h14c3975_1007 + - xz=5.2.5=h516909a_1 + - zlib=1.2.11=h36c2ea0_1013 + - zstd=1.4.9=ha95c52a_0 + - pip: + - certifi==2021.5.30 + - setuptools==58.0.4 +prefix: /scratch/conda/envs/stacks2.6 From ffa7385721326314d92b13526f9056c1c516a6c9 Mon Sep 17 00:00:00 2001 From: Postuma Date: Wed, 19 Jan 2022 11:42:16 +0100 Subject: [PATCH 2/5] Added polyG problem and clone_filter memory issues to the error section of the readme --- README.md | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e3163c4..c7495d9 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,20 @@ Recommendation: Run fastq-screen in bisulphite mode on raw data to determine sou ## Fix errors -### Clone percentage +### Clone removal + +#### Problem: +The clone_filter process does not procede, due insufficient memory available in the server + +#### Fix: + +- Run clone_filter outside of the pipeline using the following command: `clone_filter -1 R1.fq.gz -2 R2.fq.gz -o ./ --inline_inline -igzfastq --oligo_len_1 3 --oligo_len_2 3 ` +- Change the values of Wobble_R1/Wobble_R2 the barcode file to 0 and set the input/ reads to the output of the command above. +- If the previous solution does not work NGSReadsTreatment can be used instead. (paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6690869/) (download: https://sourceforge.net/projects/ngsreadstreatment/) +- The following command filters the clones: `java -Xmx32g -jar NgsReadsTreatment_v1.3.jar prefix_R1.fastq prefix_R2.fastq 32` +- After which the wobble bases should be removed for this you can use fastp https://github.com/OpenGene/fastp `fastp --trim_front1 3 --trim_front2 3 --disable_adapter_trimming --disable_trim_poly_g --disable_quality_filtering --in1 prefix_R1_1_trated.fastq --in2 prefix_R2_2_trated.fastq --out1 prefix_R1.deRepNoWobble.fq.gz --out2 prefix_R2.deRepNoWobble.fq.gz` +- The `prefix_R1.deRepNoWobble.fq.gz`/ `prefix_R2.deRepNoWobble.fq.gz` files can be used as input for the pipeline, after setting the Wobble_R1/Wobble_R2 at 0 in the barcode file. + #### Problem: I have a very high percentage of clone reads @@ -165,6 +178,13 @@ I have a very high percentage of clone reads ### Demultiplexing +#### Problem: +There are many reads that are lost due to polyG (GGGGG) stretches at the beginning of the reads + +#### Fix: +- These are filtered in the process_radtags step as they can not be assigned to any individual. +- We are not sure yet what causes these polyGs. + #### Problem: One or more samples have small amounts of recovered reads or read numbers differ a lot between different samples. From 80b2b964f9cdbc9912000e7bafe3dacd918357ad Mon Sep 17 00:00:00 2001 From: Postuma Date: Thu, 20 Jan 2022 17:23:05 +0100 Subject: [PATCH 3/5] Update README.md --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c7495d9..7bf1e60 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,16 @@ Recommendation: Run fastq-screen in bisulphite mode on raw data to determine sou ## Fix errors +### Installation + +#### Problem: +`conda create -n snake snakemake=6.1.1` did not work + +#### Fix + +- install mamba in your base conda `conda install mamba` +- run `mamba create -n snake snakemake=6.1.1` + ### Clone removal #### Problem: @@ -184,7 +194,8 @@ There are many reads that are lost due to polyG (GGGGG) stretches at the beginni #### Fix: - These are filtered in the process_radtags step as they can not be assigned to any individual. - We are not sure yet what causes these polyGs. - +- If the poly G's are still present in the trimmed data change line 17 in src/rules/trimming.rules into +`cutadapt -a AGATCGGAAGAGC -A AGATCGGAAGAGC -u 1 -U 1 -m 20 -a G{{10}} -A G{{10}} -N --info-file test.log --untrimmed-output testR1Untrimmed.fq.gz --untrimmed-paired-output testR2Untrimmed.fq.gz -o testR1Trimmed.fq.gz -p testR2Trimmed.fq.gz output/output_demultiplex/clone-stacks/150-Crick.1.fq.gz output/output_demultiplex/clone-stacks/150-Crick.2.fq.gz 2>&1 | tee test2.log` #### Problem: One or more samples have small amounts of recovered reads or read numbers differ a lot between different samples. From 3948a2a2699c1d1aeb69371c20b672848bf68d91 Mon Sep 17 00:00:00 2001 From: Postuma Date: Thu, 20 Jan 2022 17:37:35 +0100 Subject: [PATCH 4/5] Final touches to readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7bf1e60..93d1f61 100644 --- a/README.md +++ b/README.md @@ -156,7 +156,7 @@ Recommendation: Run fastq-screen in bisulphite mode on raw data to determine sou ### Installation #### Problem: -`conda create -n snake snakemake=6.1.1` did not work +`conda create -n snake snakemake=6.1.1` did not work / environment takes a long time to solve #### Fix From 9ccb4b1bb3fee9c48a95ae3bf37837785ff2e551 Mon Sep 17 00:00:00 2001 From: FleurGaBru Date: Tue, 25 Jan 2022 20:55:05 +0100 Subject: [PATCH 5/5] added some minor changes - edited some typo's - added creation of the snake env with mamba to the preparation section - added the zenodo link --- README.md | 60 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 93d1f61..c5b65d0 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,11 @@ - [Explanation of files in the output directory](#explanation-of-files-in-the-output-directory) - [When not to run the pipeline?](#when-not-to-run-the-pipeline) - [Quality control or "How to discover errors?"](#quality-control-or-how-to-discover-errors) -- [Example Config Files](#example-config-files) +- [Fix errors](#fix-errors) +- [Example Data and Config Files](#example-data-and-config-files) - [List of used software and references](#list-of-used-software-and-references) -## Prerequisites of bioinformatics skills and infrastructure and wetlab preparations for running the pipeline +## Prerequisites for running the pipeline - A basic knowledge of Linux: - Knowledge, about how to work with files and directories (cd, ls, nano) @@ -36,8 +37,10 @@ ## Preparation to run the pipeline - Make a conda environment for snakemake if snakemake is not installed globally on the server. You do not need administrator rights to do this but conda has to be installed (see [Prerequisites for running the pipeline](#prerequisites-for-running-the-pipeline)). - - `conda create -n snake snakemake=6.1.1` + - `conda create -n snake` - `conda activate snake` + - `conda install -c conda-forge mamba` + - `mamba install -c bioconda snakemake=6.1.1` - Make a copy of the pipeline - `git clone https://github.com/nioo-knaw/epiGBS2.git` - Enter the created directory: @@ -153,29 +156,22 @@ Recommendation: Run fastq-screen in bisulphite mode on raw data to determine sou ## Fix errors -### Installation - -#### Problem: -`conda create -n snake snakemake=6.1.1` did not work / environment takes a long time to solve - -#### Fix - -- install mamba in your base conda `conda install mamba` -- run `mamba create -n snake snakemake=6.1.1` - ### Clone removal #### Problem: -The clone_filter process does not procede, due insufficient memory available in the server +The clone_filter process does not proceed due insufficient memory availability on the server #### Fix: -- Run clone_filter outside of the pipeline using the following command: `clone_filter -1 R1.fq.gz -2 R2.fq.gz -o ./ --inline_inline -igzfastq --oligo_len_1 3 --oligo_len_2 3 ` -- Change the values of Wobble_R1/Wobble_R2 the barcode file to 0 and set the input/ reads to the output of the command above. -- If the previous solution does not work NGSReadsTreatment can be used instead. (paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6690869/) (download: https://sourceforge.net/projects/ngsreadstreatment/) -- The following command filters the clones: `java -Xmx32g -jar NgsReadsTreatment_v1.3.jar prefix_R1.fastq prefix_R2.fastq 32` -- After which the wobble bases should be removed for this you can use fastp https://github.com/OpenGene/fastp `fastp --trim_front1 3 --trim_front2 3 --disable_adapter_trimming --disable_trim_poly_g --disable_quality_filtering --in1 prefix_R1_1_trated.fastq --in2 prefix_R2_2_trated.fastq --out1 prefix_R1.deRepNoWobble.fq.gz --out2 prefix_R2.deRepNoWobble.fq.gz` -- The `prefix_R1.deRepNoWobble.fq.gz`/ `prefix_R2.deRepNoWobble.fq.gz` files can be used as input for the pipeline, after setting the Wobble_R1/Wobble_R2 at 0 in the barcode file. +- Run clone_filter outside of the pipeline using the following command: + - `clone_filter -1 R1.fq.gz -2 R2.fq.gz -o ./ --inline_inline -igzfastq --oligo_len_1 3 --oligo_len_2 3 ` +- Change the values of Wobble_R1/Wobble_R2 in the barcode file to 0 and in the config.yaml file set the input_dir to the output directory of the command above. +- If the previous solution does not work, NGSReadsTreatment can be used instead. (paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6690869/) (download: https://sourceforge.net/projects/ngsreadstreatment/) +- The following command filters the clone reads: + - `java -Xmx32g -jar NgsReadsTreatment_v1.3.jar prefix_R1.fastq prefix_R2.fastq 32` +- Then the wobble sequence has to be removed manually. You can use [fastp](https://github.com/OpenGene/fastp) + - `fastp --trim_front1 3 --trim_front2 3 --disable_adapter_trimming --disable_trim_poly_g --disable_quality_filtering --in1 prefix_R1_1_trated.fastq --in2 prefix_R2_2_trated.fastq --out1 prefix_R1.deRepNoWobble.fq.gz --out2 prefix_R2.deRepNoWobble.fq.gz` +- After changing the values for Wobble_R1/Wobble_R2 to 0 in the barcode file, the `prefix_R1.deRepNoWobble.fq.gz`/ `prefix_R2.deRepNoWobble.fq.gz` files can be used as input for the pipeline. #### Problem: @@ -189,13 +185,15 @@ I have a very high percentage of clone reads ### Demultiplexing #### Problem: -There are many reads that are lost due to polyG (GGGGG) stretches at the beginning of the reads +There are many reads that are lost due to polyG (GGGGG) stretches at the beginning or end of the reads #### Fix: -- These are filtered in the process_radtags step as they can not be assigned to any individual. +- Reads containing polyG stretches at the start are removed in the process_radtags step as they cannot be assigned to any individual. - We are not sure yet what causes these polyGs. -- If the poly G's are still present in the trimmed data change line 17 in src/rules/trimming.rules into -`cutadapt -a AGATCGGAAGAGC -A AGATCGGAAGAGC -u 1 -U 1 -m 20 -a G{{10}} -A G{{10}} -N --info-file test.log --untrimmed-output testR1Untrimmed.fq.gz --untrimmed-paired-output testR2Untrimmed.fq.gz -o testR1Trimmed.fq.gz -p testR2Trimmed.fq.gz output/output_demultiplex/clone-stacks/150-Crick.1.fq.gz output/output_demultiplex/clone-stacks/150-Crick.2.fq.gz 2>&1 | tee test2.log` +- In the multiQC report you can check for an overpresentation of G's in the Per Base Sequence Content plot. +- If the poly G's are present in the trimmed data, change line 17 in src/rules/trimming.rules to add additional polyG trimming: + - `cutadapt -a AGATCGGAAGAGC -A AGATCGGAAGAGC -u 1 -U 1 -m 20 -a G{{10}} -A G{{10}} -N --info-file test.log --untrimmed-output testR1Untrimmed.fq.gz --untrimmed-paired-output testR2Untrimmed.fq.gz -o testR1Trimmed.fq.gz -p testR2Trimmed.fq.gz output/output_demultiplex/clone-stacks/150-Crick.1.fq.gz output/output_demultiplex/clone-stacks/150-Crick.2.fq.gz 2>&1 | tee test2.log` + #### Problem: One or more samples have small amounts of recovered reads or read numbers differ a lot between different samples. @@ -223,9 +221,13 @@ The mapping percentage in de novo mode is low. #### Fix: - check and optimize the parameters of the de novo reference creation -## Example Config Files +## Example Data and Config Files + +### Example Data + +An example data set and barcode file are available at [Zenodo](https://zenodo.org/record/5878925). -### De novo +### Barcode file using the de novo branch ``` # path to output directory @@ -245,7 +247,7 @@ cycles : 150 barcodes: "barcodes.tsv" # the pipeline produces some temporary files. Please indicate the tmp location on your server (in most cases /tmp) -tmpdir : "/tmp/" +tmpdir : "/tmp" # mode of running pipeline (set denovo, reference or legacy. PLEASE NOTE: legacy is not supported) mode: "denovo" @@ -271,7 +273,7 @@ param_denovo: ``` -### Reference +### Barcode file using the Reference branch ``` # path to output directory @@ -291,7 +293,7 @@ cycles : 150 barcodes: "barcodes.tsv" # the pipeline produces some temporary files. Please indicate the tmp location on your server (in most cases /tmp) -tmpdir : "/tmp/" +tmpdir : "/tmp" # mode of running pipeline (set denovo, reference or legacy. PLEASE NOTE: legacy is not supported) mode: "reference"