From 0db4ab620722791b6f5f847c81962de49401b036 Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Sat, 19 Oct 2019 21:25:12 +0800 Subject: [PATCH 01/11] some bug fixes. Tidy up after repeatmodeler which leaves lots of files --- main.nf | 77 ++++++++++++++++++++++++++++++++++++++++++++----- nextflow.config | 3 ++ 2 files changed, 72 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 6c262ec..6de1430 100644 --- a/main.nf +++ b/main.nf @@ -31,7 +31,7 @@ def helpMessage() { --genomes "genomes/*.fasta" \ --repbase "downloads/RepBaseRepeatMaskerEdition-20181026.tar.gz" \ --rm_meta "downloads/RepeatMaskerMetaData-20181026.tar.gz" \ - --species "fungi" + --rm_species "fungi" ``` ## Parameters @@ -351,6 +351,7 @@ if ( params.dfam_hmm ) { label "download" label "small_task" + time "4h" publishDir "${params.outdir}/downloads" @@ -382,6 +383,7 @@ if ( params.dfam_embl ) { label "download" label "small_task" + time "1h" publishDir "${params.outdir}/downloads" @@ -413,6 +415,7 @@ if ( params.rm_repeatpeps ) { label "repeatmasker" label "small_task" + time "20m" publishDir "${params.outdir}/downloads" @@ -459,6 +462,7 @@ if ( params.rfam && params.rfam_clanin ) { label "download" label "small_task" + time "4h" publishDir "${params.outdir}/downloads" @@ -489,6 +493,7 @@ if ( params.rfam_gomapping ) { label "download" label "small_task" + time "1h" publishDir "${params.outdir}/downloads" @@ -519,7 +524,8 @@ if ( params.pfam ) { process getPfam { label "download" - label "small_task" + label "medium_task" + time "4h" publishDir "${params.outdir}/downloads" @@ -558,7 +564,6 @@ if ( params.pfam ) { if ( params.gypsydb ) { Channel .fromPath( params.gypsydb, checkIfExists: true, type: "file") - .collectFile(newLine: true) .set { gypsydb } } else { @@ -576,11 +581,12 @@ if ( params.gypsydb ) { label "download" label "small_task" + time "4h" publishDir "${params.outdir}/downloads" output: - file "gypsydb/*" into gypsydb + file "gypsydb/*" into gypsydb mode flatten script: """ @@ -597,9 +603,10 @@ process processGydb { label "posix" label "small_task" + time "20m" input: - file "stk/*" from gypsydb + file "stk/*" from gypsydb.collect() output: file "gypsydb.stk" into gypsyDBMSAs @@ -639,6 +646,7 @@ if ( params.mitefinder_profiles ) { label "mitefinder" label "small_task" + time "20m" output: file "pattern_scoring.txt" into miteFinderProfiles @@ -688,6 +696,7 @@ process runTRNAScan { label "trnascan" label "medium_task" + time "5h" publishDir "${params.outdir}/noncoding/${name}" @@ -733,6 +742,7 @@ process getTRNAScanGFF { label "gffpal" label "small_task" + time "1h" tag "${name}" @@ -760,6 +770,7 @@ process pressRfam { label "infernal" label "small_task" + time "2h" when: run_infernal @@ -787,6 +798,7 @@ process chunkifyGenomes { label "python3" label "small_task" + time "1h" tag "${name}" @@ -821,6 +833,7 @@ process runInfernal { label "infernal" label "small_task" + time "5h" tag "${name}" @@ -879,6 +892,7 @@ process tidyInfernalMatches { label "gffpal" label "small_task" + time "1h" tag "${name}" @@ -913,6 +927,7 @@ process combineInfernal { label "genometools" label "small_task" + time "1h" tag "${name}" @@ -954,6 +969,7 @@ process runRNAmmer { label "rnammer" label "small_task" + time "5h" publishDir "${params.outdir}/noncoding/${name}", saveAs: exclude_unclean @@ -987,7 +1003,7 @@ process getRNAmmerGFF { label "gffpal" label "small_task" - + time "1h" tag "${name}" @@ -1016,7 +1032,9 @@ process runOcculterCut { label "occultercut" label "small_task" + time "4h" tag "${name}" + publishDir "${params.outdir}/noncoding/${name}" input: @@ -1064,6 +1082,7 @@ process getOcculterCutRegionFrequencies { label "gffpal" label "small_task" + time "2h" tag "${name}" @@ -1094,6 +1113,7 @@ process getOcculterCutGroupedRegionFrequencies { label "gffpal" label "small_task" + time "2h" tag "${name}" @@ -1130,6 +1150,7 @@ process tidyOcculterCutGFFs { label "genometools" label "small_task" + time "1h" tag "${name} - ${suffix}" @@ -1168,6 +1189,7 @@ process prepRepeatMaskerDB { label "repeatmasker" label "small_task" + time "3h" input: file "repbase.tar.gz" from repbase @@ -1238,6 +1260,7 @@ process runRepeatModeler { label "repeatmasker" label "big_task" + time "1d" tag "${name}" publishDir "${params.outdir}/tes/${name}" @@ -1278,7 +1301,7 @@ process runRepeatModeler { mv "${name}-families.stk" "${name}_repeatmodeler.stk" fi - rm -rf -- rmlib_tmp + rm -rf -- rmlib_tmp RM_* """ } @@ -1297,6 +1320,8 @@ process getRepeatModelerFasta { label "python3" label "small_task" + time "1h" + tag "${name}" publishDir "${params.outdir}/tes/${name}" @@ -1322,6 +1347,8 @@ process getRepeatModelerGFF { label "gffpal" label "small_task" + time "1h" + tag "${name}" input: @@ -1352,6 +1379,7 @@ process getMMSeqsGenomes { label "mmseqs" label "small_task" + time "2h" tag "${name}" @@ -1391,6 +1419,7 @@ process getMSAAttributes { label "gffpal" label "small_task" + time "1h" tag "${db}" @@ -1414,6 +1443,7 @@ process getMMSeqsProfiles { label "mmseqs" label "medium_task" + time "4h" tag "${db}" @@ -1456,6 +1486,7 @@ process searchProfilesVsGenomes { label "mmseqs" label "medium_task" + time "4h" tag "${name} - ${db}" publishDir "${params.outdir}/tes/${name}" @@ -1515,6 +1546,7 @@ process getMMSeqsGenomeGFFs { label "gffpal" label "small_task" + time "1h" tag "${name} - ${db}" @@ -1548,6 +1580,7 @@ process getMMSeqsGenomeFastas { label "genometools" label "small_task" + time "1h" tag "${name} - ${db}" @@ -1600,6 +1633,7 @@ process getGtSuffixArrays { label "genometools" label "small_task" + time "2h" tag "${name}" @@ -1646,6 +1680,7 @@ process runLtrHarvest { label "genometools" label "small_task" + time "4h" tag "${name}" @@ -1691,6 +1726,8 @@ process runLtrDigest { label "genometools" label "small_task" + time "4h" + tag "${name}" publishDir "${params.outdir}/tes/${name}", saveAs: exclude_unclean @@ -1765,6 +1802,8 @@ process runEAHelitron { label "eahelitron" label "small_task" + time "6h" + tag "${name}" publishDir "${params.outdir}/tes/${name}", saveAs: exclude_unclean @@ -1809,6 +1848,7 @@ process filterEAHelitronGFF { label "gffpal" label "small_task" + time "1h" tag "${name}" @@ -1840,6 +1880,8 @@ process filterEAHelitronGFF { process runMiteFinder { label "mitefinder" label "small_task" + time "6h" + tag "${name}" input: @@ -1867,6 +1909,8 @@ process getMiteFInderGFFs { label "gffpal" label "small_task" + time "1h" + tag "${name}" input: @@ -1900,6 +1944,8 @@ process getMiteFinderFastas { label "genometools" label "small_task" + time "1h" + tag "${name}" publishDir "${params.outdir}/tes/${name}" @@ -1945,6 +1991,7 @@ process combineTEFastas { label "posix" label "small_task" + time "2h" publishDir "${params.outdir}/pantes" @@ -1985,6 +2032,7 @@ process clusterTEFastas { label "vsearch" label "medium_task" + time "1d" publishDir "${params.outdir}/pantes" @@ -2018,6 +2066,7 @@ process filterTEClusters { label "python3" label "small_task" + time "4h" publishDir "${params.outdir}/pantes" @@ -2058,6 +2107,7 @@ process getClusterMSAs { label "decipher" label "big_task" + time "1d" publishDir "${params.outdir}/pantes" @@ -2098,6 +2148,7 @@ process getClusterMSAConsensus { label "decipher" label "medium_task" + time "6h" publishDir "${params.outdir}/pantes" @@ -2131,6 +2182,7 @@ process getClusterMSAStockholm { label "python3" label "small_task" + time "2h" publishDir "${params.outdir}/pantes" @@ -2159,6 +2211,7 @@ process runRepeatClassifier { label "repeatmasker" label "big_task" + time "12h" publishDir "${params.outdir}/pantes" @@ -2200,6 +2253,8 @@ process runRepeatMaskerSpecies { label "repeatmasker" label "medium_task" + time "1d" + tag "${name}" publishDir "${params.outdir}/tes/${name}" @@ -2251,6 +2306,8 @@ process runRepeatMasker { label "repeatmasker" label "medium_task" + time "1d" + tag "${name}" publishDir "${params.outdir}/tes/${name}" @@ -2295,6 +2352,7 @@ process getRepeatMaskerGFF { label "gffpal" label "small_task" + time "2h" tag "${name} - ${analysis}" @@ -2304,7 +2362,7 @@ process getRepeatMaskerGFF { file("rm.out") from repeatMaskerResults .map { n, o -> [n, "repeatmasker", o] } .mix( repeatMaskerSpeciesResults - .map {n, o -> [n, "repeatmasker_species" ]} ) + .map {n, o -> [n, "repeatmasker_species", o ]} ) output: set val(name), @@ -2324,6 +2382,7 @@ process tidyGFFs { label "genometools" label "small_task" + time "1h" publishDir "${params.outdir}/${folder}/${name}" @@ -2374,6 +2433,7 @@ process combineGFFs { label "genometools" label "small_task" + time "2h" publishDir "${params.outdir}/final" tag "${name}" @@ -2404,6 +2464,7 @@ process getSoftmaskedGenomes { label "bedtools" label "small_task" + time "1h" publishDir "${params.outdir}/final" tag "${name}" diff --git a/nextflow.config b/nextflow.config index 83dec84..eaf7cd3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,6 +26,9 @@ profiles { docker { includeConfig "conf/docker.config" } + pawsey_zeus { + includeConfig "conf/pawsey_zeus.config" + } docker_plus { includeConfig "conf/docker_plus.config" } From e76243d21196712d8f2047e78d41cb7cdcd9b949 Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Wed, 23 Oct 2019 15:26:17 +0800 Subject: [PATCH 02/11] Filter by scaffold length before running repeatmodeler. Handle repeatmodeler output better, it sometimes has the same region twice. --- README.md | 1 + bin/stk2fasta.py | 38 +++++++++++++++++++++++++------------- conf/pawsey_zeus.config | 4 ++-- main.nf | 36 ++++++++++++++++++++++++++++++++++-- 4 files changed, 62 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 03dd0f8..8b156ee 100644 --- a/README.md +++ b/README.md @@ -226,6 +226,7 @@ You can also raise an issue on the github repository and I'll try to help. | `--mmseqs_max_evalue` | 0.001 | The maximum e-value to use to consider `mmseqs` profile matches against the genomes significant. | | `--min_intra_frequency` | 4 | The minimum number of copies a clustered repeat family must have within a genome for it to be considered "present". | | `--min_inter_proportion` | 0.2 | The minimum proportion of genomes that the clustered repeat family must be present in (after `--min_intra_frequency`) to be considered a geniune family. | +| `--repeatmodeler_min_len` | 10000 | The minimum scaffold length to allow for predicting repeats in repeatmodeler. Scaffolds smaller than this will be removed to avoid sampling bias. | | `--eahelitron_three_prime_fuzzy_level` | 3 | Passed on to the EAHelitron parameter `-r`. | | `--eahelitron_upstream_length` | 3000 | Passed on to the EAHElitron parameter `-u`. | | `--eahelitron_downstream_length` | 50 | Passed on to the EAHelitron parameter `d`. | diff --git a/bin/stk2fasta.py b/bin/stk2fasta.py index 480d1a7..36bb501 100755 --- a/bin/stk2fasta.py +++ b/bin/stk2fasta.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 +import re import sys import argparse from Bio import SeqIO -from Bio import AlignIO - -from Bio.Alphabet import Gapped, SingleLetterAlphabet +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord def cli(prog, args): @@ -34,21 +34,33 @@ def cli(prog, args): return parser.parse_args(args) +def line_to_seq(line, regex): + sline = regex.split(line.strip(), maxsplit=1) + seq = SeqRecord( + id=sline[0], + name=sline[0], + description=sline[0], + seq=Seq(sline[1].replace("-", "").replace(".", "")), + ) + + return seq + + def main(): args = cli(sys.argv[0], sys.argv[1:]) + regex = re.compile(r"\s+") + + seen = set() seqs = [] - alignments = AlignIO.parse( - args.infile, - format="stockholm", - alphabet=Gapped(SingleLetterAlphabet(), "-") - ) + for line in args.infile: + if line.startswith("#") or line.startswith("/"): + continue - i = 1 - for alignment in alignments: - for seq in alignment: - seq.seq = seq.seq.ungap() + seq = line_to_seq(line, regex) + if seq.id not in seen: seqs.append(seq) - i += 1 + + seen.add(seq.id) SeqIO.write(seqs, args.outfile, "fasta") return diff --git a/conf/pawsey_zeus.config b/conf/pawsey_zeus.config index e44eeb8..91613a4 100644 --- a/conf/pawsey_zeus.config +++ b/conf/pawsey_zeus.config @@ -34,12 +34,12 @@ process { withLabel: biggish_task { cpus = 8 - memory = 16.GB + memory = 31.GB } withLabel: big_task { cpus = 16 - memory = 32.GB + memory = 63.GB } } diff --git a/main.nf b/main.nf index 6de1430..26fdb93 100644 --- a/main.nf +++ b/main.nf @@ -177,6 +177,11 @@ def helpMessage() { family must be present in (after `--min_intra_frequency`) to be considered a geniune family. + --repeatmodeler_min_len + 10000 + The minimum scaffold length to allow for predicting repeats in repeatmodeler. + Scaffolds smaller than this will be removed to avoid sampling bias. + --eahelitron_three_prime_fuzzy_level 3 Passed on to the EAHelitron parameter `-r`. @@ -276,6 +281,8 @@ params.gypsydb_url = "http://gydb.org/gydbModules/collection/collection/db/GyDB_ params.pfam_ids = "$baseDir/data/pfam_ids.txt" params.protein_families = "$baseDir/data/proteins/families.stk" +params.repeatmodeler_min_len = 10000 + params.infernal_max_evalue = 0.00001 params.mmseqs_max_evalue = 0.001 @@ -1173,7 +1180,6 @@ process tidyOcculterCutGFFs { in.gff \ > "${name}_${suffix}.gff3" """ - } @@ -1249,6 +1255,32 @@ process prepRepeatMaskerDB { } +/* + */ +process filterScaffoldLength { + + label "posix" + label "small_task" + time "1h" + + tag "${name}" + + input: + set val(name), file("in.fasta") from genomes4RunRepeatModeler + + output: + set val(name), file("filtered.fasta") into filteredGenomes4RunRepeatModeler + + script: + """ + fasta_to_tsv.sh < in.fasta \ + | awk -v size="${params.repeatmodeler_min_len}" 'length(\$2) >= size' \ + | tsv_to_fasta.sh \ + > filtered.fasta + """ +} + + /* * RepeatModeler * url: http://www.repeatmasker.org/RepeatModeler/ @@ -1266,7 +1298,7 @@ process runRepeatModeler { publishDir "${params.outdir}/tes/${name}" input: - set val(name), file(fasta) from genomes4RunRepeatModeler + set val(name), file(fasta) from filteredGenomes4RunRepeatModeler file "rmlib" from rmlib output: From f0db864a5a2956afdc795da20f6bd67a8f77f9ce Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Wed, 23 Oct 2019 17:54:20 +0800 Subject: [PATCH 03/11] filter duplicates from repeatmodeler gff too --- bin/repeatmodeler2gff.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bin/repeatmodeler2gff.py b/bin/repeatmodeler2gff.py index ee584aa..d9b2db9 100755 --- a/bin/repeatmodeler2gff.py +++ b/bin/repeatmodeler2gff.py @@ -73,8 +73,15 @@ def parse_block(handle, source, type_): elif not line.startswith("#"): ids.append(line.split(maxsplit=1)[0]) + seen = set() + out = [] for id_ in ids: + if id_ in seen: + continue + else: + seen.add(id) + seqid, start, end, strand = id_to_loc(id_) if len(species) == 0: From bd6650c44830dec2ebd85bdb1a72df8848025263 Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Thu, 24 Oct 2019 22:23:23 +0800 Subject: [PATCH 04/11] Changed default contig size filter for repeatmodeler. Removed helitron etc type from rmout2gff script --- bin/rmout2gff3.py | 2 -- main.nf | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/bin/rmout2gff3.py b/bin/rmout2gff3.py index 12b0d22..df7fee9 100755 --- a/bin/rmout2gff3.py +++ b/bin/rmout2gff3.py @@ -233,12 +233,10 @@ def as_gff3(self, source="RepeatMasker"): ontology_terms.extend(["SO:0000206", "SO:SINE_element"]) if "helitron" in self.kind.lower(): - type_ = "helitron" ontology_terms.extend(["SO:0000544", "SO:helitron"]) if ("maverick" in self.kind.lower() or "polinton" in self.kind.lower()): - type_ = "polinton" ontology_terms.extend(["SO:0001170", "SO:polinton"]) if "mite" in self.kind.lower(): diff --git a/main.nf b/main.nf index 26fdb93..49a970c 100644 --- a/main.nf +++ b/main.nf @@ -281,7 +281,7 @@ params.gypsydb_url = "http://gydb.org/gydbModules/collection/collection/db/GyDB_ params.pfam_ids = "$baseDir/data/pfam_ids.txt" params.protein_families = "$baseDir/data/proteins/families.stk" -params.repeatmodeler_min_len = 10000 +params.repeatmodeler_min_len = 1000 params.infernal_max_evalue = 0.00001 params.mmseqs_max_evalue = 0.001 From ce6a080e7b19f143a418641b162f18563486dadc Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Fri, 25 Oct 2019 12:05:44 +0800 Subject: [PATCH 05/11] RepeatModeler output is sometimes listed outside of the actual contig. Modified masking step to prevent this raising an index error. --- main.nf | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 6de1430..5af4805 100644 --- a/main.nf +++ b/main.nf @@ -2485,10 +2485,23 @@ process getSoftmaskedGenomes { script: """ + fasta_to_tsv.sh \ + < genome.fasta \ + | awk 'BEGIN {OFS="\t"} {print $1, 0, length($2)}' \ + > genome.bed + + bedtools intersect \ + -a repeats.gff3 \ + -b genome.bed \ + > bounded.gff3 + + bedtools maskfasta \ -fi genome.fasta \ - -bed repeats.gff3 \ + -bed bounded.gff3 \ -fo "${name}_softmasked.fasta" \ -soft + + rm -f bounded.gff3 genome.bed """ } From b64ca3a200722e3bb305f5bd442638f40cad1559 Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Fri, 25 Oct 2019 12:19:53 +0800 Subject: [PATCH 06/11] escaped dollars in awk command --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 24d1f9f..5f51851 100644 --- a/main.nf +++ b/main.nf @@ -2519,7 +2519,7 @@ process getSoftmaskedGenomes { """ fasta_to_tsv.sh \ < genome.fasta \ - | awk 'BEGIN {OFS="\t"} {print $1, 0, length($2)}' \ + | awk 'BEGIN {OFS="\\t"} {print \$1, 0, length(\$2)}' \ > genome.bed bedtools intersect \ From 67200df6bc175cd87ade11acf7759c02c201ed46 Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Fri, 25 Oct 2019 22:21:32 +0800 Subject: [PATCH 07/11] Filter the combined sequences by overlap before clustering. --- bin/exclude_contained_subseqs.py | 132 +++++++++++++++++++++++++++++++ main.nf | 18 ++--- 2 files changed, 141 insertions(+), 9 deletions(-) create mode 100755 bin/exclude_contained_subseqs.py diff --git a/bin/exclude_contained_subseqs.py b/bin/exclude_contained_subseqs.py new file mode 100755 index 0000000..0a612b3 --- /dev/null +++ b/bin/exclude_contained_subseqs.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import re + +from collections import defaultdict + +from Bio import SeqIO + +from intervaltree import Interval, IntervalTree + + +def cli(prog, args): + parser = argparse.ArgumentParser( + prog=prog, + description=""" Extracts all sequences from a fasta msa file into a + stockholm alignment file. + """ + ) + + parser.add_argument( + "infile", + type=argparse.FileType('r'), + help="Input fasta file.", + ) + + parser.add_argument( + "-o", "--outfile", + default=sys.stdout, + type=argparse.FileType('w'), + help=("Output filtered fastas."), + ) + + parser.add_argument( + "-c", "--coverage", + default=0.95, + type=float, + help="The coverage of the smaller sequence required to exclude." + ) + + return parser.parse_args(args) + + +def parse_id_as_interval(id_string, regex): + """ The fasta ids contain the locus information. """ + + match = regex.match(id_string) + genome = match.group("genome") + seqid = match.group("seqid") + start_tmp = int(match.group("start")) + end_tmp = int(match.group("end")) + + start = min([start_tmp, end_tmp]) + end = max([start_tmp, end_tmp]) + del start_tmp + del end_tmp + + return (genome, seqid, start, end) + + +def coverage(left, right): + intersection_begin = max([left.begin, right.begin]) + intersection_end = min([left.end, right.end]) + size = intersection_end - intersection_begin + return size / min([left.length(), right.length()]) + + +def main(): + args = cli(sys.argv[0], sys.argv[1:]) + regex = re.compile(r"^>?(?P[^:\s]+):(?P[^:\s]+)" + r":(?P\d+)-(?P\d+)") + + itree = defaultdict(IntervalTree) + seqs = SeqIO.parse(args.infile, format="fasta") + + for seq in seqs: + genome, seqid, start, end = parse_id_as_interval(seq.id, regex) + interval = Interval(start, end, data=seq) + + if interval in itree[(genome, seqid)]: + continue + + envelops = itree[(genome, seqid)].envelop(interval) + overlaps = itree[(genome, seqid)].overlap(interval) + + # If the interval completely overlaps one already in the tree + # replace it. This would be covered by overlaps, by should be faster. + if len(envelops) > 0: + itree[(genome, seqid)].remove_overlap(start, end) + itree[(genome, seqid)].add(interval) + + # If the interval partially overlaps one, or is completely contained + # by an interval in the tree, we interrogate further. + elif len(overlaps) > 0: + to_remove = [] + add_to_tree = True + + for i in overlaps: + cov_match = coverage(i, interval) > args.coverage + + # If the coverage of the shorter interval is above a threshold + # and the interval already in the tree is the shorter one, + # we flag it for replacement. + if cov_match and i.length() < interval.length(): + to_remove.append(i) + + # If the new interval was the shorter of the intervals. + elif cov_match: + add_to_tree = False + break + + # We reached all the way through without discarding the new + # interval. + if add_to_tree: + for i in to_remove: + itree[(genome, seqid)].remove(i) + + itree[(genome, seqid)].add(interval) + + # If it doesn't overlap the sequence at all. + else: + itree[(genome, seqid)].add(interval) + + for (genome, seqid), subitree in itree.items(): + SeqIO.write((i.data for i in subitree), args.outfile, format="fasta") + + return + + +if __name__ == "__main__": + main() diff --git a/main.nf b/main.nf index 5f51851..f10a0a9 100644 --- a/main.nf +++ b/main.nf @@ -358,7 +358,7 @@ if ( params.dfam_hmm ) { label "download" label "small_task" - time "4h" + time "4h" publishDir "${params.outdir}/downloads" @@ -2021,7 +2021,7 @@ process getMiteFinderFastas { */ process combineTEFastas { - label "posix" + label "python3" label "small_task" time "2h" @@ -2043,10 +2043,10 @@ process combineTEFastas { script: """ cat in/*.fasta \ - | fasta_to_tsv.sh \ - | sort -u -k1,1 \ - | tsv_to_fasta.sh \ - > combined_tes.fasta + | exclude_contained_subseqs.py \ + --coverage 0.95 \ + -o combined_tes.fasta \ + - """ } @@ -2080,10 +2080,10 @@ process clusterTEFastas { vsearch \ --threads "${task.cpus}" \ --cluster_fast "combined.fasta" \ - --id 0.8 \ - --weak_id 0.6 \ + --id 0.90 \ + --weak_id 0.7 \ --iddef 0 \ - --qmask none \ + --qmask dust \ --uc clusters.tsv \ --strand "both" \ --clusters "clusters/fam" From 0ab1f3c3264d79b9a0238293fe14086158d131b6 Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Fri, 25 Oct 2019 22:42:34 +0800 Subject: [PATCH 08/11] fixing exclude seqs script --- bin/exclude_contained_subseqs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/exclude_contained_subseqs.py b/bin/exclude_contained_subseqs.py index 0a612b3..d69512d 100755 --- a/bin/exclude_contained_subseqs.py +++ b/bin/exclude_contained_subseqs.py @@ -72,11 +72,11 @@ def main(): r":(?P\d+)-(?P\d+)") itree = defaultdict(IntervalTree) - seqs = SeqIO.parse(args.infile, format="fasta") + seqs = SeqIO.to_dict(SeqIO.parse(args.infile, format="fasta")) - for seq in seqs: - genome, seqid, start, end = parse_id_as_interval(seq.id, regex) - interval = Interval(start, end, data=seq) + for id_ in seqs.keys(): + genome, seqid, start, end = parse_id_as_interval(id_, regex) + interval = Interval(start, end, data=id_) if interval in itree[(genome, seqid)]: continue @@ -123,7 +123,7 @@ def main(): itree[(genome, seqid)].add(interval) for (genome, seqid), subitree in itree.items(): - SeqIO.write((i.data for i in subitree), args.outfile, format="fasta") + SeqIO.write((keep[i.data] for i in subitree), args.outfile, format="fasta") return From 4b7b6400bb302582f7f50bf69b1f4e22de7ec250 Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Fri, 25 Oct 2019 22:45:46 +0800 Subject: [PATCH 09/11] minor bugfix in script --- bin/exclude_contained_subseqs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/exclude_contained_subseqs.py b/bin/exclude_contained_subseqs.py index d69512d..b2def44 100755 --- a/bin/exclude_contained_subseqs.py +++ b/bin/exclude_contained_subseqs.py @@ -123,7 +123,7 @@ def main(): itree[(genome, seqid)].add(interval) for (genome, seqid), subitree in itree.items(): - SeqIO.write((keep[i.data] for i in subitree), args.outfile, format="fasta") + SeqIO.write((seqs[i.data] for i in subitree), args.outfile, format="fasta") return From 90a5a25fb760d8e1d4d404b6c56e5ddf8cdcafa1 Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Fri, 25 Oct 2019 22:58:18 +0800 Subject: [PATCH 10/11] fixed duplicate id issue in exclude_subseqs script --- bin/exclude_contained_subseqs.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/exclude_contained_subseqs.py b/bin/exclude_contained_subseqs.py index b2def44..9a36643 100755 --- a/bin/exclude_contained_subseqs.py +++ b/bin/exclude_contained_subseqs.py @@ -72,7 +72,7 @@ def main(): r":(?P\d+)-(?P\d+)") itree = defaultdict(IntervalTree) - seqs = SeqIO.to_dict(SeqIO.parse(args.infile, format="fasta")) + seqs = {s.id: s for s in SeqIO.parse(args.infile, format="fasta")} for id_ in seqs.keys(): genome, seqid, start, end = parse_id_as_interval(id_, regex) @@ -123,7 +123,11 @@ def main(): itree[(genome, seqid)].add(interval) for (genome, seqid), subitree in itree.items(): - SeqIO.write((seqs[i.data] for i in subitree), args.outfile, format="fasta") + SeqIO.write( + (seqs[i.data] for i in subitree), + args.outfile, + format="fasta" + ) return From d661c0b63a458da4106148005309bed7a4b11aaf Mon Sep 17 00:00:00 2001 From: Darcy Jones Date: Mon, 28 Oct 2019 14:20:56 +0800 Subject: [PATCH 11/11] Made retry strategy for repeatmodeler retry, up to 3. Handles cases of random sampling hitting edge bugs in RM --- main.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/main.nf b/main.nf index f10a0a9..e5ca03f 100644 --- a/main.nf +++ b/main.nf @@ -1294,6 +1294,9 @@ process runRepeatModeler { label "big_task" time "1d" + errorStrategy 'retry' + maxRetries 3 + tag "${name}" publishDir "${params.outdir}/tes/${name}"