diff --git a/README.md b/README.md index 03dd0f8..8b156ee 100644 --- a/README.md +++ b/README.md @@ -226,6 +226,7 @@ You can also raise an issue on the github repository and I'll try to help. | `--mmseqs_max_evalue` | 0.001 | The maximum e-value to use to consider `mmseqs` profile matches against the genomes significant. | | `--min_intra_frequency` | 4 | The minimum number of copies a clustered repeat family must have within a genome for it to be considered "present". | | `--min_inter_proportion` | 0.2 | The minimum proportion of genomes that the clustered repeat family must be present in (after `--min_intra_frequency`) to be considered a geniune family. | +| `--repeatmodeler_min_len` | 10000 | The minimum scaffold length to allow for predicting repeats in repeatmodeler. Scaffolds smaller than this will be removed to avoid sampling bias. | | `--eahelitron_three_prime_fuzzy_level` | 3 | Passed on to the EAHelitron parameter `-r`. | | `--eahelitron_upstream_length` | 3000 | Passed on to the EAHElitron parameter `-u`. | | `--eahelitron_downstream_length` | 50 | Passed on to the EAHelitron parameter `d`. | diff --git a/bin/exclude_contained_subseqs.py b/bin/exclude_contained_subseqs.py new file mode 100755 index 0000000..9a36643 --- /dev/null +++ b/bin/exclude_contained_subseqs.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import re + +from collections import defaultdict + +from Bio import SeqIO + +from intervaltree import Interval, IntervalTree + + +def cli(prog, args): + parser = argparse.ArgumentParser( + prog=prog, + description=""" Extracts all sequences from a fasta msa file into a + stockholm alignment file. + """ + ) + + parser.add_argument( + "infile", + type=argparse.FileType('r'), + help="Input fasta file.", + ) + + parser.add_argument( + "-o", "--outfile", + default=sys.stdout, + type=argparse.FileType('w'), + help=("Output filtered fastas."), + ) + + parser.add_argument( + "-c", "--coverage", + default=0.95, + type=float, + help="The coverage of the smaller sequence required to exclude." + ) + + return parser.parse_args(args) + + +def parse_id_as_interval(id_string, regex): + """ The fasta ids contain the locus information. """ + + match = regex.match(id_string) + genome = match.group("genome") + seqid = match.group("seqid") + start_tmp = int(match.group("start")) + end_tmp = int(match.group("end")) + + start = min([start_tmp, end_tmp]) + end = max([start_tmp, end_tmp]) + del start_tmp + del end_tmp + + return (genome, seqid, start, end) + + +def coverage(left, right): + intersection_begin = max([left.begin, right.begin]) + intersection_end = min([left.end, right.end]) + size = intersection_end - intersection_begin + return size / min([left.length(), right.length()]) + + +def main(): + args = cli(sys.argv[0], sys.argv[1:]) + regex = re.compile(r"^>?(?P[^:\s]+):(?P[^:\s]+)" + r":(?P\d+)-(?P\d+)") + + itree = defaultdict(IntervalTree) + seqs = {s.id: s for s in SeqIO.parse(args.infile, format="fasta")} + + for id_ in seqs.keys(): + genome, seqid, start, end = parse_id_as_interval(id_, regex) + interval = Interval(start, end, data=id_) + + if interval in itree[(genome, seqid)]: + continue + + envelops = itree[(genome, seqid)].envelop(interval) + overlaps = itree[(genome, seqid)].overlap(interval) + + # If the interval completely overlaps one already in the tree + # replace it. This would be covered by overlaps, by should be faster. + if len(envelops) > 0: + itree[(genome, seqid)].remove_overlap(start, end) + itree[(genome, seqid)].add(interval) + + # If the interval partially overlaps one, or is completely contained + # by an interval in the tree, we interrogate further. + elif len(overlaps) > 0: + to_remove = [] + add_to_tree = True + + for i in overlaps: + cov_match = coverage(i, interval) > args.coverage + + # If the coverage of the shorter interval is above a threshold + # and the interval already in the tree is the shorter one, + # we flag it for replacement. + if cov_match and i.length() < interval.length(): + to_remove.append(i) + + # If the new interval was the shorter of the intervals. + elif cov_match: + add_to_tree = False + break + + # We reached all the way through without discarding the new + # interval. + if add_to_tree: + for i in to_remove: + itree[(genome, seqid)].remove(i) + + itree[(genome, seqid)].add(interval) + + # If it doesn't overlap the sequence at all. + else: + itree[(genome, seqid)].add(interval) + + for (genome, seqid), subitree in itree.items(): + SeqIO.write( + (seqs[i.data] for i in subitree), + args.outfile, + format="fasta" + ) + + return + + +if __name__ == "__main__": + main() diff --git a/bin/repeatmodeler2gff.py b/bin/repeatmodeler2gff.py index ee584aa..d9b2db9 100755 --- a/bin/repeatmodeler2gff.py +++ b/bin/repeatmodeler2gff.py @@ -73,8 +73,15 @@ def parse_block(handle, source, type_): elif not line.startswith("#"): ids.append(line.split(maxsplit=1)[0]) + seen = set() + out = [] for id_ in ids: + if id_ in seen: + continue + else: + seen.add(id) + seqid, start, end, strand = id_to_loc(id_) if len(species) == 0: diff --git a/bin/rmout2gff3.py b/bin/rmout2gff3.py index 12b0d22..df7fee9 100755 --- a/bin/rmout2gff3.py +++ b/bin/rmout2gff3.py @@ -233,12 +233,10 @@ def as_gff3(self, source="RepeatMasker"): ontology_terms.extend(["SO:0000206", "SO:SINE_element"]) if "helitron" in self.kind.lower(): - type_ = "helitron" ontology_terms.extend(["SO:0000544", "SO:helitron"]) if ("maverick" in self.kind.lower() or "polinton" in self.kind.lower()): - type_ = "polinton" ontology_terms.extend(["SO:0001170", "SO:polinton"]) if "mite" in self.kind.lower(): diff --git a/bin/stk2fasta.py b/bin/stk2fasta.py index 480d1a7..36bb501 100755 --- a/bin/stk2fasta.py +++ b/bin/stk2fasta.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 +import re import sys import argparse from Bio import SeqIO -from Bio import AlignIO - -from Bio.Alphabet import Gapped, SingleLetterAlphabet +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord def cli(prog, args): @@ -34,21 +34,33 @@ def cli(prog, args): return parser.parse_args(args) +def line_to_seq(line, regex): + sline = regex.split(line.strip(), maxsplit=1) + seq = SeqRecord( + id=sline[0], + name=sline[0], + description=sline[0], + seq=Seq(sline[1].replace("-", "").replace(".", "")), + ) + + return seq + + def main(): args = cli(sys.argv[0], sys.argv[1:]) + regex = re.compile(r"\s+") + + seen = set() seqs = [] - alignments = AlignIO.parse( - args.infile, - format="stockholm", - alphabet=Gapped(SingleLetterAlphabet(), "-") - ) + for line in args.infile: + if line.startswith("#") or line.startswith("/"): + continue - i = 1 - for alignment in alignments: - for seq in alignment: - seq.seq = seq.seq.ungap() + seq = line_to_seq(line, regex) + if seq.id not in seen: seqs.append(seq) - i += 1 + + seen.add(seq.id) SeqIO.write(seqs, args.outfile, "fasta") return diff --git a/conf/pawsey_zeus.config b/conf/pawsey_zeus.config index e44eeb8..91613a4 100644 --- a/conf/pawsey_zeus.config +++ b/conf/pawsey_zeus.config @@ -34,12 +34,12 @@ process { withLabel: biggish_task { cpus = 8 - memory = 16.GB + memory = 31.GB } withLabel: big_task { cpus = 16 - memory = 32.GB + memory = 63.GB } } diff --git a/main.nf b/main.nf index 6c262ec..e5ca03f 100644 --- a/main.nf +++ b/main.nf @@ -31,7 +31,7 @@ def helpMessage() { --genomes "genomes/*.fasta" \ --repbase "downloads/RepBaseRepeatMaskerEdition-20181026.tar.gz" \ --rm_meta "downloads/RepeatMaskerMetaData-20181026.tar.gz" \ - --species "fungi" + --rm_species "fungi" ``` ## Parameters @@ -177,6 +177,11 @@ def helpMessage() { family must be present in (after `--min_intra_frequency`) to be considered a geniune family. + --repeatmodeler_min_len + 10000 + The minimum scaffold length to allow for predicting repeats in repeatmodeler. + Scaffolds smaller than this will be removed to avoid sampling bias. + --eahelitron_three_prime_fuzzy_level 3 Passed on to the EAHelitron parameter `-r`. @@ -276,6 +281,8 @@ params.gypsydb_url = "http://gydb.org/gydbModules/collection/collection/db/GyDB_ params.pfam_ids = "$baseDir/data/pfam_ids.txt" params.protein_families = "$baseDir/data/proteins/families.stk" +params.repeatmodeler_min_len = 1000 + params.infernal_max_evalue = 0.00001 params.mmseqs_max_evalue = 0.001 @@ -351,6 +358,7 @@ if ( params.dfam_hmm ) { label "download" label "small_task" + time "4h" publishDir "${params.outdir}/downloads" @@ -382,6 +390,7 @@ if ( params.dfam_embl ) { label "download" label "small_task" + time "1h" publishDir "${params.outdir}/downloads" @@ -413,6 +422,7 @@ if ( params.rm_repeatpeps ) { label "repeatmasker" label "small_task" + time "20m" publishDir "${params.outdir}/downloads" @@ -459,6 +469,7 @@ if ( params.rfam && params.rfam_clanin ) { label "download" label "small_task" + time "4h" publishDir "${params.outdir}/downloads" @@ -489,6 +500,7 @@ if ( params.rfam_gomapping ) { label "download" label "small_task" + time "1h" publishDir "${params.outdir}/downloads" @@ -519,7 +531,8 @@ if ( params.pfam ) { process getPfam { label "download" - label "small_task" + label "medium_task" + time "4h" publishDir "${params.outdir}/downloads" @@ -558,7 +571,6 @@ if ( params.pfam ) { if ( params.gypsydb ) { Channel .fromPath( params.gypsydb, checkIfExists: true, type: "file") - .collectFile(newLine: true) .set { gypsydb } } else { @@ -576,11 +588,12 @@ if ( params.gypsydb ) { label "download" label "small_task" + time "4h" publishDir "${params.outdir}/downloads" output: - file "gypsydb/*" into gypsydb + file "gypsydb/*" into gypsydb mode flatten script: """ @@ -597,9 +610,10 @@ process processGydb { label "posix" label "small_task" + time "20m" input: - file "stk/*" from gypsydb + file "stk/*" from gypsydb.collect() output: file "gypsydb.stk" into gypsyDBMSAs @@ -639,6 +653,7 @@ if ( params.mitefinder_profiles ) { label "mitefinder" label "small_task" + time "20m" output: file "pattern_scoring.txt" into miteFinderProfiles @@ -688,6 +703,7 @@ process runTRNAScan { label "trnascan" label "medium_task" + time "5h" publishDir "${params.outdir}/noncoding/${name}" @@ -733,6 +749,7 @@ process getTRNAScanGFF { label "gffpal" label "small_task" + time "1h" tag "${name}" @@ -760,6 +777,7 @@ process pressRfam { label "infernal" label "small_task" + time "2h" when: run_infernal @@ -787,6 +805,7 @@ process chunkifyGenomes { label "python3" label "small_task" + time "1h" tag "${name}" @@ -821,6 +840,7 @@ process runInfernal { label "infernal" label "small_task" + time "5h" tag "${name}" @@ -879,6 +899,7 @@ process tidyInfernalMatches { label "gffpal" label "small_task" + time "1h" tag "${name}" @@ -913,6 +934,7 @@ process combineInfernal { label "genometools" label "small_task" + time "1h" tag "${name}" @@ -954,6 +976,7 @@ process runRNAmmer { label "rnammer" label "small_task" + time "5h" publishDir "${params.outdir}/noncoding/${name}", saveAs: exclude_unclean @@ -987,7 +1010,7 @@ process getRNAmmerGFF { label "gffpal" label "small_task" - + time "1h" tag "${name}" @@ -1016,7 +1039,9 @@ process runOcculterCut { label "occultercut" label "small_task" + time "4h" tag "${name}" + publishDir "${params.outdir}/noncoding/${name}" input: @@ -1064,6 +1089,7 @@ process getOcculterCutRegionFrequencies { label "gffpal" label "small_task" + time "2h" tag "${name}" @@ -1094,6 +1120,7 @@ process getOcculterCutGroupedRegionFrequencies { label "gffpal" label "small_task" + time "2h" tag "${name}" @@ -1130,6 +1157,7 @@ process tidyOcculterCutGFFs { label "genometools" label "small_task" + time "1h" tag "${name} - ${suffix}" @@ -1152,7 +1180,6 @@ process tidyOcculterCutGFFs { in.gff \ > "${name}_${suffix}.gff3" """ - } @@ -1168,6 +1195,7 @@ process prepRepeatMaskerDB { label "repeatmasker" label "small_task" + time "3h" input: file "repbase.tar.gz" from repbase @@ -1227,6 +1255,32 @@ process prepRepeatMaskerDB { } +/* + */ +process filterScaffoldLength { + + label "posix" + label "small_task" + time "1h" + + tag "${name}" + + input: + set val(name), file("in.fasta") from genomes4RunRepeatModeler + + output: + set val(name), file("filtered.fasta") into filteredGenomes4RunRepeatModeler + + script: + """ + fasta_to_tsv.sh < in.fasta \ + | awk -v size="${params.repeatmodeler_min_len}" 'length(\$2) >= size' \ + | tsv_to_fasta.sh \ + > filtered.fasta + """ +} + + /* * RepeatModeler * url: http://www.repeatmasker.org/RepeatModeler/ @@ -1238,12 +1292,16 @@ process runRepeatModeler { label "repeatmasker" label "big_task" + time "1d" + + errorStrategy 'retry' + maxRetries 3 tag "${name}" publishDir "${params.outdir}/tes/${name}" input: - set val(name), file(fasta) from genomes4RunRepeatModeler + set val(name), file(fasta) from filteredGenomes4RunRepeatModeler file "rmlib" from rmlib output: @@ -1278,7 +1336,7 @@ process runRepeatModeler { mv "${name}-families.stk" "${name}_repeatmodeler.stk" fi - rm -rf -- rmlib_tmp + rm -rf -- rmlib_tmp RM_* """ } @@ -1297,6 +1355,8 @@ process getRepeatModelerFasta { label "python3" label "small_task" + time "1h" + tag "${name}" publishDir "${params.outdir}/tes/${name}" @@ -1322,6 +1382,8 @@ process getRepeatModelerGFF { label "gffpal" label "small_task" + time "1h" + tag "${name}" input: @@ -1352,6 +1414,7 @@ process getMMSeqsGenomes { label "mmseqs" label "small_task" + time "2h" tag "${name}" @@ -1391,6 +1454,7 @@ process getMSAAttributes { label "gffpal" label "small_task" + time "1h" tag "${db}" @@ -1414,6 +1478,7 @@ process getMMSeqsProfiles { label "mmseqs" label "medium_task" + time "4h" tag "${db}" @@ -1456,6 +1521,7 @@ process searchProfilesVsGenomes { label "mmseqs" label "medium_task" + time "4h" tag "${name} - ${db}" publishDir "${params.outdir}/tes/${name}" @@ -1515,6 +1581,7 @@ process getMMSeqsGenomeGFFs { label "gffpal" label "small_task" + time "1h" tag "${name} - ${db}" @@ -1548,6 +1615,7 @@ process getMMSeqsGenomeFastas { label "genometools" label "small_task" + time "1h" tag "${name} - ${db}" @@ -1600,6 +1668,7 @@ process getGtSuffixArrays { label "genometools" label "small_task" + time "2h" tag "${name}" @@ -1646,6 +1715,7 @@ process runLtrHarvest { label "genometools" label "small_task" + time "4h" tag "${name}" @@ -1691,6 +1761,8 @@ process runLtrDigest { label "genometools" label "small_task" + time "4h" + tag "${name}" publishDir "${params.outdir}/tes/${name}", saveAs: exclude_unclean @@ -1765,6 +1837,8 @@ process runEAHelitron { label "eahelitron" label "small_task" + time "6h" + tag "${name}" publishDir "${params.outdir}/tes/${name}", saveAs: exclude_unclean @@ -1809,6 +1883,7 @@ process filterEAHelitronGFF { label "gffpal" label "small_task" + time "1h" tag "${name}" @@ -1840,6 +1915,8 @@ process filterEAHelitronGFF { process runMiteFinder { label "mitefinder" label "small_task" + time "6h" + tag "${name}" input: @@ -1867,6 +1944,8 @@ process getMiteFInderGFFs { label "gffpal" label "small_task" + time "1h" + tag "${name}" input: @@ -1900,6 +1979,8 @@ process getMiteFinderFastas { label "genometools" label "small_task" + time "1h" + tag "${name}" publishDir "${params.outdir}/tes/${name}" @@ -1943,8 +2024,9 @@ process getMiteFinderFastas { */ process combineTEFastas { - label "posix" + label "python3" label "small_task" + time "2h" publishDir "${params.outdir}/pantes" @@ -1964,10 +2046,10 @@ process combineTEFastas { script: """ cat in/*.fasta \ - | fasta_to_tsv.sh \ - | sort -u -k1,1 \ - | tsv_to_fasta.sh \ - > combined_tes.fasta + | exclude_contained_subseqs.py \ + --coverage 0.95 \ + -o combined_tes.fasta \ + - """ } @@ -1985,6 +2067,7 @@ process clusterTEFastas { label "vsearch" label "medium_task" + time "1d" publishDir "${params.outdir}/pantes" @@ -2000,10 +2083,10 @@ process clusterTEFastas { vsearch \ --threads "${task.cpus}" \ --cluster_fast "combined.fasta" \ - --id 0.8 \ - --weak_id 0.6 \ + --id 0.90 \ + --weak_id 0.7 \ --iddef 0 \ - --qmask none \ + --qmask dust \ --uc clusters.tsv \ --strand "both" \ --clusters "clusters/fam" @@ -2018,6 +2101,7 @@ process filterTEClusters { label "python3" label "small_task" + time "4h" publishDir "${params.outdir}/pantes" @@ -2058,6 +2142,7 @@ process getClusterMSAs { label "decipher" label "big_task" + time "1d" publishDir "${params.outdir}/pantes" @@ -2098,6 +2183,7 @@ process getClusterMSAConsensus { label "decipher" label "medium_task" + time "6h" publishDir "${params.outdir}/pantes" @@ -2131,6 +2217,7 @@ process getClusterMSAStockholm { label "python3" label "small_task" + time "2h" publishDir "${params.outdir}/pantes" @@ -2159,6 +2246,7 @@ process runRepeatClassifier { label "repeatmasker" label "big_task" + time "12h" publishDir "${params.outdir}/pantes" @@ -2200,6 +2288,8 @@ process runRepeatMaskerSpecies { label "repeatmasker" label "medium_task" + time "1d" + tag "${name}" publishDir "${params.outdir}/tes/${name}" @@ -2251,6 +2341,8 @@ process runRepeatMasker { label "repeatmasker" label "medium_task" + time "1d" + tag "${name}" publishDir "${params.outdir}/tes/${name}" @@ -2295,6 +2387,7 @@ process getRepeatMaskerGFF { label "gffpal" label "small_task" + time "2h" tag "${name} - ${analysis}" @@ -2304,7 +2397,7 @@ process getRepeatMaskerGFF { file("rm.out") from repeatMaskerResults .map { n, o -> [n, "repeatmasker", o] } .mix( repeatMaskerSpeciesResults - .map {n, o -> [n, "repeatmasker_species" ]} ) + .map {n, o -> [n, "repeatmasker_species", o ]} ) output: set val(name), @@ -2324,6 +2417,7 @@ process tidyGFFs { label "genometools" label "small_task" + time "1h" publishDir "${params.outdir}/${folder}/${name}" @@ -2374,6 +2468,7 @@ process combineGFFs { label "genometools" label "small_task" + time "2h" publishDir "${params.outdir}/final" tag "${name}" @@ -2404,6 +2499,7 @@ process getSoftmaskedGenomes { label "bedtools" label "small_task" + time "1h" publishDir "${params.outdir}/final" tag "${name}" @@ -2424,10 +2520,23 @@ process getSoftmaskedGenomes { script: """ + fasta_to_tsv.sh \ + < genome.fasta \ + | awk 'BEGIN {OFS="\\t"} {print \$1, 0, length(\$2)}' \ + > genome.bed + + bedtools intersect \ + -a repeats.gff3 \ + -b genome.bed \ + > bounded.gff3 + + bedtools maskfasta \ -fi genome.fasta \ - -bed repeats.gff3 \ + -bed bounded.gff3 \ -fo "${name}_softmasked.fasta" \ -soft + + rm -f bounded.gff3 genome.bed """ } diff --git a/nextflow.config b/nextflow.config index 83dec84..eaf7cd3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,6 +26,9 @@ profiles { docker { includeConfig "conf/docker.config" } + pawsey_zeus { + includeConfig "conf/pawsey_zeus.config" + } docker_plus { includeConfig "conf/docker_plus.config" }