From 03cf438b4e182643967850c7db6da8584803534c Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Tue, 12 May 2020 13:43:12 +0800 Subject: [PATCH] update README and change KMER to k-mer --- README.md | 32 ++++++++++++++++++++------------ src/main.cpp | 10 +++++----- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index d45728f..c60d057 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ # UniqueKMER -Generate unique KMERs for every contig in a FASTA file. +Generate unique k-mers for every contig in a FASTA file. -Unique KMER is consisted of KMER keys (i.e. ATCGATCCTTAAGG) that are only presented in one contig, but not presented in any other contigs (for both forward and reverse strands). +Unique k-mer is consisted of k-mer keys (i.e. ATCGATCCTTAAGG) that are only presented in one contig, but not presented in any other contigs (for both forward and reverse strands). -This tool accepts the input of a FASTA file consisting of many contigs, and extract unique KMERs for each contig. +This tool accepts the input of a FASTA file consisting of many contigs, and extract unique k-mers for each contig. -The output unique KMER file and Genome file can be used for fastv: https://github.com/OpenGene/fastv, which is an ultra-fast tool to identify and visualize microbial sequences from sequencing data. +The output unique k-mer file and Genome file can be used for fastv: https://github.com/OpenGene/fastv, which is an ultra-fast tool to identify and visualize microbial sequences from sequencing data. # what does UniqueKMER output? -This tool outputs a folder (folder name can be specified by `-o/--outdir`), which contains a `index.html` and a subfolder `genomes_kmers`. The subfolder `genomes_kmers` contains a KMER file and a Genome file for each contig, both in FASTA format. You can open the `index.html` with any browser, then click on the contig names to find its KMER file and Genome file. +This tool outputs a folder (folder name can be specified by `-o/--outdir`), which contains a `index.html` and a subfolder `genomes_kmers`. The subfolder `genomes_kmers` contains a k-mer file and a Genome file for each contig, both in FASTA format. You can open the `index.html` with any browser, then click on the contig names to find its k-mer file and Genome file. * a small example: http://opengene.org/uniquekmer/test/index.html. This is generated by a small FASTA (http://opengene.org/test.fasta) * a big example: http://opengene.org/uniquekmer/virus/index.html. This is generated by a big FASTA (http://opengene.org/viral.genomic.fasta) containing all NCBI complete RefSeq release of viral sequences, which can be found from https://ftp.ncbi.nlm.nih.gov/refseq/release/viral/ @@ -27,17 +27,17 @@ uniquekmer -f test.fasta You can get the test.fasta from: http://opengene.org/test.fasta # more examples -### set the KMER key length +### set the k-mer key length ```shell # 16-mer (i.e. ATCGATCGATCGATCG...) uniquekmer -f test.fasta -k 16 ``` -### filter the KMER keys that can be mapped to a reference genome (i.e. human genome) +### filter the k-mer keys that can be mapped to a reference genome (i.e. human genome) ```shell -# KMER sequences that can be mapped to hg38 with `edit distance <=2` will be removed +# k-mer sequences that can be mapped to hg38 with `edit distance <=2` will be removed uniquekmer -f test.fasta -r hg38.fasta -e 2 ``` -### set the spacing to avoid many continuous KMER keys +### set the spacing to avoid many continuous k-mer keys ```shell # the spacing will be 2, which means if `key(pos)` is stored, then `key(pos+1)` and `key(pos+2)` will be skipped uniquekmer -f test.fasta -s 2 @@ -47,10 +47,18 @@ options: ```shel -f, --fasta FASTA input file name (string) -o, --outdir Directory for output. Default is unique_kmers in the current directory. (string [=unique_kmers]) - -k, --kmer The length k of KMER (10~32), default 25 (int [=25]) + -k, --kmer The length k of k-mer (10~32), default 25 (int [=25]) -s, --spacing If a key with POS is recorded, then skip [POS+1...POS+spacing] to avoid too compact result (0~100). default 0 means no skipping. (int [=0]) -g, --genome_limit Process up to genome_limit genomes in the FASTA input file. Default 0 means no limit. This option is for DEBUG. (int [=0]) - -r, --ref Reference genome FASTA file name. Specify this only when you want to filter out the unique KMER that can be mapped to reference genome. (string [=]) - -e, --edit_distance KMER mapped to reference genome with edit distance <= edit_distance will be removed (0~16). 3 for default. (int [=3]) + -r, --ref Reference genome FASTA file name. Specify this only when you want to filter out the unique k-mer that can be mapped to reference genome. (string [=]) + -e, --edit_distance k-mer mapped to reference genome with edit distance <= edit_distance will be removed (0~16). 3 for default. (int [=3]) -?, --help print this message ``` + +## get the pre-built k-mer file, genomes file or k-mer collection file for viruses +* You can download `k-mer` files and `genomes` files of viruses from http://opengene.org/uniquekmer/viral/index.html. This is generated by extracting unique k-mers for all genomes in a big FASTA (http://opengene.org/viral.genomic.fasta), which contains all NCBI complete RefSeq release of viral sequences that can be found from https://ftp.ncbi.nlm.nih.gov/refseq/release/viral/. The k-mers that can be mapped to human reference genome (GRCh38) with `edit_distance <= 3` have already been filtered out. +* You can download the `k-mer collection` file for viral genomes from: http://opengene.org/viral.kc.fasta.gz + +## get the pre-built k-mer file, genomes file or k-mer collection file for viruses and human microorganisms +* You can download `k-mer` files and `genomes` files of viruses from http://opengene.org/uniquekmer/microbial/index.html. This is generated by extracting unique k-mers for all genomes in a big FASTA (http://opengene.org/microbial.genomic.fasta), which contains genomes for the viruses above and common human microorganisms. The k-mers that can be mapped to human reference genome (GRCh38) with `edit_distance <= 3` have already been filtered out. +* You can download the `k-mer collection` file for viral genomes from: http://opengene.org/microbial.kc.fasta.gz diff --git a/src/main.cpp b/src/main.cpp index 349c8f7..b5a6647 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ mutex logmtx; int main(int argc, char* argv[]){ // display version info if no argument is given if(argc == 1) { - cerr << "UniqueKMER: Generate unique KMERs for every contig in a FASTA file." << endl << "version " << UNIQUEKMER_VER << endl; + cerr << "UniqueKMER: Generate unique k-mers for every contig in a FASTA file." << endl << "version " << UNIQUEKMER_VER << endl; } if (argc == 2 && strcmp(argv[1], "test")==0){ UnitTest tester; @@ -30,11 +30,11 @@ int main(int argc, char* argv[]){ cmdline::parser cmd; cmd.add("fasta", 'f', "FASTA input file name", true, ""); cmd.add("outdir", 'o', "Directory for output. Default is unique_kmers in the current directory.", false, "unique_kmers"); - cmd.add("kmer", 'k', "The length k of KMER (10~32), default 25", false, 25); + cmd.add("kmer", 'k', "The length k of k-mer (10~32), default 25", false, 25); cmd.add("spacing", 's', "If a key with POS is recorded, then skip [POS+1...POS+spacing] to avoid too compact result (0~100). default 0 means no skipping.", false, 0); cmd.add("genome_limit", 'g', "Process up to genome_limit genomes in the FASTA input file. Default 0 means no limit. This option is for DEBUG.", false, 0); - cmd.add("ref", 'r', "Reference genome FASTA file name. Specify this only when you want to filter out the unique KMER that can be mapped to reference genome.", false, ""); - cmd.add("edit_distance", 'e', "KMER mapped to reference genome with edit distance <= edit_distance will be removed (0~16). 3 for default.", false, 3); + cmd.add("ref", 'r', "Reference genome FASTA file name. Specify this only when you want to filter out the unique k-mer that can be mapped to reference genome.", false, ""); + cmd.add("edit_distance", 'e', "k-mer mapped to reference genome with edit distance <= edit_distance will be removed (0~16). 3 for default.", false, 3); cmd.parse_check(argc, argv); @@ -69,7 +69,7 @@ int main(int argc, char* argv[]){ time_t t2 = time(NULL); - cerr << endl << "Please find results (index.html, KMER/Genome files) in folder: " << opt.outdir << endl; + cerr << endl << "Please find results (index.html, k-mer/Genome files) in folder: " << opt.outdir << endl; cerr << endl << command << endl; cerr << "uniquekmer v" << UNIQUEKMER_VER << ", time used: " << (t2)-t1 << " seconds" << endl;