diff --git a/ericscript.pl b/ericscript.pl new file mode 100755 index 0000000..6e75e07 --- /dev/null +++ b/ericscript.pl @@ -0,0 +1,557 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use Pod::Usage; +use Getopt::Long; +use File::Spec; +use Cwd 'abs_path'; + +our ($verbose, $help, $man); +our ($samplename, $reads_1, $reads_2, $outputfolder, $minreads, $removetemp, $nthreads, $MAPQ, $checkdb, $demo, $refid, $printdb, $downdb, $dbfolder, $recalc, $bwa_aln, $genomeref); +our ($simulator, $readlength, $ntrim, $insize, $sd_insize, $ngenefusion, $min_cov, $max_cov, $nsims, $be, $ie, $background_1, $background_2, $nreads_background, $ensversion); +our($calcstats, $resultsfolder, $datafolder, $algoname, $dataset, $normroc); +my @command_line = @ARGV; +GetOptions('verbose|v'=>\$verbose, +'help|h'=>\$help, +'man|m'=>\$man, +'samplename|name=s'=>\$samplename, +'outputfolder|o=s'=>\$outputfolder, +'dbfolder|db=s'=>\$dbfolder, +'background_1=s'=>\$background_1, +'background_2=s'=>\$background_2, +'refid=s'=>\$refid, +'minreads|minr=i'=>\$minreads, +'remove'=>\$removetemp, +'nthreads|p=i'=>\$nthreads, +'readlength|rl=i'=>\$readlength, +'ntrim=i'=>\$ntrim, +'insize=f'=>\$insize, +'sd_insize=f'=>\$sd_insize, +'ngenefusion=i'=>\$ngenefusion, +'min_cov=i'=>\$min_cov, +'max_cov=i'=>\$max_cov, +'nsims=i'=>\$nsims, +'nreads_background=i'=>\$nreads_background, +'checkdb'=>\$checkdb, +'ie'=>\$ie, +'be'=>\$be, +'demo'=>\$demo, +'simulator'=>\$simulator, +'recalc'=>\$recalc, +'printdb'=>\$printdb, +'checkdb'=>\$checkdb, +'downdb'=>\$downdb, +'bwa_aln'=>\$bwa_aln, +'calcstats'=>\$calcstats, +'resultsfolder=s'=>\$resultsfolder, +'datafolder=s'=>\$datafolder, +'algoname=s'=>\$algoname, +'dataset=s'=>\$dataset, +'normroc=i'=>\$normroc, +'ensversion=i'=>\$ensversion, +'MAPQ=f'=>\$MAPQ) or pod2usage (); + +$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT); +$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT); + +my $sysmsg; +my $sysflag=0; +$sysmsg = qx/samtools 2>&1/ || ''; +if ($sysmsg !~ m/bamshuf/) { + print STDERR "[EricScript] Error: SAMtools >= 0.1.19 not found! Please install and add it to your PATH.\n"; + $sysflag=1; +} +if ($sysmsg =~ m/htslib/) { + print STDERR "[EricScript] Error: SAMtools >= 1.0 detected! EricScript is not yet compatible with it. Please use samtools 0.1.19 to run EricScript.\n"; + $sysflag=1; +} +$sysmsg = qx/bwa 2>&1/ || ''; +if ($sysmsg !~ m/sampe/) { + print STDERR "[EricScript] Error: BWA not found! Please install and add it to your PATH.\n"; + $sysflag=1; +} else { + $sysmsg = qx/bwa mem 2>&1/ || ''; + if ($sysmsg !~ m/-Y/) { + print STDERR "[EricScript] Error: BWA >= 0.7.12 not found! Please install and add it to your PATH.\n"; + $sysflag=1; + } +} +#} elsif ($sysmsg !~ m/mem/) { +# print STDERR "[EricScript] Error: BWA >= 0.7.4 not found! Please install and add it to your PATH.\n"; +# $sysflag=1; +#} +$sysmsg = qx/blat 2>&1/ || ''; +if ($sysmsg !~ "tileSize") { + print STDERR "[EricScript] Error: BLAT not found! Please install and add it to your PATH.\n"; + $sysflag=1; +} +$sysmsg = qx/R --help 2>&1/ || ''; +if ($sysmsg !~ "Rdiff") { + print STDERR "[EricScript] Error: R not found! Please install and add it to your PATH.\n"; + $sysflag=1; +} +$sysmsg = qx/seqtk 2>&1/ || ''; +if ($sysmsg !~ "subseq") { + print STDERR "[EricScript] Error: Seqtk not found! Please install and add it to your PATH.\n"; + $sysflag=1; +} +$sysmsg = qx/bedtools 2>&1/ || ''; +if ($sysmsg !~ "sample") { + print STDERR "[EricScript] Error: bedtools (>=2.18) not found! Please install and add it to your PATH.\n"; + $sysflag=1; +} + +if ($sysflag == 1) { + exit(100); +} + +my $myscript = abs_path($0); +my $myscriptname = $myscript; $myscriptname =~ s%.*/%%g; +my $ericscriptfolder=substr $myscript, 0, (length($myscript) - length($myscriptname) - 1) ; + +$calcstats ||= 0; +$simulator ||= 0; +$checkdb ||= 0; +$demo ||= 0; +$printdb ||=0; +$downdb ||=0; +$recalc ||=0; +$bwa_aln ||= 0; +$ensversion ||=0; +#if ($bwa_aln != 0) { +# $bwa_aln = 1; +#} +$dbfolder ||= "$ericscriptfolder/lib"; + +if ( ($ensversion > 0) && ($ensversion < 70) ) { + pod2usage ("[EricScript] Error in selecting Ensembl release. Minimum supported version is 70.\n"); +} + +if ($downdb != 0) { + $refid or pod2usage ("[EricScript] Syntax error. \"refid\" parameter needs to be specified. Run ericscript.pl --printdb to see the list of available refid.\n"); +} else { + $refid ||= "homo_sapiens"; +} + +if ($printdb != 0) { + system("R --slave --args $ericscriptfolder,$dbfolder,$ensversion,$printdb < $ericscriptfolder/lib/R/RetrieveRefId.R"); + exit(100); +} + +if ($recalc != 0) { + system("R --slave --args $ericscriptfolder,$refid,$dbfolder < $ericscriptfolder/lib/R/CheckDB.R"); + my $flagdb; + open FILE, "< $ericscriptfolder/lib/data/_resources/.flag.dbexists"; + $flagdb = ; + if ($flagdb == 0) { + pod2usage (); + } + + ## check inputs !! ONLY FOR DEBUG purposes + my $file3 = File::Spec->catfile ($genomeref); + -f $file3 or pod2usage ("[EricScript] Error: please specify a genome reference file.\n"); + my $file4 = File::Spec->catfile ("$genomeref.pac"); + -f $file4 or pod2usage ("[EricScript] Error: BWA indexes for $genomeref not found. Create BWA indexes then run EricScript.\n"); + $outputfolder or pod2usage ("[EricScript] Error: Please specify where past analysis is stored by using --outputfolder"); +# $outputfolder = abs_path($outputfolder); + if (-d $outputfolder) { + my $abs_outputfolder = abs_path($outputfolder); + system("R --slave --args $ericscriptfolder,$abs_outputfolder,$dbfolder,$refid,$genomeref < $ericscriptfolder/lib/R/CalcBreakpointPositions.R"); + } else { + die "[EricScript] Error: output folder $outputfolder does not exist.\n"; + } + +} + +if ($checkdb == 0 && $demo == 0 && $simulator == 0 && $calcstats == 0 && $downdb == 0 && $recalc == 0) { + + @ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT); + @ARGV == 2 or pod2usage ("[EricScript] Syntax error.\n"); + ($reads_1, $reads_2) = @ARGV; + + ## check db existence + system("R --slave --args $ericscriptfolder,$refid,$dbfolder < $ericscriptfolder/lib/R/CheckDB.R"); + my $flagdb; + open FILE, "< $ericscriptfolder/lib/data/_resources/.flag.dbexists"; + $flagdb = ; + if ($flagdb == 0) { + exit(100); + } + + ## check inputs + my $file1 = File::Spec->catfile ($reads_1); + -f $file1 or pod2usage ("[EricScript] Error: the required file $reads_1 does not exist.\n"); + my $file2 = File::Spec->catfile ($reads_2); + -f $file2 or pod2usage ("[EricScript] Error: the required file $reads_2 does not exist.\n"); +# my $file3 = File::Spec->catfile ($genomeref); +# -f $file3 or pod2usage ("[EricScript] Error: please specify a valid genome reference file.\n"); +# my $file4 = File::Spec->catfile ("$genomeref.pac"); +# -f $file4 or pod2usage ("[EricScript] Error: BWA indexes for $genomeref not found. Create BWA indexes then run EricScript.\n"); + + + my $userhome = $ENV{HOME}; + $samplename ||= 'MyEric'; + $outputfolder ||= "$userhome/$samplename"; +# $outputfolder = abs_path($outputfolder); +# if (-d $outputfolder) { +# die "[EricScript] Error: output folder $outputfolder already exists.\n"; +# } + if (! -d $outputfolder) { mkdir($outputfolder) || die "[EricScript] Error: the directory $outputfolder is not writable by the current user. \n"; } + if (! -d "$outputfolder/aln") { mkdir("$outputfolder/aln"); } + if (! -d "$outputfolder/out") { mkdir("$outputfolder/out"); } + $verbose ||= 0; + $minreads ||= 3; + $ntrim ||= -1; + $removetemp ||= 0; + $MAPQ ||= 20; + $nthreads ||= 4; + my $myref="$dbfolder/data/$refid/EnsemblGene.Reference.fa"; + my $mynewref="$outputfolder/out/$samplename.EricScript.junctions.fa"; + my $mynewref_recal="$outputfolder/out/$samplename.EricScript.fa"; + my $flagbin= 1; + if (-T $reads_1) { + $flagbin = 0; + } + ## write vars + my $abs_outputfolder = abs_path($outputfolder); + my $range = 10000; + my $rnum = int(rand($range)); + my $varfile = "$outputfolder/ericscript.vars"; + open(FILE, ">", "$varfile") or die "Couldn't open: $!"; + print FILE "samplename=\"$samplename\"\n"; + print FILE "outputfolder=\"$abs_outputfolder\"\n"; + print FILE "dbfolder=\"$dbfolder\"\n"; + print FILE "reads_1=\"$reads_1\"\n"; + print FILE "reads_2=\"$reads_2\"\n"; + print FILE "minreads=$minreads\n"; + print FILE "ntrim=$ntrim\n"; + print FILE "MAPQ=$MAPQ\n"; + print FILE "removetemp=$removetemp\n"; + print FILE "flagbin=$flagbin\n"; + print FILE "nthreads=$nthreads\n"; + print FILE "ericscriptfolder=\"$ericscriptfolder\"\n"; + print FILE "myref=\"$myref\"\n"; + print FILE "mynewref=\"$mynewref\"\n"; + print FILE "mynewref_recal=\"$mynewref_recal\"\n"; + print FILE "refid=\"$refid\"\n"; + print FILE "verbose=$verbose\n"; + print FILE "bwa_aln=$bwa_aln\n"; + close (FILE); + + system("cp", "$varfile", "$outputfolder/out/.ericscript.vars"); + system("bash", "$ericscriptfolder/lib/bash/RunEric.sh", "$outputfolder"); +} elsif ($checkdb != 0) { + if (-d $dbfolder) { + print STDERR "[EricScript] Checking installed Database.\n"; + system("R --slave --args $ericscriptfolder,$dbfolder,$ensversion,$printdb < $ericscriptfolder/lib/R/RetrieveRefId.R"); + system("R --slave --args $ericscriptfolder,$dbfolder < $ericscriptfolder/lib/R/UpdateDB.R"); + exit(100); + } else { + die "[EricScript] Error: the directory $dbfolder does not exist.\n"; + } +} elsif ($downdb != 0) { + if ( -d $dbfolder ) { + if ( !-d "$dbfolder/data" ) { + mkdir ("$dbfolder/data"); + } + system("R --slave --args $ericscriptfolder,$dbfolder,$ensversion,$printdb < $ericscriptfolder/lib/R/RetrieveRefId.R"); + system("bash $ericscriptfolder/lib/bash/BuildSeq.sh $ericscriptfolder $refid $dbfolder $ensversion"); + } else { + die "[EricScript] Error: the directory $dbfolder does not exist.\n"; + } + +} elsif ($demo != 0) { + ## check db +# my $file3 = File::Spec->catfile ($genomeref); +# -f $file3 or die ("[EricScript] Error: please specify a genome reference file.\n"); + system("R --slave --args $ericscriptfolder,$refid,$dbfolder < $ericscriptfolder/lib/R/CheckDB.R"); + my $flagdb; + open FILE, "< $ericscriptfolder/lib/data/_resources/.flag.dbexists"; + $flagdb = ; + if ($flagdb == 0) { + pod2usage (); + } + my $userhome = $ENV{HOME}; + $samplename ='demo'; + $outputfolder ||= "$userhome/ericscript_demo"; +# $outputfolder = abs_path($outputfolder); +# if (-d $outputfolder) { +# die "[EricScript] Error: output folder $outputfolder already exists.\n"; +# } + if (! -d $outputfolder) { mkdir($outputfolder) || die "[EricScript] Error: the directory $outputfolder is not writable by the current user. \n"; } + mkdir("$outputfolder/aln"); + mkdir("$outputfolder/out"); + $verbose=0; + $minreads ||= 3; + $ntrim ||= -1; + $removetemp ||= 0; + $MAPQ ||= 20; + $nthreads ||= 4; + $refid ||= "homo_sapiens"; + my $myref="$dbfolder/data/$refid/EnsemblGene.Reference.fa"; + my $mynewref="$outputfolder/out/$samplename.EricScript.junctions.fa"; + my $mynewref_recal="$outputfolder/out/$samplename.EricScript.fa"; + my $reads_1="$ericscriptfolder/lib/demo/myreads_1.fq.gz"; + my $reads_2="$ericscriptfolder/lib/demo/myreads_2.fq.gz"; + my $flagbin= 1; + if (-T $reads_1) { + $flagbin = 0; + } + my $abs_outputfolder = abs_path($outputfolder); + my $range = 10000; + my $rnum = int(rand($range)); + my $varfile = "$outputfolder/ericscript.vars"; + open(FILE, ">", "$varfile") or die "Couldn't open: $!"; + print FILE "samplename=\"$samplename\"\n"; + print FILE "outputfolder=\"$abs_outputfolder\"\n"; + print FILE "dbfolder=\"$dbfolder\"\n"; + print FILE "reads_1=\"$reads_1\"\n"; + print FILE "reads_2=\"$reads_2\"\n"; + print FILE "flagbin=$flagbin\n"; + print FILE "minreads=$minreads\n"; + print FILE "ntrim=$ntrim\n"; + print FILE "MAPQ=$MAPQ\n"; + print FILE "removetemp=$removetemp\n"; + print FILE "nthreads=$nthreads\n"; + print FILE "ericscriptfolder=\"$ericscriptfolder\"\n"; + print FILE "myref=\"$myref\"\n"; + print FILE "mynewref=\"$mynewref\"\n"; + print FILE "mynewref_recal=\"$mynewref_recal\"\n"; + print FILE "refid=\"$refid\"\n"; + print FILE "verbose=$verbose\n"; + print FILE "bwa_aln=$bwa_aln\n"; + close (FILE); + + system("cp", "$varfile", "$outputfolder/out/.ericscript.vars"); + system("bash", "$ericscriptfolder/lib/bash/RunEric.sh", "$outputfolder"); + +} +elsif ($simulator != 0) { + $sysmsg = qx/wgsim --help 2>&1/ || ''; + my $userhome = $ENV{HOME}; + if ($sysmsg !~ "outer") { + print STDERR "[EricScript] Error: wgsim not found! Please install and add it to your PATH.\n"; + $sysflag=1; + } + if ($sysflag == 1) { + exit(100); + } + $outputfolder ||= "$userhome/ericscript_simulator"; +# $outputfolder = abs_path($outputfolder); +# if (-d $outputfolder) { +# die "[EricScript] Error: output folder $outputfolder already exists.\n"; +# } +print "line 345;"; + if (! -d $outputfolder ) { mkdir($outputfolder) || die "[EricScript] Error: the directory $outputfolder is not writable by the current user. \n"; } +print "line 347 ($outputfolder);"; + $verbose ||= 0; + $readlength ||= 75; + $insize ||= 200; + $sd_insize ||= 50; + $ngenefusion ||= 50; + $min_cov ||= 1; + $max_cov ||= 50; + $nsims ||= 10; + $be ||= 0; + $ie ||= 0; + $refid ||= "homo_sapiens"; + if ($be == 0 && $ie == 0) { + $ie=1; + } + $background_1 ||= 0; + $background_2 ||= 0; + $nreads_background ||= 200000; + my $abs_outputfolder = abs_path($outputfolder); + my $simcommand="R --slave --args $readlength,$abs_outputfolder,$ericscriptfolder,$verbose,$insize,$sd_insize,$ngenefusion,$min_cov,$max_cov,$nsims,$be,$ie,$background_1,$background_2,$nreads_background,$dbfolder,$refid < $ericscriptfolder/lib/R/SimulateFusions.R"; + system($simcommand); + +} +elsif ($calcstats != 0) { + + my $userhome = $ENV{HOME}; + $outputfolder ||= "$userhome/ericscript_stats"; +# $outputfolder = abs_path($outputfolder); + if (-d $outputfolder) { +# die "[EricScript] Error: output folder $outputfolder already exists.\n"; + } else { + mkdir($outputfolder) || die "[EricScript] Error: the directory $outputfolder is not writable by the current user. \n"; + } + $resultsfolder || pod2usage ("[EricScript] Error: Argument resultsfolder is not specified! \n"); + $datafolder || pod2usage ("[EricScript] Error: Argument datafolder is not specified! \n"); + $algoname || pod2usage ("[EricScript] Error: Argument algoname is not specified! \n"); + $dataset || pod2usage ("[EricScript] Error: Argument dataset is not specified! \n"); + $readlength || pod2usage ("[EricScript] Error: Argument readlength is not specified! \n"); + $normroc ||= 1; + my $abs_outputfolder = abs_path($outputfolder); + -e $resultsfolder || die "[EricScript] Error: the folder $resultsfolder does not exist. \n"; + -e $datafolder || die "[EricScript] Error: the folder $datafolder does not exist. \n"; + my $datafolder1="$datafolder/$dataset"; + -e $datafolder1 || die "[EricScript] Error: no $dataset synthetic data have been found in $datafolder. \n"; + + my $calcstatscommand="R --slave --args $resultsfolder,$abs_outputfolder,$datafolder,$algoname,$dataset,$readlength,$normroc,$ericscriptfolder < $ericscriptfolder/lib/R/CalcStats.R"; + system($calcstatscommand); + +} + + +=head1 SYNOPSIS + + ericscript.pl [arguments] + + Optional arguments: + -h, --help print help message + -m, --man print complete documentation + -v, --verbose use verbose output + -name, --samplename what's the name of your sample? + -o, --outputfolder where the results will be stored + -db, --dbfolder where database is stored. Default is ERICSCRIPT_FOLDER/lib/ + -minr, --minreads minimum reads to consider discordant alignments [3] + -p, --nthreads number of threads for the bwa aln process [4] + -ntrim trim PE reads from 1st base to $ntrim. Default is no trimming. Set ntrim=0 to don't trim reads. + --MAPQ minimum value of mapping quality to consider discordant reads. For MAPQ 0 use a negative value [20] + --remove remove all temporary files. + --demo Run a demonstration of EricScript on simulated reads. + --refid Genome reference identification. Run ericscript.pl --printdb to see available refid [homo_sapiens]. + --bwa_aln Use BWA ALN instead of BWA MEM to search for discordant reads. + + Subcommands: + --checkdb Check if your database is up-to-date, based on the latest Ensembl release. + --downdb Download, build database. refid parameter need to be specified. + --simulator Generate synthetic gene fusions with the same recipe of the ericscript's paper + --calcstats Calculate the statistics that we used in our paper to evaluate the performance of the algorithms. + + -------- + arguments for databases subcommands (downdb, checkdb): + + -db, --dbfolder where database is stored. Default is ERICSCRIPT_FOLDER/lib/ + --refid Genome reference identification. Run ericscript.pl --printdb to see available refid [homo_sapiens]. + --printdb Print a list of available genomes and exit. + --ensversion Download data of a specific Ensembl version (>= 70). Default is the latest one. + + ------- + arguments for simulator: + -o, --outputfolder where synthetic datasets will be stored [HOME/ericscript_simulator] + -rl, --readlength length of synthetic reads [75] + --refid Genome reference identification. Run ericscript.pl --printdb to see available refid [homo_sapiens]. + -v, --verbose use verbose output + --insize parameter of wgsym. Outer distance between the two ends [200] + --sd_insize parameter of wgsym. Standard deviation [50] + --ngenefusion The number of synthetic gene fusions per dataset? [50] + --min_cov Minimum coverage to simulate [1] + --max_cov Maximum coverage to simulate [50] + --nsims The number of synthetic datasets to simulate [10] + --be Use --be to generate Broken Exons (BE) data [no] + --ie Use --ie to generate Intact Exons (IE) data [yes] + -db, --dbfolder where database is stored. Default is ERICSCRIPT_FOLDER/lib/ + --background_1 Fastq file (forward) for generating background reads. + --background_2 Fastq file (reverse) for generating background reads. + --nreads_background The number of reads to extract from background data [200e3]. + + ------- + arguments for calcstats: + -o, --outputfolder where statistics file will be stored [HOME/ericscript_calcstats] + --resultsfolder path to folder containing algorithm results. + --datafolder path to folder containing synthetic data generated by ericscript simulator. + --algoname name of the algorithm that generated results. + --dataset type of synthetic data to considered for calculating statistics. IE or BE? + -rl, --readlength length of synthetic reads + --normroc factor to normalize the score given by the algorithm. + + ericscript.pl automatically runs a pipeline to detect chimeric transcripts in + paired-end RNA-seq samples. It is also able to generate datasets with synthetic gene fusions. + More information about running EricScript Simulator and EricScript CalcStats can be + found at http://ericscript.sourceforge.net + + Version: 0.5.5b + +=head1 OPTIONS + +=over 8 + +=item B<--help> + + print a brief usage message and detailed explanation of options. + +=item B<--man> + + print the complete manual of the program. + +=item B<--verbose> + + use verbose output. + + +=item B<--samplename> + + Choose a name for your sample. Default is "MyEric" + +=item B<--outputfolder> + + Folder that will contain all the results of the analysis. Default is YOUR_HOME/SAMPLENAME + +=item B<--dbfolder> + + Folder that contains transcriptome sequences and information of the downloaded species. Default is + ERICSCRIPT_FOLDER/lib + +=item B<--minreads> + + Minimum reads to consider discordant alignments. Default is 3 reads with minimum MAPQ. + +=item B<-ntrim> + + trim PE reads from 1st base to $ntrim. Trimmed reads will be used only for the first alignment (identification + of discordant reads). Setting ntrim to values lower than orginal read length allows EricScript to + increase its sensitivity, especially when the length of reads is 75nt or 100 nt. + Default is no trimming. Set ntrim=0 to don't trim reads. + +=item B<--nthreads > + + Number of threads for the bwa aln process. + +=item B<--MAPQ > + + minimum value of mapping quality to consider discordant reads. For MAPQ 0 use a negative value. Default is 20. + +=item B<--remove> + + remove all temporary files. By default, all temporary files will be kept for + user inspection, but this will easily clutter the directory. + +=item B<--checkdb> + + Check if your database is up-to-date, based on the latest Ensembl release. + +=item B<--downdb> + + Download, build database. refid parameter need to be specified. + +=item B<--refid> + + Genome reference identification. Run ericscript.pl --printdb to see available refid.[homo_sapiens] + +=item B<--ensversion> + + Download data of a specific version of Ensembl. Default is downloading the latest version of Ensembl. + Minimum supported version is 70. + +=item B<--printdb> + + Print a list of available genomes and exit. + +=item B<--demo> + + Run a demonstration of EricScript on simulated reads. + +=back + +=head1 DESCRIPTION + + EricScript (chimEric tranScript Detection Algorithm) is a computational framework for the discovery of + gene fusion products in paired end RNA-seq data. + EricScript is freely available to the community for non-commercial use under GPLv3 license. For + questions or comments, please contact matteo.benelli@gmail.com. + + +=cut diff --git a/lib/R/BuildExonUnionModel.R b/lib/R/BuildExonUnionModel.R new file mode 100644 index 0000000..72ca1bf --- /dev/null +++ b/lib/R/BuildExonUnionModel.R @@ -0,0 +1,93 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +ericscriptfolder <- split.vars [1] +refid <- split.vars[2] +dbfolder <- split.vars [3] +tmpfolder <- split.vars [4] + + +formatfasta <- function(myfasta, step = 50) { + totalchar <- nchar(myfasta) + if (totalchar > step) { + steps <- seq(1, totalchar, by = step) + newfasta <- rep("", (length(steps) - 1)) + for (j in 1: (length(steps) - 1)) { + aa <- substr(myfasta, steps[j], (steps[j] + (step - 1))) + newfasta[j] <- aa + } + if ((totalchar - tail(steps, n = 1)) > 0) { + newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar)) + } + } else + { + newfasta <- substr(myfasta, 1, totalchar) + } + return(newfasta) +} + +convertToComplement <- function(x) { + + bases <- c("A", "C", "G", "T") + #xx <- unlist(strsplit(toupper(x), NULL)) + xx <- rev(unlist(strsplit(toupper(x), NULL))) + paste(unlist(lapply(xx, function(bbb) { + if (bbb=="A") compString <- "T" + if (bbb=="C") compString <- "G" + if (bbb=="G") compString <- "C" + if (bbb=="T") compString <- "A" + if (!bbb %in% bases) compString <- "N" + return(compString) + })), collapse="") + +} + +refid.folder <- file.path(dbfolder, "data", refid) +if (file.exists(refid.folder) == F) { + dir.create(refid.folder) +} +x <- scan(file.path(tmpfolder, "subseq.fa"), what = "", quiet = T) +x.bed <- read.delim(file.path(tmpfolder, "exonstartend.mrg.txt"), sep = "\t", header = F) +refid.bed <- paste(as.character(x.bed[[1]]), paste((as.numeric(as.character(x.bed[[2]])) + 1), as.character(x.bed[[3]]), sep = "-"), sep = ":") +tmp <- grep(">", x) +genomicreg <- substr(x[tmp], 2, nchar(x[tmp])) +sequences.tmp <- rep("", length(tmp)) +for (i in 1: (length(tmp) - 1)) { + sequences.tmp[i] <- gsub(", ", "", toString(x[(tmp[i] + 1):(tmp[i+1] - 1)])) +} +sequences.tmp[length(tmp)] <- gsub(", ", "", toString(x[(tmp[length(tmp)] + 1): length(x)])) +genenames.tmp <- as.character(x.bed[[4]]) +unique.genenames <- unique(genenames.tmp) +strand.tmp <- read.delim(file.path(tmpfolder, "strand.txt"), sep = "\t", header = F) +strand <- strand.tmp[[2]] +sequences <- rep("", length(unique.genenames)) +for (i in 1: length(unique.genenames)) { + genenames1 <- paste(">", unique.genenames[i], sep = "") + ix.gene <- which(genenames.tmp == unique.genenames[i]) + ix.refid <- which(genomicreg %in% refid.bed[ix.gene]) + if (strand[i] == "-1") { + seqtmp0 <- gsub(", ", "", toString(sequences.tmp[ix.refid])) + sequences[i] <- convertToComplement(seqtmp0) + } else { + sequences[i] <- gsub(", ", "", toString(sequences.tmp[ix.refid])) + } + if (nchar(sequences[i]) == 0) { + sequences[i] <- "NNNNNNN" + } + + if (i == 1) { + cat(genenames1, file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = F, sep = "\n") + } else { + cat(genenames1, file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = T, sep = "\n") + } + cat(formatfasta(sequences[i]), file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = T, sep = "\n") +} +ix.emptyseq <- which(nchar(sequences) == 0) +GeneNames <- unique.genenames +if (length(ix.emptyseq) > 0) { +GeneNames <- GeneNames[-ix.emptyseq] +sequences <- sequences[-ix.emptyseq] +} +save(GeneNames, file = file.path(refid.folder, "EnsemblGene.GeneNames.RData")) +save(sequences, file = file.path(refid.folder, "EnsemblGene.Sequences.RData")) + diff --git a/lib/R/BuildFasta.R b/lib/R/BuildFasta.R new file mode 100644 index 0000000..451571c --- /dev/null +++ b/lib/R/BuildFasta.R @@ -0,0 +1,101 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +samplename <- split.vars [1] +outputfolder <- split.vars[2] +ericscriptfolder <- split.vars[3] +readlength <- max(as.numeric(split.vars[4])) +refid <- as.character(split.vars[5]) +dbfolder <- as.character(split.vars[6]) + + +formatfasta <- function(myfasta, step = 50) { + + totalchar <- nchar(myfasta) + if (totalchar > step) { + steps <- seq(1, totalchar, by = step) + newfasta <- rep("", (length(steps) - 1)) + for (j in 1: (length(steps) - 1)) { + aa <- substr(myfasta, steps[j], (steps[j] + (step - 1))) + newfasta[j] <- aa + } + if ((totalchar - tail(steps, n = 1)) > 0) { + newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar)) + } + } else + { + newfasta <- substr(myfasta, 1, totalchar) + } + return(newfasta) +} + + +load(file.path(outputfolder,"out", paste(samplename,".chimeric.RData", sep = ""))) +load(file.path(dbfolder, "data", refid, "EnsemblGene.GeneNames.RData")) +load(file.path(dbfolder, "data", refid, "EnsemblGene.Sequences.RData")) +load(file.path(dbfolder, "data", refid, "EnsemblGene.Structures.RData")) +load(file.path(outputfolder, "out", paste(samplename,".chimeric.RData", sep = ""))) +id1 <- MyGF$id1 +id2 <- MyGF$id2 +junctions <- rep(NA, length(id1)) +ids_fasta <- rep("", length(id1)) +sequences.fasta <- rep("", length(id1)) +fasta.file <- c() +maxgap <- 300 +for (i in 1: length(id1)) { + ix.genetable1 <- which(EnsemblGene.Structures$EnsemblGene == id1[i]) + ix.genetable2 <- which(EnsemblGene.Structures$EnsemblGene == id2[i]) + ix.gene1 <- which(GeneNames == id1[i]) + ix.gene2 <- which(GeneNames == id2[i]) + min.pos1 <- min(MyGF$pos1[[i]]) - 2*readlength + max.pos1 <- max(MyGF$pos1[[i]]) + readlength - 1 + if (min.pos1 < 1) {min.pos1 <- 1} + min.pos2 <- min(MyGF$pos2[[i]]) + max.pos2 <- max(MyGF$pos2[[i]]) + 2*readlength + a <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.genetable1]), ","))) + b <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.genetable1]), ","))) + strand1 <- as.character(EnsemblGene.Structures$Strand[ix.genetable1]) + if (strand1 == "+") { + tmp.sum1 <- cumsum((b - a )) + } else { + tmp.sum1 <- cumsum(rev(b - a)) + } + exonenumber1 <- which(tmp.sum1 >= max.pos1)[1] + if (is.na(exonenumber1)) {exonenumber1 <- length(tmp.sum1)} + a2 <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.genetable2]), ","))) + b2 <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.genetable2]), ","))) + strand2 <- as.character(EnsemblGene.Structures$Strand[ix.genetable2]) + if (strand2 == "+") { + tmp.sum2 <- cumsum((b2 - a2)) + } else { + tmp.sum2 <- cumsum(rev(b2 - a2)) + } + exonenumber2 <- which(tmp.sum2 >= min.pos2)[1] + if (is.na(exonenumber2)) {exonenumber2 <- length(tmp.sum2)} + id.gf1 <- paste(id1[i], exonenumber1, sep = "_") + fasta.gf1.tmp0 <- sequences[ix.gene1] + start.end.exons <- c(0,tmp.sum1) + fasta.gf1 <- substr(fasta.gf1.tmp0, min.pos1, (max.pos1 + maxgap - 1)) + id.gf2 <- paste(id2[i], exonenumber2, sep = "_") + fasta.gf2.tmp0 <- sequences[ix.gene2] + if (max.pos2 > nchar(fasta.gf2.tmp0)) {max.pos2 <- nchar(fasta.gf2.tmp0)} + start.end.exons <- c(0,tmp.sum2) + fasta.gf2 <- substr(fasta.gf2.tmp0, (min.pos2 - maxgap), max.pos2) + id.fastaGF <- paste(">",id.gf1,"----",id.gf2," junction@",nchar(fasta.gf1),sep = "") + sequences.fasta[i] <- paste(fasta.gf1, fasta.gf2, sep = "") + fasta.gf12 <- formatfasta(sequences.fasta[i]) + ids_fasta[i] <- paste(id.gf1,id.gf2, sep = "----") + junctions[i] <- nchar(fasta.gf1) + fastaGF <- c(id.fastaGF, fasta.gf12) + fasta.file <- c(fasta.file, fastaGF) +} +save(junctions, file = file.path(outputfolder, "out", paste(samplename,".junctions.RData", sep = ""))) +save(sequences.fasta, file = file.path(outputfolder, "out", paste(samplename,".sequences_fasta.RData", sep = ""))) +save(ids_fasta, file = file.path(outputfolder, "out", paste(samplename, ".ids_fasta.RData", sep = ""))) +cat(fasta.file, file = file.path(outputfolder,"out", paste(samplename,".EricScript.junctions.fa",sep = "")), sep = "\n") + + + + + + diff --git a/lib/R/BuildNeighbourhoodSequences.R b/lib/R/BuildNeighbourhoodSequences.R new file mode 100644 index 0000000..028ad01 --- /dev/null +++ b/lib/R/BuildNeighbourhoodSequences.R @@ -0,0 +1,37 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +samplename <- split.vars [1] +outputfolder <- split.vars[2] +z <- read.delim(file.path(outputfolder,"out",paste(samplename,".intervals.pileup", sep = "")), sep = "\t", header = F) +load(file.path(outputfolder,"out",paste(samplename,".ids_filtered.RData", sep = ""))) +load(file.path(outputfolder,"out",paste(samplename,".junctions.recalibrated.RData", sep = ""))) +load(file.path(outputfolder, "out", paste(samplename, ".ids_fasta.RData", sep = ""))) +id.pileup <- as.character(z[,1]) +pos.pileup <- as.numeric(as.character(z[,2])) +sequence.pileup <- as.character(z[,3]) +unique.ids.pileup <- unique(id.pileup) +width <- 100 +fasta.file <- c() +for (i in 1:length(id.filtered)) { + ix.id <- which(id.pileup == id.filtered[i]) + ix.ref <- which(ids_fasta == id.filtered[i]) + junction <- junctions.recalibrated[ix.ref] + ix.id.pileup <- which(id.pileup == id.filtered[i]) + ix.junction1 <- which(pos.pileup[ix.id.pileup] == junction) + ix.junction2 <- which(pos.pileup[ix.id.pileup] == (junction + 1)) + seq.vec <- rep("N", width) + pos.seq <- seq.int((junction-(width/2-1)), (junction + (width/2))) + ix.pos.tmp <- which(pos.seq %in% pos.pileup[ix.id.pileup]) + seq.vec[ix.pos.tmp] <- sequence.pileup[ix.id.pileup] + if ((length(ix.junction1)!=0) & (length(ix.junction2)!=0)) { + query.sequence <- character(length = 1) + for (ii in 1:length(seq.vec)) { + query.sequence <- paste(query.sequence, seq.vec[ii], sep = "") + } + ids_fasta_query <- paste(">", id.filtered[i],sep = "") + fasta.file <- c(fasta.file, c(ids_fasta_query, query.sequence)) + } +} +cat(fasta.file, sep = "\n", file = file.path(outputfolder, "out",paste(samplename,".checkselfhomology.fa", sep = ""))) +cat(file.path(outputfolder, "out", paste(samplename,".checkselfhomology.fa", sep = "")), file = file.path(outputfolder, "out", ".link")) diff --git a/lib/R/CalcBreakpointPositions.R b/lib/R/CalcBreakpointPositions.R new file mode 100644 index 0000000..cc01280 --- /dev/null +++ b/lib/R/CalcBreakpointPositions.R @@ -0,0 +1,348 @@ +## re-calculate breakpoints positions for samples analysed with ericscript < 0.4.0 +## and re-estimation of ericscore and blacklist + +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +ericscriptfolder <- as.character(split.vars[1]) +outputfolder <- split.vars [2] +dbfolder <- split.vars [3] +refid <- as.character(split.vars[4]) +genomeref <- as.character(split.vars[5]) + +flag.ada <- require(ada, quietly = T) +if (flag.ada == F) { + require(kernlab, quietly = T) +} +load(file.path(ericscriptfolder, "lib","data", "_resources", "BlackList.RData")) +load(file.path(ericscriptfolder, "lib","data", "_resources", "DataModel.RData")) +load(file.path(dbfolder, "data", refid, "EnsemblGene.Structures.RData")) + +myls <- list.files(outputfolder, pattern = "Summary.RData") +myls.tsv.total <- list.files(outputfolder, pattern = ".results.total.tsv") +myls.tsv.filt <- list.files(outputfolder, pattern = ".results.filtered.tsv") + +if (length(myls) == 1 & length(myls.tsv.total) == 1 & length(myls.tsv.filt)) { + samplename <- gsub(".results.total.tsv", "", myls.tsv.total) + load(file.path(outputfolder, myls)) + cat(paste("[EricScript] Re-estimating EricScore for sample ", samplename, "... ", sep = "")) + + ensgenename1 <- as.character(SummaryMat$EnsemblGene1) + ensgenename2 <- as.character(SummaryMat$EnsemblGene2) + genename1 <- as.character(SummaryMat$GeneName1) + genename2 <- as.character(SummaryMat$GeneName2) + gjs.score <- as.numeric(as.character(SummaryMat$GJS)) + edge.score <- as.numeric(as.character(SummaryMat$ES)) + nreads.score <- as.numeric(as.character(SummaryMat$US)) + cov.score <- as.numeric(as.character(SummaryMat$GeneExpr_Fused)) + myscores <- cbind(gjs.score, edge.score, nreads.score, cov.score) + colnames(myscores) <- c("probs.gjs", "probs.es", "probs.us", "cov") + myscores <- data.frame(myscores) + if (flag.ada) { + myada <- ada(control~., data = DataScores,loss="exponential", nu = 0.1) + ericscore <- as.numeric(predict(myada, myscores, type = "probs")[,2]) + } else { + sig <- sigest(control~., data = DataScores, frac = 1, na.action = na.omit, scaled = TRUE)[2] + model <- ksvm(control~., data = DataScores, type = "C-svc", kernel = "rbfdot", kpar = list(sigma = sig), C = 1, prob.model = TRUE) + ericscore <- predict(model, myscores, type = "probabilities")[,2] + } + + myblacklist <- rep("", length(genename1)) + ix.bl <- which((genename1 %in% gene.bl1 & genename2 %in% gene.bl2) | (genename1 %in% gene.bl2 & genename2 %in% gene.bl1)) + if (length(ix.bl) > 0) { + for (bli in 1: length(ix.bl)) { + ix.bli <- which((gene.bl1 == genename1[ix.bl[bli]] & gene.bl2 == genename2[ix.bl[bli]]) | (gene.bl2 == genename1[ix.bl[bli]] & gene.bl1 == genename2[ix.bl[bli]])) + myblacklist[ix.bl[bli]] <- paste("Frequency:", sum(freq.bl[ix.bli])) + } + } + + cat("done. \n") + cat(paste("[EricScript] Re-calculating breakpoint positions for sample ", samplename, "... ", sep = "")) + + myseq <- as.character(SummaryMat$JunctionSequence) + left_junction <- substr(myseq, 1, 50) + right_junction <- substr(myseq, 51, 100) + + chr1 <- rep("", length(ensgenename1)) + chr2 <- rep("", length(ensgenename1)) + genestart1 <- rep("", length(ensgenename1)) + genestart2 <- rep("", length(ensgenename1)) + geneend1 <- rep("", length(ensgenename1)) + geneend2 <- rep("", length(ensgenename1)) + strand1 <- rep("", length(ensgenename1)) + strand2 <- rep("", length(ensgenename1)) + + generef <- as.character(EnsemblGene.Structures$EnsemblGene) + chrref <- as.character(EnsemblGene.Structures$Chromosome) + genestartref <- as.character(EnsemblGene.Structures$geneStart) + geneendref <- as.character(EnsemblGene.Structures$geneEnd) + strandref <- as.character(EnsemblGene.Structures$Strand) + for (i in 1: length(ensgenename1)) { + + ix.ref <- which(generef == ensgenename1[i]) + chr1[i] <- chrref[ix.ref] + genestart1[i] <- genestartref[ix.ref] + geneend1[i] <- geneendref[ix.ref] + strand1[i] <- strandref[ix.ref] + + ix.ref <- which(generef == ensgenename2[i]) + chr2[i] <- chrref[ix.ref] + genestart2[i] <- genestartref[ix.ref] + geneend2[i] <- geneendref[ix.ref] + strand2[i] <- strandref[ix.ref] + } + + + # NEW Find GenomicPosition (50nt) + for (i in 1: length(ensgenename1)) { + if (i == 1) { + cat(paste("@", i, "_", 1, "\n", left_junction[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction[i])))), "\n", "@", i, "_", 2, "\n", right_junction[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = F) + } else { + cat(paste("@", i, "_", 1, "\n", left_junction[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction[i])))), "\n", "@", i, "_", 2, "\n", right_junction[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = T) + } + } + system(paste("bwa aln", "-R 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.sai"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) + system(paste("bwa samse", "-n 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq.sai"), file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) + system(paste("cat", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "|", file.path(ericscriptfolder, "lib", "perl", "xa2multi.pl"), "-", "|","grep -v -e \'^\\@\' -",">", file.path(outputfolder, "out", "findgenomicpos.fq.sam"))) + xx.pos <- read.delim(file.path(outputfolder, "out", "findgenomicpos.fq.sam"), sep = "\t", header= F) + genpos_1 <- rep(0,length(ensgenename1)) + genpos_2 <- rep(0,length(ensgenename1)) + id.pos <- as.character(xx.pos[[1]]) + flag.pos <- as.character(xx.pos[[2]]) + chr.pos <- as.character(xx.pos[[3]]) + if (length(grep("chr", chr.pos)) > 0) { + chr.pos <- gsub("chr", "", chr.pos) + } + pos.pos <- as.numeric(as.character(xx.pos[[4]])) + mapq.pos <- as.numeric(as.character(xx.pos[[5]])) + + for (i in 1: length(ensgenename1)) { + + ## for 5' gene + ix.mypos <- which(id.pos == paste(i, "_1", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr1[i] & pos.pos.ix >= as.numeric(genestart1[i]) & pos.pos.ix <= as.numeric(geneend1[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_1[i] <- pos.pos.ix[ix.okpos] + } else { + genpos_1[i] <- pos.pos.ix[ix.okpos] + 49 + } + } + ## for 3' gene + ix.mypos <- which(id.pos == paste(i, "_2", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr2[i] & pos.pos.ix >= as.numeric(genestart2[i]) & pos.pos.ix <= as.numeric(geneend2[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_2[i] <- pos.pos.ix[ix.okpos] + 49 + } else { + genpos_2[i] <- pos.pos.ix[ix.okpos] + } + } + } + + # NEW Find GenomicPosition (25nt_1) + ix.na.pos_1 <- which(genpos_1 == 0) + ix.na.pos_2 <- which(genpos_2 == 0) + if (length(ix.na.pos_1 ) > 0 | length(ix.na.pos_2 ) > 0) { + left_junction.trim <- substr(left_junction, 26, 50) + right_junction.trim <- substr(right_junction, 1, 25) + for (i in 1: length(ensgenename1)) { + if (i == 1) { + cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = F) + } else { + cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = T) + } + } + system(paste("bwa aln", "-R 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.sai"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) + system(paste("bwa samse", "-n 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq.sai"), file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) + system(paste("cat", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "|", file.path(ericscriptfolder, "lib", "perl", "xa2multi.pl"), "-", "|","grep -v -e \'^\\@\' -",">", file.path(outputfolder, "out", "findgenomicpos.fq.sam"))) + xx.pos <- read.delim(file.path(outputfolder, "out", "findgenomicpos.fq.sam"), sep = "\t", header= F) + id.pos <- as.character(xx.pos[[1]]) + flag.pos <- as.character(xx.pos[[2]]) + chr.pos <- as.character(xx.pos[[3]]) + if (length(grep("chr", chr.pos)) > 0) { + chr.pos <- gsub("chr", "", chr.pos) + } + pos.pos <- as.numeric(as.character(xx.pos[[4]])) + mapq.pos <- as.numeric(as.character(xx.pos[[5]])) + for (i in 1: length(ensgenename1)) { + if (i %in% ix.na.pos_1) { + ## for 5' gene + ix.mypos <- which(id.pos == paste(i, "_1", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr1[i] & pos.pos.ix >= as.numeric(genestart1[i]) & pos.pos.ix <= as.numeric(geneend1[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_1[i] <- pos.pos.ix[ix.okpos] + } else { + genpos_1[i] <- pos.pos.ix[ix.okpos] + 24 + } + } + } + ## for 3' gene + if (i %in% ix.na.pos_2) { + ix.mypos <- which(id.pos == paste(i, "_2", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr2[i] & pos.pos.ix >= as.numeric(genestart2[i]) & pos.pos.ix <= as.numeric(geneend2[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_2[i] <- pos.pos.ix[ix.okpos] + 24 + } else { + genpos_2[i] <- pos.pos.ix[ix.okpos] + } + } + } + } + # NEW Find GenomicPosition (25nt_2) + ix.na.pos_1 <- which(genpos_1 == 0) + ix.na.pos_2 <- which(genpos_2 == 0) + left_junction.trim <- substr(left_junction, 1, 25) + right_junction.trim <- substr(right_junction, 26, 50) + for (i in 1: length(ensgenename1)) { + if (i == 1) { + cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = F) + } else { + cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = T) + } + } + system(paste("bwa aln", "-R 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.sai"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) + system(paste("bwa samse", "-n 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq.sai"), file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) + system(paste("cat", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "|", file.path(ericscriptfolder, "lib", "perl", "xa2multi.pl"), "-", "|","grep -v -e \'^\\@\' -",">", file.path(outputfolder, "out", "findgenomicpos.fq.sam"))) + + + xx.pos <- read.delim(file.path(outputfolder, "out", "findgenomicpos.fq.sam"), sep = "\t", header= F) + id.pos <- as.character(xx.pos[[1]]) + flag.pos <- as.character(xx.pos[[2]]) + chr.pos <- as.character(xx.pos[[3]]) + if (length(grep("chr", chr.pos)) > 0) { + chr.pos <- gsub("chr", "", chr.pos) + } + pos.pos <- as.numeric(as.character(xx.pos[[4]])) + mapq.pos <- as.numeric(as.character(xx.pos[[5]])) + for (i in 1: length(ensgenename1)) { + if (i %in% ix.na.pos_1) { + ## for 5' gene + ix.mypos <- which(id.pos == paste(i, "_1", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr1[i] & pos.pos.ix >= as.numeric(genestart1[i]) & pos.pos.ix <= as.numeric(geneend1[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_1[i] <- pos.pos.ix[ix.okpos] + } else { + genpos_1[i] <- pos.pos.ix[ix.okpos] + 24 + } + } + } + ## for 3' gene + if (i %in% ix.na.pos_2) { + ix.mypos <- which(id.pos == paste(i, "_2", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr2[i] & pos.pos.ix >= as.numeric(genestart2[i]) & pos.pos.ix <= as.numeric(geneend2[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_2[i] <- pos.pos.ix[ix.okpos] + 24 + } else { + genpos_2[i] <- pos.pos.ix[ix.okpos] + } + } + } + } + } + +# # refine genomic coordinates +# genpos_1.recal <- genpos_1 +# genpos_2.recal <- genpos_2 +# for (i in 1: length(ensgenename1)) { +# ix.ref <- which(generef == ensgenename1[i]) +# if (strand1[i] == "+") { +# exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.ref]), ","))) +# } else { +# exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.ref]), ","))) +# } +# ix.exon <- which.min(abs(genpos_1[i] - exonpos)) +# mydiff <- abs(genpos_1[i] - exonpos[ix.exon]) +# if (mydiff <= 3) { +# genpos_1.recal[i] <- exonpos[ix.exon] +# } +# +# ix.ref <- which(generef == ensgenename1[i]) +# if (strand2[i] == "+") { +# exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.ref]), ","))) +# } else { +# exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.ref]), ","))) +# } +# ix.exon <- which.min(abs(genpos_2[i] - exonpos)) +# mydiff <- abs(genpos_2[i] - exonpos[ix.exon]) +# if (mydiff <= 3) { +# genpos_2.recal[i] <- exonpos[ix.exon] +# } +# } +# mynames <- names(SummaryMat) +# SummaryMat$Breakpoint1 <- genpos_1.recal +# SummaryMat$Breakpoint2 <- genpos_2.recal + + SummaryMat$Breakpoint1 <- genpos_1 + SummaryMat$Breakpoint2 <- genpos_2 + SummaryMat$EricScore <- ericscore + SummaryMat$Blacklist <- myblacklist + save(SummaryMat, file=file.path(outputfolder, paste(samplename, ".Summary.recalc.RData", sep = ""))) + n.spanning <- as.numeric(as.character(SummaryMat$spanningreads)) + n.crossing <- as.numeric(as.character(SummaryMat$crossingreads)) + oddity.spanningreads <- rep(0, length(genpos_1)) + oddity.spanningreads[which(n.spanning == 1 & n.crossing >= 10)] <- 1 + + if (dim(SummaryMat)[1] > 0) { + write.table(SummaryMat, file = file.path(outputfolder,paste(samplename,".results.recalc.total.tsv", sep = "")), sep = "\t", row.names = F, quote = F) + ix.sorting.score <- sort(ericscore, decreasing = T, index.return = T)$ix + ericscore.sorted <- ericscore[ix.sorting.score] + myblacklist.sorted <- myblacklist[ix.sorting.score] + oddity.spanningreads.sorted <- oddity.spanningreads[ix.sorting.score] + SummaryMat.sorted <- SummaryMat[ix.sorting.score, ] + SummaryMat.Filtered <- SummaryMat.sorted[which(ericscore.sorted > 0.5 & myblacklist.sorted == "" & oddity.spanningreads.sorted == 0), ] + write.table(SummaryMat.Filtered[, -16], file = file.path(outputfolder,paste(samplename,".results.recalc.filtered.tsv", sep = "")), sep = "\t", row.names = F, quote = F) + } + cat("done. \n") + cat(paste("[EricScript] Breakpoint position corrected files are in ", outputfolder, ".\n", sep = "")) + +} else { + cat("[EricScript] No files of results found in ", outputfolder, ". Nothing to be done, Exit!\n", sep =" ") +} + diff --git a/lib/R/CalcStats.R b/lib/R/CalcStats.R new file mode 100644 index 0000000..52138ef --- /dev/null +++ b/lib/R/CalcStats.R @@ -0,0 +1,223 @@ +### calculate statistics + +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +resultsfolder <- split.vars[1] +outputfolder <- split.vars[2] +datafolder <- split.vars[3] +algoname <- split.vars[4] +dataset <- split.vars[5] +readlength <- as.numeric(split.vars[6]) +normroc <- as.numeric(split.vars[7]) +ericscriptfolder <- as.character(split.vars[8]) + +source(file.path(ericscriptfolder, "lib", "R", "ImportResults.R")) + +trapezint <- function (x, y, a, b) { +## function of the ROC package (http://bioconductor.org) + if (length(x) != length(y)) + stop("length x must equal length y") + y <- y[x >= a & x <= b] + x <- x[x >= a & x <= b] + if (length(unique(x)) < 2) + return(NA) + ya <- approx(x, y, a, ties = max, rule = 2)$y + yb <- approx(x, y, b, ties = max, rule = 2)$y + x <- c(a, x, b) + y <- c(ya, y, yb) + h <- diff(x) + lx <- length(x) + 0.5 * sum(h * (y[-1] + y[-lx])) +} + +algonamelist <- c("ericscript", "chimerascan", "defuse", "fusionmap", "shortfuse") +algoname <- tolower(algoname) + +if (any(algonamelist %in% algoname) == F) { + algoid <- "unknown" +} else { + algoid <- algoname +} + +xx <- list.files(resultsfolder, pattern = "sim_") +nsims <- length(xx) +cat("[EricScript calcstats] Found ", nsims, " synthetic data analysis for algorithm ", algoname,". \n", sep = "") + +if (nsims > 0) { + tpr <- rep(NA, nsims) + fpr <- rep(NA, nsims) + tpr.5 <- rep(NA, nsims) + fpr.5 <- rep(NA, nsims) + tpr.seq <- rep(NA, nsims) + refpath <- file.path(datafolder, dataset, "data") + rocs.tpr <- rep(0, 1000) + rocs.fpr <- rep(0, 1000) + nosims <- 0 + + refpath <- file.path(datafolder, dataset, "data") + + for (i in 1: nsims) { + if (i < 10) { + dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", "0000", i, sep = ""))) + if (is.list(dataresults)) { + load(file.path(refpath, paste("sim_", "0000", i, sep = ""), "GeneFusions.RData")) + cat ("[EricScript calcstats] Analysing ", paste("sim_", "0000", i, sep = "")," ... ") + } else if (dataresults == 0) { + cat("[EricScript calcstats] No results file found for ", paste("sim_", "0000", i, sep = ""), ".\n", sep = "") + nosims <- nosims + 1 + next + } else if (dataresults > 1) { + cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", "0000", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") + nosims <- nosims + 1 + next + } + } else if (i >= 10 & i < 100) { + dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", "000", i, sep = ""))) + if (is.list(dataresults)) { + load(file.path(refpath, paste("sim_", "000", i, sep = ""), "GeneFusions.RData")) + cat ("[EricScript calcstats] Analysing ", paste("sim_", "000", i, sep = "")," ... ") + } else if (dataresults == 0) { + cat("[EricScript calcstats] No results file found for ", paste("sim_", "000", i, sep = ""), ".\n", sep = "") + nosims <- nosims + 1 + next + } else if (dataresults > 1) { + cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", "000", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") + nosims <- nosims + 1 + next + } + } else if (i >= 100 & i < 1000) { + dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", "00", i, sep = ""))) + if (is.list(dataresults)) { + load(file.path(refpath, paste("sim_", "00", i, sep = ""), "GeneFusions.RData")) + cat ("[EricScript calcstats] Analysing ", paste("sim_", "00", i, sep = "")," ... ") + } else if (dataresults == 0) { + cat("[EricScript calcstats] No results file found for ", paste("sim_", "00", i, sep = ""), ".\n", sep = "") + nosims <- nosims + 1 + next + } else if (dataresults > 1) { + cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", "00", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") + nosims <- nosims + 1 + next + } + } else if (i >= 1000 & i < 10000) { + dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", "0", i, sep = ""))) + if (is.list(dataresults)) { + load(file.path(refpath, paste("sim_", "0", i, sep = ""), "GeneFusions.RData")) + cat ("[EricScript calcstats] Analysing ", paste("sim_", "0000", i, sep = "")," ... ") + } else if (dataresults == 0) { + cat("[EricScript calcstats] No results file found for ", paste("sim_", "0", i, sep = ""), ".\n", sep = "") + nosims <- nosims + 1 + next + } else if (dataresults > 1) { + cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", "0", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") + nosims <- nosims + 1 + next + } + } else if (i >= 10000 & i < 100000) { + dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", i, sep = ""))) + if (is.list(dataresults)) { + load(file.path(refpath, paste("sim_", i, sep = ""), "GeneFusions.RData")) + cat ("[EricScript calcstats] Analysing ", paste("sim_", i, sep = "")," ... ") + } else if (dataresults == 0) { + cat("[EricScript calcstats] No results file found for ", paste("sim_", i, sep = ""), ".\n", sep = "") + nosims <- nosims + 1 + next + } else if (dataresults > 1) { + cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") + nosims <- nosims + 1 + next + } + } + + id1.simul <- GeneFusions[[1]] + id2.simul <- GeneFusions[[2]] + seq.simul <- GeneFusions[[5]] + ngenefusions <- length(id1.simul) + if (!exists("cov.tpr")) { + cov.tpr <- rep(0, ngenefusions) + } + gene1ens <- as.character(dataresults$gene5) + gene2ens <- as.character(dataresults$gene3) + nreads <- as.numeric(as.character(dataresults$nreads)) + score <- as.numeric(as.character(dataresults$score)) + if (normroc > 1) { + score <- score/normroc + } + seq <- as.character(dataresults$seq) + + tpr[i] <- length(which(gene1ens %in% id1.simul & gene2ens %in% id2.simul))/ngenefusions + ix.tpr <- which(id1.simul %in% gene1ens & id2.simul %in%gene2ens) + ix.fpr <- which((gene1ens %in% id1.simul & gene2ens %in% id2.simul) == F) + cov.tpr[ix.tpr] <- cov.tpr[ix.tpr] + 1 + + tpr.5[i] <- length(which(gene1ens %in% id1.simul & gene2ens %in% id2.simul & nreads > 5))/ngenefusions + fpr.5[i] <- length(which((gene1ens %in% id1.simul & gene2ens %in% id2.simul) == F & nreads > 5))/length(gene1ens) + + fpr[i] <- (length(gene1ens) - tpr[i]*ngenefusions)/length(gene1ens) + + + rocs.tpr <- colSums(rbind(rocs.tpr, tabulate(score[which(gene1ens %in% id1.simul & gene2ens %in% id2.simul)]*1000, nbins = 1000))) + rocs.fpr <- colSums(rbind(rocs.fpr, tabulate(score[which((gene1ens %in% id1.simul & gene2ens %in% id2.simul) == F)]*1000, nbins = 1000))) + + ix.correctseq <- c() + for (ii in 1: length(ix.tpr)) { + ix.tmp <- which(gene1ens == id1.simul[ix.tpr[ii]] & gene2ens == id2.simul[ix.tpr[ii]]) + if (length(ix.tmp) > 1) { + ix.tmp <- ix.tmp[1] + } + if (length(agrep(toupper(seq[ix.tmp]), seq.simul[ix.tpr], max = 5)) > 0 & is.na(seq[ix.tmp]) == F) { + ix.correctseq <- c(ix.correctseq, ix.tpr[ii]) + } + } + tpr.seq[i] <- length(ix.correctseq)/length(ix.tpr) + + + cat ("done.\n") + + } + + + roc.total <- rocs.tpr + rocs.fpr + ntot <- sum(roc.total) + ntot.tpr <- sum(rocs.tpr) + ntot.fpr <- sum(rocs.fpr) + sens <- rep(0, 1000) + spec <- rep(0, 1000) + for (i in 1: 1000) { + if (i == 1) { + sens[i] <- sum(rocs.tpr)/ntot.tpr + spec[i] <- 1 - sum(rocs.fpr)/ntot.fpr + } else { + sens[i] <- sum(rocs.tpr[-c(1:i)])/ntot.tpr + spec[i] <- 1 - sum(rocs.fpr[-c(1:i)])/ntot.fpr + } + } + + stats <- list() + stats$algorithm <- algoname + stats$dataset <- dataset + stats$readlength <- readlength + stats$totalsims <- nsims + stats$nsims <- nsims - nosims + stats$meantpr <- mean(tpr, na.rm = T) + stats$meanfpr <- mean(fpr, na.rm = T) + stats$meantpsr <- mean(tpr.seq, na.rm = T) + stats$auc <- trapezint(sens, 1 - spec, 0, 1) + stats$meantpr5 <- mean(tpr.5, na.rm = T) + stats$meanfpr5 <- mean(fpr.5, na.rm = T) + stats$tpr <- tpr + stats$fpr <- fpr + stats$tpsr <- tpr.seq + stats$tpr5 <- tpr.5 + stats$fpr5 <- fpr.5 + stats$scoring_sensitivity <- sens + stats$scoring_specificity <- spec + stats$covtpr <- cov.tpr/(nsims-nosims) + + save(stats, file = file.path(outputfolder, paste(algoname, dataset, readlength, "stats","RData", sep = "."))) +} else { + cat ("[EricScript calcstats] Error: no directories containing results on synthetic data have been found in ", resultsfolder, ". Exit.\n", sep = "") + +} + diff --git a/lib/R/CheckDB.R b/lib/R/CheckDB.R new file mode 100644 index 0000000..faca01b --- /dev/null +++ b/lib/R/CheckDB.R @@ -0,0 +1,73 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +ericscriptfolder <- split.vars[1] +refid <- split.vars[2] +dbfolder <- as.character(split.vars[3]) + +flag.dbexists <- 1 + +mydbdata.homo <- c("EnsemblGene.Reference.fa", "EnsemblGene.Sequences.RData", "EnsemblGene.GenePosition.RData", "EnsemblGene.Structures.RData", "EnsemblGene.GeneInfo.RData", "EnsemblGene.Paralogs.RData", "EnsemblGene.GeneNames.RData") +mydbdata <- c("EnsemblGene.Reference.fa", "EnsemblGene.Sequences.RData", "EnsemblGene.GenePosition.RData", "EnsemblGene.Structures.RData", "EnsemblGene.GeneInfo.RData","EnsemblGene.GeneNames.RData") + +xx <- file.exists(file.path(dbfolder, "data", refid)) + if (xx) { + xx.files <- list.files(file.path(dbfolder, "data", refid)) + if (refid == "homo_sapiens") { + xx1 <- all( mydbdata.homo %in% xx.files ) + } else { + xx1 <- all( mydbdata %in% xx.files ) + } + if (!xx1) { + flag.dbexists <- 0 + cat("[EricScript] Some required db files were not found for", refid, "genome. Please run \"ericscript.pl --downdb --refid", refid, "\" to solve this.\n") + } + } else { + flag.dbexists <- 0 + cat("[EricScript] DB data for", refid, "genome do not exist. Set correct -db option or run \" ericscript.pl --downdb --refid", refid, "\" to solve this.\n") + } + +## check bwa version +yy <- file.exists(file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version")) +if (yy) { + prev.version.bwa <- scan(file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version"), what = "", quiet = T, sep = "\n") + system(paste("bwa", "2>&1", "|", "grep ersion", ">", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"))) + curr.version.bwa <- scan(file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"), what = "", quiet = T, sep = "\n") + if (curr.version.bwa != prev.version.bwa) { + cat("[EricScript] Updating BWA indexes for", refid,"... ") + system(paste("bwa index", file.path(file.path(dbfolder, "data", refid, "EnsemblGene.Reference.fa")), "1>>", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"), "2>>", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"))) + cat("done.\n") + cat(curr.version.bwa, file = file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version")) + } +} else { + system(paste("bwa", "2>&1", "|", "grep ersion", ">", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"))) + curr.version.bwa <- scan(file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"), what = "", quiet = T, sep = "\n") + version.a <- gsub("Version: ", "", strsplit(curr.version.bwa, ".", fixed = T)[[1]][1]) + version.b <- strsplit(curr.version.bwa, ".", fixed = T)[[1]][2] + version.c <- gsub("[a-z]", "", strsplit(strsplit(curr.version.bwa, ".", fixed = T)[[1]][3], "-")[[1]][1]) + version.tot <- as.numeric(paste(version.a, version.b, version.c, sep = "")) + if (version.tot >= 74) { + cat("[EricScript] Updating BWA indexes for", refid, "...") + system(paste("bwa index", file.path(file.path(dbfolder, "data", refid, "EnsemblGene.Reference.fa")), "1>>", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"), "2>>", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"))) + cat("done.\n") + system(paste("bwa", "2>&1", "|", "grep ersion", ">", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version"))) + } else { + flag.dbexists <- 0 + cat("[EricScript] BWA version >= 0.7.4 is required. Exit.\n") + } +} + +mydbdata.bwa <- c("EnsemblGene.Reference.fa.bwt", "EnsemblGene.Reference.fa.pac", "EnsemblGene.Reference.fa.ann", "EnsemblGene.Reference.fa.amb", "EnsemblGene.Reference.fa.sa") + +if (xx) { + xx.files.bwa <- list.files(file.path(dbfolder, "data", refid)) + xx1 <- all( mydbdata.bwa %in% xx.files.bwa ) + if (!xx1) { + flag.dbexists <- 0 + cat("[EricScript] Some required files (bwa indexes) were not found for", refid, "genome. Please run \"ericscript.pl --downdb --refid", refid, "\" to solve this.\n") + } +} + +cat(flag.dbexists, file = file.path(ericscriptfolder, "lib", "data", "_resources", ".flag.dbexists")) + + diff --git a/lib/R/CheckSelfHomology.R b/lib/R/CheckSelfHomology.R new file mode 100644 index 0000000..f225774 --- /dev/null +++ b/lib/R/CheckSelfHomology.R @@ -0,0 +1,110 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +samplename <- split.vars [1] +outputfolder <- split.vars[2] +x <- read.delim(file.path(outputfolder,"out",paste(samplename, ".checkselfhomology.blat", sep = "")), sep = "\t", header = F) +query.fa <- readLines(file.path(outputfolder,"out",paste(samplename, ".checkselfhomology.fa", sep = ""))) +id.fa <- substr(query.fa[seq(1, length(query.fa), by = 2)], 2, nchar(query.fa[seq(1, length(query.fa), by = 2)])) +id.query.nolabel <- as.character(x[,1]) +unique.ids.nolabel <- unique(id.query.nolabel) +ix.fa <- which(id.fa %in% unique.ids.nolabel) +seq.fa <- query.fa[seq(2, length(query.fa), by = 2)][ix.fa] +id.target.f <- as.character(x[,2]) +id.match.f <- as.numeric(as.character(x[,4])) +start.match <- as.numeric(as.character(x[,7])) +end.match <- as.numeric(as.character(x[,8])) +diff.match <- end.match - start.match + 1 +ix.junct.nohomology <- rep(0, length(unique.ids.nolabel)) +ix.junct.homology <- rep(0, length(unique.ids.nolabel)) +flag.dup.a <- rep(0, length(unique.ids.nolabel)) +flag.dup.b <- rep(0, length(unique.ids.nolabel)) +homology.list <- vector("list", length(unique.ids.nolabel)) +for (i in 1:length(unique.ids.nolabel)) { + ix.id <- which(id.query.nolabel == unique.ids.nolabel[i]) + query <- unique.ids.nolabel[i] + target <- id.target.f[ix.id] + match <- as.numeric(id.match.f[ix.id]) + query.tmp <- unlist(strsplit(query, "----", fixed = T)) + query_a <- unlist(strsplit(query.tmp[1], "_"))[1] + query_b <- unlist(strsplit(query.tmp[2], "_"))[1] + diff.match.tmp <- diff.match[ix.id] + start.match.tmp <- start.match[ix.id] + end.match.tmp <- end.match[ix.id] + width <- 100 - length(grep("N", unlist(strsplit(seq.fa[i], "")))) + ix.c <- seq.int(1,length(target)) + ix.a <- which(target %in% query_a) + ix.b <- which(target %in% query_b) + ix.ab <- c(ix.a, ix.b) + if (((length(ix.a) > 0) & (length(ix.b) > 0)) | (length(ix.a) > 1 & length(ix.b) == 1) | (length(ix.a) == 1 & length(ix.b) > 1)) { + if ((length(ix.a) > 1 & length(ix.b) == 1)) { + myflag <- length(which(start.match.tmp[ix.a] %in% c((start.match.tmp[ix.b]-3):(start.match.tmp[ix.b]+3)))) + length(which(end.match.tmp[ix.a] %in% c((end.match.tmp[ix.b]-3):(end.match.tmp[ix.b]+3)))) + } else if ((length(ix.a) == 1 & length(ix.b) > 1)) { + myflag <- length(which(start.match.tmp[ix.b] %in% c((start.match.tmp[ix.a]-3):(start.match.tmp[ix.a]+3)))) + length(which(end.match.tmp[ix.b] %in% c((end.match.tmp[ix.a]-3):(end.match.tmp[ix.a]+3)))) + } else { + myflag <- 0 + } + if (myflag == 0) { + if (max(diff.match.tmp[ix.ab]) < round(0.8*width) & ((length(ix.ab) > 2) & any(diff.match.tmp[ix.ab] < 30) | (length(ix.ab) == 2))) { + if (length(ix.ab) != 0) { + ix.c <- ix.c[-ix.ab] + } + + if(length(ix.c) != 0) { + unique.ids.homology <- unique(target[ix.c]) + homology.list[[i]] <- vector("list", length(unique.ids.homology)) + for (j in 1:length(unique.ids.homology)) { + ix.id.homology <- which(target[ix.c] == unique.ids.homology[j]) + max.match <- max(match[ix.c][ix.id.homology]) + homology.list[[i]][[j]] <- cbind(unique.ids.homology[j], max.match) + } + ix.junct.homology[i] <- 1 + } + if(length(ix.c) == 0) { + ix.junct.nohomology[i] <- 1 + + } + if (length(ix.a) > 1) { + flag.dup.a[i] <- 1 + } + if (length(ix.b) > 1) { + flag.dup.b[i] <- 1 + } + + } + + } + } +} +ix.junct <- sort(c(which(ix.junct.nohomology == 1), which(ix.junct.homology == 1))) +if (length(ix.junct) == 0) { + myflag <- 0 + cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) + stop("No putative gene fusions pass the self-homology filter. Exit!") +} + +info.homology <- rep("", length(ix.junct)) +for (i in 1: length(ix.junct)) { + list.tmp <- homology.list[[ix.junct[i]]] + if (is.null(list.tmp) == F) { + info.homology.tmp <- c() + n.homo <- length(list.tmp) + if (n.homo > 30) { + n.homo <- 30 + info.homology.tmp <- "More than 30 homologies found: " + } + for (j in 1: n.homo) { + info.homology.tmp <- paste(info.homology.tmp, paste(list.tmp[[j]][1,1]," (",list.tmp[[j]][1,2],"%)", sep = "" ), sep = "") + if (n.homo > 1 & j < n.homo) { + info.homology.tmp <- paste(info.homology.tmp, ", ", sep = "") + } else + { + info.homology[i] <- info.homology.tmp + } + } + + } +} + +info.id.and.homology <- cbind(unique.ids.nolabel[ix.junct], info.homology, flag.dup.a[ix.junct], flag.dup.b[ix.junct]) +save(info.id.and.homology, file = file.path(outputfolder,"out",paste(samplename,".ids_homology.RData", sep = ""))) diff --git a/lib/R/ConvertTxt2R.R b/lib/R/ConvertTxt2R.R new file mode 100644 index 0000000..c71b617 --- /dev/null +++ b/lib/R/ConvertTxt2R.R @@ -0,0 +1,78 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +ericscriptfolder <- split.vars [1] +refid <- split.vars[2] +dbfolder <- split.vars[3] +tmpfolder <- split.vars[4] + +xx <- read.delim(file.path(tmpfolder, "genepos.txt"), sep = "\t", header = F) +chrs <- as.character(xx[[2]]) +unique.chrs <- unique(chrs) +geneid <- as.character(xx[[1]]) +genepos <- as.numeric(as.character(xx[[3]])) +xx.strand <- read.delim(file.path(tmpfolder, "strand.txt"), sep = "\t", header = F) +strand <- as.character(xx.strand[[2]]) +## sorting genes by genomic pos +ix.srt <- rep(NA, dim(xx)[1]) +count <- 0 +for ( i in 1: length(unique.chrs)) { + ix.chr <- which(chrs == unique.chrs[i]) + tmp <- sort(genepos[ix.chr], index.return = T) + ix.srt[(count + 1):(count + length(ix.chr))] <- ix.chr[tmp$ix] + count <- count + length(ix.chr) +} + +geneid.srt <- geneid[ix.srt] +genepos.str <- genepos[ix.srt] +strand.srt <- strand[ix.srt] + +EnsemblGene.GenePosition <- xx[ix.srt, ] +names(EnsemblGene.GenePosition) <- c("EnsemblGene", "Chromosome", "Position") +save(EnsemblGene.GenePosition, file = file.path(dbfolder, "data", refid, "EnsemblGene.GenePosition.RData")) + +xx <- read.delim(file.path(tmpfolder, "exonstartend.mrg.txt"), sep = "\t", header = F) + +exgeneid <- as.character(xx[[4]]) +exchr <- as.character(xx[[1]]) +exstart.tmp <- as.numeric(as.character(xx[[2]])) + 1 +exend.tmp <- as.character(xx[[3]]) +ix.dup <- which(!duplicated(exgeneid)) +exstart <- rep("", length(ix.dup)) +exend <- rep("", length(ix.dup)) +excount <- rep(NA, length(ix.dup)) +start <- rep("", length(ix.dup)) +end <- rep("", length(ix.dup)) +strand1 <- rep("", length(ix.dup)) +for (i in 1: (length(ix.dup) - 1)) { + if (strand[i] == "-1") { + strand1[i] <- "-" + } else { + strand1[i] <- "+" + } + ix.tmp <- ix.dup[i]:(ix.dup[i+1] - 1) + start[i] <- exstart.tmp[ix.tmp][1] + end[i] <- exend.tmp[ix.tmp][length(ix.tmp)] + exstart[i] <- toString(exstart.tmp[ix.tmp]) + exend[i] <- toString(exend.tmp[ix.tmp]) + excount[i] <- length(ix.tmp) +} + + + +EnsemblGene.Structures <- cbind(exgeneid[ix.dup], exchr[ix.dup], strand1, start, end, excount, exstart, exend)[ix.srt, ] +EnsemblGene.Structures <- data.frame(EnsemblGene.Structures) +names(EnsemblGene.Structures) <- c("EnsemblGene", "Chromosome", "Strand", "geneStart", "geneEnd", "exonCount", "exonStart", "exonEnd") +save(EnsemblGene.Structures, file = file.path(dbfolder, "data", refid, "EnsemblGene.Structures.RData")) + +xx <- read.delim(file.path(tmpfolder, "geneinfo.txt"), sep = "\t", header = F) +EnsemblGene.GeneInfo <- xx[ix.srt, ] +names(EnsemblGene.GeneInfo) <- c("EnsemblGene", "GeneName", "Description") +save(EnsemblGene.GeneInfo, file = file.path(dbfolder, "data", refid, "EnsemblGene.GeneInfo.RData")) + +if (refid == "homo_sapiens") { +xx <- read.delim(file.path(tmpfolder, "paralogs.txt"), sep = "\t", header = F) +EnsemblGene.Paralogs <- xx +names(EnsemblGene.Paralogs) <- c("EnsemblGene", "Paralogs") +save(EnsemblGene.Paralogs, file = file.path(dbfolder, "data", refid, "EnsemblGene.Paralogs.RData")) +} diff --git a/lib/R/CreateDataEricTheSimulator.R b/lib/R/CreateDataEricTheSimulator.R new file mode 100644 index 0000000..f2461ef --- /dev/null +++ b/lib/R/CreateDataEricTheSimulator.R @@ -0,0 +1,53 @@ +options(stringsAsFactors=F) +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +refid <- split.vars[1] +dbfolder <- split.vars[2] +tmpfolder <- split.vars[3] + +## read transcript genomic info +xx <- read.delim(file.path(tmpfolder, "transcripts.txt"), sep = "\t", header = F) +geneid <- xx[[1]] +transcriptid <- xx[[2]] +exonstart <- xx[[3]] +exonend <- xx[[4]] +chr <- xx[[5]] +strandtmp <- xx[[6]] +strand <- rep("+", length(strandtmp)) +strand[strandtmp == "-1"] <- "-" +## read transcript cdna +xxseq <- scan(file.path(tmpfolder, "transcripts.fa"), what = list(seq="", id=""), sep = "\t", quiet = T) +seqtmp <- xxseq[[1]] +transcriptid.seqtmp <- xxseq[[2]] +rm (xx, xxseq) +unique.transcriptid <- unique(transcriptid) +EnsemblGene.Structures <- c() +GeneNames <- rep("", length(unique.transcriptid)) +sequences <- rep("", length(unique.transcriptid)) +for (i in 1: length(unique.transcriptid)) { + ix <- which(transcriptid == unique.transcriptid[i]) + ixseq <- which(transcriptid.seqtmp == unique.transcriptid[i]) + if (length(ixseq) > 0) { + sequences[i] <- seqtmp[ixseq] + } else { + sequences[i] <- "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" + } + ixsrt <- sort(exonstart[ix], index.return = T)$ix + genestart <- min(c(exonstart[ix], exonend[ix])) + geneend <- max(c(exonstart[ix], exonend[ix])) + exonStart <- toString(exonstart[ix[ixsrt]]) + exonEnd <- toString(exonend[ix[ixsrt]]) + exoncount <- length(ix) + mychr <- unique(chr[ix]) + mystrand <- unique(strand[ix]) + GeneNames[i] <- unique(geneid[ix]) + EnsemblGene.Structures <- rbind(EnsemblGene.Structures, c(unique.transcriptid[i], mychr, mystrand, genestart, geneend, exoncount, exonStart, exonEnd)) +} +colnames(EnsemblGene.Structures) <- c("EnsemblGene", "Chromosome", "Strand", "geneStart", "geneEnd", "exonCount", "exonStart", "exonEnd") +EnsemblGene.Structures <- data.frame(EnsemblGene.Structures) +save(EnsemblGene.Structures, GeneNames, sequences, file = file.path(dbfolder, "data", refid, "EnsemblGene.Transcripts.RData")) + + + + diff --git a/lib/R/DownloadDB.R b/lib/R/DownloadDB.R new file mode 100644 index 0000000..48e3757 --- /dev/null +++ b/lib/R/DownloadDB.R @@ -0,0 +1,207 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +ericscriptfolder <- split.vars [1] +dbfolder <- split.vars [2] +user.refid <- split.vars [3] +tmpfolder <- split.vars [4] +ensversion <- as.numeric(split.vars [5]) +#### + +load(file.path(dbfolder, "data", "_resources", "RefID.RData")) +ix.refid <- which(refid == user.refid) +if (length(ix.refid) == 0) { + cat("\n[EricScript] Error: No data available for genome ", user.refid, ". Run ericscript.pl --printdb to view the available genomes.\n", sep = "") + cat(0, file = file.path(tmpfolder, ".refid.flag")) + quit( save = "no") +} +cat(1, file = file.path(tmpfolder, ".refid.flag")) +myrefid <- refid[ix.refid] +myrefid.path <- refid.path[ix.refid] + +if (ensversion == 0) { + ensversion <- version +} +## create xml queries + +tmp0 <- unlist(strsplit(myrefid, "_")) +myrefid.xml <- paste(gsub(", ", "", toString(c(substr(tmp0[1], 1, 1), tmp0[2]))), "gene", "ensembl", sep = "_") + +## genepos file +fileout <- file.path(tmpfolder, "genepos.xml") +cat("", file = fileout, sep = "\n") +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat(paste("", sep = ""), file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +if (myrefid == "homo_sapiens") { + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) +} else { + cat("", file = fileout, sep = "\n", append = T) +} +#### start attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +#### end attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) + +## geneinfo file +fileout <- file.path(tmpfolder, "geneinfo.xml") +cat("", file = fileout, sep = "\n") +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat(paste("", sep = ""), file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +if (myrefid == "homo_sapiens") { + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) +} else { + cat("", file = fileout, sep = "\n", append = T) +} +#### start attributes +cat("", file = fileout, sep = "\n", append = T) +if (ensversion > 0 & ensversion <= 75) { + cat("", file = fileout, sep = "\n", append = T) +} else { + cat("", file = fileout, sep = "\n", append = T) +} +cat("", file = fileout, sep = "\n", append = T) +#### end attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) + +## exonstartend file +fileout <- file.path(tmpfolder, "exonstartend.xml") +cat("", file = fileout, sep = "\n") +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat(paste("", sep = ""), file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +if (myrefid == "homo_sapiens") { + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) +} else { + cat("", file = fileout, sep = "\n", append = T) +} +#### start attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +#### end attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) + +## strand file +fileout <- file.path(tmpfolder, "strand.xml") +cat("", file = fileout, sep = "\n") +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat(paste("", sep = ""), file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +if (myrefid == "homo_sapiens") { + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) +} else { + cat("", file = fileout, sep = "\n", append = T) +} +#### start attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +#### end attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) + + +## paralogs file (if it exists) +if (myrefid == "homo_sapiens") { + fileout <- file.path(tmpfolder, "paralogs.xml") + cat("", file = fileout, sep = "\n") + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) + cat(paste("", sep = ""), file = fileout, sep = "\n", append = T) +# cat("", file = fileout, sep = "\n", append = T) +# cat("", file = fileout, sep = "\n", append = T) + if (myrefid == "homo_sapiens") { + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) + } else { + cat("", file = fileout, sep = "\n", append = T) + } + #### start attributes + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) + #### end attributes + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) +} + +## transcripts (eric the simulator) +fileout <- file.path(tmpfolder, "transcripts.xml") +cat("", file = fileout, sep = "\n") +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat(paste("", sep = ""), file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +if (myrefid == "homo_sapiens") { + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) +} else { + cat("", file = fileout, sep = "\n", append = T) +} +#### start attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +#### end attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) + +fileout <- file.path(tmpfolder, "transcripts_cdna.xml") +cat("", file = fileout, sep = "\n") +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +cat(paste("", sep = ""), file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +#cat("", file = fileout, sep = "\n", append = T) +if (myrefid == "homo_sapiens") { + cat("", file = fileout, sep = "\n", append = T) + cat("", file = fileout, sep = "\n", append = T) +} else { + cat("", file = fileout, sep = "\n", append = T) +} +#### start attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) +#### end attributes +cat("", file = fileout, sep = "\n", append = T) +cat("", file = fileout, sep = "\n", append = T) + +## download gene data +system(paste("perl", file.path(ericscriptfolder, "lib", "perl", "retrievefrombiomart.pl"), file.path(tmpfolder, "genepos.xml"), ensversion, "| sort -u - >", file.path(tmpfolder, "genepos.txt"))) +system(paste("perl", file.path(ericscriptfolder, "lib", "perl", "retrievefrombiomart.pl"), file.path(tmpfolder, "geneinfo.xml"), ensversion, "| sort -u - >", file.path(tmpfolder, "geneinfo.txt"))) +system(paste("perl", file.path(ericscriptfolder, "lib", "perl", "retrievefrombiomart.pl"), file.path(tmpfolder, "exonstartend.xml"), ensversion, "| sort -u - >", file.path(tmpfolder, "exonstartend.txt"))) +system(paste("perl", file.path(ericscriptfolder, "lib", "perl", "retrievefrombiomart.pl"), file.path(tmpfolder, "strand.xml"), ensversion, "| sort -u - >", file.path(tmpfolder, "strand.txt"))) +system(paste("perl", file.path(ericscriptfolder, "lib", "perl", "retrievefrombiomart.pl"), file.path(tmpfolder, "transcripts.xml"), ensversion, "| sort -u - >", file.path(tmpfolder, "transcripts.txt"))) +system(paste("perl", file.path(ericscriptfolder, "lib", "perl", "retrievefrombiomart.pl"), file.path(tmpfolder, "transcripts_cdna.xml"), ensversion, ">", file.path(tmpfolder, "transcripts.fa"))) +if (myrefid == "homo_sapiens") { + system(paste("perl", file.path(ericscriptfolder, "lib", "perl", "retrievefrombiomart.pl"), file.path(tmpfolder, "paralogs.xml"), ensversion, "| sort -u - >", file.path(tmpfolder, "paralogs.txt"))) + acc.chrs <- c(1:22, "X", "Y") + cat (acc.chrs, file = file.path(tmpfolder, "chrlist"), sep = "\n") +} +## download seq data +download.file(file.path("ftp://ftp.ensembl.org/pub", paste("release-", ensversion, sep = ""), "fasta", myrefid, "dna", myrefid.path), destfile = file.path(tmpfolder, "seq.fa.gz"), quiet = T) + + + diff --git a/lib/R/EstimateSpanningReads.R b/lib/R/EstimateSpanningReads.R new file mode 100644 index 0000000..ba41eb7 --- /dev/null +++ b/lib/R/EstimateSpanningReads.R @@ -0,0 +1,124 @@ +## EstimateSpanningReads v2: exclude soft clipped reads +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +samplename <- split.vars [1] +outputfolder <- split.vars[2] +readlength <- max(as.numeric(split.vars[3])) +load(file.path(outputfolder,"out",paste(samplename,".junctions.recalibrated.RData", sep = ""))) +load(file.path(outputfolder, "out", paste(samplename, ".ids_fasta.RData", sep = ""))) +load(file.path(outputfolder, "out", "isize.RData")) +mysigma <- readlength/4 +myref <- sort(dnorm(seq(1, readlength, by = 1), readlength/2, mysigma), decreasing = T) +DataMatrix <- matrix(NA, nrow = length(ids_fasta), ncol = 9) +for (i in 1: length(ids_fasta)) { + junction.tmp <- junctions.recalibrated[i] + x <- system(paste("samtools view ", file.path(outputfolder, "aln", paste(samplename,".remap.recal.sorted.rmdup.bam", sep = ""))," ", ids_fasta[i], ":", junction.tmp, "-", junction.tmp + 1, " | awk '($5>0)' | cut -f 1,4,6 > ", file.path(outputfolder,"out",".spanningreads.sam"),sep = "")) + x <- system(paste("samtools view ", file.path(outputfolder, "aln", paste(samplename,".remap.recal.sorted.rmdup.bam", sep = ""))," ", ids_fasta[i]," | awk '($9>0)' | cut -f 9 - > ", file.path(outputfolder,"out",".insertsize.sam"),sep = "")) + x <- system(paste("samtools view ", file.path(outputfolder, "aln", paste(samplename,".remap.recal.sorted.rmdup.bam", sep = ""))," ", ids_fasta[i]," | awk '($4<", junction.tmp, ") && ($8>",junction.tmp+1,")' | cut -f 1,4 - > ", file.path(outputfolder,"out",".crossingreads.sam"),sep = "")) + + spanningreads.tmp <- scan(file.path(outputfolder,"out",".spanningreads.sam"), sep = "\t", what = list("", 1, ""), quiet = T) + ix.nosc <- sort(intersect(grep("^[0-9]*S", spanningreads.tmp[[3]], perl = T, invert = T), grep("[0-9]*S$", spanningreads.tmp[[3]], perl = T, invert = T))) + spanningreads <- vector("list", length = 3) + if (length(ix.nosc) > 0) { + spanningreads[[1]] <- spanningreads.tmp[[1]][ix.nosc] + spanningreads[[2]] <- spanningreads.tmp[[2]][ix.nosc] + spanningreads[[3]] <- spanningreads.tmp[[3]][ix.nosc] + } else { + spanningreads <- spanningreads.tmp + } + crossingreads <- scan(file.path(outputfolder,"out",".crossingreads.sam"), sep = "\t", what = list("", 1), quiet = T) + insert.size <- mean(abs(as.numeric(readLines(file.path(outputfolder,"out",".insertsize.sam"))))) - readlength + id.spanningreads <- spanningreads[[1]] + pos.spanningreads <- spanningreads[[2]] + id.crossingreads <- crossingreads[[1]] + pos.crossingreads <- crossingreads[[2]] + spanning.score <- 0 + edge.score <- 0 + range.pos.crossingreads <- 0 + if (length(pos.crossingreads) > 0) { + range.pos.crossingreads <- junction.tmp - min(pos.crossingreads) + } + n.crossingreads <- length(which(id.crossingreads %in% id.spanningreads == F)) + n.spanningreads <- 0 + gjs <- 0 + unique.score <- 0 + us.prob <- 0 + insertsize.score <- 0 + if (length(pos.spanningreads) > 0) { + pos <- pos.spanningreads - junction.tmp + readlength + us.pos <- tabulate(pos, nbins = readlength) + + if (sum(us.pos) > 0) { + us.mult <- floor(sum(us.pos)/readlength) + us.residuals <- sum(us.pos)/readlength - floor(sum(us.pos)/readlength) + us.refdistr <- rep(us.mult, readlength) + if (us.residuals > 0) { + for (kk in 1: (sum(us.pos) - us.mult*readlength)) { + us.refdistr[kk] <- us.refdistr[kk] + 1 + } + } else { + us.refdistr[1] <- us.refdistr[1] + 1 + us.refdistr[2] <- us.refdistr[2] - 1 + } + + ff <- which(sort(us.pos!=0)) + us.prob <- 1- sum(abs(sort(us.pos)[ff]-sort(us.refdistr)[ff]))/sum(us.pos) + } + + mynorm <- sum(myref[1:length(unique(pos))]) + prob <- sum(dnorm(unique(pos), readlength/2, mysigma)) + gjs <- prob/mynorm + + + left.spanningreads <- pos.spanningreads[(pos.spanningreads <= (junction.tmp - round(readlength/3)))] + right.spanningreads <- pos.spanningreads[(pos.spanningreads > (junction.tmp - round(readlength/3)))] + n.left.spanningreads <- length(left.spanningreads ) + n.right.spanningreads <- length(right.spanningreads) + spanning.score <- 1- abs(n.left.spanningreads - n.right.spanningreads)/(n.left.spanningreads + n.right.spanningreads) + + if (length(left.spanningreads) > 0) { + left.score <- mean((junction.tmp - readlength)-left.spanningreads) + } else { + left.score <- 0 + } + if (length(right.spanningreads) > 0) { + right.score <- mean(right.spanningreads - junction.tmp) + } else { + right.score <- 0 + } + edge.score <- 1 - 1.1^(mean(c(left.score,right.score))) + insertsize.score <- dnorm(insert.size, isize.mean, isize.sd)/dnorm(isize.mean, isize.mean, isize.sd) + n.spanningreads <- length(pos.spanningreads) + + } + + DataMatrix[i,] <- c(ids_fasta[i], n.crossingreads , insert.size, n.spanningreads, range.pos.crossingreads, edge.score, gjs, us.prob, insertsize.score) + +} +colnames(DataMatrix) <- c("id", "nreads","mean_ins_size","nreads_junc", "rangepos", "edgescore", "gjs", "uniformity.score", "isize.score") +save(DataMatrix, file = file.path(outputfolder,"out",paste(samplename, ".DataMatrix.RData", sep = ""))) + +filecon <- file(file.path(outputfolder,"out", paste(samplename, ".intervals", sep = "")), open = "w") +ix.filter <- sort(unique(intersect(which(DataMatrix[,4] > 0), which(DataMatrix[,3] != "NaN")))) +if (length(ix.filter) > 0) { + width <- 100 + id.filtered <- ids_fasta[ix.filter] + save(id.filtered, file = file.path(outputfolder, "out",paste(samplename,".ids_filtered.RData", sep = ""))) + for (i in 1:length(ix.filter)) { + ix.ref <- ix.filter[i] + junction <- junctions.recalibrated[ix.ref] + pileup.interval <- seq.int((junction - (width/2 - 1)), (junction + (width/2))) + pileup.interval[which(pileup.interval < 1)] <- 1 + pileup.interval <- unique(pileup.interval) + cat(paste(rep(id.filtered[i], length(pileup.interval)), pileup.interval, sep = " "), file = filecon, sep = "\n", append = T) + } + close(filecon) + myflag <- 1 + cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) +} else { + myflag <- 0 + cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) + stop("No chimeric transcripts found. Exit!") +} + diff --git a/lib/R/ExtractInsertSize.R b/lib/R/ExtractInsertSize.R new file mode 100644 index 0000000..46831ca --- /dev/null +++ b/lib/R/ExtractInsertSize.R @@ -0,0 +1,33 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +outputfolder <- split.vars[1] +bwa_aln <- as.numeric(as.character(split.vars[2])) +if (bwa_aln == 0) { + xx <- readLines(file.path(outputfolder, "out", ".ericscript.log")) + ix.isize <- grep("mean and std.dev:", xx) + if (length(ix.isize) > 0) { + isize.tmp <- strsplit(tail(xx[ix.isize], n = 1), ": ")[[1]][2] + isize.tmp1 <- unlist(strsplit(isize.tmp, ",", fixed = T)) + isize.mean <- as.numeric(substr(isize.tmp1[1], 2, nchar(isize.tmp1[1]))) + isize.sd <- as.numeric(substr(isize.tmp1[2], 1, nchar(isize.tmp1[2])-1)) + } else { + isize.mean <- 200 + isize.sd <- 40 + } + save(isize.mean, isize.sd, file = file.path(outputfolder, "out", "isize.RData")) +} else { + xx <- readLines(file.path(outputfolder, "out", ".ericscript.log")) + ix.isize <- grep("inferred external isize from", xx) + if (length(ix.isize) > 0) { + isize.tmp <- strsplit(tail(xx[ix.isize], n = 1), ": ")[[1]][2] + isize.tmp1 <- unlist(strsplit(isize.tmp, "+/-", fixed = T)) + isize.mean <- as.numeric(isize.tmp1[1]) + isize.sd <- as.numeric(isize.tmp1[2]) + } else { + isize.mean <- 200 + isize.sd <- 40 + } + save(isize.mean, isize.sd, file = file.path(outputfolder, "out", "isize.RData")) + +} diff --git a/lib/R/ImportResults.R b/lib/R/ImportResults.R new file mode 100644 index 0000000..860666a --- /dev/null +++ b/lib/R/ImportResults.R @@ -0,0 +1,409 @@ +### ensgene converter + +toens <- function(ericscriptfolder, genename) { + + load(file.path(ericscriptfolder, "lib", "data", "EnsemblGene.GeneInfo.RData")) + ensgene <- as.character(EnsemblGene.GeneInfo$EnsemblGene)[which(as.character(EnsemblGene.GeneInfo$GeneName) == genename)] + if (length(ensgene) == 0) {ensgene <- NA} + return(ensgene) + +} + + +convertToComplement<-function(x) { + + bases=c("A","C","G","T") + xx<-unlist(strsplit(toupper(x), NULL)) + paste(unlist(lapply(xx, function(bbb) { + if(bbb=="A") compString <- "T" + if(bbb=="C") compString <- "G" + if(bbb=="G") compString <- "C" + if(bbb=="T") compString <- "A" + if(!bbb %in% bases) compString <- "N" + return(compString) + })),collapse="") + +} + +### import results from algorithm''s output + +Import_ericscript <- function(outputpath) { + + filename <- grep(".results.total.tsv", list.files(outputpath), value = T) + + if (length(filename) == 1) { + xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) + gene5 <- as.character(xx$EnsemblGene1) + gene3 <- as.character(xx$EnsemblGene2) + nreads <- as.numeric(as.character(xx$spanningreads)) + score <- as.numeric(as.character(xx$EricScore)) + seq <- as.character(xx$JunctionSequence) + + algout <- list() + algout$gene5 <- gene5 + algout$gene3 <- gene3 + algout$nreads <- nreads + algout$score <- score + algout$seq <- seq + + return(algout) + } else if (length(filename) == 0) { + + return(0) + + } else if (length(filename) > 1) { + + return(length(filename)) + + } + + +} + + + +Import_defuse <- function(outputpath) { + + filename <- grep(".classify.tsv", list.files(outputpath), value = T) + + if (length(filename) == 1) { + + xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) + gene5 <- as.character(xx$gene1) + gene3 <- as.character(xx$gene2) + nreads <- as.numeric(as.character(xx$splitr_count)) + score <- as.numeric(as.character(xx$probability)) + seq <- rep("", dim(xx)[1]) + for (seqd in 1: dim(xx)[1]) { + tmp <- unlist(strsplit(as.character(xx$splitr_sequence[seqd]), "|", fixed = T)) + seq[seqd] <- paste(substr(tmp[1], (nchar(tmp[1])-29), nchar(tmp[1])), substr(tmp[2], 1, 30), sep = "") + } + + + algout <- list() + algout$gene5 <- gene5 + algout$gene3 <- gene3 + algout$nreads <- nreads + algout$score <- score + algout$seq <- seq + + return(algout) + } else if (length(filename) == 0) { + + return(0) + + } else if (length(filename) > 1) { + + return(length(filename)) + + } + +} + + +Import_chimerascan <- function(outputpath) { + + filename <- grep("chimeras.bedpe", list.files(outputpath), value = T) + + if (length(filename) == 1) { + + xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) + gene1tmp <- as.character(xx$genes5p) + gene2tmp <- as.character(xx$genes3p) + nreadstmp <- as.numeric(as.character(xx$total_frags)) + scoretmp <- as.numeric(as.character(xx$score)) + + gene1 <- c() + gene2 <- c() + nreads <- c() + score <- c() + + for (i in 1: length(gene1tmp)) { + + if((length(grep(",", gene1tmp[i])) > 0) & (length(grep(",", gene2tmp[i])) == 0)) { + + gene1 <- c(gene1, unlist(strsplit(gene1tmp[i], ","))) + myrep <- length(unlist(strsplit(gene1tmp[i], ","))) + gene2 <- c(gene2, rep(gene2tmp[i], myrep)) + nreads <- c(nreads, rep(nreadstmp[i], myrep)) + score <- c(score, rep(scoretmp[i], myrep)) + + } else if ((length(grep(",", gene1tmp[i])) == 0) & (length(grep(",", gene2tmp[i])) > 0)) { + + gene2 <- c(gene2, unlist(strsplit(gene2tmp[i], ","))) + myrep <- length(unlist(strsplit(gene2tmp[i], ","))) + gene1 <- c(gene1, rep(gene1tmp[i], myrep)) + nreads <- c(nreads, rep(nreadstmp[i], myrep)) + score <- c(score, rep(scoretmp[i], myrep)) + + } else if ((length(grep(",", gene1tmp[i])) > 0) & (length(grep(",", gene2tmp[i])) > 0)) { + + gene1tmp1 <- unlist(strsplit(gene1tmp[i], ",")) + gene2tmp1 <- unlist(strsplit(gene2tmp[i], ",")) + myrep1 <- length(unlist(strsplit(gene1tmp[i], ","))) + myrep2 <- length(unlist(strsplit(gene2tmp[i], ","))) + + for (j in 1: myrep1) { + + gene1 <- c(gene1, rep(gene1tmp1[j], myrep2)) + gene2 <- c(gene2, gene2tmp1) + nreads <- c(nreads, rep(nreadstmp[i], myrep2)) + score <- c(score, rep(scoretmp[i], myrep2)) + + } + + } else { + + gene1 <- c(gene1, gene1tmp[i]) + gene2 <- c(gene2, gene2tmp[i]) + nreads <- c(nreads, nreadstmp[i]) + score <- c(score, scoretmp[i]) + + } + + } + + seq <- rep(NA, length(gene1)) + gene5 <- rep(NA, length(gene1)) + gene3 <- rep(NA, length(gene1)) + + for (i in 1: length(gene1)) { + + gene5[i] <- toens(ericscriptfolder, gene1[i]) + gene3[i] <- toens(ericscriptfolder, gene2[i]) + + } + + algout <- list() + algout$gene5 <- gene5 + algout$gene3 <- gene3 + algout$nreads <- nreads + algout$score <- score + algout$seq <- seq + + return(algout) + } else if (length(filename) == 0) { + + return(0) + + } else if (length(filename) > 1) { + + return(length(filename)) + + } + +} + + + +Import_shortfuse <- function(outputpath) { + + filename <- grep("fusion_counts.bedpe", list.files(outputpath), value = T) + + if (length(filename) == 1) { + + xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = F) + gene1tmp <- as.character(xx[, 11]) + gene2tmp <- as.character(xx[, 12]) + nreadstmp <- as.numeric(as.character(xx[, 8])) + scoretmp <- as.numeric(as.character(xx[, 8])) + + gene12 <- paste(gene1tmp, gene2tmp) + ixNOdupgene <- which(duplicated(gene12) == F) + + gene5 <- rep(NA, length(ixNOdupgene)) + gene3 <- rep(NA, length(ixNOdupgene)) + + for (i in 1: length(ixNOdupgene)) { + + gene5[i] <- toens(ericscriptfolder, gene1tmp[ixNOdupgene[i]]) + gene3[i] <- toens(ericscriptfolder, gene2tmp[ixNOdupgene[i]]) + + } + + nreads <- nreadstmp[ixNOdupgene] + score <- scoretmp[ixNOdupgene] + seq <- rep(NA, length(nreads)) + + algout <- list() + algout$gene5 <- gene5 + algout$gene3 <- gene3 + algout$nreads <- nreads + algout$score <- score + algout$seq <- seq + + return(algout) + } else if (length(filename) == 0) { + + return(0) + + } else if (length(filename) > 1) { + + return(length(filename)) + + } + +} + + + + +Import_fusionmap <- function(outputpath) { + + filename <- grep("FusionReport.txt", list.files(outputpath), value = T) + + if (length(filename) == 1) { + + xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) + + transcriptversetmp <- as.character(xx$FusionGene) + genelisttmp <- unlist(strsplit(transcriptversetmp, "->")) + gene1tmp <- genelisttmp[seq(1, length(genelisttmp), by = 2)] + gene2tmp <- genelisttmp[seq(2, length(genelisttmp), by = 2)] + gene1tmp1 <- as.character(xx$KnownGene1) + gene2tmp1 <- as.character(xx$KnownGene2) + seqtmp <- as.character(xx$FusionJunctionSequence) + ix.reverse <- which(gene1tmp %in% gene1tmp1 == F) + for (ii in 1: length(ix.reverse)) { + seqtmp[ix.reverse[ii]] <- convertToComplement(reverse(seqtmp[ix.reverse[ii]])) + } + nreadstmp <- as.numeric(as.character(xx[, 2])) + scoretmp <- as.numeric(as.character(xx[, 2])) + + gene1 <- c() + gene2 <- c() + nreads <- c() + score <- c() + seq <- c() + + for (i in 1: length(gene1tmp)) { + + if((length(grep(",", gene1tmp[i])) > 0) & (length(grep(",", gene2tmp[i])) == 0)) { + + gene1 <- c(gene1, unlist(strsplit(gene1tmp[i], ","))) + myrep <- length(unlist(strsplit(gene1tmp[i], ","))) + gene2 <- c(gene2, rep(gene2tmp[i], myrep)) + nreads <- c(nreads, rep(nreadstmp[i], myrep)) + score <- c(score, rep(scoretmp[i], myrep)) + seq <- c(seq, rep(seqtmp[i], myrep)) + + } else if ((length(grep(",", gene1tmp[i])) == 0) & (length(grep(",", gene2tmp[i])) > 0)) { + + gene2 <- c(gene2, unlist(strsplit(gene2tmp[i], ","))) + myrep <- length(unlist(strsplit(gene2tmp[i], ","))) + gene1 <- c(gene1, rep(gene1tmp[i], myrep)) + nreads <- c(nreads, rep(nreadstmp[i], myrep)) + score <- c(score, rep(scoretmp[i], myrep)) + seq <- c(seq, rep(seqtmp[i], myrep)) + + + } else if ((length(grep(",", gene1tmp[i])) > 0) & (length(grep(",", gene2tmp[i])) > 0)) { + + gene1tmp1 <- unlist(strsplit(gene1tmp[i], ",")) + gene2tmp1 <- unlist(strsplit(gene2tmp[i], ",")) + myrep1 <- length(unlist(strsplit(gene1tmp[i], ","))) + myrep2 <- length(unlist(strsplit(gene2tmp[i], ","))) + + for (j in 1: myrep1) { + + gene1 <- c(gene1, rep(gene1tmp1[j], myrep2)) + gene2 <- c(gene2, gene2tmp1) + nreads <- c(nreads, rep(nreadstmp[i], myrep2)) + score <- c(score, rep(scoretmp[i], myrep2)) + seq <- c(seq, rep(seqtmp[i], myrep2)) + + } + + } else { + + gene1 <- c(gene1, gene1tmp[i]) + gene2 <- c(gene2, gene2tmp[i]) + nreads <- c(nreads, nreadstmp[i]) + score <- c(score, scoretmp[i]) + seq <- c(seq, seqtmp[i]) + + } + + } + + gene5 <- rep(NA, length(gene1)) + gene3 <- rep(NA, length(gene1)) + + for (i in 1: length(gene1)) { + + gene5[i] <- toens(ericscriptfolder, gene1[i]) + gene3[i] <- toens(ericscriptfolder, gene2[i]) + + } + + algout <- list() + algout$gene5 <- gene5 + algout$gene3 <- gene3 + algout$nreads <- nreads + algout$score <- score + algout$seq <- seq + + return(algout) + } else if (length(filename) == 0) { + + return(0) + + } else if (length(filename) > 1){ + + return(length(filename)) + + } + +} + + + + +Import_unknown <- function(outputpath) { + + filename <- grep("ericsim", list.files(outputpath), value = T) + + if (length(filename) == 1) { + + xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) + + gene5 <- as.character(xx$gene5) + gene3 <- as.character(xx$gene3) + nreads <- as.numeric(as.character(xx$nread)) + score <- as.numeric(as.character(xx$score)) + seq <- as.character(x$seq) + + algout <- list() + algout$gene5 <- gene5 + algout$gene3 <- gene3 + algout$nreads <- nreads + algout$score <- score + algout$seq <- seq + + return(algout) + + } else if (length(filename) == 0) { + + return(0) + + } else if (length(filename) > 1) { + + return(length(filename)) + + } + + + +} + + + + + + + + + + + + diff --git a/lib/R/MakeAdjacencyMatrix.R b/lib/R/MakeAdjacencyMatrix.R new file mode 100644 index 0000000..0860369 --- /dev/null +++ b/lib/R/MakeAdjacencyMatrix.R @@ -0,0 +1,119 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +samplename <- split.vars [1] +outputfolder <- split.vars[2] +ericscriptfolder <- split.vars[3] +minreads <- as.numeric(split.vars[4]) +MAPQ <- as.numeric(split.vars[5]) +refid <- as.character(split.vars[6]) +dbfolder <- as.character(split.vars[7]) + +filein <- file.path(outputfolder, "out", paste(samplename, ".filtered.out", sep = "")) +xx <- readLines(filein, n = 1) +if (length(xx) == 0) { + myflag <- 0 + cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) + stop("No lines available in ",filein,". No discordant reads found with MAPQ set to ", MAPQ, ". Try to decrease MAPQ parameter and run again EricScript. Exit!") +} else { + myflag <- 1 + cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) +} +x <- read.delim(filein, sep = "\t", header = F) +load(file.path(dbfolder,"data", refid, "EnsemblGene.GenePosition.RData")) +flag <- x[,1] +#ix.flag <- which((flag > 63 & flag < 70) | (flag > 95 & flag < 118) | flag == 161 | flag == 181) +ix.flag <- which((flag > 63 & flag < 70) | (flag > 95 & flag < 112) | flag == 161) +id_1 <- as.character(x[ix.flag,2]) +id_2 <- as.character(x[ix.flag,5]) +pos_1 <- as.numeric(as.character(x[ix.flag,3])) +pos_2 <- as.numeric(as.character(x[ix.flag,6])) +rm(x) +genename <- as.character(EnsemblGene.GenePosition$EnsemblGene) +id1 <- c() +id2 <- c() +nreads <- c() +diffpos <- c() +generef <- unique(id_1) +for (i in 1: length(generef)) { + ix.generef <- which(genename == generef[i]) + ix.gene <- which(id_1 == generef[i]) + tmp <- sort(summary(as.factor(id_2[ix.gene]), maxsum = length(unique(id_2[ix.gene]))), decreasing = T) + tmp.genename <- names(tmp) + tmp.weight <- as.numeric(tmp) + if ((max(tmp.weight) >= minreads) & (length(which(tmp.weight >= minreads)) <= 10)) { + ix.maxnodes <- which(tmp.weight >= minreads) + tmp.genename <- tmp.genename[ix.maxnodes] + tmp.weight <- tmp.weight[ix.maxnodes] + for (j in 1:length(tmp.weight)) { + ix.genelink <- which(genename == tmp.genename[j]) + if (length(ix.genelink)!=0) { + id1 <- c(id1, generef[i]) + id2 <- c(id2, tmp.genename[j]) + nreads <- c(nreads, tmp.weight[j]) + diffpos <- c(diffpos, abs(ix.generef-ix.genelink)) + } + } + } + +} +## filter paralogs if paralogs exist + +if (file.exists(file.path(dbfolder,"data", refid, "EnsemblGene.Paralogs.RData"))) { + + load(file.path(dbfolder, "data", refid, "EnsemblGene.Paralogs.RData")) + paralogs.flag <- rep(0, length(id1)) + + if (length(id1) == 0) { + myflag <- 0 + cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) + stop("No discordant reads found with minimum reads set to ", minreads, ". Exit!") + } + for (i in 1: length(id1)) { + ix.paralogs <- which(EnsemblGene.Paralogs$EnsemblGene == id1[i]) + paralogs <- as.character(EnsemblGene.Paralogs$Paralogs[ix.paralogs]) + if (length(grep(id2[i], paralogs)) > 0) { + paralogs.flag[i] <- 1 + } + } + ## + paralogs.filter <- which(paralogs.flag == 0) + id1f <- id1[paralogs.filter] + id2f <- id2[paralogs.filter] + nreadsf <- nreads[paralogs.filter] + diffposf <- diffpos[paralogs.filter] + if (length(id1) == 0) { + myflag <- 0 + cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) + stop("No discordant reads found with minimum reads set to ",minreads,". Exit!") + } + nfus <- length(id1f) + MyGF <- vector("list", 6) + names(MyGF) <- c("id1", "id2", "nreads", "pos1", "pos2", "diffpos") + MyGF$id1 <- id1f + MyGF$id2 <- id2f + MyGF$nreads <- nreadsf + MyGF$diffpos <- diffposf + MyGF$pos1 <- vector("list", nfus) + MyGF$pos2 <- vector("list", nfus) + for (i in 1: (nfus)) { + ix.pos <- which((id_1 == id1f[i]) & (id_2 ==id2f[i])) + MyGF$pos1[[i]] <- pos_1[ix.pos] + MyGF$pos2[[i]] <- pos_2[ix.pos] + } +} else { + + nfus <- length(id1) + MyGF <- vector("list", 6) + names(MyGF) <- c("id1", "id2", "nreads", "pos1", "pos2", "diffpos") + MyGF$id1 <- id1 + MyGF$id2 <- id2 + MyGF$nreads <- nreads + MyGF$diffpos <- diffpos + MyGF$pos1 <- pos_1 + MyGF$pos2 <- pos_2 + +} + +save(MyGF, file = file.path(outputfolder, "out", paste(samplename, ".chimeric.RData", sep = ""))) + diff --git a/lib/R/MakeEmptyResults.R b/lib/R/MakeEmptyResults.R new file mode 100644 index 0000000..00781a7 --- /dev/null +++ b/lib/R/MakeEmptyResults.R @@ -0,0 +1,27 @@ +## MakeResults.R v0.2 +## different read count-based method for gene expression level estimation +## added machine-learning based algorithm as summarization score +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +samplename <- split.vars [1] +outputfolder <- split.vars[2] + +Results <- "No Chimeric Transcript found!" +write.table(Results, file = file.path(outputfolder,paste(samplename,".results.total.tsv", sep = "")), sep = "\t", row.names = F, col.names = F, quote = F) + + + + + + + + + + + + + + + + diff --git a/lib/R/MakeResults.R b/lib/R/MakeResults.R new file mode 100644 index 0000000..7cb5155 --- /dev/null +++ b/lib/R/MakeResults.R @@ -0,0 +1,485 @@ +## MakeResults.R v0.5 +## different read count-based method for gene expression level estimation +## added machine-learning based algorithm as summarization score +## new method to retrieve genomic coordinates +## genome reference from db data +## edited gene fusions separator + +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +samplename <- split.vars [1] +outputfolder <- split.vars[2] +ericscriptfolder <- split.vars[3] +readlength <- as.numeric(split.vars[4]) +verbose <- as.numeric(split.vars[5]) +refid <- as.character(split.vars [6]) +dbfolder <- as.character(split.vars[7]) +#genomeref <- as.character(split.vars[8]) + +flag.ada <- require(ada, quietly = T) +if (flag.ada == F) { + require(kernlab, quietly = T) +} +load(file.path(outputfolder,"out",paste(samplename,".chimeric.RData", sep = ""))) +load(file.path(outputfolder,"out",paste(samplename,".DataMatrix.RData", sep = ""))) +load(file.path(outputfolder,"out", paste(samplename,".ids_homology.RData", sep = ""))) +load(file.path(dbfolder, "data", refid, "EnsemblGene.GeneInfo.RData")) +load(file.path(dbfolder, "data", refid, "EnsemblGene.Structures.RData")) +load(file.path(dbfolder, "data", refid, "EnsemblGene.GeneNames.RData")) +load(file.path(dbfolder, "data", refid, "EnsemblGene.Sequences.RData")) +load(file.path(ericscriptfolder, "lib","data", "_resources", "BlackList.RData")) +load(file.path(ericscriptfolder, "lib","data", "_resources", "DataModel.RData")) +genomeref <- file.path(dbfolder, "data", refid, "allseq.fa") + +checkselfhomology.fa <- scan(file = file.path(outputfolder,"out", paste(samplename, ".checkselfhomology.fa", sep = "")), sep= "\n", what = "", quiet = T) +seq.length <- nchar(checkselfhomology.fa)[2] +ix.id.fa <- which(checkselfhomology.fa %in% paste(">",info.id.and.homology[,1], sep = "")) +ix.fa <- ix.id.fa + 1 +left_junction <- substr(checkselfhomology.fa[ix.fa], 1, (seq.length/2)) +right_junction <- substr(checkselfhomology.fa[ix.fa], (seq.length/2 + 1), seq.length) +tmp_id <- unlist(strsplit(checkselfhomology.fa[ix.id.fa], "----", fixed = T)) +left_id <- tmp_id[seq(1, length(ix.id.fa)*2, by = 2)] +right_id <- paste(">",tmp_id[seq(2, length(ix.id.fa)*2, by = 2)], sep = "") +exone.tmp <- unlist(strsplit(as.character(DataMatrix[,1]), "----", fixed = T)) +exone.tmp1 <- exone.tmp[seq(1, dim(DataMatrix)[1]*2, by = 2)] +exone.tmp2 <- exone.tmp[seq(2, dim(DataMatrix)[1]*2, by = 2)] +ensgenename1 <- unlist(strsplit(exone.tmp1, "_"))[seq(1, dim(DataMatrix)[1]*2, by = 2)] +ensgenename2 <- unlist(strsplit(exone.tmp2, "_"))[seq(1, dim(DataMatrix)[1]*2, by = 2)] +ix.idd <- which(DataMatrix[,1] %in% info.id.and.homology[,1]) +DataMatrixF <- DataMatrix[ix.idd ,] +homology <- info.id.and.homology[,2] +flag.dup.a <- info.id.and.homology[,3] +flag.dup.b <- info.id.and.homology[,4] +geneinfo.id <- as.character(EnsemblGene.Structures$EnsemblGene) +geneinfo.chr <- as.character(EnsemblGene.Structures$Chromosome) +geneinfo.start <- as.character(EnsemblGene.Structures$geneStart) +geneinfo.end <- as.character(EnsemblGene.Structures$geneEnd) +geneinfo.description <- as.character(EnsemblGene.GeneInfo$Description) +geneinfo.genename <- as.character(EnsemblGene.GeneInfo$GeneName) +geneinfo.strand <- as.character(EnsemblGene.Structures$Strand) +ensgenename12 <- as.character(DataMatrixF[,1]) +exone.tmp <- unlist(strsplit(as.character(DataMatrixF[,1]), "----", fixed = T)) +exone.tmp1 <- exone.tmp[seq(1, dim(DataMatrixF)[1]*2, by = 2)] +exone.tmp2 <- exone.tmp[seq(2, dim(DataMatrixF)[1]*2, by = 2)] +ensgenename1 <- unlist(strsplit(exone.tmp1, "_"))[seq(1, dim(DataMatrixF)[1]*2, by = 2)] +ensgenename2 <- unlist(strsplit(exone.tmp2, "_"))[seq(1, dim(DataMatrixF)[1]*2, by = 2)] +geneid1 <- rep("",length(ensgenename1)) +geneid2 <- rep("",length(ensgenename1)) +description1 <- rep("",length(ensgenename1)) +description2 <- rep("",length(ensgenename1)) +status1 <- rep("",length(ensgenename1)) +status2 <- rep("",length(ensgenename1)) +chr1 <- rep("", length(ensgenename1)) +chr2 <- rep("", length(ensgenename1)) +genestart1 <- rep("", length(ensgenename1)) +genestart2 <- rep("", length(ensgenename1)) +geneend1 <- rep("", length(ensgenename1)) +geneend2 <- rep("", length(ensgenename1)) +biotype1 <- rep("", length(ensgenename1)) +biotype2 <- rep("", length(ensgenename1)) +strand1 <- rep("", length(ensgenename1)) +strand2 <- rep("", length(ensgenename1)) + +for (i in 1:length(ensgenename1)) { + ix.gene1 <- which(geneinfo.id == ensgenename1[i]) + ix.gene1.info <- which(EnsemblGene.GeneInfo$EnsemblGene == ensgenename1[i]) + if (length(ix.gene1)!=0) { + geneid1[i] <- geneinfo.genename[ix.gene1.info] + description1[i] <- geneinfo.description[ix.gene1.info] + chr1[i] <- geneinfo.chr[ix.gene1] + strand1[i] <- geneinfo.strand[ix.gene1] + genestart1[i] <- geneinfo.start[ix.gene1] + geneend1[i] <- geneinfo.end[ix.gene1] + } + ix.gene2 <- which(geneinfo.id == ensgenename2[i]) + ix.gene2.info <- which(EnsemblGene.GeneInfo$EnsemblGene == ensgenename2[i]) + if (length(ix.gene2)!=0) { + geneid2[i] <- geneinfo.genename[ix.gene2.info] + description2[i] <- geneinfo.description[ix.gene2.info] + chr2[i] <- geneinfo.chr[ix.gene2] + strand2[i] <- geneinfo.strand[ix.gene2] + genestart2[i] <- geneinfo.start[ix.gene2] + geneend2[i] <- geneinfo.end[ix.gene2] + + } +} + +exone1 <- unlist(strsplit(exone.tmp1, "_"))[seq(2, dim(DataMatrixF)[1]*2, by = 2)] +exone2 <- unlist(strsplit(exone.tmp2, "_"))[seq(2, dim(DataMatrixF)[1]*2, by = 2)] +genename1 <- geneid1 +genename2 <- geneid2 +diff.adj.tot <- rep(NA, length(ensgenename1)) +n.crossing <- rep(NA, length(ensgenename1)) +for (hh in 1: length(ensgenename1)) { + hhix <- which(MyGF$id1 == ensgenename1[hh] & MyGF$id2 == ensgenename2[hh]) + diff.adj.tot[hh] <- MyGF$diffpos[hhix] + n.crossing[hh] <- MyGF$nreads[hhix] +} +n.spanning <- as.numeric(as.character(DataMatrixF[,4])) +edge.score <- round(as.numeric(as.character(DataMatrixF[,6])), digits = 4) +gjs.score <- round(as.numeric(as.character(DataMatrixF[,7])), digits = 4) +unique.score <- round(as.numeric(as.character(DataMatrixF[,8])), digits = 4) +isize.score <- round(as.numeric(as.character(DataMatrixF[,9])), digits = 4) +ins.size <- round(as.numeric(as.character(DataMatrixF[,3])), digits = 2) +adj <- rep("Not Adjacent", length(ensgenename1)) +adj[which(diff.adj.tot == 1)] <- "Adjacent" +if (refid == "homo_sapiens") { + + acceptable.chrs <- c(seq.int(1,22), "X","Y") + ix.chr1 <- which(chr1 %in% acceptable.chrs) + ix.chr2 <- which(chr2 %in% acceptable.chrs) + ix.chr.tmp1 <- intersect(ix.chr1, ix.chr2) + ix.chr.tmp2 <- intersect(grep("NNN", right_junction, invert = T), grep("NNN", left_junction, invert = T)) + ix.chr <- intersect(ix.chr.tmp1, ix.chr.tmp2) + + + if (length(ix.chr) > 0) { + genename1 <- genename1[ix.chr] + genename2 <- genename2[ix.chr] + chr1 <- chr1[ix.chr] + chr2 <- chr2[ix.chr] + genestart1 <- genestart1[ix.chr] + genestart2 <- genestart2[ix.chr] + geneend1 <- geneend1[ix.chr] + geneend2 <- geneend2[ix.chr] + exone1 <- exone1[ix.chr] + exone2 <- exone2[ix.chr] + ensgenename1 <- ensgenename1[ix.chr] + ensgenename12 <- ensgenename12[ix.chr] + ensgenename2 <- ensgenename2[ix.chr] + DataMatrixF <- rbind(DataMatrixF[ix.chr,]) + homology <- homology[ix.chr] + flag.dup.a <- flag.dup.a[ix.chr] + flag.dup.b <- flag.dup.b[ix.chr] + adj <- adj[ix.chr] + n.spanning <- n.spanning[ix.chr] + n.crossing <- n.crossing[ix.chr] + edge.score <- edge.score[ix.chr] + unique.score <- unique.score[ix.chr] + gjs.score <- gjs.score[ix.chr] + isize.score <- isize.score[ix.chr] + ins.size <- ins.size[ix.chr] + biotype1 <- biotype1[ix.chr] + biotype2 <- biotype2[ix.chr] + status1 <- status1[ix.chr] + status2 <- status2[ix.chr] + description1 <- description1[ix.chr] + description2 <- description2[ix.chr] + left_id <- left_id[ix.chr] + left_junction <- left_junction[ix.chr] + right_id <- right_id[ix.chr] + right_junction <- right_junction[ix.chr] + strand1 <- strand1[ix.chr] + strand2 <- strand2[ix.chr] + } +} +stats <- read.delim(file.path(outputfolder, "out", paste(samplename, ".stats", sep = "")), sep = "\t", header = F) +gene.stats <- as.character(stats[,1]) +length.stats <- as.numeric(stats[,2]) +nreads.stats <- as.numeric(stats[,3]) +gene1.exp <- rep(0, length(ensgenename1)) +gene2.exp <- rep(0, length(ensgenename1)) +gene12.exp <- round(((as.numeric(DataMatrixF[,2]) + as.numeric(DataMatrixF[,4]))*readlength/as.numeric(DataMatrixF[,5])), digits = 2) +#gene12.exp <- rep(0, length(ensgenename1)) +for (iexpr in 1: length(ensgenename1)) { + ix1.stats <- which(gene.stats == ensgenename1[iexpr]) + ix2.stats <- which(gene.stats == ensgenename2[iexpr]) + # ix12.stats <- which(gene.stats == ensgenename12[iexpr]) + gene1.exp[iexpr] <- round(nreads.stats[ix1.stats]*readlength/length.stats[ix1.stats], digits = 2) + gene2.exp[iexpr] <- round(nreads.stats[ix2.stats]*readlength/length.stats[ix2.stats], digits = 2) + # gene12.exp[iexpr] <- round(nreads.stats[ix12.stats]*readlength/length.stats[ix12.stats], digits = 2) +} + +# NEW Find GenomicPosition (50nt) +for (i in 1: length(ensgenename1)) { + if (i == 1) { + cat(paste("@", i, "_", 1, "\n", left_junction[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction[i])))), "\n", "@", i, "_", 2, "\n", right_junction[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = F) + } else { + cat(paste("@", i, "_", 1, "\n", left_junction[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction[i])))), "\n", "@", i, "_", 2, "\n", right_junction[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = T) + } +} +system(paste("bwa aln", "-R 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.sai"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) +system(paste("bwa samse", "-n 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq.sai"), file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) +system(paste("cat", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "|", file.path(ericscriptfolder, "lib", "perl", "xa2multi.pl"), "-", "|","grep -v -e \'^\\@\' -",">", file.path(outputfolder, "out", "findgenomicpos.fq.sam"))) +xx.pos <- read.delim(file.path(outputfolder, "out", "findgenomicpos.fq.sam"), sep = "\t", header= F) +genpos_1 <- rep(0,length(ensgenename1)) +genpos_2 <- rep(0,length(ensgenename1)) +id.pos <- as.character(xx.pos[[1]]) +flag.pos <- as.character(xx.pos[[2]]) +chr.pos <- as.character(xx.pos[[3]]) +if (length(grep("chr", chr.pos)) > 0) { + chr.pos <- gsub("chr", "", chr.pos) +} +pos.pos <- as.numeric(as.character(xx.pos[[4]])) +mapq.pos <- as.numeric(as.character(xx.pos[[5]])) + +for (i in 1: length(ensgenename1)) { + + ## for 5' gene + ix.mypos <- which(id.pos == paste(i, "_1", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr1[i] & pos.pos.ix >= as.numeric(genestart1[i]) & pos.pos.ix <= as.numeric(geneend1[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_1[i] <- pos.pos.ix[ix.okpos] + } else { + genpos_1[i] <- pos.pos.ix[ix.okpos] + 49 + } + } + ## for 3' gene + ix.mypos <- which(id.pos == paste(i, "_2", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr2[i] & pos.pos.ix >= as.numeric(genestart2[i]) & pos.pos.ix <= as.numeric(geneend2[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_2[i] <- pos.pos.ix[ix.okpos] + 49 + } else { + genpos_2[i] <- pos.pos.ix[ix.okpos] + } + } +} + +# NEW Find GenomicPosition (25nt_1) +ix.na.pos_1 <- which(genpos_1 == 0) +ix.na.pos_2 <- which(genpos_2 == 0) +if (length(ix.na.pos_1 ) > 0 | length(ix.na.pos_2 ) > 0) { + +left_junction.trim <- substr(left_junction, 26, 50) +right_junction.trim <- substr(right_junction, 1, 25) +for (i in 1: length(ensgenename1)) { + if (i == 1) { + cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = F) + } else { + cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = T) + } +} +system(paste("bwa aln", "-R 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.sai"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) +system(paste("bwa samse", "-n 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq.sai"), file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) +system(paste("cat", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "|", file.path(ericscriptfolder, "lib", "perl", "xa2multi.pl"), "-", "|","grep -v -e \'^\\@\' -",">", file.path(outputfolder, "out", "findgenomicpos.fq.sam"))) +xx.pos <- read.delim(file.path(outputfolder, "out", "findgenomicpos.fq.sam"), sep = "\t", header= F) +id.pos <- as.character(xx.pos[[1]]) +flag.pos <- as.character(xx.pos[[2]]) +chr.pos <- as.character(xx.pos[[3]]) +if (length(grep("chr", chr.pos)) > 0) { + chr.pos <- gsub("chr", "", chr.pos) +} +pos.pos <- as.numeric(as.character(xx.pos[[4]])) +mapq.pos <- as.numeric(as.character(xx.pos[[5]])) +for (i in 1: length(ensgenename1)) { + if (i %in% ix.na.pos_1) { + ## for 5' gene + ix.mypos <- which(id.pos == paste(i, "_1", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr1[i] & pos.pos.ix >= as.numeric(genestart1[i]) & pos.pos.ix <= as.numeric(geneend1[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_1[i] <- pos.pos.ix[ix.okpos] + } else { + genpos_1[i] <- pos.pos.ix[ix.okpos] + 24 + } + } + } + ## for 3' gene + if (i %in% ix.na.pos_2) { + ix.mypos <- which(id.pos == paste(i, "_2", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr2[i] & pos.pos.ix >= as.numeric(genestart2[i]) & pos.pos.ix <= as.numeric(geneend2[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_2[i] <- pos.pos.ix[ix.okpos] + 24 + } else { + genpos_2[i] <- pos.pos.ix[ix.okpos] + } + } + } +} +# NEW Find GenomicPosition (25nt_2) +ix.na.pos_1 <- which(genpos_1 == 0) +ix.na.pos_2 <- which(genpos_2 == 0) +left_junction.trim <- substr(left_junction, 1, 25) +right_junction.trim <- substr(right_junction, 26, 50) +for (i in 1: length(ensgenename1)) { + if (i == 1) { + cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = F) + } else { + cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = T) + } +} +system(paste("bwa aln", "-R 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.sai"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) +system(paste("bwa samse", "-n 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq.sai"), file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) +system(paste("cat", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "|", file.path(ericscriptfolder, "lib", "perl", "xa2multi.pl"), "-", "|","grep -v -e \'^\\@\' -",">", file.path(outputfolder, "out", "findgenomicpos.fq.sam"))) + + +xx.pos <- read.delim(file.path(outputfolder, "out", "findgenomicpos.fq.sam"), sep = "\t", header= F) +id.pos <- as.character(xx.pos[[1]]) +flag.pos <- as.character(xx.pos[[2]]) +chr.pos <- as.character(xx.pos[[3]]) +if (length(grep("chr", chr.pos)) > 0) { + chr.pos <- gsub("chr", "", chr.pos) +} +pos.pos <- as.numeric(as.character(xx.pos[[4]])) +mapq.pos <- as.numeric(as.character(xx.pos[[5]])) +for (i in 1: length(ensgenename1)) { + if (i %in% ix.na.pos_1) { + ## for 5' gene + ix.mypos <- which(id.pos == paste(i, "_1", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr1[i] & pos.pos.ix >= as.numeric(genestart1[i]) & pos.pos.ix <= as.numeric(geneend1[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_1[i] <- pos.pos.ix[ix.okpos] - 1 + } else { + genpos_1[i] <- pos.pos.ix[ix.okpos] + 24 + } + } + } + ## for 3' gene + if (i %in% ix.na.pos_2) { + ix.mypos <- which(id.pos == paste(i, "_2", sep = "")) + chr.pos.ix <- chr.pos[ix.mypos] + flag.pos.ix <- flag.pos[ix.mypos] + pos.pos.ix <- pos.pos[ix.mypos] + mapq.pos.ix <- mapq.pos[ix.mypos] + ix.okpos <- which(chr.pos.ix == chr2[i] & pos.pos.ix >= as.numeric(genestart2[i]) & pos.pos.ix <= as.numeric(geneend2[i])) + if (length(ix.okpos) > 1) { + ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] + } + if (length(ix.okpos) > 0) { + if (flag.pos.ix[ix.okpos] == 16) { + genpos_2[i] <- pos.pos.ix[ix.okpos] + 24 + } else { + genpos_2[i] <- pos.pos.ix[ix.okpos] - 1 + } + } + } +} +} + +genpos_1.recal <- genpos_1 +genpos_2.recal <- genpos_2 + +# # refine genomic coordinates +# for (i in 1: length(ensgenename1)) { +# ix.ref <- grep(ensgenename1[i], GeneNames) +# ix.ref.table <- which(GeneNames[ix.ref] == geneinfo.id) +# if (strand1[i] == "+") { +# exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.ref.table]), ","))) +# } else { +# exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.ref.table]), ","))) +# } +# ix.exon <- which.min(abs(genpos_1[i] - exonpos)) +# mydiff <- abs(genpos_1[i] - exonpos[ix.exon]) +# if (mydiff <= 3) { +# genpos_1.recal[i] <- exonpos[ix.exon] +# } +# +# ix.ref <- grep(ensgenename2[i], GeneNames) +# ix.ref.table <- which(GeneNames[ix.ref] == geneinfo.id) +# if (strand2[i] == "+") { +# exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.ref.table]), ","))) +# } else { +# exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.ref.table]), ","))) +# } +# ix.exon <- which.min(abs(genpos_2[i] - exonpos)) +# mydiff <- abs(genpos_2[i] - exonpos[ix.exon]) +# if (mydiff <= 3) { +# genpos_2.recal[i] <- exonpos[ix.exon] +# } +# } + + +nreads.score <- pmin( n.crossing, n.spanning)/pmax(n.crossing, n.spanning) +myscores <- cbind(gjs.score, edge.score, nreads.score, gene12.exp) +colnames(myscores) <- c("probs.gjs", "probs.es", "probs.us", "cov") +myscores <- data.frame(myscores) +if (flag.ada) { + myada <- ada(control~., data = DataScores,loss="exponential", nu = 0.1) + ericscore <- as.numeric(predict(myada, myscores, type = "probs")[,2]) +} else { + sig <- sigest(control~., data = DataScores, frac = 1, na.action = na.omit, scaled = TRUE)[2] + model <- ksvm(control~., data = DataScores, type = "C-svc", kernel = "rbfdot", kpar = list(sigma = sig), C = 1, prob.model = TRUE) + ericscore <- predict(model, myscores, type = "probabilities")[,2] +} +ix.repeated <- unique(c(which(is.na(genpos_1)), which(is.na(genpos_2)))) +ix.norepeated <- intersect(which(is.na(genpos_1) == F), which(is.na(genpos_2) == F)) +genpos_1.recal[which(genpos_1.recal == 0)] <- "Unable to predict breakpoint position" +genpos_2.recal[which(genpos_2.recal == 0)] <- "Unable to predict breakpoint position" +myblacklist <- rep("", length(genename1)) +ix.bl <- which((genename1 %in% gene.bl1 & genename2 %in% gene.bl2) | (genename1 %in% gene.bl2 & genename2 %in% gene.bl1)) +if (length(ix.bl) > 0) { + for (bli in 1: length(ix.bl)) { + ix.bli <- which((gene.bl1 == genename1[ix.bl[bli]] & gene.bl2 == genename2[ix.bl[bli]]) | (gene.bl2 == genename1[ix.bl[bli]] & gene.bl1 == genename2[ix.bl[bli]])) + myblacklist[ix.bl[bli]] <- paste("Frequency:", sum(freq.bl[ix.bli])) + } +} +oddity.spanningreads <- rep(0, length(genpos_1)) +oddity.spanningreads[which(n.spanning == 1 & n.crossing >= 10)] <- 1 +fusion.type <- rep("inter-chromosomal",length(genpos_1)) +fusion.type[which(chr1==chr2)] <- "intra-chromosomal" +fusion.type[intersect(intersect(which(chr1==chr2) , which(adj == "Adjacent")), which(strand1 == strand2))] <- "Read-Through" +fusion.type[intersect(intersect(which(chr1==chr2) , which(adj == "Adjacent")), which(strand1 != strand2))] <- "Cis" +junctionsequence <- paste(tolower(left_junction), right_junction, sep = "") +SummaryMat <- cbind(genename1, genename2, chr1, genpos_1.recal, strand1, chr2, genpos_2.recal , strand2, ensgenename1, ensgenename2, n.crossing, n.spanning,ins.size, homology, fusion.type, myblacklist,description1, description2, junctionsequence, gene1.exp, gene2.exp, gene12.exp, edge.score, gjs.score, nreads.score, ericscore) +colnames(SummaryMat) <- c("GeneName1", "GeneName2", "chr1","Breakpoint1", "strand1", "chr2" ,"Breakpoint2", "strand2","EnsemblGene1", "EnsemblGene2", "crossingreads","spanningreads","mean insertsize","homology","fusiontype", "Blacklist","InfoGene1", "InfoGene2", "JunctionSequence", "GeneExpr1", "GeneExpr2", "GeneExpr_Fused", "ES", "GJS", "US","EricScore") +SummaryMat <- data.frame(SummaryMat) +save(SummaryMat, file=file.path(outputfolder, paste(samplename, ".Summary.RData", sep = ""))) +if (dim(SummaryMat)[1] > 0) { + write.table(SummaryMat, file = file.path(outputfolder,paste(samplename,".results.total.tsv", sep = "")), sep = "\t", row.names = F, quote = F) + ix.sorting.score <- sort(ericscore, decreasing = T, index.return = T)$ix + ericscore.sorted <- ericscore[ix.sorting.score] + myblacklist.sorted <- myblacklist[ix.sorting.score] + oddity.spanningreads.sorted <- oddity.spanningreads[ix.sorting.score] + SummaryMat.sorted <- SummaryMat[ix.sorting.score, ] + SummaryMat.Filtered <- SummaryMat.sorted[which(ericscore.sorted > 0.5 & myblacklist.sorted == "" & oddity.spanningreads.sorted == 0), ] + write.table(SummaryMat.Filtered[, -16], file = file.path(outputfolder,paste(samplename,".results.filtered.tsv", sep = "")), sep = "\t", row.names = F, quote = F) +} else +{ + Results <- "No Chimeric Transcript found!" + write.table(Results, file = file.path(outputfolder,paste(samplename,".results.total.tsv", sep = "")), sep = "\t", row.names = F, col.names = F, quote = F) + +} + + + + + + + + + + + + + + diff --git a/lib/R/RecalibrateJunctions.R b/lib/R/RecalibrateJunctions.R new file mode 100644 index 0000000..cdf2896 --- /dev/null +++ b/lib/R/RecalibrateJunctions.R @@ -0,0 +1,141 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +samplename <- split.vars [1] +outputfolder <- split.vars[2] +readlength <- as.numeric(split.vars[3]) +verbose <- as.numeric(split.vars[4]) +grep.readlength <- c("grep") +for (i in 1: length(readlength)) { + grep.readlength <- paste(grep.readlength, " -v -e MD:Z:", readlength[i], sep = "") +} + +formatfasta <- function(myfasta, step = 50) { + totalchar <- nchar(myfasta) + if (totalchar > step) { + steps <- seq(1, totalchar, by = step) + newfasta <- rep("", (length(steps) - 1)) + for (j in 1: (length(steps) - 1)) { + aa <- substr(myfasta, steps[j], (steps[j] + (step - 1))) + newfasta[j] <- aa + } + if ((totalchar - tail(steps, n = 1)) > 0) { + newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar)) + } + } else + { + newfasta <- substr(myfasta, 1, totalchar) + } + return(newfasta) +} + +TryRecalibration <- function(outputfolder, verbose) { + + if (verbose == 0) { + x <- system(paste("blat -tileSize=8 -fine", file.path(outputfolder,"out",".tmp.ref.fa"), file.path(outputfolder, "out", ".link"), file.path(outputfolder, "out",".recalibrated.junctions.blat"), " 1>> ", file.path(outputfolder, "out",".ericscript.log"))) + } else { + x <- system(paste("blat -tileSize=8 -fine", file.path(outputfolder,"out",".tmp.ref.fa"), file.path(outputfolder, "out", ".link"), file.path(outputfolder, "out",".recalibrated.junctions.blat"))) + } + yy <- readLines(file.path(outputfolder, "out", ".recalibrated.junctions.blat"), n = 6) + if (length(yy) > 5) { + xx <- read.delim(file.path(outputfolder, "out", ".recalibrated.junctions.blat"), sep = "\t", skip = 5, header = F) + gapsize <- xx[,8] + } + if (all(gapsize <= 3) | (length(yy) <= 5)) { + if (verbose == 0) { + x <- system(paste("blat -tileSize=8", file.path(outputfolder,"out",".tmp.ref.fa"), file.path(outputfolder, "out", ".link"), file.path(outputfolder, "out",".recalibrated.junctions.blat"), " 1>> ", file.path(outputfolder, "out",".ericscript.log"))) + } else { + x <- system(paste("blat -tileSize=8", file.path(outputfolder,"out",".tmp.ref.fa"), file.path(outputfolder, "out", ".link"), file.path(outputfolder, "out",".recalibrated.junctions.blat"))) + } + + } + +} + +load(file.path(outputfolder,"out",paste(samplename,".junctions.RData", sep = ""))) +load(file.path(outputfolder,"out",paste(samplename,".ids_fasta.RData", sep = ""))) +load(file.path(outputfolder,"out",paste(samplename,".sequences_fasta.RData", sep = ""))) +cat(file.path(outputfolder,"out",".tmp.query.fa"), file = file.path(outputfolder,"out",".link")) +sequences <- sequences.fasta +recal.left <- rep(0, length(ids_fasta)) +recal.right <- rep(0, length(ids_fasta)) +count.recal <- rep(0, length(ids_fasta)) +count.total <- rep(0, length(ids_fasta)) +sequences.recal <- sequences +junctions.recalibrated <- junctions +for (i in 1:length(ids_fasta)) { + junction.tmp <- junctions[i] + x <- system(paste("samtools view ", file.path(outputfolder, "aln", paste(samplename,".remap.sorted.bam", sep = ""))," ", ids_fasta[i], " | ", grep.readlength, " | awk '((($5==0) && ($6==\"*\")) || ($5>=0))' | awk '{ print \">\" $1\"_\"$2,\"\\n\"$10}' > ", file.path(outputfolder,"out",".tmp.query.fa"),sep = "")) + xx <- readLines(file.path(outputfolder,"out",".tmp.query.fa"), n = 1) + if (length(xx) != 0) { + x <- cat(paste(">",ids_fasta[i],sep=""), sequences[i],sep = "\n", file = file.path(outputfolder,"out",".tmp.ref.fa")) + try.recal <- TryRecalibration(outputfolder, verbose) + rm(xx) + yy <- readLines(file.path(outputfolder, "out", ".recalibrated.junctions.blat"), n = 6) + if (length(yy) > 5) { + xx <- read.delim(file.path(outputfolder, "out", ".recalibrated.junctions.blat"), sep = "\t", skip = 5, header = F) + gapsize <- xx[,8] + gapstarts <- strsplit(as.character(xx[,21]), ",") + blocksize <- strsplit(as.character(xx[,19]), ",") + if (any(gapsize > 3)) { + ix.0 <- which(gapsize > 3) + if (length(ix.0) > 0) { + gapstarts1 <- gapstarts[ix.0] + gapstarts.tmp1 <- c() + gapstarts.tmp2 <- c() + for (jgap in 1:length(gapstarts1)) { + ccc <- as.numeric(gapstarts1[[jgap]]) + gapstarts.tmp1 <- c(gapstarts.tmp1, ccc[1]) + gapstarts.tmp2 <- c(gapstarts.tmp2, ccc[2]) + } + ix.gap.in.junct <- ix.0[which((gapstarts.tmp1 <= junction.tmp) & (gapstarts.tmp2 >= (junction.tmp - 10 + 1)))] + gaps <- gapsize[ix.gap.in.junct] + unique.gaps <- unique(gaps) + rr <- tabulate(gaps) + max.rr <- max(rr) + if( max.rr >= 1) { + gap.length <- which.max(rr) + ix.gaps <- which(gapsize == gap.length) + a <- rep(0, length(ix.gaps)) + b <- rep(0, length(ix.gaps)) + aa <- rep(0, length(ix.gaps)) + for (jj in 1:length(ix.gaps)) { + gapstarts.tmp <- as.numeric(gapstarts[[ix.gaps[jj]]]) + blocksize.tmp <- as.numeric(blocksize[[ix.gaps[jj]]]) + a[jj] <- gapstarts.tmp[1] + blocksize.tmp[1] - 1 + b[jj] <- gapstarts.tmp[2] + aa[jj] <- gapstarts.tmp[1] + } + max.a <- max(tabulate(a)) + max.b <- max(tabulate(b)) + my.a <- which.max(tabulate(a)) + my.b <- which.max(tabulate(b)) + if ((abs(max.a-max.b)/max.a) < 0.31) { + count.total[i] <- length(gaps) + count.recal[i] <- max.rr + recal.left[i] <- my.a + 1 + recal.right[i] <- my.b + 1 + sequences.recal[i] <- paste(substr(sequences[i], 1, recal.left[i]), substr(sequences[i], recal.right[i], nchar(sequences[i])), sep = "") + junctions.recalibrated[i] <- recal.left[i] + } + } + + } + } + } + + } +} +ids_fasta.recalibrated <- paste(">", ids_fasta, " junction@", junctions.recalibrated, sep = "") +ref.recalibrated <- c() +for (i in 1:length(ids_fasta.recalibrated)) { + ref.recalibrated <- c(ref.recalibrated, c(ids_fasta.recalibrated[i], formatfasta(sequences.recal[i]))) +} +write(ref.recalibrated, file = file.path(outputfolder,"out",paste(samplename,".EricScript.junctions.recalibrated.fa", sep = "")), ncolumns = 1, sep = "") +Recalibrated.Data <- cbind(recal.left, recal.right, junctions, count.recal, count.total) +colnames(Recalibrated.Data) <- c("Left_Junction", "Right_Junction", "Junction", "Recal_Count", "Total_Count") +save(sequences.recal, file = file.path(outputfolder,"out",paste(samplename,".sequences.recalibrated.RData", sep = ""))) +save(Recalibrated.Data, file = file.path(outputfolder,"out",paste(samplename,".Recalibrated.Data.RData", sep = ""))) +save(junctions.recalibrated, file = file.path(outputfolder,"out",paste(samplename,".junctions.recalibrated.RData", sep = ""))) + + diff --git a/lib/R/RetrieveRefId.R b/lib/R/RetrieveRefId.R new file mode 100644 index 0000000..996ac8c --- /dev/null +++ b/lib/R/RetrieveRefId.R @@ -0,0 +1,86 @@ +### retrieverefid ver 3: changed ensembl ftp filesystem ver 2 +### added user selectable ensembl version +vars.tmp <- commandArgs() +split.vars <- unlist(strsplit(vars.tmp[length(vars.tmp)], ",")); +ericscriptfolder <- split.vars[1]; +dbfolder <- split.vars[2]; +ensversion <- as.integer(split.vars[3]); +flagprint <- split.vars[4]; +if(ensversion < 0) ensversion <- 0; + +resdn <- file.path(dbfolder,"data","_resources"); +rfn <- file.path(resdn,"RefID.RData"); +if(!file.exists(resdn)) dir.create(resdn,recursive=T); +ctime <- as.integer(Sys.time()); +if(file.exists(rfn)) { + dtime <- as.integer(file.info(rfn)$mtime); +} +if(!exists("dtime") || is.na(dtime)) dtime <- 0; + +# update if necessary +if(dtime > ctime-10000 && ensversion==0) { + z <- new.env(); + load(rfn,envir=z); + ensversion <- z$version; + ensrefid <- z$refid; + ensrefid.path <- z$refid.path; + rm(z); + ensversion0 <- ensversion; +} else { +# get ensembl version and fasta lists +cat(xx0.tmp<-readLines("http://ftp.ensembl.org/pub"),file=paste(resdn,"/.ftplist0h",sep="")); +# get the most recent version +ensversion0 <- ensversion +if (ensversion == 0) { + i1 <- grep(">release-[0-9]*/",xx0.tmp); + ensversion <- max(as.integer(sub(".*>release-([0-9]*)/.*","\\1",xx0.tmp[i1]))); +} + +# get available species +cat(xx1.tmp<-readLines((bfn<-paste("http://ftp.ensembl.org/pub/",ifelse(ensversion>0,paste("release-",ensversion,"/fasta",sep=""),"/current_fasta"),sep=""))),file=paste(resdn,"/.ftplist1h",sep=""),sep="\n"); +# collect names of all available species +i1 <- grep("]*>([^<]*)/.*","\\1",xx1.tmp[i1]); + ensrefid <- xx; + ensrefid.path <- c(); + for(i in 1:length(xx)) tryCatch({ + xx2.tmp <- readLines(paste(bfn,"/",xx[i],"/dna",sep="")); + a1 <- grep("dna.toplevel",xx2.tmp,value=T); + if(length(a1)==1) ensrefid.path[i] <- sub(".*]*)\">[^<]*.*","\\1",a1); + },error=function(e){}); + i1 <- which(!is.na(ensrefid.path) & nchar(ensrefid.path)>10); + ensrefid <- ensrefid[i1]; + ensrefid.path <- ensrefid.path[i1]; + +}; # end if dtime & ctime comparison + +if(flagprint != 0) { + cat(ifelse(ensversion0!=0,"Selected","Current"),"Ensembl version:", ensversion, "\n"); + + if(file.exists(rfn) & any(file.exists(file.path(dbfolder,"data",ensrefid)))) { + load(rfn); + cat("Installed Ensembl version:", version, "\n") + } else { + cat("Installed Ensembl version:", "No database installed", "\n") + } + cat("Available reference IDs:\n", paste("\t", ensrefid, "\n")); +} +flag.updatedb <- 1; +if(file.exists(rfn)) { + load(rfn); + if(ensversion != version) { + flag.updatedb <- 0; + } +} +if(flag.updatedb>0) { + refid <- ensrefid + refid.path <- ensrefid.path + version <- ensversion +} + +save(refid, refid.path, version, file = rfn); +cat(version, file = file.path(resdn, "Ensembl.version")); +cat(flag.updatedb, file = file.path(resdn, ".flag.updatedb")); + + + diff --git a/lib/R/SimulateFusions.R b/lib/R/SimulateFusions.R new file mode 100644 index 0000000..385614e --- /dev/null +++ b/lib/R/SimulateFusions.R @@ -0,0 +1,321 @@ +### simulate data [revised]. + +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) + +readlength <- as.numeric(split.vars[1]) +outputfolder <- split.vars[2] +ericscriptfolder <- split.vars[3] +verbose <- as.numeric(split.vars[4]) +ins.size <- as.numeric(split.vars[5]) +sd.inssize <- as.numeric(split.vars[6]) +ngenefusion <- as.numeric(split.vars[7]) +min.coverage <- as.numeric(split.vars[8]) +max.coverage <- as.numeric(split.vars[9]) +nsims <- as.numeric(split.vars[10]) +BE.data <- as.numeric(split.vars[11]) +IE.data <- as.numeric(split.vars[12]) +background.data_1 <- as.character(split.vars[13]) +background.data_2 <- as.character(split.vars[14]) +nreads.background <- as.numeric(split.vars[15]) +dbfolder <- as.character(split.vars[16]) +refid <- as.character(split.vars[17]) + +mysyndata <- file.exists(file.path(dbfolder, "data", refid, "EnsemblGene.Transcripts.RData")) +if (mysyndata == T) { + cat("[EricScript simulator] Load genes data ...") + load(file.path(dbfolder, "data", refid, "EnsemblGene.Transcripts.RData")) +} else { + cat( paste("[EricScript simulator] You need to download", refid, "data before running EricScript Simulator. Exit.\n")) + system(paste("rm -r", outputfolder)) + quit() +} + +# myurl <- "http://dl.dropbox.com/u/3629305/EnsemblGene.Transcripts.RData" +# if (mysyndata == T) { +# cat("[EricScript simulator] Load genes data ...") +# load(file.path(dbfolder, "data", "EnsemblGene.Transcripts.RData")) +# } else { +# cat("[EricScript simulator] Retrieving genes data ...") +# download.file(myurl, destfile = file.path(dbfolder, "data", "EnsemblGene.Transcripts.RData"), quiet = T) +# load(file.path(dbfolder, "data", "EnsemblGene.Transcripts.RData")) +# cat(" done.\n") +# cat("[EricScript simulator] Load genes data ...") +# +# } + +flag.background <- 0 +if (nchar(background.data_1) > 2 & nchar(background.data_2) > 2) { + flag.background <- 1 +} + +dataset <- c() +if (BE.data == 1) { + dataset <- c(dataset, "BE") +} +if (IE.data == 1) { + dataset <- c(dataset, "IE") +} +formatfasta <- function(myfasta, step = 50) { + + totalchar <- nchar(myfasta) + if (totalchar > step) { + steps <- seq(1, totalchar, by = step) + newfasta <- rep("", (length(steps) - 1)) + for (j in 1: (length(steps) - 1)) { + aa <- substr(myfasta, steps[j], (steps[j] + (step - 1))) + newfasta[j] <- aa + } + if ((totalchar - tail(steps, n = 1)) > 0) { + newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar)) + } + } else + { + newfasta <- substr(myfasta, 1, totalchar) + } + return(newfasta) +} + +## evaluate n.backgound reads + + +TranscriptNames <- as.character(EnsemblGene.Structures$EnsemblGene) +acceptable.chrs <- c(seq(1,22), "X", "Y") +mycoverage <- seq(min.coverage, max.coverage, length.out = ngenefusion) +minlength <- ins.size + 2*sd.inssize +if (refid == "homo_sapiens") { + ix.geneok <- which((EnsemblGene.Structures$Chromosome %in% acceptable.chrs)) +} else { + ix.geneok <- seq(1, length(EnsemblGene.Structures$Chromosome)) +} +genenameok <- as.character(EnsemblGene.Structures$EnsemblGene)[ix.geneok] +strandok <- as.character(EnsemblGene.Structures$Strand)[ix.geneok] +ix.goodseq <- which((nchar(sequences) > 2*minlength) & (TranscriptNames %in% genenameok) & is.na(GeneNames) == F) +sequences <- sequences[ix.goodseq] +GeneNames <- GeneNames[ix.goodseq] +TranscriptNames <- TranscriptNames[ix.goodseq] + + +formatted.count.tmp <-paste("00000", seq(1, nsims), sep = "") +formatted.count <- substr(formatted.count.tmp, nchar(formatted.count.tmp) - 4, nchar(formatted.count.tmp)) +formatted.count.tmp <-paste("00000", seq(1, ngenefusion), sep = "") +formatted.count.fusions <- substr(formatted.count.tmp, nchar(formatted.count.tmp) - 4, nchar(formatted.count.tmp)) + + +for (tt in 1: length(dataset)) { + dir.create(file.path(outputfolder, dataset[tt])) + dir.create(file.path(outputfolder, dataset[tt], "data")) + dir.create(file.path(outputfolder, dataset[tt], "reads")) + + for (jj in 1: nsims) { + + dir.create(file.path(outputfolder, dataset[tt], "data", paste("sim", formatted.count[jj], sep = "_"))) + dir.create(file.path(outputfolder, dataset[tt], "reads", paste("sim", formatted.count[jj], sep = "_"))) + } +} +cat(" done.\n") + +for (jj in 1: nsims) { + cat("[EricScript simulator] Generating synthetic dataset", formatted.count[jj], "...") + + if (flag.background == 1) { + + myrandomseed <- round(runif(1, 1, 1000)) + system(paste("seqtk sample -s", myrandomseed, " background.data_1 ", nreads.background, " > ", file.path(outputfolder, "background.reads.1.fq") , sep = "")) + system(paste("seqtk sample -s", myrandomseed, " background.data_2 ", nreads.background, " > ", file.path(outputfolder, "background.reads.2.fq") , sep = "")) + } + + ix.gene1 <- rep(0,ngenefusion) + ix.gene2 <- rep(0,ngenefusion) + strand1 <- rep(0,ngenefusion) + strand2 <- rep(0,ngenefusion) + flag <- 1 + mycount <- 0 + while (flag == 1) { + trans1 <- sample(TranscriptNames, ngenefusion) + for (ii in 1: ngenefusion) { + ix.gene1[ii] <- which(TranscriptNames == trans1[ii]) + strand1[ii] <- strandok[which(genenameok == trans1[ii])] + } + gene1 <- GeneNames[ix.gene1] + trans2 <- sample(TranscriptNames, ngenefusion) + for (ii in 1: ngenefusion) { + ix.gene2[ii] <- which(TranscriptNames == trans2[ii]) + strand2[ii] <- strandok[which(genenameok == trans2[ii])] + } + gene2 <- GeneNames[ix.gene2] + ix.gene12 <- c(ix.gene1, ix.gene2) + if (length(unique(ix.gene12)) == 2*ngenefusion & length(unique(GeneNames[ix.gene12])) == 2*ngenefusion) { + flag <- 0 + } else + {flag <- 1} + } + + sequence1 <- sequences[ix.gene1] + sequence2 <- sequences[ix.gene2] + + if ("BE" %in% dataset) { + + myref <- c() + junction1.tot <- c() + junction2.tot <- c() + id.fusions <- rep(0, length(sequence1)) + sequence.fusions <- rep(0, length(sequence1)) + sequence.fusions.50bp <- rep(0, length(sequence1)) + + for (i in 1: length(sequence1)) { + + tmp <- seq.int(100,(nchar(sequence1[i]) - 100)) + junction1 <- sample(tmp,1) + junction1.tot <- c(junction1.tot, junction1) + tmp <- seq.int(100,(nchar(sequence2[i]) - 100)) + junction2 <- sample(tmp,1) + junction2.tot <- c(junction2.tot, junction2) + sequence.fusions[i] <- paste(substr(sequence1[i], 1, junction1), substr(sequence2[i], junction2 + 1, nchar(sequence2[i])), sep = "") + sequence.fusions.50bp[i] <- paste(substr(sequence1[i], (junction1 - 49), junction1), substr(sequence2[i], junction2 + 1, (junction2 + 50)), sep = "") + id.fusions[i] <- paste(">", paste(gene1[i], gene2[i], sep = "----"), sep = "") + myref <- c(myref, c(id.fusions[i], sequence.fusions[i])) + myref.single <- c(id.fusions[i], formatfasta(sequence.fusions[i])) + cat(myref.single, file = file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), sep = "\n") + } + + GeneFusions <- list() + GeneFusions[[1]] <- gene1 + GeneFusions[[2]] <- gene2 + GeneFusions[[3]] <- junction1.tot + GeneFusions[[4]] <- junction2.tot + GeneFusions[[5]] <- sequence.fusions.50bp + GeneFusions[[6]] <- mycoverage + GeneFusions[[7]] <- trans1 + GeneFusions[[8]] <- trans2 + names(GeneFusions) <- c("gene1", "gene2", "junction1", "junction2", "junctionseq", "coverage", "trans1", "trans2") + save(GeneFusions, file = file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "GeneFusions.RData")) + + system(paste(">", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"))) + system(paste(">", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"))) + + for (i in 1: ngenefusion) { + mynreads <- round(mycoverage[i]*nchar(sequence.fusions[i])/(2*readlength)) + if (verbose == 0) { + system(paste("wgsim -d ", ins.size, " -r 0.0001 -R 0.001 -s ", sd.inssize, " -N ", mynreads, " -1 ", readlength, " -2 ", readlength," ", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), " " ,file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq")," ", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), " 2>> ", file.path(outputfolder, "wgsim.log"), " 1>> ", file.path(outputfolder, "wgsim.log"), sep = "")) + } else { + system(paste("wgsim -d ", ins.size, " -r 0.0001 -R 0.001 -s ", sd.inssize, " -N ", mynreads, " -1 ", readlength, " -2 ", readlength," ", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), " " ,file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq")," ", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), sep = "")) + + } + + system(paste("cat", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq"), ">>", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"))) + system(paste("cat", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), ">>", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"))) + + } + + if (flag.background == 1) { + system(paste("cat ", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"), " ", file.path(outputfolder, "background.reads.1.fq"), " > ", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "total.reads.1.fq"), sep = "")) + system(paste("cat ", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"), " ", file.path(outputfolder, "background.reads.2.fq"), " > ", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "total.reads.2.fq"), sep = "")) + } + + system(paste("rm", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq"))) + system(paste("rm", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"))) + + + } + + if ("IE" %in% dataset) { + + + myref <- c() + + Gene.Table <- EnsemblGene.Structures + junction1.tot <- c() + junction2.tot <- c() + id.fusions <- rep(0, length(sequence1)) + sequence.fusions <- rep(0, length(sequence1)) + sequence.fusions.50bp <- rep(0, length(sequence1)) + genename.table <- as.character(Gene.Table[,1]) + + + for (i in 1: length(sequence1)) { + ix.genename.table <- which(genename.table == trans1[i]) + start.exons <- as.numeric(unlist(strsplit(as.character(Gene.Table[ix.genename.table, 7]), ","))) + end.exons <- as.numeric(unlist(strsplit(as.character(Gene.Table[ix.genename.table, 8]), ","))) + strand <- as.character(Gene.Table[ix.genename.table, 3]) + if (strand == "+") { + tmp <- cumsum((end.exons - start.exons)) + } else { + tmp <- cumsum(rev(end.exons - start.exons)) + } + if (length(tmp) > 1) { + junction1 <- sample(tmp,1) + } else { + junction1 <- tmp + } + junction1.tot <- c(junction1.tot, junction1) + ix.genename.table <- which(genename.table == trans2[i]) + start.exons <- as.numeric(unlist(strsplit(as.character(Gene.Table[ix.genename.table, 7]), ","))) + end.exons <- as.numeric(unlist(strsplit(as.character(Gene.Table[ix.genename.table, 8]), ","))) + strand <- as.character(Gene.Table[ix.genename.table, 3]) + if (strand == "+") { + tmp <- cumsum((end.exons - start.exons)) + tmp <- tmp[-length(tmp)] + } else { + tmp <- cumsum(rev(end.exons - start.exons)) + tmp <- tmp[-length(tmp)] + } + if (length(tmp) > 1) { + junction2 <- sample(tmp,1) + } else { + junction2 <- 1 + } + junction2.tot <- c(junction2.tot, junction2) + sequence.fusions[i] <- paste(substr(sequence1[i], 1, junction1), substr(sequence2[i], junction2 + 1, nchar(sequence2[i])), sep = "") + sequence.fusions.50bp[i] <- paste(substr(sequence1[i], (junction1 - 49), junction1), substr(sequence2[i], junction2 + 1, (junction2 + 50)), sep = "") + id.fusions[i] <- paste(">", paste(gene1[i], gene2[i], sep = "----"), sep = "") + myref <- c(myref, c(id.fusions[i], sequence.fusions[i])) + myref.single <- c(id.fusions[i], formatfasta(sequence.fusions[i])) + cat(myref.single, file = file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), sep = "\n") + + } + + GeneFusions <- list() + GeneFusions[[1]] <- gene1 + GeneFusions[[2]] <- gene2 + GeneFusions[[3]] <- junction1.tot + GeneFusions[[4]] <- junction2.tot + GeneFusions[[5]] <- sequence.fusions.50bp + GeneFusions[[6]] <- mycoverage + GeneFusions[[7]] <- trans1 + GeneFusions[[8]] <- trans2 + names(GeneFusions) <- c("gene1", "gene2", "junction1", "junction2", "junctionseq", "coverage", "trans1", "trans2") + save(GeneFusions, file = file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "GeneFusions.RData")) + + + system(paste(">", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"))) + system(paste(">", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"))) + for (i in 1: ngenefusion) { + mynreads <- round(mycoverage[i]*nchar(sequence.fusions[i])/(2*readlength)) + if (verbose == 0) { + system(paste("wgsim -d ", ins.size, " -r 0.0001 -R 0.001 -s ", sd.inssize, " -N ", mynreads, " -1 ", readlength, " -2 ", readlength," ", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), " " ,file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq")," ", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), " 2>> ", file.path(outputfolder, "wgsim.log"), " 1>> ", file.path(outputfolder, "wgsim.log"), sep = "")) + } else { + system(paste("wgsim -d ", ins.size, " -r 0.0001 -R 0.001 -s ", sd.inssize, " -N ", mynreads, " -1 ", readlength, " -2 ", readlength," ", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), " " ,file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq")," ", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), sep = "")) + + } + system(paste("cat", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq"), ">>", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"))) + system(paste("cat", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), ">>", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"))) + } + + if (flag.background == 1) { + + system(paste("cat ", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"), " ", file.path(outputfolder, "background.reads.1.fq"), " > ", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "total.reads.1.fq"), sep = "")) + system(paste("cat ", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"), " ", file.path(outputfolder, "background.reads.2.fq"), " > ", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "total.reads.2.fq"), sep = "")) + } + system(paste("rm", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq"))) + system(paste("rm", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"))) + } + if (flag.background == 1) { + system(paste("rm", file.path(outputfolder, "background.reads.1.fq"))) + system(paste("rm", file.path(outputfolder, "background.reads.2.fq"))) + } + system(paste("rm", file.path(outputfolder, "wgsim.log"))) + cat(" done. \n") +} + diff --git a/lib/R/UpdateDB.R b/lib/R/UpdateDB.R new file mode 100644 index 0000000..ff4ad5c --- /dev/null +++ b/lib/R/UpdateDB.R @@ -0,0 +1,24 @@ +vars.tmp <- commandArgs() +vars <- vars.tmp[length(vars.tmp)] +split.vars <- unlist(strsplit(vars, ",")) +ericscriptfolder <- split.vars [1] +dbfolder <- split.vars[2] + +mydblist.tmp <- list.files(file.path(dbfolder, "data")) +if (length(mydblist.tmp) > 1) { + mydblist <- mydblist.tmp[-which(mydblist.tmp == "_resources")] + flag <- scan(file.path(ericscriptfolder, "lib", "data", "_resources", ".flag.updatedb"), what = "numeric", quiet = T) + if (flag == 0) { + cat("[EricScript] Nothing to update. Exit.\n", sep = "") + } else + { + cat("[EricScript] Found a new release of Ensembl Gene. Updating database for ", toString(mydblist),".\n", sep = "") + for (i in 1: length(mydblist)) { + system(paste("sh", file.path(ericscriptfolder, "lib", "bash", "BuildSeq.sh"), ericscriptfolder, mydblist[i])) + } + } +} else { + cat("[EricScript] No database was found in ", file.path(dbfolder, "data"), ". Please run ericscript.pl --downdb to download your databases.\n", sep = "") +} + + diff --git a/lib/bash/BuildSeq.sh b/lib/bash/BuildSeq.sh new file mode 100644 index 0000000..f1ffdef --- /dev/null +++ b/lib/bash/BuildSeq.sh @@ -0,0 +1,35 @@ +#!/bin/bash +ericscriptfolder=$1 +refid=$2 +dbfolder=$3 +ensversion=$4 +myrandomn=$RANDOM +tmpfolder=$dbfolder/".tmp_"$myrandomn +mkdir $tmpfolder +printf "[EricScript] Downloading $refid data. This process may take from few minutes to few hours depending on the selected genome ..." +R --slave --args $ericscriptfolder,$dbfolder,$refid,$tmpfolder,$ensversion < $ericscriptfolder/lib/R/DownloadDB.R +flagrefid=`cat $tmpfolder/.refid.flag` +if [ $flagrefid -eq 1 ] +then +bedtools sort -i $tmpfolder/exonstartend.txt | bedtools merge -c 4 -o collapse -i - | cut -d ',' -f1 - | awk '{print $4"\t"($2-1)"\t"$3"\t"$1}' - > $tmpfolder/exonstartend.mrg.txt +seqtk subseq $tmpfolder/seq.fa.gz $tmpfolder/exonstartend.mrg.txt > $tmpfolder/subseq.fa +printf "done.\n" +printf "[EricScript] Creating database for $refid ..." +R --slave --args $ericscriptfolder,$refid,$dbfolder,$tmpfolder < $ericscriptfolder/lib/R/BuildExonUnionModel.R +R --slave --args $ericscriptfolder,$refid,$dbfolder,$tmpfolder < $ericscriptfolder/lib/R/ConvertTxt2R.R +R --slave --args $refid,$dbfolder,$tmpfolder < $ericscriptfolder/lib/R/CreateDataEricTheSimulator.R +if [ $refid == "homo_sapiens" ] +then +seqtk subseq -l 50 $tmpfolder/seq.fa.gz $tmpfolder/chrlist > $dbfolder/data/$refid/allseq.fa +else +gunzip -c -d $tmpfolder/seq.fa.gz > $dbfolder/data/$refid/allseq.fa +fi +printf "done.\n" +printf "[EricScript] Building reference indexes with BWA for transcriptome and genome ..." +bwa index $dbfolder/data/$refid/allseq.fa 1>> $tmpfolder/.tmp.log 2>> $tmpfolder/.tmp.log +bwa index $dbfolder/data/$refid/EnsemblGene.Reference.fa 1>> $tmpfolder/.tmp.log 2>> $tmpfolder/.tmp.log +printf "done.\n" +fi +printf "[EricScript] Removing temporary files ..." +rm -r $tmpfolder +printf "done.\n" diff --git a/lib/bash/RunEric.sh b/lib/bash/RunEric.sh new file mode 100644 index 0000000..bb28e89 --- /dev/null +++ b/lib/bash/RunEric.sh @@ -0,0 +1,276 @@ +#!/bin/bash +## bash pipeline +outputfolder=$1 +. $outputfolder/ericscript.vars +if [ $verbose -eq 1 ]; then +touch $outputfolder/out/.ericscript.log +fi +printf "\n[EricScript] Starting EricScript analysis for sample $samplename.\n" +if [ $flagbin -eq 0 ]; then +readlength=$((`head -n 2 $reads_1 | tail -n 1 | wc -m` - 1)) +else +readlength=$((`gunzip -c $reads_1 | head -n 2 | tail -n 1 | wc -m` - 1)) +fi +if [ $ntrim -lt $readlength -a $ntrim -ne -1 ]; then +if [ $ntrim -lt 36 -a $ntrim -ge 0 ]; then +printf "[EricScript] Minimum trimming value is 36. Reads will be trimmed to 36 nt.\n" +ntrim=36 +fi +if [ $ntrim -eq -1 -a $readlength -ge 70 ]; then +ntrim=50 +fi +printf "[EricScript] Trimming PE reads to $ntrim nt ..." +if [ $flagbin -eq 0 ]; then +perl $ericscriptfolder/lib/perl/trimfq.pl $reads_1 $ntrim $outputfolder/aln/$samplename.1.fq.trimmed +perl $ericscriptfolder/lib/perl/trimfq.pl $reads_2 $ntrim $outputfolder/aln/$samplename.2.fq.trimmed +printf "done. \n" +else +gunzip -c $reads_1 | perl $ericscriptfolder/lib/perl/trimfq.pl - $ntrim $outputfolder/aln/$samplename.1.fq.trimmed +gunzip -c $reads_2 | perl $ericscriptfolder/lib/perl/trimfq.pl - $ntrim $outputfolder/aln/$samplename.2.fq.trimmed +fi +if [ $verbose -eq 0 ]; then +printf "[EricScript] Aligning with bwa ..." +if [ $bwa_aln -eq 1 ]; then +bwa aln -R 5 -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed > $outputfolder/aln/"$samplename"_1.sai 2>> $outputfolder/out/.ericscript.log +bwa aln -R 5 -t $nthreads $myref $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename"_2.sai 2>> $outputfolder/out/.ericscript.log +fi +if [ $MAPQ -gt 0 ]; then +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename".sam 2>> $outputfolder/out/.ericscript.log +else +bwa mem -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename".sam 2>> $outputfolder/out/.ericscript.log +fi +else +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log +else +bwa mem -Y -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log +fi +cat $outputfolder/aln/tmp.sam | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam +fi +else +printf "[EricScript] Aligning with bwa ...\n" +if [ $bwa_aln -eq 1 ]; then +bwa aln -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed > $outputfolder/aln/"$samplename"_1.sai +bwa aln -t $nthreads $myref $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename"_2.sai +fi +if [ $MAPQ -gt 0 ]; then +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename".sam +else +bwa mem -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename".sam +fi +else +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam +else +bwa mem -Y -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam +fi +fi +fi +else +if [ $ntrim -ge $readlength ]; then +printf "[EricScript] Selected trimming value is greater equal to read length. Reads will not be trimmed.\n" +fi +if [ $verbose -eq 0 ]; then +printf "[EricScript] Aligning with bwa ..." +if [ $bwa_aln -eq 1 ]; then +bwa aln -R 5 -t $nthreads $myref $reads_1 > $outputfolder/aln/"$samplename"_1.sai 2>> $outputfolder/out/.ericscript.log +bwa aln -R 5 -t $nthreads $myref $reads_2 > $outputfolder/aln/"$samplename"_2.sai 2>> $outputfolder/out/.ericscript.log +fi +if [ $MAPQ -gt 0 ]; then +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $reads_1 $reads_2 > $outputfolder/aln/"$samplename".sam 2>> $outputfolder/out/.ericscript.log +else +bwa mem -t $nthreads $myref $reads_1 $reads_2 > $outputfolder/aln/"$samplename".sam 2>> $outputfolder/out/.ericscript.log +fi +else +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log +else +bwa mem -Y -t $nthreads $myref $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log +fi +cat $outputfolder/aln/tmp.sam | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam +fi +else +printf "[EricScript] Aligning with bwa ...\n" +if [ $bwa_aln -eq 1 ]; then +bwa aln -t $nthreads $myref $reads_1 > $outputfolder/aln/"$samplename"_1.sai +bwa aln -t $nthreads $myref $reads_2 > $outputfolder/aln/"$samplename"_2.sai +fi +if [ $MAPQ -gt 0 ]; then +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $reads_1 $reads_2 > $outputfolder/aln/"$samplename".sam +else +bwa mem -t $nthreads $myref $reads_1 $reads_2 > $outputfolder/aln/"$samplename".sam +fi +else +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam +else +bwa mem -Y -t $nthreads $myref $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam +fi +fi +fi +fi +printf "done. \n" +R --slave --args $outputfolder,$bwa_aln < $ericscriptfolder/lib/R/ExtractInsertSize.R +printf "[EricScript] Extracting discordant alignments ... " +grep -v '^@' $outputfolder/aln/"$samplename".sam | awk -v mapq="$MAPQ" '(($7!="=") && ($7!="*") && ($5>=mapq)) { print }' | cut -f2,3,4,5,7,8 > $outputfolder/out/"$samplename".filtered.out +R --slave --args $samplename,$outputfolder,$ericscriptfolder,$minreads,$MAPQ,$refid,$dbfolder < $ericscriptfolder/lib/R/MakeAdjacencyMatrix.R +myflag=`cat $outputfolder/out/.ericscript.flag` +if [ $myflag -eq 0 ]; then +printf "done. \n" +printf "[EricScript] No chimeric transcripts found! Writing results ..." +R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/MakeEmptyResults.R +printf "done. \n" +exit 1 +fi +printf "done. \n" +printf "[EricScript] Building exon junction reference ... " +R --slave --args $samplename,$outputfolder,$ericscriptfolder,$readlength,$refid,$dbfolder < $ericscriptfolder/lib/R/BuildFasta.R +printf "done. \n" +## Aligning to putative junction reference +if [ $verbose -eq 0 ]; then +printf "[EricScript] Aligning to exon junction reference ... " +bwa index $mynewref 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log +if [ $bwa_aln -eq 1 ]; then +bwa aln -t $nthreads $mynewref $reads_1 > $outputfolder/aln/"$samplename"_1.remap.sai 2>> $outputfolder/out/.ericscript.log +bwa aln -t $nthreads $mynewref $reads_2 > $outputfolder/aln/"$samplename"_2.remap.sai 2>> $outputfolder/out/.ericscript.log +fi +if [ $MAPQ -gt 0 ]; then +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P $mynewref $outputfolder/aln/"$samplename"_1.remap.sai $outputfolder/aln/"$samplename"_2.remap.sai $reads_1 $reads_2 > $outputfolder/aln/$samplename.remap.sam 2>> $outputfolder/out/.ericscript.log +else +bwa mem -t $nthreads $mynewref $reads_1 $reads_2 > $outputfolder/aln/$samplename.remap.sam 2>> $outputfolder/out/.ericscript.log +fi +else +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P $mynewref $outputfolder/aln/"$samplename"_1.remap.sai $outputfolder/aln/"$samplename"_2.remap.sai $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log +else +bwa mem -Y -t $nthreads $mynewref $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log +fi +cat $outputfolder/aln/tmp.sam | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.sam +fi +samtools view -@ $nthreads -bS -o $outputfolder/aln/$samplename.remap.bam $outputfolder/aln/$samplename.remap.sam 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log +samtools sort -@ $nthreads $outputfolder/aln/$samplename.remap.bam $outputfolder/aln/$samplename.remap.sorted 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log +samtools index $outputfolder/aln/$samplename.remap.sorted.bam 1>> $outputfolder/out/.ericscript.log +else +printf "[EricScript] Aligning to exon junction reference ... \n" +bwa index $mynewref +if [ $bwa_aln -eq 1 ]; then +bwa aln -t $nthreads $mynewref $reads_1 > $outputfolder/aln/"$samplename"_1.remap.sai +bwa aln -t $nthreads $mynewref $reads_2 > $outputfolder/aln/"$samplename"_2.remap.sai +fi +if [ $MAPQ -gt 0 ]; then +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P $mynewref $outputfolder/aln/"$samplename"_1.remap.sai $outputfolder/aln/"$samplename"_2.remap.sai $reads_1 $reads_2 > $outputfolder/aln/$samplename.remap.sam +else +bwa mem -t $nthreads $mynewref $reads_1 $reads_2 > $outputfolder/aln/$samplename.remap.sam +fi +else +if [ $bwa_aln -eq 1 ]; then +bwa sampe -P $mynewref $outputfolder/aln/"$samplename"_1.remap.sai $outputfolder/aln/"$samplename"_2.remap.sai $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.sam +else +bwa mem -Y -t $nthreads $mynewref $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.sam +fi +fi +samtools view -@ $nthreads -bS -o $outputfolder/aln/$samplename.remap.bam $outputfolder/aln/$samplename.remap.sam +samtools sort -@ $nthreads $outputfolder/aln/$samplename.remap.bam $outputfolder/aln/$samplename.remap.sorted +samtools index $outputfolder/aln/$samplename.remap.sorted.bam +fi +printf "done. \n" +## Recalibrating junctions +printf "[EricScript] Recalibrating junctions ... " +R --slave --args $samplename,$outputfolder,$readlength,$verbose < $ericscriptfolder/lib/R/RecalibrateJunctions.R +cat $outputfolder/out/$samplename.EricScript.junctions.recalibrated.fa $myref > $mynewref_recal +printf "done. \n" +## Aligning not properly mapped reads +if [ $verbose -eq 0 ]; then +printf "[EricScript] Aligning to recalibrated junction reference ... " +bwa index $mynewref_recal 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log +if [ $bwa_aln -eq 1 ]; then +bwa aln -R 5 -t $nthreads $mynewref_recal $reads_1 > $outputfolder/aln/"$samplename"_1.remap.recal.sai 2>> $outputfolder/out/.ericscript.log +bwa aln -R 5 -t $nthreads $mynewref_recal $reads_2 > $outputfolder/aln/"$samplename"_2.remap.recal.sai 2>> $outputfolder/out/.ericscript.log +bwa sampe -P $mynewref_recal $outputfolder/aln/"$samplename"_1.remap.recal.sai $outputfolder/aln/"$samplename"_2.remap.recal.sai $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log +else +bwa mem -Y -t $nthreads $mynewref_recal $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log +fi +cat $outputfolder/aln/tmp.sam | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.recal.sam +samtools view -@ $nthreads -bt $mynewref_recal -o $outputfolder/aln/$samplename.remap.recal.bam $outputfolder/aln/$samplename.remap.recal.sam 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log +samtools sort -@ $nthreads $outputfolder/aln/$samplename.remap.recal.bam $outputfolder/aln/$samplename.remap.recal.sorted 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log +samtools rmdup $outputfolder/aln/$samplename.remap.recal.sorted.bam $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log +samtools index $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam 1>> $outputfolder/out/.ericscript.log +samtools view -@ $nthreads -b -h -q 1 $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam > $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam +samtools index $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam +else +printf "[EricScript] Aligning to recalibrated junction reference ... \n" +bwa index $mynewref_recal +if [ $bwa_aln -eq 1 ]; then +bwa aln -R 5 -t $nthreads $mynewref_recal $reads_1 > $outputfolder/aln/"$samplename"_1.remap.recal.sai +bwa aln -R 5 -t $nthreads $mynewref_recal $reads_2 > $outputfolder/aln/"$samplename"_2.remap.recal.sai +bwa sampe -P $mynewref_recal $outputfolder/aln/"$samplename"_1.remap.recal.sai $outputfolder/aln/"$samplename"_2.remap.recal.sai $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.recal.sam +else +bwa mem -Y -t $nthreads $mynewref_recal $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.recal.sam +fi +samtools view -@ $nthreads -bt $mynewref_recal -o $outputfolder/aln/$samplename.remap.recal.bam $outputfolder/aln/$samplename.remap.recal.sam +samtools sort -@ $nthreads $outputfolder/aln/$samplename.remap.recal.bam $outputfolder/aln/$samplename.remap.recal.sorted +samtools rmdup $outputfolder/aln/$samplename.remap.recal.sorted.bam $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam +samtools index $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam +samtools view -@ $nthreads -b -h -q 1 $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam > $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam +samtools index $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam +fi +printf "done. \n" +samtools idxstats $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam > $outputfolder/out/$samplename.stats +rm $outputfolder/aln/*.sam +## Estimating spanning reads +printf "[EricScript] Scoring candidate fusions ..." +R --slave --args $samplename,$outputfolder,$readlength < $ericscriptfolder/lib/R/EstimateSpanningReads.R +myflag=`cat $outputfolder/out/.ericscript.flag` +if [ $myflag -eq 0 ]; then +printf "done. \n" +printf "[EricScript] No chimeric transcripts found! Writing results ..." +R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/MakeEmptyResults.R +printf "done. \n" +exit 1 +fi +printf "done. \n" +printf "[EricScript] Filtering candidate fusions ..." +if [ $verbose -eq 0 ]; then +samtools mpileup -A -f $mynewref_recal -l $outputfolder/out/$samplename.intervals $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam > $outputfolder/out/$samplename.remap.recal.sorted.rmdup.pileup 2>> $outputfolder/out/.ericscript.log +else +samtools mpileup -A -f $mynewref_recal -l $outputfolder/out/$samplename.intervals $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam > $outputfolder/out/$samplename.remap.recal.sorted.rmdup.pileup +fi +cut -f1,2,3 $outputfolder/out/$samplename.remap.recal.sorted.rmdup.pileup | grep -e '[0-9]----[a-z | A-Z]' - > $outputfolder/out/$samplename.intervals.pileup +R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/BuildNeighbourhoodSequences.R +if [ $verbose -eq 0 ]; then +blat $myref $outputfolder/out/.link $outputfolder/out/$samplename.checkselfhomology.blat -out=blast8 1>> $outputfolder/out/.ericscript.log +else +blat $myref $outputfolder/out/.link $outputfolder/out/$samplename.checkselfhomology.blat -out=blast8 +fi +R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/CheckSelfHomology.R +myflag=`cat $outputfolder/out/.ericscript.flag` +if [ $myflag -eq 0 ]; then +printf "done. \n" +printf "[EricScript] No chimeric transcripts found! Writing results ..." +R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/MakeEmptyResults.R +printf "done. \n" +exit 1 +fi +printf "done. \n" +## Writing results +if [ $verbose -eq 0 ]; then +printf "[EricScript] Writing results ... " +else +printf "[EricScript] Writing results ... \n" +fi +R --slave --args $samplename,$outputfolder,$ericscriptfolder,$readlength,$verbose,$refid,$dbfolder < $ericscriptfolder/lib/R/MakeResults.R +printf "done. \n" +rm $outputfolder/out/*fai +if [ $removetemp -eq 1 ]; then +printf "[EricScript] Removing temporary files ... " +rm -r $outputfolder/aln +rm -r $outputfolder/out +printf "done. \n" +fi +printf "[EricScript] Open $outputfolder/$samplename.results* to view the results of EricScript analysis.\n" diff --git a/lib/demo/myreads_1.fq.gz b/lib/demo/myreads_1.fq.gz new file mode 100644 index 0000000..74520e3 Binary files /dev/null and b/lib/demo/myreads_1.fq.gz differ diff --git a/lib/demo/myreads_2.fq.gz b/lib/demo/myreads_2.fq.gz new file mode 100644 index 0000000..4a7e168 Binary files /dev/null and b/lib/demo/myreads_2.fq.gz differ diff --git a/lib/perl/retrievefrombiomart.pl b/lib/perl/retrievefrombiomart.pl new file mode 100755 index 0000000..919a1dd --- /dev/null +++ b/lib/perl/retrievefrombiomart.pl @@ -0,0 +1,75 @@ +# script by BioMart +# feb16: mod with selection of genome release + +use strict; +use LWP::UserAgent; + +my $ensversion = $ARGV[1]; + +open (FH,$ARGV[0]) || die ("\nUsage: perl webExample.pl Query.xml\n\n"); + +my $xml; +while (){ + $xml .= $_; +} +close(FH); + +my $path = ""; + +if ($ensversion == 70) { + $path = "http://jan2013.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 71) { + $path = "http://apr2013.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 72) { + $path = "http://jun2013.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 73) { + $path = "http://sep2013.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 74) { + $path = "http://dec2013.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 75) { + $path = "http://feb2014.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 76) { + $path = "http://aug2014.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 77) { + $path = "http://oct2014.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 78) { + $path = "http://dec2014.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 79) { + $path = "http://mar2015.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 80) { + $path = "http://may2015.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 81) { + $path = "http://jul2015.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion == 82) { + $path = "http://sep2015.archive.ensembl.org/biomart/martservice?"; +} +elsif ($ensversion > 82) { + $path = "http://www.ensembl.org/biomart/martservice?"; +} +my $request = HTTP::Request->new("POST",$path,HTTP::Headers->new(),'query='.$xml."\n"); +my $ua = LWP::UserAgent->new; +my $response; + +$ua->request($request, + sub{ + my($data, $response) = @_; + if ($response->is_success) { + print "$data"; + } + else { + warn ("Problems with the web server: ".$response->status_line); + } + },1000); + diff --git a/lib/perl/trimfq.pl b/lib/perl/trimfq.pl new file mode 100755 index 0000000..27287c6 --- /dev/null +++ b/lib/perl/trimfq.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl +use warnings; +use strict; + +my($listin) = $ARGV[0]; +my($ntrim) = $ARGV[1]; +my($outfile) = $ARGV[2]; + +open LIST, "${listin}" or die $!; +open OUT, ">$outfile"; +my $a; +my $count = 1; +while () + + { + if ($count++ % 2 == 0) { + $a = substr($_, 0, $ntrim); + print OUT "$a\n"; + } else + { + print OUT "$_"; + } + + } + +close LIST; +close OUT; diff --git a/lib/perl/xa2multi.pl b/lib/perl/xa2multi.pl new file mode 100755 index 0000000..4e813be --- /dev/null +++ b/lib/perl/xa2multi.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +while (<>) { + if (/\tXA:Z:(\S+)/) { + my $l = $1; + print; + my @t = split("\t"); + while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) { + my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated! + my $seq = $t[9]; + my $phred = $t[10]; + # if alternative alignment has other orientation than primary, + # then print the reverse (complement) of sequence and phred string + if ((($t[1]&0x10)>0) xor ($2<0)) { + $seq = reverse $seq; + $seq =~ tr/ACGTacgt/TGCAtgca/; + $phred = reverse $phred; + } + print(join("\t", $t[0], ($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, $seq, $phred, "NM:i:$4"), "\n"); + } + } else { print; } +} +