From 88749ff195a1afcc27de7894df3ffc5e6d21d4a7 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Fri, 9 Mar 2018 12:53:36 +0100 Subject: [PATCH 01/10] added optional call to breakMAF.pl before RNAcode --- tools/rna_tools/rnacode/rnacode.xml | 32 ++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/tools/rna_tools/rnacode/rnacode.xml b/tools/rna_tools/rnacode/rnacode.xml index 0311ca4d2c..c6abdf4966 100644 --- a/tools/rna_tools/rnacode/rnacode.xml +++ b/tools/rna_tools/rnacode/rnacode.xml @@ -10,6 +10,15 @@ RNAcode --version aln && + #else + ln -s $alignment aln && + #end if + RNAcode $outputFormat @@ -37,7 +46,7 @@ --pars "$pars" #end if - $alignment + aln #if $outputFormat.value == '--tabular' --outfile $outFileDefault @@ -49,6 +58,27 @@ + + + + + + + + + + + + + + + + + + + + + From a55a819e1c04b6107f2e2485088125e3ec30cf9c Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Fri, 9 Mar 2018 12:53:36 +0100 Subject: [PATCH 02/10] added optional call to breakMAF.pl before RNAcode --- tools/rna_tools/rnacode/rnacode.xml | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/rna_tools/rnacode/rnacode.xml b/tools/rna_tools/rnacode/rnacode.xml index 0311ca4d2c..d3214fea8c 100644 --- a/tools/rna_tools/rnacode/rnacode.xml +++ b/tools/rna_tools/rnacode/rnacode.xml @@ -10,6 +10,15 @@ RNAcode --version aln && + #else + ln -s $alignment aln && + #end if + RNAcode $outputFormat @@ -37,7 +46,7 @@ --pars "$pars" #end if - $alignment + aln #if $outputFormat.value == '--tabular' --outfile $outFileDefault @@ -49,6 +58,17 @@ + + + + + + + + + + + From 170ec5a72bdf7064ffffe25c0ff33ccd5270306b Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Fri, 9 Mar 2018 14:01:47 +0100 Subject: [PATCH 03/10] fixes - single quotes for alignment file - version increased - stdio -> detect_errors="exit_code" --- tools/rna_tools/rnacode/rnacode.xml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tools/rna_tools/rnacode/rnacode.xml b/tools/rna_tools/rnacode/rnacode.xml index d3214fea8c..1b08b245c8 100644 --- a/tools/rna_tools/rnacode/rnacode.xml +++ b/tools/rna_tools/rnacode/rnacode.xml @@ -1,22 +1,18 @@ - + Analyze the protein coding potential in MSA. rnacode - - - - RNAcode --version - + aln && + < '$alignment' > aln && #else - ln -s $alignment aln && + ln -s '$alignment' aln && #end if RNAcode From 503a14b5b725e336e656117afad400fc70411ff9 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Thu, 22 Mar 2018 15:23:12 +0100 Subject: [PATCH 04/10] preprocessing scripts and eps output working --- tools/rna_tools/rnacode/breakMAF.pl | 312 ++++++++++++++++++++++++++ tools/rna_tools/rnacode/processMAF.sh | 148 ++++++++++++ tools/rna_tools/rnacode/rnacode.xml | 20 +- 3 files changed, 473 insertions(+), 7 deletions(-) create mode 100755 tools/rna_tools/rnacode/breakMAF.pl create mode 100755 tools/rna_tools/rnacode/processMAF.sh diff --git a/tools/rna_tools/rnacode/breakMAF.pl b/tools/rna_tools/rnacode/breakMAF.pl new file mode 100755 index 0000000000..9de1a2b6f7 --- /dev/null +++ b/tools/rna_tools/rnacode/breakMAF.pl @@ -0,0 +1,312 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; +use POSIX qw(ceil floor); + +my $maxLength = 400; +my $desiredLength = 200; +my $help = 0; + +GetOptions( + "maxLength:i" => \$maxLength, + "desiredLength:i" => \$desiredLength, + "help" => \$help, + "h" => \$help +); + +if ($help) { + print STDERR "\nbreakMAF [options] < input.maf > output.maf\n\n"; + print STDERR "Options:\n"; + print STDERR " maxLength: Break all blocks longer than that (default: 400 columns)\n"; + print STDERR " desiredLength: Try to create blocks of this size (default: 200 columns)\n"; + exit(1); +} + +$/ = 'a score='; + +while ( my $block = <> ) { + + $block = 'a score=' . $block; + + my $currAln = readMAF($block)->[0]; + + next if not defined($currAln); + + my $length = length( $currAln->[0]->{seq} ); + + if ( $length > $maxLength ) { + + my $breakN = int( $length / $desiredLength ); + my $chunkLength = ceil( $length / $breakN ); + + my $from = 0; + + while (1) { + my $to = $from + $chunkLength; + + if ( $to > $length ) { + $to = $length; + } + + my $slice = sliceAlnByColumn( $currAln, $from, $to ); + + print formatAln( $slice, 'maf' ), "\n"; + + $from = $to; + + last if $from == $length; + } + + } else { + print formatAln( $currAln, 'maf' ), "\n"; + } +} + +###################################################################### +# +# readMAF($string) +# +# Converts the MAF in string to internal alignment format. Returns +# list of usual array references. +# +###################################################################### + +sub readMAF { + + my $string = shift; + + return [] if $string eq ''; + + my @input = split( "\n", $string ); + + #open(FILE,"<$file") || die("Could not read $file ($!)"); + + my @outAlns = (); + my @aln = (); + + foreach my $i ( 0 .. $#input ) { + + $_ = $input[$i]; + + next if (/^\s?\#/); + next if (/^\s?a/); + + if (/^\s?s/) { + ( my $dummy, my $name, my $start, my $length, my $strand, my $fullLength, my $seq ) = split; + + my $end = $start + $length; + + ( my $org, my $chrom ) = split( /\./, $name ); + + push @aln, { + name => $name, + org => $org, + chrom => $chrom, + start => $start, + end => $end, + seq => $seq, + strand => $strand + }; + } + + if ( /^\s?$/ and @aln ) { + push @outAlns, [@aln]; + @aln = (); + } + if ( ( not defined $input[ $i + 1 ] ) and (@aln) ) { + push @outAlns, [@aln]; + @aln = (); + } + } + return \@outAlns; +} + +sub formatAln { + + my @aln = @{ $_[0] }; + + my $format = $_[1]; + + $format = lc($format); + + my @alnSeqs = (); + my @alnNames = (); + + my $counter = 1; + + foreach my $row (@aln) { + + my $name = "seq$counter"; + $counter++; + if ( $row->{name} ) { + $name = ( $row->{name} ); + } elsif ( $row->{org} and $row->{chrom} ) { + $name = "$row->{org}.$row->{chrom}"; + } + + my $start = $row->{start}; + my $end = $row->{end}; + my $strand = $row->{strand}; + + my $pos = ''; + + if ( defined $start + and defined $end + and defined $strand + and ( $format ne 'phylip' ) ) { + ( $start, $end ) = ( $end, $start ) if ( $strand eq '-' ); + $pos = "/$start-$end"; + } + + push @alnNames, "$name$pos"; + push @alnSeqs, $row->{seq}; + + } + + my $output = ''; + + if ( $format eq 'clustal' ) { + + $output = "CLUSTAL W(1.81) multiple sequence alignment\n\n\n"; + my $maxName = 0; + + foreach my $name (@alnNames) { + $maxName = ( $maxName < length($name) ) ? length($name) : $maxName; + } + + for my $i ( 0 .. $#alnNames ) { + my $buffer = " " x ( ( $maxName + 6 ) - length( $alnNames[$i] ) ); + $alnNames[$i] .= $buffer; + } + my $columnWidth = 60; + my $currPos = 0; + my $length = length( $alnSeqs[0] ); + + while ( $currPos < $length ) { + for my $i ( 0 .. $#alnNames ) { + $output .= $alnNames[$i]; + $output .= substr( $alnSeqs[$i], $currPos, $columnWidth ); + $output .= "\n"; + } + $output .= "\n\n"; + $currPos += $columnWidth; + } + } elsif ( $format eq 'fasta' ) { + foreach my $i ( 0 .. $#alnNames ) { + my $name = $alnNames[$i]; + my $seq = $alnSeqs[$i]; + $seq =~ s/(.{60})/$1\n/g; + $output .= ">$name\n$seq\n"; + } + } elsif ( $format eq 'phylip' ) { + my $length = length( $alnSeqs[0] ); + my $N = @alnSeqs; + $output .= " $N $length\n"; + foreach my $i ( 0 .. $#alnNames ) { + my $name = $alnNames[$i]; + my $seq = $alnSeqs[$i]; + $seq =~ s/(.{60})/$1\n/g; + $output .= "$name\n$seq\n"; + } + } elsif ( $format eq 'maf' ) { + $output .= "a score=0\n"; + foreach my $row (@aln) { + my $length = $row->{end} - $row->{start}; + $output .= "s $row->{org}.$row->{chrom} $row->{start} $length $row->{strand} 0 $row->{seq}\n"; + } + } + return $output; + +} + +###################################################################### +# +# sliceAlnByColumn(\@aln ref-to-alignment, $start int, $end int) +# +# Returns slice of alignment specified by alignment column. +# +# \@aln ... alignment in list of hash format +# $start, $end ... slice to cut +# +# Returns reference to alignment in list of hash format. This is a new +# alignment, i.e. the input is not sliced in place +# +###################################################################### + +sub sliceAlnByColumn { + + my @aln = @{ $_[0] }; + shift; + ( my $start, my $end ) = @_; + + # correct ends without warning if outside of valid range + $start = 0 if ( $start < 0 ); + $end = length( $aln[0]->{seq} ) if ( $end > length( $aln[0]->{seq} ) ); + + #my @newAln=@aln; + + # make deep copy of list of hash + my @newAln = (); + foreach (@aln) { + push @newAln, { %{$_} }; + } + + foreach my $i ( 0 .. $#newAln ) { + + my $oldStart = $newAln[$i]->{start}; + my $oldEnd = $newAln[$i]->{end}; + + $newAln[$i]->{start} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $start ); + $newAln[$i]->{end} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $end - 1 ) + 1; + $newAln[$i]->{seq} = substr( $newAln[$i]->{seq}, $start, $end - $start ); + + } + + return ( [@newAln] ); + +} + +###################################################################### +# +# alnCol2genomePos($seq string, $start int, $col int) +# +# Calculates the genomic position corresponding to a column in an +# alignment. +# +# $seq ... sequence from alignment (i.e. letters with gaps) +# $start ... Genomic position of first letter in $seq +# $col ... column in the alignment that is to be mapped +# +# Returns genomic position. No error handling, so $col must be a valid +# column of the string $seq. +# +####################################################################### + +sub alnCol2genomePos { + + ( my $seq, my $start, my $col ) = @_; + + $seq =~ s/\./-/g; #Convert all gaps to "-" + + my $newPos = $start; + + # if gap only... + return $start if ( $seq =~ /^-+$/ ); + + ( my $tmp ) = $seq =~ /(-*)[^-]/; + + my $leadingGaps = length($tmp); + + # if column is in gap before first letter, + # return position of the first letter + return $start if ( $col < $leadingGaps ); + + $newPos = $start - 1; + + for my $i ( $leadingGaps .. $col ) { + $newPos++ if ( ( my $t = substr( $seq, $i, 1 ) ) ne '-' ); + } + return $newPos; +} + diff --git a/tools/rna_tools/rnacode/processMAF.sh b/tools/rna_tools/rnacode/processMAF.sh new file mode 100755 index 0000000000..106e3bd6aa --- /dev/null +++ b/tools/rna_tools/rnacode/processMAF.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# RNAcode sometimes fails because of bugs. Since the manual suggests +# to call RNAcode on splitted alignments it is feasible to run +# RNAcode separately on the parts. This is implemented here. Command +# line parameters just passed on to RNAcode. +# +# - the script ensures that the region ids are continuous (otherwise +# the results for each block would start with 0) +# - also eps file names are corrected accordingly +# if RNAcode fails for one part it just outputs the part (for bug reporting) +# and continues + +# for splitting the alignment you can use breakMAF.pl from the RNAcode +# github (it seems to be absent from the 0.3 release) and feed the output +# with the desired RNAcode parameters into this shell script, e.g.: +# +# breakMAF.pl < chrM.maf | ./processMAF.sh --tabular --eps --eps-dir eps2/ + +# parse the command line options +args=$@ +while [[ $# -gt 0 ]] +do +key="$1" + case $key in + -d|--eps-dir) + epsdir=$2 + shift # past argument + shift # past value + ;; + -e|--eps) + eps=1 + shift # past argument + ;; + -t|--tabular) + tabular=1 + shift # past argument + ;; + -o|--outfile) + outfile=$2 + shift # past argument + shift # past value + ;; + -g|--gtf|-b|--best-only|-r|--best-region|-s|--stop-early|-n|--num-samples|-p|--cutoff|-c|--pars|-e|--eps|-i|--eps-cutoff|-h|--help|-v|--version) + shift # past argument + ;; + *) # unknown option + file=$1 + shift # past argument + ;; + esac +done + +# fix output (renumber blocks) +# and move eps files (if present) to tmpdir +function fix_output { + if [[ -z "$last" ]]; then + echo "reset LAST" + last=0 + fi + while read line + do + i=`echo "$line" | awk '{print $1}'` + j=`echo "$i+$last" | bc` + echo $line | awk -v n=$j '{printf("%d\t", n); for(i=2; i<=NF; i++){printf("%s", $(i)); if(i==NF){printf("\n")}else{printf("\t")}}}' + if [[ ! -z "$eps" && -f ${epsdir:-eps}/hss-$i.eps ]]; then + mv ${epsdir:-eps}/hss-$i.eps $tmpd/hss-$j.eps + fi + done + if [[ ! -z "$j" ]]; then + last=`echo "$j+1" | bc` + unset j + fi +} + +# run RNAcode for $tempfile if >= 3 sequences +function run_rnacode { + >&2 echo -e "processing" `cat ${tmpif} | grep $ref | cut -d" " -f1-6` + nl=`cat ${tmpif} | grep "^s" | wc -l` + if [[ "$nl" -ge "3" ]]; then + stdout=`RNAcode $@ |& egrep -v "^ HSS #|^====|^$"` + if [[ "$?" != "0" ]]; then + ef=$(mktemp -u -p '.') + cat ${tmpif} > ${ef}.maf + >&2 echo "RNAcode failed for the alignmentblock \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\" (${ef}.maf)" + fi + else + >&2 echo "less than 3 sequences in the alignment block \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\"" + fi + # save content of outfile (otherwise it might be overwritten) + if [[ ! -z "$outfile" ]]; then + fix_output < "$outfile" >> $tmpof + rm "$outfile" + else + # - filter stdout for lines containing the ref and redirect everything to stderr + # https://github.com/wash/rnacode/issues/9 + # - we can not pipe stdout | ... | fix_output since then $last can not be used as global variable + echo "$stdout" | grep -v $ref 1>&2 + tmpf=$(mktemp -p '.') + echo "$stdout" | grep $ref > $tmpf + fix_output < $tmpf + rm $tmpf + fi +} + +ref="" +last=0 + +if [[ ! -z "$tabular" ]]; then + if [[ ! -z "$outfile" ]]; then + echo -a "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" > "$outfile" + else + echo "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" + fi +fi + +tmpif=$(mktemp -p '.') +tmpof=$(mktemp -p '.') +tmpd=$(mktemp -d -p '.') +while read line +do + if [[ "$line" =~ ^# ]]; then + echo > ${tmpif} + elif [[ "$line" =~ ^$ ]]; then + run_rnacode $args ${tmpif} + echo > ${tmpif} + else + if [[ -z $ref && "$line" =~ ^s ]]; then + ref=`echo $line | sed 's/\./ /g' | cut -d" " -f 2` + fi + echo $line >> ${tmpif} + fi +done < /dev/stdin +run_rnacode $args ${tmpif} + +if [[ ! -z "$outfile" ]]; then + cat $tmpof > "$outfile" +fi + +if [[ ! -z "$eps" ]]; then + if [[ -z "$epsdir" ]]; then + mkdir eps + fi + mv ${tmpd}/*eps ${epsdir:-eps}/ +fi + +rm ${tmpif} +rm ${tmpof} +rmdir ${tmpd} diff --git a/tools/rna_tools/rnacode/rnacode.xml b/tools/rna_tools/rnacode/rnacode.xml index 1b08b245c8..79759d0ae0 100644 --- a/tools/rna_tools/rnacode/rnacode.xml +++ b/tools/rna_tools/rnacode/rnacode.xml @@ -10,13 +10,17 @@ breakMAF.pl --maxlength $cond_breakmaf.maxlength --desiredLength $cond_breakmaf.desiredLength - < '$alignment' > aln && + < '$alignment' > aln.maf #else - ln -s '$alignment' aln && + ln -s '$alignment' aln.maf + #end if + + #if $cond_breakmaf.select_breakmaf == 'breakmaf' and $cond_breakmaf.processseparately == 'yes' + ./processMAF.sh + #else + RNAcode #end if - RNAcode - $outputFormat #if $cutoff and $cutoff is not None @@ -33,6 +37,7 @@ #if $cond_generateEPS.generateEPS == 'create' --eps + --eps-dir eps/ #if $cond_generateEPS.eps_cutoff and $cond_generateEPS.eps_cutoff is not None --eps-cutoff $cond_generateEPS.eps_cutoff #end if @@ -42,7 +47,7 @@ --pars "$pars" #end if - aln + aln.maf #if $outputFormat.value == '--tabular' --outfile $outFileDefault @@ -55,13 +60,14 @@ - + + @@ -105,7 +111,7 @@ cond_generateEPS['generateEPS'] == "create" - + From a10370508d9d256b167c65f72fcae398cc406344 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Fri, 23 Mar 2018 13:58:45 +0100 Subject: [PATCH 05/10] added processMAF for separate processing of parts and fixes --- tools/rna_tools/rnacode/processMAF.sh | 66 ++++++++++++++++----------- tools/rna_tools/rnacode/rnacode.xml | 35 +++++++++++--- 2 files changed, 67 insertions(+), 34 deletions(-) diff --git a/tools/rna_tools/rnacode/processMAF.sh b/tools/rna_tools/rnacode/processMAF.sh index 106e3bd6aa..c78b24dce0 100755 --- a/tools/rna_tools/rnacode/processMAF.sh +++ b/tools/rna_tools/rnacode/processMAF.sh @@ -17,22 +17,27 @@ # breakMAF.pl < chrM.maf | ./processMAF.sh --tabular --eps --eps-dir eps2/ # parse the command line options -args=$@ +# - removes --outfile, --help, --version, and the input file from the arguments +# - outfile and infile are stored in variables +declare -a args=() while [[ $# -gt 0 ]] do key="$1" case $key in -d|--eps-dir) epsdir=$2 + args+=($1 $2) shift # past argument shift # past value ;; -e|--eps) eps=1 + args+=($1) shift # past argument ;; -t|--tabular) tabular=1 + args+=($1) shift # past argument ;; -o|--outfile) @@ -40,7 +45,15 @@ key="$1" shift # past argument shift # past value ;; - -g|--gtf|-b|--best-only|-r|--best-region|-s|--stop-early|-n|--num-samples|-p|--cutoff|-c|--pars|-e|--eps|-i|--eps-cutoff|-h|--help|-v|--version) + -g|--gtf|-b|--best-only|-r|--best-region|-s|--stop-early) + args+=($1) + shift # past argument + ;; + -n|--num-samples|-p|--cutoff|-c|--pars|-i|--eps-cutoff) + args+=($1 $2) + shift # past argument + ;; + -h|--help|-v|--version) shift # past argument ;; *) # unknown option @@ -54,7 +67,6 @@ done # and move eps files (if present) to tmpdir function fix_output { if [[ -z "$last" ]]; then - echo "reset LAST" last=0 fi while read line @@ -77,28 +89,22 @@ function run_rnacode { >&2 echo -e "processing" `cat ${tmpif} | grep $ref | cut -d" " -f1-6` nl=`cat ${tmpif} | grep "^s" | wc -l` if [[ "$nl" -ge "3" ]]; then - stdout=`RNAcode $@ |& egrep -v "^ HSS #|^====|^$"` + streams=$(mktemp -p '.') + RNAcode $@ &> $streams if [[ "$?" != "0" ]]; then ef=$(mktemp -u -p '.') cat ${tmpif} > ${ef}.maf >&2 echo "RNAcode failed for the alignmentblock \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\" (${ef}.maf)" fi - else - >&2 echo "less than 3 sequences in the alignment block \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\"" - fi - # save content of outfile (otherwise it might be overwritten) - if [[ ! -z "$outfile" ]]; then - fix_output < "$outfile" >> $tmpof - rm "$outfile" - else # - filter stdout for lines containing the ref and redirect everything to stderr # https://github.com/wash/rnacode/issues/9 # - we can not pipe stdout | ... | fix_output since then $last can not be used as global variable - echo "$stdout" | grep -v $ref 1>&2 - tmpf=$(mktemp -p '.') - echo "$stdout" | grep $ref > $tmpf - fix_output < $tmpf - rm $tmpf + cat $streams | grep -v $ref | egrep -v "^ HSS #|^====|^$" 1>&2 + sed -i -n "/$ref/p" $streams + fix_output < $streams + rm $streams + else + >&2 echo "less than 3 sequences in the alignment block \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\"" fi } @@ -107,39 +113,45 @@ last=0 if [[ ! -z "$tabular" ]]; then if [[ ! -z "$outfile" ]]; then - echo -a "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" > "$outfile" + echo -e "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" > "$outfile" else - echo "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" + echo -e "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" fi fi tmpif=$(mktemp -p '.') tmpof=$(mktemp -p '.') tmpd=$(mktemp -d -p '.') + +# process lines of the alignment +# - save lines to tmpif +# - empty lines: process tmpif (ie. last alignment block) with RNAcode, clear tmpif +# - on the go the name of the reference species is determined from the 1st line +# of the alignment this is used then for filtering the RNAcode output while read line do if [[ "$line" =~ ^# ]]; then - echo > ${tmpif} + echo -n > ${tmpif} elif [[ "$line" =~ ^$ ]]; then - run_rnacode $args ${tmpif} - echo > ${tmpif} + run_rnacode ${args[@]} ${tmpif} + echo -n > ${tmpif} else if [[ -z $ref && "$line" =~ ^s ]]; then ref=`echo $line | sed 's/\./ /g' | cut -d" " -f 2` fi echo $line >> ${tmpif} fi -done < /dev/stdin -run_rnacode $args ${tmpif} +done < ${file:-/dev/stdin} +# if there is something left -> process it +if [[ "`cat ${tmpif} | wc -l`" -gt "0" ]]; then + run_rnacode ${args[@]} ${tmpif} +fi if [[ ! -z "$outfile" ]]; then cat $tmpof > "$outfile" fi if [[ ! -z "$eps" ]]; then - if [[ -z "$epsdir" ]]; then - mkdir eps - fi mv ${tmpd}/*eps ${epsdir:-eps}/ fi diff --git a/tools/rna_tools/rnacode/rnacode.xml b/tools/rna_tools/rnacode/rnacode.xml index 79759d0ae0..6a4282d9c0 100644 --- a/tools/rna_tools/rnacode/rnacode.xml +++ b/tools/rna_tools/rnacode/rnacode.xml @@ -7,16 +7,16 @@ aln.maf + --desiredLength $cond_breakmaf.desiredlength + < '$alignment' > aln.maf && #else - ln -s '$alignment' aln.maf + ln -s '$alignment' aln.maf && #end if #if $cond_breakmaf.select_breakmaf == 'breakmaf' and $cond_breakmaf.processseparately == 'yes' - ./processMAF.sh + $__tool_directory__/processMAF.sh #else RNAcode #end if @@ -61,8 +61,8 @@ - - + + @@ -122,8 +122,29 @@ + + + + + + + + + + + + + + + + + + + + + From 78beeaf90531c24ffb03412df05fd201cef15316 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Fri, 23 Mar 2018 18:37:35 +0100 Subject: [PATCH 06/10] bugfixes for gtf output --- tools/rna_tools/rnacode/processMAF.sh | 70 +++++++++++++++++---------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/tools/rna_tools/rnacode/processMAF.sh b/tools/rna_tools/rnacode/processMAF.sh index c78b24dce0..3cbf7138fa 100755 --- a/tools/rna_tools/rnacode/processMAF.sh +++ b/tools/rna_tools/rnacode/processMAF.sh @@ -35,6 +35,11 @@ key="$1" args+=($1) shift # past argument ;; + -g|--gtf) + gtf=1 + args+=($1) + shift # past argument + ;; -t|--tabular) tabular=1 args+=($1) @@ -45,13 +50,14 @@ key="$1" shift # past argument shift # past value ;; - -g|--gtf|-b|--best-only|-r|--best-region|-s|--stop-early) + -b|--best-only|-r|--best-region|-s|--stop-early) args+=($1) shift # past argument ;; -n|--num-samples|-p|--cutoff|-c|--pars|-i|--eps-cutoff) args+=($1 $2) shift # past argument + shift # past value ;; -h|--help|-v|--version) shift # past argument @@ -71,9 +77,18 @@ function fix_output { fi while read line do - i=`echo "$line" | awk '{print $1}'` + if [[ -z "$gtf" ]]; then + i=`echo "$line" | sed 's/^\([[:digit:]]\+\)[[:space:]].*/\1/'` + else + i=`echo $line | sed 's/.*Gene\([[:digit:]]\+\).*/\1/'` + fi j=`echo "$i+$last" | bc` - echo $line | awk -v n=$j '{printf("%d\t", n); for(i=2; i<=NF; i++){printf("%s", $(i)); if(i==NF){printf("\n")}else{printf("\t")}}}' + if [[ -z "$gtf" ]]; then + echo "$line" | sed "s/^\([[:digit:]]\+\)\([[:space:]].*\)/$j\2/" + else + echo "$line" | sed "s/^\(.*\)Gene[0-9]\+\(\".*\)$/\1Gene$j\2/" + fi + #echo $line | awk -v n=$j '{printf("%d\t", n); for(i=2; i<=NF; i++){printf("%s", $(i)); if(i==NF){printf("\n")}else{printf("\t")}}}' if [[ ! -z "$eps" && -f ${epsdir:-eps}/hss-$i.eps ]]; then mv ${epsdir:-eps}/hss-$i.eps $tmpd/hss-$j.eps fi @@ -83,26 +98,32 @@ function fix_output { unset j fi } - + # run RNAcode for $tempfile if >= 3 sequences function run_rnacode { - >&2 echo -e "processing" `cat ${tmpif} | grep $ref | cut -d" " -f1-6` - nl=`cat ${tmpif} | grep "^s" | wc -l` + >&2 echo -e "processing " `cat ${tmpif} | grep ^s | head -n 1 | cut -d" " -f1-6` +# >&2 echo "with RNAcode" $@ + nl=`cat ${tmpif} | grep "^s" | wc -l` if [[ "$nl" -ge "3" ]]; then - streams=$(mktemp -p '.') - RNAcode $@ &> $streams + # - filter the outfile for lines containing the ref and redirect everything to stderr + # https://github.com/wash/rnacode/issues/9 + # - we can not pipe stdout | ... | fix_output since then $last can not be used as global variable + if [[ ! -z "$gtf" ]]; then + field=1 + elif [[ ! -z "$tabular" ]]; then + field=7 + else + field=6 + fi + RNAcode $@ | awk -v ref=$ref -v field=$field '{if($(field)==ref){print $0}else{$0 > "/dev/stderr"}}' > ${tmpof} + if [[ "$?" != "0" ]]; then ef=$(mktemp -u -p '.') cat ${tmpif} > ${ef}.maf >&2 echo "RNAcode failed for the alignmentblock \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\" (${ef}.maf)" fi - # - filter stdout for lines containing the ref and redirect everything to stderr - # https://github.com/wash/rnacode/issues/9 - # - we can not pipe stdout | ... | fix_output since then $last can not be used as global variable - cat $streams | grep -v $ref | egrep -v "^ HSS #|^====|^$" 1>&2 - sed -i -n "/$ref/p" $streams - fix_output < $streams - rm $streams + fix_output < $tmpof + echo -n > ${tmpof} else >&2 echo "less than 3 sequences in the alignment block \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\"" fi @@ -112,11 +133,7 @@ ref="" last=0 if [[ ! -z "$tabular" ]]; then - if [[ ! -z "$outfile" ]]; then - echo -e "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" > "$outfile" - else - echo -e "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" - fi + echo -e "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" >> ${outfile:-/dev/stdout} fi tmpif=$(mktemp -p '.') @@ -128,16 +145,22 @@ tmpd=$(mktemp -d -p '.') # - empty lines: process tmpif (ie. last alignment block) with RNAcode, clear tmpif # - on the go the name of the reference species is determined from the 1st line # of the alignment this is used then for filtering the RNAcode output +# in case of gtf output only the chromosome is printed, ie only chr1 instead of dm6.chr1 while read line do if [[ "$line" =~ ^# ]]; then echo -n > ${tmpif} elif [[ "$line" =~ ^$ ]]; then run_rnacode ${args[@]} ${tmpif} + # >> ${outfile:-/dev/stdout} echo -n > ${tmpif} else if [[ -z $ref && "$line" =~ ^s ]]; then - ref=`echo $line | sed 's/\./ /g' | cut -d" " -f 2` + if [[ -z "$gtf" ]]; then + ref=`echo $line | cut -d" " -f 2` + else + ref=`echo $line | sed 's/\./ /g' | cut -d" " -f 3` + fi fi echo $line >> ${tmpif} fi @@ -145,10 +168,7 @@ done < ${file:-/dev/stdin} # if there is something left -> process it if [[ "`cat ${tmpif} | wc -l`" -gt "0" ]]; then run_rnacode ${args[@]} ${tmpif} -fi - -if [[ ! -z "$outfile" ]]; then - cat $tmpof > "$outfile" + # >> ${outfile:-/dev/stdout} fi if [[ ! -z "$eps" ]]; then From e495ea999eb6b65fb8b616c68dc0990983b77ab5 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Mon, 26 Mar 2018 08:20:32 +0200 Subject: [PATCH 07/10] added missing quotes --- tools/rna_tools/rnacode/rnacode.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/rna_tools/rnacode/rnacode.xml b/tools/rna_tools/rnacode/rnacode.xml index 6a4282d9c0..28ac111fe2 100644 --- a/tools/rna_tools/rnacode/rnacode.xml +++ b/tools/rna_tools/rnacode/rnacode.xml @@ -6,7 +6,7 @@ RNAcode --version Date: Tue, 27 Mar 2018 18:46:17 +0200 Subject: [PATCH 08/10] removed quotes again --- tools/rna_tools/rnacode/rnacode.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/rna_tools/rnacode/rnacode.xml b/tools/rna_tools/rnacode/rnacode.xml index 28ac111fe2..6a4282d9c0 100644 --- a/tools/rna_tools/rnacode/rnacode.xml +++ b/tools/rna_tools/rnacode/rnacode.xml @@ -6,7 +6,7 @@ RNAcode --version Date: Wed, 28 Mar 2018 08:46:57 +0200 Subject: [PATCH 09/10] applied correct quoting --- tools/rna_tools/rnacode/rnacode.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/rna_tools/rnacode/rnacode.xml b/tools/rna_tools/rnacode/rnacode.xml index 6a4282d9c0..36c18bb0bf 100644 --- a/tools/rna_tools/rnacode/rnacode.xml +++ b/tools/rna_tools/rnacode/rnacode.xml @@ -7,7 +7,7 @@ aln.maf && @@ -16,7 +16,7 @@ #end if #if $cond_breakmaf.select_breakmaf == 'breakmaf' and $cond_breakmaf.processseparately == 'yes' - $__tool_directory__/processMAF.sh + '$__tool_directory__/processMAF.sh' #else RNAcode #end if From 16cc0a9a956857179b32c9ae2845da7f20bb5d54 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Fri, 13 Apr 2018 11:02:29 +0200 Subject: [PATCH 10/10] switched to pipes --- tools/rna_tools/rnacode/rnacode.xml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/rna_tools/rnacode/rnacode.xml b/tools/rna_tools/rnacode/rnacode.xml index 36c18bb0bf..53e7fc12b5 100644 --- a/tools/rna_tools/rnacode/rnacode.xml +++ b/tools/rna_tools/rnacode/rnacode.xml @@ -10,9 +10,9 @@ '$__tool_directory__/breakMAF.pl' --maxlength $cond_breakmaf.maxlength --desiredLength $cond_breakmaf.desiredlength - < '$alignment' > aln.maf && + < '$alignment' | #else - ln -s '$alignment' aln.maf && + cat '$alignment' | #end if #if $cond_breakmaf.select_breakmaf == 'breakmaf' and $cond_breakmaf.processseparately == 'yes' @@ -47,8 +47,6 @@ --pars "$pars" #end if - aln.maf - #if $outputFormat.value == '--tabular' --outfile $outFileDefault #elif $outputFormat.value == '--gtf'