Skip to content

Commit

Permalink
updated file test to open file rather than use operators.
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Keane committed May 30, 2016
1 parent a1fb472 commit a33b726
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 13 deletions.
15 changes: 11 additions & 4 deletions RetroSeq/Utilities.pm
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ my $BAMFLAGS =
'mate_reverse' => 0x0020,
'1st_in_pair' => 0x0040,
'2nd_in_pair' => 0x0080,
'secondary' => 0x0100,
'secondary' => 0x0100,
'failed_qc' => 0x0200,
'duplicate' => 0x0400,
};
Expand Down Expand Up @@ -211,6 +211,14 @@ sub testBreakPoint
}
}

# read is mapped mate is mapped paired correctly
if( ! ($s[ 1 ] & $$BAMFLAGS{'unmapped'} ) && !($s[ 1 ] & $$BAMFLAGS{'mate_unmapped'} ) && ( $s[ 1 ] & $$BAMFLAGS{'read_paired'} ) )
{
#look for soft clipped bases and include them in the count of soft clipped reads
if( $s[ 5 ] =~ /[0-9]{2}+S$/ ){$lhsSoft++;}
if( $s[ 5 ] =~ /^[0-9]{2}+S/ ){$rhsSoft++;}
}

# read is mapped mate is mapped not paired correctly ins size < 50k both mates are mapped to same strand
if( ! ($s[ 1 ] & $$BAMFLAGS{'unmapped'} ) && !($s[ 1 ] & $$BAMFLAGS{'mate_unmapped'} ) && ! ( $s[ 1 ] & $$BAMFLAGS{'read_paired'} ) && abs($s[ 8 ]) < 50000 && ( $s[ 1 ] & $$BAMFLAGS{'reverse_strand'} ) == ( $s[ 1 ] & $$BAMFLAGS{'mate_reverse'} ) && ( $s[ 2 ] eq $s[ 6 ] || $s[ 6 ] eq '=') ) #and both mates mapped to the same chr
{
Expand Down Expand Up @@ -859,7 +867,6 @@ sub calculateCigarBreakpoint
=pod
return 1 - unmapped mate
return 2 - mate mapped but incorrectly
return 3 - soft clipped read supporting the breakpoint
=cut
sub isSupportingClusterRead
{
Expand All @@ -876,8 +883,8 @@ sub isSupportingClusterRead

return 0 if( !( $flag & $$BAMFLAGS{'paired_tech'} ) );

# read is not a duplicate map quality is >= minimum
if( ! ( $flag & $$BAMFLAGS{'duplicate'} ) && $mapQ >= $minQual )
# read is not a duplicate map quality is >= minimum no soft clipping(full length) is not a secondary match
if( ! ( $flag & $$BAMFLAGS{'duplicate'} ) && $mapQ >= $minQual && $cigar !~ /[0-9]+S/ && $cigar !~ /[0-9]+H/ && ! ( $flag & $$BAMFLAGS{'secondary'} ) )
{
# read is mapped mate is unmapped
if( ! ( $flag & $$BAMFLAGS{'unmapped'} ) && ( $flag & $$BAMFLAGS{'mate_unmapped'} ) )
Expand Down
25 changes: 16 additions & 9 deletions bin/retroseq.pl
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
my $MAX_READ_GAP_IN_REGION = 120;
my $DEFAULT_MIN_CLUSTER_READS = 2;
my $DEFAULT_MAX_CLUSTER_DIST = 4000;
my $DEFAULT_FEATURE_OVERLAP = 0.8;

my $HEADER = qq[#retroseq v:].$RetroSeq::Utilities::VERSION;
my $FOOTER = qq[#END_CANDIDATES];
Expand All @@ -54,7 +55,7 @@
'duplicate' => 0x0400,
};

my ($discover, $call, $bam, $bams, $ref, $eRefFofn, $length, $id, $output, $anchorQ, $region, $input, $reads, $depth, $noclean, $tmpdir, $readgroups, $filterFile, $orientate, $ignoreRGsFofn, $callNovel, $refTEs, $excludeRegionsDis, $doAlign, $incsoftclips, $help);
my ($discover, $call, $bam, $bams, $ref, $eRefFofn, $length, $id, $output, $anchorQ, $region, $input, $reads, $depth, $noclean, $tmpdir, $readgroups, $filterFile, $orientate, $ignoreRGsFofn, $callNovel, $refTEs, $excludeRegionsDis, $doAlign, $incsoftclips, $fractionOverlap, $help);

GetOptions
(
Expand Down Expand Up @@ -86,6 +87,7 @@
'exd=s' => \$excludeRegionsDis,
'align' => \$doAlign,
'soft' => \$incsoftclips,
'fetOverlap' => \$fractionOverlap,
'h|help' => \$help,
);

Expand Down Expand Up @@ -121,13 +123,14 @@
-output Output file to store candidate supporting reads (required for calling step)
[-refTEs Tab file with TE type and BED file of reference elements. These will be used to quickly assign discordant reads the TE types and avoid alignment. Using this will speed up discovery dramatically.]
[-noclean Do not remove intermediate output files. Default is to cleanup.]
[-q Minimum mapping quality for a read mate that anchors the insertion call. Default is 30. Parameter is optional.]
[-id Minimum percent ID for a match of a read to the transposon references. Default is 90.]
[-q Minimum mapping quality for a read mate that anchors the insertion call. Default is $DEFAULT_ANCHORQ. Parameter is optional.]
[-id Minimum percent ID for a match of a read to the transposon references. Default is $DEFAULT_ID.]
[-rgs Comma separated list of readgroups to operate on. Default is all.]
[-exd Fofn of BED files of regions where discordant mates falling into will be excluded e.g. simple repeats, centromeres, telomeres]
[-align Do the computational expensive exonerate PE discordant mate alignment step]
[-eref Tab file with list of transposon types and the corresponding fasta file of reference sequences (e.g. SINE /home/me/refs/SINE.fasta). Required when the -align option is used.]
[-len Minimum length of a hit to the transposon references when using the -align option. Default is 36bp.]
[-len Minimum length of a hit to the transposon references when using the -align option. Default is $DEFAULT_LENGTH.]
[-overlap Fraction of read alignment that must overlap with ref TE elements to be considered. Default is $DEFAULT_FEATURE_OVERLAP.]
USAGE

Expand All @@ -142,7 +145,8 @@
foreach my $type ( keys( %{$erefs} ) )
{
my $file = $$erefs{$type};
croak qq[Cant find transposon reference file: $file\n] if( ! -f $$erefs{$type} );
#Test if can open the file
open(my $iifh,$file)or croak qq[Cant find $type transposon reference file: $file\nError $!\n];close($iifh);
}
if( ! $doAlign ){print qq[Forcing -align option to be switched on as transposon sequence files were provided\n];$doAlign = 1;}
}
Expand All @@ -155,15 +159,17 @@
foreach my $type ( keys( %{$refTEsF} ) )
{
my $file = $$refTEsF{ $type };
croak qq[Cant find $type reference TEs file: $file\n] if( ! -f $$refTEsF{$type} );
#Test if can open the file
open(my $iifh,$file)or croak qq[Cant find $type reference TEs file: $file\nError $!\n];close($iifh);
}
}

$anchorQ = defined( $anchorQ ) && $anchorQ > -1 ? $anchorQ : $DEFAULT_ANCHORQ;
$id = defined( $id ) && $id < 101 && $id > 0 ? $id : $DEFAULT_ID;
$length = defined( $length ) && $length > 25 ? $length : $DEFAULT_LENGTH;
my $clean = defined( $noclean ) ? 0 : 1;

$fractionOverlap = defined( $fractionOverlap ) && $fractionOverlap <= 1.0 && $fractionOverlap > 0 ? $fractionOverlap : $DEFAULT_FEATURE_OVERLAP;

if( $readgroups && length( $readgroups ) > 0 )
{
my @s = split( /,/, $readgroups );foreach my $rg ( @s ){if( $rg !~ /[A-Za-z0-9]|\.|-|_+/ ){croak qq[Invalid readgroup: $rg\n];}}
Expand All @@ -177,7 +183,7 @@
RetroSeq::Utilities::checkBinary( q[exonerate], qq[2.2.0] ) if( $doAlign );
RetroSeq::Utilities::checkBinary( q[bedtools] );

_findCandidates( $bam, $erefs, $id, $length, $anchorQ, $output, $readgroups, $refTEsF, $excludeRegionsDis, $doAlign, $clean );
_findCandidates( $bam, $erefs, $id, $length, $anchorQ, $output, $readgroups, $refTEsF, $excludeRegionsDis, $doAlign, $fractionOverlap, $clean );
}
elsif( $call )
{
Expand Down Expand Up @@ -274,6 +280,7 @@ sub _findCandidates
my $refTEsF = shift;
my $excludeRegionsDis = shift;
my $doAlign = shift;
my $fractionOverlap = shift;
my $clean = shift;

my $readgroupsFile = qq[$$.readgroups];
Expand Down Expand Up @@ -392,7 +399,7 @@ sub _findCandidates
{
my $file = $$refTEsF{ $type };
print qq[Screening for hits to: $type\n];
system( qq[bedtools intersect -a $discordantMatesBed -b $file -u | awk -F"\t" '{print \$4,\$5}' > $$.$type.mates.bed] ) == 0 or die qq[Failed to run bedtools intersect];
system( qq[bedtools intersect -a $discordantMatesBed -b $file -u -f $fractionOverlap | awk -F"\t" '{print \$4,\$5}' > $$.$type.mates.bed] ) == 0 or die qq[Failed to run bedtools intersect];

#print the mates (i.e. the anchors) of these reads into the discovery output file
#first load up the readnames
Expand Down

0 comments on commit a33b726

Please sign in to comment.