-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_motif_xl.pl
executable file
·132 lines (86 loc) · 2.61 KB
/
get_motif_xl.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env perl
=head1 NAME
=head1 AUTHOR
Chaolin Zhang ([email protected])
Created on Nov 22, 2017
=cut
use strict;
use warnings;
use Getopt::Long;
use Carp;
use File::Basename;
use Data::Dumper;
use MyConfig;
use Common 1.02;
use Bed;
use Motif;
my $prog = basename ($0);
my $ext = 10;
my $word = "";
my $padding = 0;
my $mismatch = 0;
my $cache = MyConfig::getDefaultCache ($prog);
my $verbose = 0;
GetOptions ("l=i"=>\$ext,
"w=s"=>\$word,
"p:i"=>\$padding,
"m:i"=>\$mismatch,
"c:s"=>\$cache,
"v"=>\$verbose);
if (@ARGV != 2)
{
print "Get crosslink frequency in motif ...\n";
print "Usage: $prog [options] <seq_file> <out_file>\n";
print " -l [int] : sequence extension around crosslink site ($ext)\n";
print " -w [string]: word to search for\n";
print " -p [int] : number of nucleotide to be padded on both sides ($padding)\n";
print " -m [int] : number of mismatches allowed in the core motif ($mismatch)\n";
print " -c [string]: cache dir ($cache)\n";
print " -v : verbose\n";
exit (1);
}
my ($inFastaFile, $outFile) = @ARGV;
print "loading top words ...\n" if $verbose;
my %crosslinkedBase;
system ("mkdir $cache");
print "search motifs ...\n" if $verbose;
my $motifLen = length ($word);
my $tmpSiteFile = "$cache/site.bed";
my $cmd = "PatternMatch -c $word -m $mismatch $inFastaFile > $tmpSiteFile";
print "$cmd\n" if $verbose;
my $ret = system ($cmd);
Carp::croak "CMD=$cmd failed:$?\n" if $ret != 0;
my %crosslinkCount;
for (my $i = 0; $i < $motifLen + $padding * 2; $i++)
{
$crosslinkCount{$i - $padding} = 0;
}
print "get crosslink profile ...\n" if $verbose;
my $fin;
open ($fin, "<$tmpSiteFile") || Carp::croak "cannot open file $tmpSiteFile to read\n";
my $n = 0;
while (my $line = <$fin>)
{
chomp $line;
next if $line=~/^\s*$/;
my $s = lineToBed ($line);
my $start = $s->{'chromStart'};
my $crosslinkPos = $ext - $start; #crosslink position relative to motif site
next if $crosslinkPos < -$padding || $crosslinkPos >= $motifLen + $padding;
$crosslinkCount{$crosslinkPos}++;
$n++;
}
close ($fin);
print "write to output...\n" if $verbose;
my @pos = sort {$a <=> $b} keys %crosslinkCount;
my $fout;
open ($fout, ">$outFile") || Carp::croak "cannot open file $outFile to write\n";
print $fout join ("\t", "#pos", "base", "count", "freq"), "\n";
foreach my $p (@pos)
{
my $base = $p < 0 || $p >= $motifLen ? 'N' : substr($word, $p, 1);
my $freq = $n == 0 ? 0 : $crosslinkCount{$p} / $n;
print $fout join ("\t", sprintf("%02d", $p), $base, $crosslinkCount{$p}, $freq), "\n";
}
close ($fout);
system ("rm -rf $cache");