-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalignbypattern.pl
executable file
·170 lines (146 loc) · 4.44 KB
/
alignbypattern.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env perl
use strict;
use lib "/Users/alexrd/perl";
use mj qw ( %names %symb );
my @residue = split(/:/,shift(@ARGV));
my @index = split(/:/,shift(@ARGV));
sub help {
print("
alignbypattern.pl
Alex Dickson
University of Michigan
This script takes a folder full of pdbs, and aligns them in sequence according to a pattern of conserved residues.
If the pattern is found more than once it will align to each instance, and then print a warning.
The output pdbs are sorted by which chain was matched to the given sequence.
The first argument is a colon-separated list of the residues that are conserved using one-letter codes.
Multiple residues can be specified to allow for flexibility:
e.g. Y:N:ILVM:DN
The second argument is a colon-separated list of the corresponding residue indices.
The output pdbs are aligned in sequence such that the matching residues have these indices.
e.g. 412:428:425:421
If during the alignment process, negative indices would be obtained, these residues are put on a new chain, X, and
their residue numbers are left unchanged. To avoid this, consider shifting these values upward, by a constant.
");
exit(0);
}
if ((scalar @residue != scalar @index) || (scalar @residue == 0)) {
print("Usage: res1:res2:res3:.. ind1:ind2:ind3:..\n");
&help;
}
my @files = split(/\n/,`ls *.pdb`);
if (scalar @files == 0) {
print("No pdbs in this directory!\n");
&help;
}
my $shiftupnexttime = 0;
for my $file (@files) {
my %resi;
open(MYIN,"$file");
while (my $line = <MYIN>) {
if (substr($line,0,4) eq "ATOM" || substr($line,0,6) eq "HETATM") {
my $chain = substr($line,21,1);
my $resnum = substr($line,22,4);
$resnum =~ s/^\s+//;
my $resname = substr($line,17,3);
$resi{$chain}{$resnum} = lc($resname);
}
if (substr($line,0,6) eq "ENDMDL") {
last;
}
}
close(MYIN);
my @matchchain = ();
my @matchnum = ();
for my $c ( keys %resi ) {
for my $n ( keys %{$resi{$c}} ) {
if (exists $symb{$resi{$c}{$n}}) {
my $aa = $symb{$resi{$c}{$n}};
if ($residue[0] =~ m/$aa/) { # the first one matches, check the rest
# print("following up on chain $c num $n : resid = $aa\n");
my $match = 1;
for my $i (1..@residue-1) {
if ($match) {
my $offset = $index[$i] - $index[0];
my $ni = $n + $offset;
if (exists $resi{$c}{$ni}) {
my $aai = $symb{$resi{$c}{$ni}};
if (!($residue[$i] =~ m/$aai/)) {
# print("does not match because of resid $ni : $resi{$c}{$ni} : ($aai instead of $residue[$i])\n");
$match = 0;
}
} else {
# print("resi $resi{$c}{$ni} does not exist in the table\n");
$match = 0;
}
}
}
if ($match) {
push(@matchchain,$c);
push(@matchnum,$n);
}
}
}
}
}
if (scalar @matchchain == 0) {
print(STDERR "no sequence matches found for $file\n");
} else {
my $senterr = 0;
my %chains;
for my $i (0..@matchchain-1) {
if (!exists $chains{$matchchain[$i]}) {
$chains{$matchchain[$i]} = 1;
} else {
$chains{$matchchain[$i]}++;
if (!$senterr) {
print(STDERR "multiple sequence matches found for $file ");
for my $j (0..@matchchain-1) {
print(STDERR "($matchchain[$j])$matchnum[$j] ");
}
print(STDERR "\n");
$senterr = 1;
}
}
my $outfile = $file;
$outfile =~ s/\.pdb/_align_${matchchain[$i]}$chains{$matchchain[$i]}\.pdb/;
open(MYOUT,">$outfile");
open(MYIN,"$file");
while (my $line = <MYIN>) {
if ((substr($line,0,4) eq "ATOM" || substr($line,0,4) eq "TER ") || substr($line,0,6) eq "HETATM") {
my $chain = substr($line,21,1);
my $resnum = substr($line,22,4);
if ($chain eq $matchchain[$i]) {
my $newnum = $resnum - $matchnum[$i] + $index[0];
if ($newnum < 0) {
if (-$newnum > $shiftupnexttime) {
$shiftupnexttime = -$newnum;
}
$newnum = $resnum;
substr($line,21,1,"X");
} else {
my $strnum = sprintf('%4s',$newnum);
substr($line,22,4,$strnum);
}
print(MYOUT $line);
} else {
print(MYOUT $line);
}
} else {
print(MYOUT $line);
}
}
close(MYIN);
close(MYOUT);
}
}
}
if ($shiftupnexttime > 0) {
print(STDERR "Warning: negative residue indices were shifted to chain X.
To avoid this next time, shift indices upward by $shiftupnexttime like so:
");
for (@index) {
$_ += $shiftupnexttime;
}
my $tmp = join(':',@index);
print(STDERR "$tmp\n");
}