forked from alexrd/SynthPDB
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgroupalign.pl
executable file
·301 lines (253 loc) · 8.4 KB
/
groupalign.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env perl
use strict;
use lib "/Users/alexrd/perl";
use mj qw ( %names %symb );
my $alignfile = shift(@ARGV);
my $alignres = shift(@ARGV);
sub help {
print("
groupalign.pl
Usage: groupalign.pl alignfile alignres
Alex Dickson
University of Michigan
This script uses a multiple-sequence alignment, and maps the indices of amino acids in PDBs such that they agree with
this alignment. It can handle alignments of domains (such as bromodomains, zinc fingers, etc.) where multiple
domains can occur on the same protein. The output pdbs are labeled by which chain was matched to the given sequence.
The first argument is the name of the file where the alignment is stored. The format of this file is assumed to
be the same as Clustal Omega output, which is as follows:
Q6IRX1_68 -----------------------LPDYY------LTIKKPMDMEKIRSHMMA---NKYQDIDSMVEDFVMMFNNACTYNEP--------
P25440_317 ----KHAAYAWPFYKPVDASALGLHDYH------DIIKHPMDLSTVKRKMEN---RDYRDAQEFAADVRLMFSNCYKYNPPDHDVV---
Q58F21_287 ----KHFSYAWPFYNPVDVNALGLHNYY------DVVKNPMDLGTIKEKMDN---QEYKDAYKFAADVRLMFMNCYKYNPPDHEVV---
B4E3L4_723 -----------PFRQPVDPQLLGIPDYF------DIVKNPMDLSTIKRKLDT---GQYQEPWQYVDDVWLMFNNAWLYNR---------
P51532_1461 -----------------------LPEYY------ELIRKPVDFKKIKERIRN---HKY-------------------------------
etc..
Where the first word is a unique identifier (here, the uniprot ID followed by the starting position)
and the second word is the alignment for each sequence. Multiple 'paragraphs' can be used for longer sequences, and
they should be separated by two blank lines. The first word identifier in each line of the second paragraph should
be the same as was given in the first paragraph: the start residue does not need to be updated.
The second argument is the index of the first amino acid in the alignment. This will be shared among all the
aligned PDB files. If during the alignment process, negative indices result, these residues are put on
a new chain, X, and their residue numbers are left unchanged, and a warning is printed at the end.
To avoid this, consider using a higher index value.
");
exit(0);
}
if (!-f $alignfile) {
print("Error: alignfile does not exist!\n");
&help;
}
my @files = split(/\n/,`ls *.pdb`);
if (scalar @files == 0) {
print("No pdbs in this directory!\n");
&help;
}
if (!-d "seqalign") {
mkdir("seqalign");
}
# read alignment file
my %alignseq; # stores the sequence
my %ndashbef; # stores the number of dashes before each amino acid
open(MYIN,$alignfile);
while (my $line = <MYIN>) {
chomp($line);
my @arr = split(/\s+/,$line);
if (defined $arr[0] && $arr[0] ne "CLUSTAL") {
$alignseq{$arr[0]} .= "$arr[1]";
}
}
close(MYIN);
my %seq = %alignseq;
for my $id ( keys %seq ) {
$seq{$id} =~ s/-//g; # get sequences with no dashes
my @tmp;
my $n = length($alignseq{$id});
my $nres = 0;
my $nd = 0; # running number of dashes
my $naa = 0; # running number of amino acids
for my $i (0..$n-1) {
if (substr($alignseq{$id},$i,1) eq "-") {
$nd++;
} else {
$tmp[$naa] = $nd;
$naa++;
}
}
$tmp[$naa] = $nd;
$ndashbef{$id} = \@tmp;
}
my $shiftupnexttime = 0;
my %flagsent;
for my $file (@files) {
my %resi;
my %lowres; # lowest residue index on each chain
my %highres; # highest residue index on each chain
open(MYIN,"$file");
while (my $line = <MYIN>) {
if (substr($line,0,4) eq "ATOM" || substr($line,0,6) eq "HETATM") {
my $chain = substr($line,21,1);
my $resnum = substr($line,22,4);
$resnum =~ s/\s+//;
if (!defined $lowres{$chain} || $resnum < $lowres{$chain}) {
$lowres{$chain} = $resnum;
}
if (!defined $highres{$chain} || $resnum > $highres{$chain}) {
$highres{$chain} = $resnum;
}
my $resname = substr($line,17,3);
$resi{$chain}{$resnum} = lc($resname);
}
if (substr($line,0,6) eq "ENDMDL") {
last;
}
}
close(MYIN);
# try to match each chain with sequence fragments
my @match = ();
for my $c ( keys %resi ) {
# make a sequence "word" with spaces for missing residues
$resi{$c}{"seq"} = "";
for my $res ($lowres{$c}..$highres{$c}) {
my $resname = $resi{$c}{$res};
if (defined $resname) {
my $aa = $symb{lc($resname)};
if (!defined $aa) {
$aa = "X";
if (!exists $flagsent{$resname}) {
# print("No residue entry for residue $resname\n");
}
$flagsent{$resname} = 1;
}
$resi{$c}{"seq"} .= $aa;
} else {
$resi{$c}{"seq"} .= " ";
}
}
# look through the list of sequence fragments for a match
for my $s ( keys %seq ) {
my $mref = &getmatch($resi{$c}{"seq"},$seq{$s});
my @tmatch = @{$mref};
if (scalar @tmatch > 0) {
my $nm = scalar @tmatch;
foreach (@tmatch) {
$_->{"ID"} = $s;
$_->{"chain"} = $c;
$_->{"startres"} += $lowres{$c};
push(@match,$_); # collect matches in @match array
}
}
}
}
if (scalar @match == 0) {
print(STDERR "no sequence matches found for $file\n");
} else {
my $senterr = 0;
for my $c (keys %resi) {
my $longest = undef; # find the longest match in each chain
my $long_index = undef;
for my $i (0..@match-1) {
if ($c eq $match[$i]{"chain"}) {
if (!defined $longest || length($seq{$match[$i]{'ID'}}) > $longest) {
$longest = length($seq{$match[$i]{'ID'}});
$long_index = $i;
}
}
}
if (defined $long_index) {
my $i = $long_index;
my $matchchain = $match[$i]{"chain"};
my $outfile = $file;
$outfile =~ s/\.pdb/_align_$match[$i]{'ID'}_${matchchain}\.pdb/;
open(MYOUT,">seqalign/$outfile");
open(MYIN,"$file");
while (my $line = <MYIN>) {
if ((substr($line,0,4) eq "ATOM" || substr($line,0,4) eq "TER ") || substr($line,0,6) eq "HETATM") {
my $chain = substr($line,21,1);
my $resnum = substr($line,22,4);
$resnum =~ s/\s+//;
# preprocessing : for visualization and alignment, look for multiple resolutions in crystal
# structures, and only keep 'A'
my $reso = substr($line,16,1);
if (($reso eq " ") || ($reso eq "A")) { # discard all resolutions that are not "A"
if ($chain eq $matchchain) { # write out modified residue number
my $newnum;
if ($resnum < $match[$i]{"startres"}) {
$newnum = $resnum - $match[$i]{"startres"} + $alignres;
} else { # shift up, taking dashes in sequence file into account
my $n = scalar @{$ndashbef{$match[$i]{'ID'}}};
my $tmp = $resnum-$match[$i]{"startres"};
if ($tmp >= $n) {
$tmp = $n -1;
}
$newnum = $resnum - $match[$i]{"startres"} + $alignres + $ndashbef{$match[$i]{'ID'}}[$tmp];
}
if ($newnum < 0) {
if (-$newnum > $shiftupnexttime) {
$shiftupnexttime = -$newnum;
}
$newnum = $resnum;
substr($line,21,1,"X"); # put residue instead on chain X, write a warning at the end
} else {
my $strnum = sprintf('%4s',$newnum);
substr($line,22,4,$strnum);
}
print(MYOUT $line);
} else {
print(MYOUT $line);
}
}
} else {
print(MYOUT $line);
}
}
close(MYIN);
close(MYOUT);
}
}
}
}
if ($shiftupnexttime > 0) {
my $tmp = $shiftupnexttime + $alignres;
print(STDERR "Warning: negative residue indices were shifted to chain X.
To avoid this next time, use alignres = $tmp\n");
}
sub getmatch {
# looks for a match between the two sequences
# returns an array of two integer arrays,integer reporting on the match strength, and the starting residue
#
# strength 1 = matches, but with X residues
# strength 3 = matches
my $seqpdb = shift;
my $seqalign = shift;
my @matches = ();
if (length($seqpdb) >= length($seqalign)) {
for my $startres (0..length($seqpdb)-length($seqalign)) {
my $match = 3;
my $nresmatch = 0;
my $i = 0;
my $test = substr($seqpdb,$startres,length($seqalign));
if ($test =~ m/\s/) {
$match = 0;
} else {
for my $res ($startres..$startres+length($seqalign)-1) {
my $a = substr($seqpdb,$res,1);
my $b = substr($seqalign,$i,1);
if ($a eq "X") {
$match = 1;
} elsif ($a ne $b) {
$match = 0;
last;
} else {
$nresmatch++;
}
$i++;
}
my $fracmatch = $nresmatch/(length($seqalign));
if ($match > 0 && $fracmatch > 0.5) {
my %tmp = ("strength" => $match, "startres" => $startres);
push(@matches,\%tmp);
}
}
}
}
return \@matches;
}