forked from LUMC/ribosome-profiling-analysis-framework
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpack_sam_files.php
executable file
·113 lines (100 loc) · 3.93 KB
/
pack_sam_files.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/php
<?php
/*******************************************************************************
*
* PackSamFiles converts sam files into packed sam files, saving space and
* grouping the reads together into transcript, position, coverage.
*
* Created : 2013-08-13
* Modified : 2013-08-15
* Version : 0.1
*
* Copyright : 2013-2015 Leiden University Medical Center; http://www.LUMC.nl/
* Programmer : Ing. Ivo F.A.C. Fokkema <[email protected]>
*
*
* This work is licensed under the Creative Commons
* Attribution-NonCommercial-ShareAlike 4.0 International License. To view a
* copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/
* or send a letter to:
* Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
*
*************/
$_SETT =
array(
'version' => '0.1',
'suffix' => '.packed',
'terminal_width' => 100,
);
echo 'PackSamFiles v.' . $_SETT['version'] . "\n" .
'PLEASE DO NOT USE THIS SCRIPT ON A NETWORK DRIVE; IT CAN BE INCREDIBLY SLOW THERE.' . "\n\n";
$aFiles = $_SERVER['argv'];
$sScriptName = array_shift($aFiles);
if (count($aFiles) < 1) {
die('Usage: ' . $sScriptName . ' SAM_FILE1 [SAM_FILE2 [SAM_FILE3 [...]]]' . "\n\n");
}
// Check if all files can be read.
foreach ($aFiles as $sFile) {
if (!is_readable($sFile)) {
die('Unable to open ' . $sFile . '.' . "\n");
}
}
foreach ($aFiles as $sFile) {
$nLine = 0;
// SAM files are BIG. Very, very, very, BIG. Really. HUGE, actually. GBs. We need to read this line by line, to prevent a memory error.
$fIn = @fopen($sFile, 'r');
if (!$fIn) {
die('Unable to open ' . $sFile . '.' . "\n\n");
}
$nFileSize = filesize($sFile);
$nBytesRead = 0;
$sFileOut = $sFile . $_SETT['suffix'];
$fOut = @fopen($sFileOut, 'w');
if (!$fOut) {
die('Unable to open file for writing: ' . $sFileOut . '.' . "\n\n");
}
$aData = array(); // Will contain transcripts as keys, with an array (position => coverage) as value.
while ($sLine = fgets($fIn)) {
$nLine ++;
$nBytesRead += strlen($sLine);
$sLine = rtrim($sLine);
list(, $sReference, $nPosition) = explode("\t", $sLine);
list(, , , $sTranscript,) = explode('|', $sReference);
if ($sTranscript && $nPosition) {
if (!isset($aData[$sTranscript])) {
$aData[$sTranscript] = array();
}
if (!isset($aData[$sTranscript][$nPosition])) {
$aData[$sTranscript][$nPosition] = 0;
}
$aData[$sTranscript][$nPosition] ++;
} else {
die('Can\'t parse line ' . $nLine . ' in file ' . $sFile . '.' . "\n\n");
}
if (!($nLine % 50000)) {
$nPercentageRead = round($nBytesRead/$nFileSize, 2);
$nAvailableWidth = $_SETT['terminal_width'] - 8 - strlen($nLine);
$lDone = round($nPercentageRead*$nAvailableWidth);
print(str_repeat(chr(8), $_SETT['terminal_width']) .
'[' . str_repeat('=', $lDone) . str_repeat(' ', $nAvailableWidth - $lDone) . '] ' . $nLine . ' ' . str_pad(round($nPercentageRead*100), 3, ' ', STR_PAD_LEFT) . '%');
}
}
$nAvailableWidth = $_SETT['terminal_width'] - 8 - strlen($nLine);
print(str_repeat(chr(8), $_SETT['terminal_width']) .
'[' . str_repeat('=', $nAvailableWidth) . '] ' . $nLine . ' 100%');
fclose($fIn);
print("\n" .
'Done reading ' . $nLine . ' lines, writing output... ');
// Sorting the results would be nice, but perhaps really unnecessary.
ksort($aData);
foreach ($aData as $sTranscript => $aTranscript) {
ksort($aTranscript);
foreach ($aTranscript as $nPosition => $nCoverage) {
fputs($fOut, $sTranscript . "\t" . $nPosition . "\t" . $nCoverage . "\n");
}
}
fclose($fOut);
print('Done, wrote ' . count($aData) . ' lines.' . "\n");
}
die('All files done.' . "\n");
?>