-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathlzh8.py
executable file
·266 lines (213 loc) · 8.14 KB
/
lzh8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env python
# Author: Bryan Cain
# Original software (lzh8_dec 0.8) written in C by hcs
# Date: January 17, 2011
# Description: Decompresses Nintendo's N64 romc compression, type 2 (LZ77+Huffman)
'''
LZH8 decompressor
An implementation of LZSS, with symbols stored via two Huffman codes:
- one for backreference lengths and literal bytes (8 bits each)
- one for backreference displacement lengths (bits - 1)
Layout of the compression:
0x00: 0x40 (LZH8 identifier)
0x01-0x03: uncompressed size (little endian)
0x04-0x07: optional 32-bit size if 0x01-0x03 is 0
followed by:
9-bit prefix coding tree table (for literal bytes and backreference lengths)
0x00-0x01: Tree table size in 32-bit words, -1
0x02-: Bit packed 9-bit inner nodes and leaves, stored as in Huff8
Total size: 2 ^ (leaf count + 1)
5-bit prefix coding tree table (for backreference displacement lengths)
0x00: Tree table size in 32-bit words, -1
0x01-: Bit packed 5-bit inner nodes and leaves, stored as in Huff8
Total size: 2 ^ (leaf count + 1)
Followed by compressed data bitstream:
1) Get a symbol from the 9-bit tree, if < 0x100 is a literal byte, repeat 1.
2) If 1 wasn't a literal byte, symbol - 0x100 + 3 is the backreference length
3) Get a symbol from the 5-bit tree, this is the length of the backreference
displacement.
3a) If displacement length is zero, displacement is zero
3b) If displacement length is one, displacement is one
3c) If displacement length is > 1, displacement is next displen-1 bits,
with an extra 1 on the front (normalized).
Reverse engineered by hcs.
'''
import sys, os, struct
from array import array
VERSION = "0.8"
# debug output options
SHOW_SYMBOLS = 0
SHOW_FREQUENCIES = 0
SHOW_TREE = 0
SHOW_TABLE = 0
# constants
LENBITS = 9
DISPBITS = 5
LENCNT = (1 << LENBITS)
DISPCNT = (1 << DISPBITS)
# globals
input_offset = 0
bit_pool = 0 # uint8_t
bits_left = 0
# read MSB->LSB order
'''static inline uint16_t get_next_bits(
FILE *infile,
long * const offset_p,
uint8_t * const bit_pool_p,
int * const bits_left_p,
const int bit_count)'''
def get_next_bits(infile, bit_count):
global input_offset, bit_pool, bits_left
offset_p = input_offset
bit_pool_p = bit_pool
bits_left_p = bits_left
out_bits = 0
num_bits_produced = 0
while num_bits_produced < bit_count:
if bits_left_p == 0:
infile.seek(offset_p)
bit_pool_p = struct.unpack("<B", infile.read(1))[0]
bits_left_p = 8
offset_p += 1
bits_this_round = 0
if bits_left_p > (bit_count - num_bits_produced):
bits_this_round = bit_count - num_bits_produced
else:
bits_this_round = bits_left_p
out_bits <<= bits_this_round
out_bits |= (bit_pool_p >> (bits_left_p - bits_this_round)) & ((1 << bits_this_round) - 1)
bits_left_p -= bits_this_round
num_bits_produced += bits_this_round
input_offset = offset_p
bit_pool = bit_pool_p
bits_left = bits_left_p
return out_bits
# void analyze_LZH8(FILE *infile, FILE *outfile, long file_length)
def decompress(infile):
global input_offset, bit_pool, bits_left
input_offset = 0
bit_pool = 0
bits_left = 0
# determine input file size
infile.seek(0, os.SEEK_END)
file_length = infile.tell()
# read header
infile.seek(input_offset)
header = struct.unpack("<I", infile.read(4))[0]
if (header & 0xFF) != 0x40: raise ValueError("not LZH8")
uncompressed_length = header >> 8
if uncompressed_length == 0:
uncompressed_length = struct.unpack("<I", f.read(4))[0]
# allocate output buffer
outbuf = array('B', '\0' * uncompressed_length) # uint8_t*
# allocate backreference length decode table
length_table_bytes = (struct.unpack("<H", infile.read(2))[0] + 1) * 4 # const uint32_t
length_decode_table_size = LENCNT * 2 # const long
length_decode_table = array('H', '\0' * length_decode_table_size * 2) # uint16_t* const
input_offset = infile.tell()
# read backreference length decode table
#if SHOW_TABLE: print "backreference length table"
start_input_offset = input_offset-2
i = 1
bits_left = 0
while (input_offset - start_input_offset) < length_table_bytes:
if i >= length_decode_table_size:
break
length_decode_table[i] = get_next_bits(infile, LENBITS)
i += 1
#if SHOW_TABLE: print "%ld: %d" % (i-1, length_decode_table[i-1])
input_offset = start_input_offset + length_table_bytes
bits_left = 0
#if SHOW_TABLE: print "done at 0x%lx" % input_offset
# allocate backreference displacement length decode table
infile.seek(input_offset)
displen_table_bytes = (struct.unpack("<B", infile.read(1))[0] + 1) * 4 # const uint32_t
input_offset += 1
displen_decode_table = array('B', '\0' * (DISPCNT * 2)) # uint8_t* const
# read backreference displacement length decode table
#if SHOW_TABLE: print "backreference displacement length table"
start_input_offset = input_offset-1
i = 1
bits_left = 0
while (input_offset - start_input_offset < displen_table_bytes):
if i >= length_decode_table_size:
break
displen_decode_table[i] = get_next_bits(infile, DISPBITS)
i += 1
#if SHOW_TABLE: print "%ld: %d" % (i-1, displen_decode_table[bit_pool = 0 # uint8_ti-1])
input_offset = start_input_offset + displen_table_bytes
bits_left = 0
#if SHOW_TABLE: print "done at 0x%lx" % input_offset
bytes_decoded = 0
# main decode loop
while bytes_decoded < uncompressed_length:
length_table_offset = 1
# get next backreference length or literal byte
while True:
next_length_child = get_next_bits(infile, 1)
length_node_payload = length_decode_table[length_table_offset] & 0x7F
next_length_table_offset = (length_table_offset / 2 * 2) + (length_node_payload + 1) * 2 + bool(next_length_child)
next_length_child_isleaf = length_decode_table[length_table_offset] & (0x100 >> next_length_child)
if next_length_child_isleaf:
length = length_decode_table[next_length_table_offset]
if 0x100 > length:
# literal byte
outbuf[bytes_decoded] = length
bytes_decoded += 1
else:
# backreference
length = (length & 0xFF) + 3
displen_table_offset = 1
# get backreference displacement length
while True:
next_displen_child = get_next_bits(infile, 1)
displen_node_payload = displen_decode_table[displen_table_offset] & 0x7
next_displen_table_offset = (displen_table_offset / 2 * 2) + (displen_node_payload + 1) * 2 + bool(next_displen_child)
next_displen_child_isleaf = displen_decode_table[displen_table_offset] & (0x10 >> next_displen_child)
if next_displen_child_isleaf:
displen = displen_decode_table[next_displen_table_offset]
displacement = 0
if displen != 0:
displacement = 1 # normalized
# collect the bits
#for (uint16_t i = displen-1; i > 0; i--)
for i in range(displen-1, 0, -1):
displacement *= 2
next_bit = get_next_bits(infile, 1)
displacement |= next_bit
# apply backreference
#for (long i = 0; i < length && bytes_decoded < uncompressed_length; bytes_decoded ++, i ++)
for i in range(length):
outbuf[bytes_decoded] = outbuf[bytes_decoded - displacement - 1]
bytes_decoded += 1
if bytes_decoded >= uncompressed_length: break
break # break out of displen tree traversal loop
else:
assert next_displen_table_offset != displen_table_offset # stuck in a loop somehow
displen_table_offset = next_displen_table_offset
# end of displen tree traversal loop
# end of if backreference !(0x100 > length)*/
break # break out of length tree traversal loop
else:
assert next_length_table_offset != length_table_offset # "stuck in a loop somehow"
length_table_offset = next_length_table_offset
# end of length tree traversal
# end of main decode loop
return outbuf.tostring()
if __name__ == "__main__":
if len(sys.argv) != 3:
print "lzh8_dec %s\n" % VERSION
print "Usage: %s infile outfile" % sys.argv[0]
sys.exit(1)
# open file
infile = open(sys.argv[1], "rb")
outfile = open(sys.argv[2], "wb")
# decompress
print "Decompressing"
infile.seek(0, os.SEEK_SET)
output = decompress(infile)
print "Writing to file"
outfile.write(output)
outfile.close()
infile.close()
sys.exit(0)