-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathS3_Stream.py
174 lines (154 loc) · 6.53 KB
/
S3_Stream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import sys
import math
import time
import hashlib
import binascii
import re
import urlparse
import boto
import logging
import argparse
from progressbar import ProgressBar
from boto.s3.connection import OrdinaryCallingFormat
from multiprocessing import Pool
from boto.s3.key import Key
'''Acknowledgements:
Used portions of https://github.com/mumrah/s3-multipart to build this script
'''
def set_args():
parser = argparse.ArgumentParser(description="Get checksum of S3 object, and optionally download or stream it to stdout.",
prog="s3-download")
parser.add_argument("url", help="The S3 url to access")
parser.add_argument("-o", "--output-file", help="Download the file",
default=False, action="store_true")
parser.add_argument("-p", "--part-size", help="Size of file parts in Mb, defaults to 15Mb",
type=int, default=15)
parser.add_argument("-max", "--max-tries", help = "Number of times to retry a failed part.", default=5)
parser.add_argument("-nb", "--no-progress", help= "Supress progress bar.", default=False, action="store_true")
parser.add_argument("-", "--stdout", help="Output to stdout", default=False, action="store_true")
return parser.parse_args()
def hashstream(streamObj, hasher):
hasher.update(streamObj)
return hasher
def get_stream_part( min_byte, max_byte, received, current_tries=0 ):
keyObj = key
max_tries = args.max_tries
try:
#print('Getting Range: bytes %d-%d' % (min_byte, max_byte))
#file_stream = keyObj.get_contents_as_string(headers={'Range': "bytes=%d-%d" % (min_byte, max_byte)})
resp = s3_conn.make_request('GET', bucket=bucket.name, key=key.name,
headers={'Range':"bytes=%d-%d" % (min_byte, max_byte)})
file_stream = resp.read()
if args.output_file or args.stdout:
write_to_file (min_byte, max_byte, file_stream)
return file_stream, received
#TODO verify that exception block is actually retrying correctly. NOTE: boto
except Exception as err:
if current_tries > max_tries:
sys.stderr.write('ERROR: Exceeded max number of retries on %d-%d: Part %d' % (min_byte, max_byte, received))
else:
current_tries +=1
sys.stderr.write("WARNING:Waiting before retry %d seconds." % current_tries*3)
time.sleep(current_tries * 3)
sys.stderr.write("WARNING:Retrying part %d range %d-%d" % (received, min_byte, max_byte))
get_stream_part( min_byte, max_byte, received, current_tries)
def update_hasher (hasher, file_stream, part):
full_hash = hasher['full_hash']
full_hash.update(file_stream)
hashed_part = hashstream(file_stream, hashlib.md5()).hexdigest()
hasher['parts'][part-1] = hashed_part
hasher.update({'full_hash': full_hash})
def compose_etag (hasher):
hash_as_bytes = ""
for part in hasher['parts']:
#print ('Part is %s' % part)
hash_as_bytes += binascii.unhexlify(part)
#hash_as_bytes = '\n'.join(binascii.unhexlify(part) for part in hasher['parts'])
#print(hash_as_bytes)
etag_hash = hashlib.md5()
etag_hash.update(hash_as_bytes)
return etag_hash.hexdigest()
def write_to_file(min_byte, max_byte, stream):
if args.stdout:
sys.stdout.write(stream)
if args.output_file:
fd= os.open(os.path.basename(args.url), os.O_WRONLY)
os.lseek(fd, min_byte, os.SEEK_SET)
os.write(fd, stream)
os.close(fd)
def get_contents(part_number, keyObj, conn, bucketObj, chunksize):
if args.output_file:
fd = os.open(os.path.basename(args.url), os.O_CREAT)
os.close(fd)
chunksize = chunksize * 1024 * 1024
size = keyObj.size
#print("Size: %d, Chunksize: %d" % (size, chunksize))
if not args.stdout:
print('Size of %d bytes' % size)
etag = keyObj.etag
etagsum = re.findall(r"([a-fA-F\d]{32})", etag)[0]
if not args.stdout:
print('Etag sum=%s' % etagsum)
if size > chunksize:
parts = int(re.findall(r"\-([0-9]+).$", etag)[0])
else:
parts =1
etag = '%s-%d' % (etagsum, parts)
#print('Parts =%d' % parts)
min_byte = 0;
max_byte = chunksize-1
full_hash = hashlib.md5()
hasher = {'full_hash': full_hash, 'parts': [None] * parts}
received = 0
if not args.stdout:
print('Retreiving file in %d parts, of %d mb' % (parts, chunksize/(1024*1024)))
if size > chunksize:
pbar = ProgressBar()
for received in pbar(range(1,parts)):
file_stream, received = get_stream_part( min_byte, max_byte, received)
update_hasher(hasher, file_stream, received)
min_byte = max_byte+1
max_byte = max_byte+chunksize
max_byte = size-1
file_stream, received = get_stream_part( min_byte, max_byte, parts)
update_hasher(hasher, file_stream, parts)
computed_etag = ""
if size > chunksize:
computed_etag += "%s-%d" % (compose_etag(hasher), parts)
else:
computed_etag = "%s-%d" % (hasher['full_hash'].hexdigest(), parts)
if computed_etag != etag:
sys.stderr.write('ERROR: s3 etag: %s does not match computed etag: %s\n' % (etag, computed_etag))
else:
full_md5 = hasher['full_hash'].hexdigest()
if not args.stdout:
print ('Final md5sum:%s' % full_md5)
print ('Recalculated etag: %s' % (computed_etag))
md5out = open(os.path.basename(args.url)+'.md5', 'w')
md5out.write('%s %s' % (full_md5,os.path.basename(args.url)))
md5out.close()
etagout = open(os.path.basename(args.url)+'.etag', 'w')
etagout.write('%s %s' % (computed_etag,os.path.basename(args.url)))
etagout.close()
def get_connection (s3_path):
# Split out the bucket and the key
split_rs = urlparse.urlsplit(s3_path)
if split_rs.scheme != "s3":
raise ValueError("'%s' is not an S3 url" % src)
s3 = boto.connect_s3()
s3 = boto.connect_s3(calling_format=OrdinaryCallingFormat())
bucket = s3.lookup(split_rs.netloc, validate=False)
if bucket == None:
raise ValueError("'%s' is not a valid bucket" % split_rs.netloc)
key = bucket.get_key(split_rs.path)
if key is None:
raise ValueError("'%s' does not exist." % split_rs.path)
return (s3, bucket, key)
if __name__ == "__main__":
args = set_args()
try:
s3_conn, bucket, key = get_connection(args.url)
except:
sys.stderr.write('ERROR: Connection Error!')
get_contents(None, key, bucket, s3_conn, args.part_size)