This repository has been archived by the owner on Mar 17, 2022. It is now read-only.
forked from emretetik96/manuscript-pages
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserver.py
240 lines (201 loc) · 10.2 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
from __future__ import print_function
import httplib2
import os
import shutil
import io
import json
import urllib
from lxml import etree
from apiclient import discovery
from apiclient import http
from apiclient.http import MediaFileUpload
from apiclient import errors
import oauth2client
from oauth2client import client
from oauth2client import tools
import re
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
"""If modifying these scopes, delete your previously saved credentials
at ~/.credentials/drive-python-quickstart.json
"""
SCOPES = 'https://www.googleapis.com/auth/drive.readonly https://www.googleapis.com/auth/drive.file'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
CSV = "well_formedness_and_schema_validation_errors.csv"
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
home_dir = os.path.expanduser('./')
credential_dir = os.path.join(home_dir, '.credentials')
if not os.path.exists(credential_dir):
os.makedirs(credential_dir)
credential_path = os.path.join(credential_dir,
'drive-quickstart.json')
store = oauth2client.file.Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES) # This is where the magic happens
flow.user_agent = APPLICATION_NAME
if flags:
print(flow)
print(store)
print(flags)
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def download_file_by_url(url, path):
"""Given url and path, download the file located at url to location path
"""
try:
testfile = urllib.URLopener()
testfile.retrieve(url, path)
except:
return
def get_new_file_title(old_title):
"""Given a file title, return the file's new title, according to the naming convention:
tc_p057r ==> 057_tc_preTEI.xml
"""
m = re.search('\d+[rv]', old_title)
page_number = m.group(0)
m = re.search('[tcnl]+', old_title)
file_type = m.group(0) # File_type refers to whether the file is tc, tcn, or tl
new_file_title = page_number + "_" + file_type + "_preTEI.xml"
return new_file_title
def add_root_tags(path):
"""Given a file's location path, add root tags to the beginning and end of the file
"""
with open(path, "a") as f: # Append "</root>" to end of file
f.write("</root>")
with open(path, "r+") as f: # Add "<root>" to beginning of file
old = f.read()
f.seek(0)
f.write("<root>" + old)
return
def clear_directory(path):
"""Clears the directory at location path of all files and subdirectories
"""
for the_file in os.listdir(path):
file_path = os.path.join(path, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(e)
return
def upload_csv_as_spreadsheet(service, path, file_title, file_parents=""):
"""Uploads a csv file to user's Google Drive as a Google spreadsheet
Args:
service: the service object with which you are accessing the Drive API
path: the path to the file to be uploaded
file_title: the title to be given to the uploaded file
file_parents: the IDs of the folders that this file should be uploaded to
e.g. if you with the uploaded file to be placed within a directory with ID 0BwJi-u8sfkVDZ05XNy1tMUdQM1E,
then pass [{'id' : '0BwJi-u8sfkVDZ05XNy1tMUdQM1E'}] as the file_parents argument
If no value is passed for file_parents, then it is placed in the root folder of the user's Drive
"""
if file_parents=="":
file_metadata = {
'title' : file_title,
'mimeType' : "application/vnd.google-apps.spreadsheet"
}
else:
file_metadata = {
'title' : file_title,
'mimeType' : "application/vnd.google-apps.spreadsheet",
'parents' : file_parents
}
media = MediaFileUpload(path, mimetype='text/csv', resumable=True)
create_file = service.files().insert(body=file_metadata, media_body=media,fields='id').execute()
return
def main():
"""Downloads every file in __Manuscript Pages and saves them to the correct subdirectory of manuscript_downloads.
Adds root tags to each file, checks if the files are well-formed XML and if they are valid against the schema at http://52.87.169.35:8080/exist/rest/db/ms-bn-fr-640/lib/preTEI.rng.
Writes the results of this check to well_formedness_and_schema_validation_errors.csv
Uploads the csv as a spreadsheet to 2016 Files for Paleographers.
"""
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v2', http=http)
clear_directory("./manuscript_downloads/") # Clear manuscript_downloads directory
for x in range(1,171): # Repopulate manuscript_downloads with appropriate subdirectories
os.makedirs("./manuscript_downloads/" + str(x).zfill(3) + "r")
os.makedirs("./manuscript_downloads/" + str(x).zfill(3) + "v")
csv = open(CSV, "wb") # Create csv file
"""Get each folder in manuscript pages.
maxResults is set to 400 so that every folder in __Manuscript Pages can be processed.
If you would like to test the code for some functionality, set maxResults to a smaller number.
"""
folders = service.files().list(q="'0B42QaQPHLJloNnZhakpiVk9GRmM' in parents", maxResults="400").execute()
folders_hash = folders["items"]
for folder in folders_hash:
try: # Get the folder's id
folder_id = folder["id"]
print(folder_id)
print(folder["title"])
except:
print("no title")
files_within_folder = service.files().list(q="'" + folder_id + "' in parents").execute() # Use the folder's id to get all files within the folder
files = files_within_folder.get('items', [])
if not files:
print('No files found.')
else:
print('Files:')
for f in files: # Process every file with an exportLink
try:
ftitle = f["title"] # Get the file's title
m = re.search('\d+[rv]', ftitle)
page_number = m.group(0) # Get the page number of the file to put it in the correct folder
new_file_title = "manuscript_downloads/" + page_number + "/" + get_new_file_title(ftitle) # Generate the file's new name
print(new_file_title)
flink = f["exportLinks"]["text/plain"]
download_file_by_url(flink, new_file_title) # Using the exportLink, download and save the file with its new title
os.system("perl remove_BOM.pl " + new_file_title) # Run perl script to remove BOM character, which Google automatically adds to the start of the file when downloading
add_root_tags(new_file_title) # Modify the file to add root tags at the beginning and end
url = f["alternateLink"] # Get a clickable to url to add into the spreadsheet for corrector's ease
clickable_url = url.rstrip("?usp=drivesdk")
m = re.search('[tcnl]+', ftitle)
file_type = m.group(0)
base_string = page_number + "," + file_type + "," + clickable_url
try: # Check if the file is well-formed XML, write results to the csv
with open(new_file_title, "r") as myfile:
xml = myfile.read()
doc = etree.fromstring(xml)
base_string = base_string + ", well-formed, , , " # Create base_string to write to csv
download_file_by_url("http://52.87.169.35:8080/exist/rest/db/ms-bn-fr-640/lib/preTEI.rng", "preTEI.rng") # Download the schema
relaxng_doc = etree.parse("ms-transcription.rng")
relaxng = etree.RelaxNG(relaxng_doc)
doc = etree.parse(new_file_title)
try: # Validate the file against the schema, write results to the csv
relaxng.assertValid(doc)
with open(CSV, "a") as myfile:
myfile.write(base_string + ", schema-valid\n")
except Exception as e:
for err in relaxng.error_log: # Get each validation error and write it to its own row
error_string = str(err)
m = re.search('.*?:.*?:.*?:.*?:.*?:.*?:', error_string) # Format the error message to be cleaner
clean_error_message = error_string.lstrip(m.group(0))
with open(CSV, "a") as myfile:
myfile.write(base_string + ", not schema-valid, " + clean_error_message + "\n")
except Exception as e:
with open(CSV, "a") as myfile:
myfile.write(base_string + ", not well-formed, " + str(e) + "\n")
except:
print("No exportLink for this file")
print(str(len(folders_hash)) + " folders processed.")
upload_csv_as_spreadsheet(service, CSV, # Upload the csv file as a spreadsheet
"XML_well-formedness_and_schema_validation_errors_list",
[{'id' : '0BwJi-u8sfkVDZ05XNy1tMUdQM1E'}])
if __name__ == '__main__':
main()