-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_xml_new_and_update.py
413 lines (362 loc) · 19.3 KB
/
parse_xml_new_and_update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
"""**MongoDB**:
.. warning:: MongoDb must be running. Otherwise it will give you an error.
.. note:: MongoDB is initialized just by calling the module *parse_xml_new_and_update*.
+----------------+-------------------------------------------+
| Data base | Collection |
+================+===========================================+
| bvc | training_collection_All |
+ +-------------------------------------------+
| | training_collection_None_Indexed_t1 |
+ +-------------------------------------------+
| | training_collection_None_Indexed_t2 |
| +-------------------------------------------+
| | training_collection_Update_info |
| +-------------------------------------------+
| | errors_training |
+----------------+-------------------------------------------+
"""
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from datetime import datetime
from pymongo import MongoClient
from bs4 import BeautifulSoup
import json
import glob
from io import open
import re
import os, shutil
import sys, time
import Crawl_Records
client = MongoClient('localhost:27017')
db = client.bvs
collection_all = db.training_collection_All
collection_None_Indexed_t1 = db['training_collection_None_Indexed_t1']
collection_None_Indexed_t2 = db['training_collection_None_Indexed_t2']
collection_Update_info = db.training_collection_Update_info
errors = db.errors_training
def save_to_mongo_updated_info(id,type,db):
"""This method is for saving the data like _id, type, db and date, in MongoDB *data base: bvc* and collection*.
:param id: Article document's id.
:type id: string
:param type: Type is new or update. It depends on article if it's new or just being updated.
:type type: string (new or update)
:param db: The name of article's data base (LILACS or IBECS)
:type db: sting
:returns: Nothing to return
.. note:: The date will be saved automatically. It will be actual date obtained by ``datetime.utcnow()``.
"""
date = datetime.utcnow()
dictionary = dict({'_id':id,'type':type,'db':db,'parsing_date':date})
collection_Update_info.insert_one(dictionary)
def download_document(id):
"""This method is for downloading a single *article document in **xml** format*, by the **id of article**.
:param id: Article's alternate id. If it's a normal id than it will return the same. (Ex: biblio-986217).
:type id: string
:returns: url, xml (xml is a article document downloaded by id)
:rtype: string, xml
"""
base_url = 'http://pesquisa.bvsalud.org/portal/?output=xml&lang=en&from=&sort=&format=&count=&fb=&page=1&q=id%3A'
url = base_url+id
document = urlopen(url)
time.sleep(10)
return url,document
def find_id_by_alternate_id(alternate_id):
"""Method for obtained article's **id** by **alternate id**. It finds a document by document's *id* or *alternate_id*.
The logic of this method is use for find a **id** by **alternate id**.
:param alternate_id: Article's alternate id. If it's a normal id than it will return the same (Ex: biblio-986217).
:type alternate_id: string
:returns: Article's id.
:rtype: string (Ex: biblio-1001042)
"""
base_url = 'http://pesquisa.bvsalud.org/portal/resource/en/'
url = base_url + alternate_id
content = urlopen(url)
bsObj = BeautifulSoup(content,'html')
data_string = (bsObj.find(attrs = {'class' :'data'})).text #Get the string whose class is data, for extracing the id.
found_object = re.search(r"(?=ID:).*",data_string) # Regex For get id from the string
doc_id = found_object.group().strip()
time.sleep(10)
return doc_id
def xml_to_dictionary(document_xml):
"""The method converts a **xml document** to a **dictionary (json) format**. The method is just for article BVSalud **LILACS** or **IBECS**.
difference_between_entry_update_date.
:param document_xml: A **single article document** in the **xml** format.
:type document_xml: xml
:returns: A **single article document** in the **dictionary (json)** format.
:rtype: dictionary/json
"""
document_dict = dict()
document_values = ['id','type','ur','au','afiliacao_autor','ti_es','ti_pt',
'ti_en','ti','fo','ta','is','la','cp','da','ab_pt','ab_en','ab_es','ab',
'entry_date','_version_','ct','mh','sh','cc','mark_ab_es','mark_ab_pt','mark_ab_en',
'db','alternate_id','update_date']
for code in document_values:
try:
value =document_xml.find(attrs= {'name':code}) # Find the value by code. If it doesn't exit than returns none
if not value: # Check if the value is None
document_dict[code] = value #Saving the value to the dictionary by code as key. In this case it must be None.
elif code == 'da':
try:
document_dict[code] = (datetime.strptime(value.text[:6],'%Y%m%d'))
except:
try:
document_dict[code] = (datetime.strptime(value.text[:4],'%Y'))
except:
document_dict[code] =value.text
elif code in ['entry_date','update_date']:
try:
document_dict[code] = (datetime.strptime(value.text,'%Y%m%d'))
except:
try:
document_dict[code] = datetime.strptime(value.text[:6], '%Y%m%d')
except:
try:
document_dict[code] = datetime.strptime(value.text[:4], '%Y')
except (TypeError, AttributeError) as e:
errors.insert_one(dict(date_time = datetime.utcnow(),
doc_id = document_xml.find(attrs= {"name":"id"}).text,
type_error = 'Extract and converting date format single doc',
detail_error = code,
exception_str = str(e)))
document_dict[code] =value.text
else:
children = value.findChildren() #Find if it has children. If it doesn't have than returns None.
if len(children) > 0: #If children exists, so get into the loop for appending all child
if code in ['type','ti_es','ti_pt','fo','cp','ab_pt','ab_en','ab_es','cc','ti_en', 'db', 'alternate_id']:
document_dict[code] = children[0].text
elif code in ['ur','au','afiliacao_autor','ti','ta','is','la','ab','ct','mh','sh']:
strings_list = []
for child in children:
strings_list.append(child.text)
document_dict[code] = strings_list
elif code in ['mark_ab_es','mark_ab_pt','mark_ab_en']:
document_dict[code] = children[0].text
else:
document_dict[code] = value.text
except (TypeError, AttributeError) as e:
errors.insert_one(dict(date_time = datetime.utcnow(),
doc_id = document_xml.find(attrs= {"name":"id"}).text,
type_error = 'Extract field information from single doc',
detail_error = code,
exception_str = str(e)))
document_dict['_id'] = document_dict.pop('id')
document_dict['parsing_entry_date'] = datetime.utcnow()
return document_dict
def parse_file(path_to_file,mode=None):
"""The method parse a files and extract all documents one by one,
and after it converts **each document** by calling the function **xml_to_dictionary**.
After **all the documents** one by one will be saved in the **data base MongoDB** as well all **ERROR**.
:param path_to_file: The root of file to be parsed.
:type path_to_file: string (Ex: ./crawled/IBECS_LILACS_17072019_pg_1.xml).
:param mode: The mode is condition if it receives **"compare"** will saved into a collection time 2.
Otherwise in the collection normal, maybe time 1. By default it's None.
:type mode: string
:returns: Nothing to return.
"""
try:
file = open(path_to_file)
xml_content = file.read()
bsObj = BeautifulSoup(xml_content,features='lxml')
documents = bsObj.findAll("doc")
documents_list = []
try:
for i, document_xml in enumerate(documents):
document_dict = xml_to_dictionary(document_xml)
document_dict['file'] = path_to_file
if mode == "compare":
collection_None_Indexed_t2.insert_one(document_dict)
else:
collection_all.insert_one(document_dict)
if document_dict['mh'] is None:
collection_None_Indexed_t1.insert_one(document_dict)
except Exception as e:
errors.insert_one(dict(date_time=datetime.utcnow(),
doc_id=document_xml.find(attrs= {"name":"id"}).text,
type_error='Extract one <doc> from single XML file',
detail_error=document_xml.find(attrs= {"name":"id"}).text,
exception_str=str(e)))
except Exception as e:
errors.insert_one(dict(date_time=datetime.utcnow(),
doc_id=document_xml.find(attrs= {"name":"id"}).text,
type_error='Extract multiple <doc> from single XML file',
detail_error=path_to_file,
exception_str=str(e)))
def document_compare():
"""This method is just for compare all document none indexed, *DATA BASE time1 by time2 and time2 by time1*.
New will be inserted into the main DataBase and others will be updated by **id, mh, sh, alternat_id**
unless in time2 documents have **mh** as **None**
.. note:: It receive nothing as parameter and nethier return. It just compare two collaction none indexed of time1 and time2.
"""
all_ids_cursor_t2 = collection_None_Indexed_t2.find({},{"_id":1})
all_ids_t2 = []
for item in all_ids_cursor_t2:
all_ids_t2.append(item['_id'])
all_ids_cursor_t1 = collection_None_Indexed_t1.find({},{"_id":1})
all_ids_t1 = []
for item in all_ids_cursor_t1:
all_ids_t1.append(item['_id'])
i = 1
for id_t2 in all_ids_t2:
try:
print("t2",i)
i += 1
document_t2 = collection_None_Indexed_t2.find_one({"_id":id_t2})
document_t1 = collection_None_Indexed_t1.find_one({'_id':id_t2})
if document_t1 is None:
try:
print("Saving new Document",document_t2['_id'])
collection_all.insert_one(document_t2)
save_to_mongo_updated_info(id_t2,'new',document_t2['db'])
except (TypeError, AttributeError) as e:
errors.insert_one(dict(date_time = datetime.utcnow(),
doc_id = id_t2,
type_error = 'Save new none indexed document into mongo',
detail_error = id_t2,
exception_str = str(e)))
except (TypeError, AttributeError) as e:
errors.insert_one(dict(date_time = datetime.utcnow(),
doc_id = id_t2,
type_error = 'Save new none indexed document into mongo',
detail_error = id_t2,
exception_str = str(e)))
i = 1
for id_t1 in all_ids_t1:
print("t1",i)
i += 1
document_t1 = collection_None_Indexed_t1.find_one({'_id':id_t1})
document_t2 = collection_None_Indexed_t2.find_one({'_id':id_t1})
if document_t2 is None:
if document_t1['db'] == 'IBEBCS':
doc_id_t1 = find_id_by_alternate_id(id_t1)
else:
doc_id_t1 = id_t1
url, xml= download_document(doc_id_t1)
bsObj = BeautifulSoup(xml,features='lxml')
document_xml = bsObj.find('doc')
if document_xml is not None:
try:
document_dict = xml_to_dictionary(document_xml)
print("Updating a document: ", id_t1)
collection_all.update_one({'_id': id_t1},
{'$set':
{'_id': document_dict['_id'],
'alternate_id': document_dict['alternate_id'],
'mh': document_dict['mh'],
'sh': document_dict['sh'],
'parsing_update_date': datetime.utcnow()
}
})
save_to_mongo_updated_info(document_dict['_id'],'update',document_dict['db'])
except Exception as e:
errors.insert_one(dict(date_time=datetime.utcnow(),
doc_id=document_dict['_id'],
type_error='Update information from single <doc>',
detail_error=url,
exception_str=str(e)))
def change_collections_name_mongo(old_name, new_name):
"""It changes the name of a collaction if the target name is exist than it will delete that collaction.
(Ex: vs.training_collection_old -> vs.training_collection_new).
:param old_name: The collection's name which will be changed by a new one.
:type: strint
:param new_name: A new name for the collection.
:returns: Nothing to return
.. warning:: Please do not pass new_name same as old_name, those must be diffrent.
"""
try:
client.admin.command("renameCollection", old_name, to= new_name,dropTarget=True)
except:
pass
def process_dir_t2(path_to_dir):
"""Method to get all file from a folder. All files one by one will be passed to the method **parse_file** with the condion **"compare"**
:param path_to_dir: The root of the directory where all files in **xml** format are saved.
:type: string (Ex: ./crawled_no_indexed/)
:returns: Nothing to return.
.. seealso:: You should take a look at the method **parse file** with **mode "compare"**.
it would help you to handle better this method.
"""
files = glob.glob(path_to_dir+'*.xml')
for file in files:
print("file",file)
parse_file(file,"compare")
def process_dir_t1(path_to_dir):
"""Method to get all file from a folder. All files one by one will be passed to the method **parse_file** without any condition (None).
:param path_to_dir: The root of the directory where all files **xml** format are saved.
:type: string (Ex: ./crawled/)
:returns: Nothing to return.
.. seealso:: You should take a look at the method **parse file** with **mode "compare"**.
it would help you to handle better this method.
"""
print("..........")
files = glob.glob(path_to_dir+'*.xml')
for file in files:
print("file",file)
parse_file(file)
def find_new_documents():
print("Finding all missing records:")
num_records_in_mongo = collection_all.count()
num_records_in_web = Crawl_Records.get_records("all","count")
data_name,base_url,folder_to_save = Crawl_Records.make_base_url("all")
folder_to_save = './crawled_lost_found/'
date = datetime.utcnow().strftime('%d%m%Y')
num_missing_records = num_records_in_web - num_records_in_mongo
print("Total records: ",num_records_in_web,"\nRecord in MondoDB: ",num_records_in_mongo,)
print("Missing record: ",num_missing_records)
num_found_records = 0
num_missing_records = 10
i = 1
while num_found_records < num_missing_records:
print(i)
url = Crawl_Records.make_url(base_url,i,1,i)
xml_content = Crawl_Records.urlopen(url)
bsObj = BeautifulSoup(xml_content,features='lxml')
document_xml = bsObj.find("doc")
id =document_xml.find(attrs= {'name':'id'}).text
exist = collection_all.find_one({'_id':id})
if not exist:
document_dict = xml_to_dictionary(document_xml)
num_found_records += 1;
print("Found: ",num_found_records,"id:",document_dict['_id'])
try:
collection_all.insert_one(document_dict)
save_to_mongo_updated_info(document_dict['_id'],'new_indexed',document_dict['db'])
except Exception as e:
errors.insert_one(dict(date_time=datetime.utcnow(),
doc_id=document_dict['_id'],
type_error='Insert new while finding lost documents ',
detail_error=url,
exception_str=str(e)))
i += 1
def main(arguments):
"""The method main is just for calling all other methods. It recives a argument, but not required.
If it recives the argument **"first_time"** than it will download all documents and parse those to save in the MongoDB.
Otherwise it will just download to be comared with others already existing.
:param argument: This a condition if the program is being excecuted for first time.
:type: string
:returns: Nothing to return
.. note:: If the program is being executed first time, you must pass a argument **first_time**. Otherwise it doesn't need any.
First time: *python parse_xml_new_and_update.py first_time*
Otherwise: python parse_xml_new_and_update.py*
"""
if len(arguments) == 2:
if arguments[1] in ['first_time', "all"]:
Crawl_Records.get_records("all")
change_collections_name_mongo("bvs.training_collection_None_Indexed_2","bvs.old_training_collection_None_Indexed_t2")
change_collections_name_mongo("bvs.training_collection_All","bvs.old_training_collection_All")
change_collections_name_mongo("bvs.training_collection_None_Indexed_t1","bvs.old_training_collection_None_Indexed_t1")
process_dir_t1("./crawled/")
else:
print("\n\tError:\tWrong argument.")
print("\n\tfirst_time:\tIf you are doing it first time.")
print("\n\t\t\tOtherwise don't have to pass any argument.")
print("\n\tall:\tIf you want download all again.")
elif len(arguments) == 1:
Crawl_Records.get_records("none_indexed_ibecs")
Crawl_Records.get_records("none_indexed_lilacs")
change_collections_name_mongo("bvs.training_collection_None_Indexed_t2","bvs.training_collection_None_Indexed_t1")
process_dir_t2('./crawled_no_indexed/')
document_compare()
else:
print("\nError: Wrong argument.\n")
print("\tfirst_time: If you are doing it first time.\n\t\tOtherwise do not pass any argument")
if __name__ == '__main__':
main(sys.argv)