-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparser_geopeitus.py
64 lines (52 loc) · 2.27 KB
/
parser_geopeitus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Geopeituse "Tartu" RSS-voo sisendite parsimine
"""
import parsers_common
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
"""
Meetod Tartu aarete nimekirja loomiseks
"""
articleDescriptions = []
articleIds = []
# articleImages = []
articlePubDates = pageTree.xpath('//div[@id="t-content"]/table[1]/tr/td[1]/text()')
articleTitles = pageTree.xpath('//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/text()')
articleUrls = pageTree.xpath('//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/@href')
articleUrls = parsers_common.domainUrls(domain, articleUrls)
articleDescriptionsParents = pageTree.xpath('//div[@id="t-content"]/table[1]/tr') # as a parent
for i in range(0, len(articleUrls)):
articleUrl = articleUrls[i]
# get unique id from articleUrl
articleIds.append(articleUrl.split('/')[-1])
# descriptions
curArtDescParent = articleDescriptionsParents[i]
curArtDescChilds = parsers_common.stringify_children(curArtDescParent)
articleDescriptions.append(curArtDescChilds)
# timeformat magic from "12.12.2017" to datetime()
curArtPubDate = articlePubDates[i]
curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d.%m.%Y")
articlePubDates[i] = curArtPubDate
# remove non "Tartu" ocation lines
retArticleDescriptions = []
retArticleIds = []
retArticleImages = []
retArticlePubDates = []
retArticleTitles = []
retArticleUrls = []
for i in range(0, len(articleUrls)):
if ('Tartu' in articleDescriptions[i]):
retArticleDescriptions.append(articleDescriptions[i])
retArticleIds.append(articleIds[i])
# retArticleImages.append(articleImages[i])
retArticlePubDates.append(articlePubDates[i])
retArticleTitles.append(articleTitles[i])
retArticleUrls.append(articleUrls[i])
return {"articleDescriptions": retArticleDescriptions,
"articleIds": retArticleIds,
"articleImages": retArticleImages,
"articlePubDates": retArticlePubDates,
"articleTitles": retArticleTitles,
"articleUrls": retArticleUrls,
}