-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathApache_dataset_crawler.py
198 lines (165 loc) · 9.48 KB
/
Apache_dataset_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# Imports requests library, is used to read the contents of a webpage
import requests
# Imports os library to read/write files from the OS
import os
# Import lxml library, used to make sense of (HTML structured) webpage contents
from lxml import html
# Total number of files is correct, system fails though, maybe repair (or do not include in final public repository)
def main():
# Reads contents from the following webpage
url = "http://mail-archives.apache.org/mod_mbox/lucene-java-user/"
page = requests.get(url)
root = html.fromstring(page.content)
# Finds all the years of email archives featured on website above
year_tables = reversed(root.xpath('//table[@class="year"]'))
# Loops through all tables from different years on the website
for year_table in year_tables:
year = int(year_table.xpath("thead/tr/th/text()")[0].split(" ")[1])
# Only if the year range in between 2001-2011
if year in list(range(2001, 2012)):
# Finds all the months in said year (2001 does not have all months, we do not use all months of 2011)
months = reversed(year_table.xpath("tbody/tr"))
# Loops through all months of a year
for month in months:
name = month.xpath('td[@class="date"]/text()')[0]
# Breaks if the year is 2011 and the month is after July
# The dataset should correlate with the dataset as provided on AWS:
# https://aws.amazon.com/datasets/apache-software-foundation-public-mail-archives/
# The Amazon dataset does not regard files between July 12 and August 1st, but for simplicity sake we did also include these files
if (
name == "Aug 2011"
or name == "Sep 2011"
or name == "Oct 2011"
or name == "Nov 2011"
or name == "Dec 2011"
):
break
# Prins the name of the month to show the user the progress (the system works in order)
print(name)
# Finds the link to a webpage containing all java-user emails of the Apache Lucene mailing list from a specific month
link = month.xpath('td[@class="links"]/span/a/@href')[0]
month_link = link.split("/")[0]
# The links to a webpage containing the emails of a specific month
thread_url = url + link
message_page = requests.get(thread_url)
message_page_root = html.fromstring(message_page.content)
# The messages are featured in sections of 50 emails per page (so we need to crawl all of these sections to get all emails in a month)
page_identifiers = list(
range(
0,
len(
message_page_root.xpath(
'//table[@id="msglist"]/thead/tr/th[@class="pages"]/a/text()'
)
),
)
)
page_identifiers = ["?" + str(x) for x in page_identifiers]
# If the total number of emails within a month is lower than 50 (and thus there is only a single section)
if page_identifiers == []:
page_identifiers = [""]
# Loops through all 50-email sections on an emails-per-month page
for id in page_identifiers:
# Finds the right link
new_url = thread_url + id
new_message_page = requests.get(new_url)
new_message_page_root = html.fromstring(
new_message_page.content.decode("utf-8", "ignore").encode(
"utf-8"
)
)
emails = new_message_page_root.xpath(
'//table[@id="msglist"]/tbody/tr'
)
# Logs the number of emails in a single email section
emails_per_page = 0
# Loops through all emails features in a single email section
for email in emails:
email_path = email.xpath('td[@class="subject"]/a/@href')
# If the email_path is not empty
# (some emails are not archived and so only the name of the email is mentioned without the actual email)
# We decided this disregard these emails
if not email_path == []:
# Finds information of a single email on a webpage, in this case:
# - The path to the webpage displaying said email
# - The author of said email
email_path = email_path[0]
author = str(
email.xpath('td[@class="author"]/text()')[0]
).replace("/", "-")
path_to_directory = os.path.join(
os.getcwd(), "Datasets", "ApacheLucene-java-user"
)
# The crawled ApacheLucene-java-user dataset is subdivided by the authors of specific emails (such as the ENRON dataset)
# It is useful to note that the amount of different authors (and authors spelling their name otherwise)
# is much higher than the ENRON dataset
# And thus the amount of emails per author is much lower
folder_name = os.path.join(path_to_directory, author)
# If the author folder does not exist already we create a new folder
if not author in os.listdir(path_to_directory):
try:
os.mkdir(folder_name)
os.mkdir(os.path.join(folder_name, "mbox"))
except OSError as e:
print(e)
# Some characters are escaped in the file name, these characters correspond with URL-encoded characters
email_name = (
email_path.replace("/", "-")
.replace("%3c", "")
.replace("%3e", "")
)
# If the email was not already present in the dataset we add it to the {{ author }}/mbox folder
if email_name not in os.listdir(
os.path.join(folder_name, "mbox")
):
# Adds specific email data to the email file accordingly
email_url = url + month_link + "/" + email_path
email_page = requests.get(email_url)
email_root = html.fromstring(
email_page.content.decode("utf-8", "ignore").encode(
"utf-8"
)
)
email_table = email_root.xpath(
'//table[@id="msgview"]'
)[0]
date = email_table.xpath(
'tbody/tr[@class="date"]/td[@class="right"]/text()'
)[0].split(" ")
date_num = int(date[1])
date_month = date[2]
date_year = int(date[3])
sender = email_table.xpath(
'tbody/tr[@class="from"]/td[@class="right"]/text()'
)[0]
subject = email_table.xpath(
'tbody/tr[@class="subject"]/td[@class="right"]/text()'
)[0]
contents = email_table.xpath(
'tbody/tr[@class="contents"]/td/pre/text()'
)[0]
date_formatted = (
str(date_num)
+ " "
+ date_month
+ " "
+ str(date_year)
)
file_path = os.path.join(
path_to_directory,
author,
"mbox",
email_name + ".txt",
)
f = open(file_path, "w")
f.writelines(
[
"FROM: " + str(sender) + "\n",
"SUBJECT: " + str(subject) + "\n",
"DATE: " + date_formatted + "\n",
contents,
]
)
f.close()
print("Finished")
main()