-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbring_data_from_url.py
79 lines (76 loc) · 4.13 KB
/
bring_data_from_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#We use requests module to connect to the internet and request the articles url
#We use BeautifulSoup to read the data which come back from the request in html format
#We use pandas to orgnize the informations and to save them to CSV file
from bs4 import BeautifulSoup
import requests
import pandas as pd
from clear_data_fun import clear_data
def bring_data():
#cat list has the category and two list contain the urls for every category, We can add any number of url
cat = ['Art', 'Science']
#urls of arts
url_cat1= [ 'https://en.wikipedia.org/wiki/Art',
'https://en.wikipedia.org/wiki/The_arts',
'https://en.wikipedia.org/wiki/Drawing',
'https://en.wikipedia.org/wiki/Graphic_design',
'https://en.wikipedia.org/wiki/Paint',
'https://en.wikipedia.org/wiki/Song',
'https://en.wikipedia.org/wiki/Singing',
'https://en.wikipedia.org/wiki/Music',
'https://en.wikipedia.org/wiki/Rhythm',
'https://en.wikipedia.org/wiki/Artist',
'https://en.wikipedia.org/wiki/Disco',
'https://en.wikipedia.org/wiki/Hip-hop_dance',
'https://en.wikipedia.org/wiki/Art_museum',
'https://en.wikipedia.org/wiki/Moscow_Museum_of_Modern_Art',
'https://en.wikipedia.org/wiki/Hand-colouring_of_photographs',
'https://en.wikipedia.org/wiki/Performance_art',
'https://en.wikipedia.org/wiki/Magic_(illusion)',
'https://en.wikipedia.org/wiki/Poetry_reading',
'https://en.wikipedia.org/wiki/Casting_(performing_arts)',
'https://en.wikipedia.org/wiki/Writing']
#urls of science
url_cat2= [ 'https://en.wikipedia.org/wiki/Science',
'https://en.wikipedia.org/wiki/Mathematics',
'https://en.wikipedia.org/wiki/Computer_program',
'https://en.wikipedia.org/wiki/Programmer',
'https://en.wikipedia.org/wiki/Engineering',
'https://en.wikipedia.org/wiki/Mechatronics',
'https://en.wikipedia.org/wiki/Robotics',
'https://en.wikipedia.org/wiki/Physics',
'https://en.wikipedia.org/wiki/Algorithm',
'https://en.wikipedia.org/wiki/Computer_science',
'https://en.wikipedia.org/wiki/Engineer',
'https://en.wikipedia.org/wiki/Robot',
'https://en.wikipedia.org/wiki/Motion_planning',
'https://en.wikipedia.org/wiki/Mechanical_engineering',
'https://en.wikipedia.org/wiki/Statistics',
'https://en.wikipedia.org/wiki/Data_science',
'https://en.wikipedia.org/wiki/Machine',
'https://en.wikipedia.org/wiki/Electricity',
'https://en.wikipedia.org/wiki/Engine',
'https://en.wikipedia.org/wiki/Applied_mathematics']
data_pd=[]
for url in url_cat1 :
#request the url
text_html = requests.get( url, timeout=5).text
#save the data as html code and we take just the data in <div> with 'id'='bodyContent'
soup = BeautifulSoup(text_html,"html5lib").find("div", {"id": "bodyContent"})
#extract the text from the data
data = soup.get_text().lower()
#apply NLP to the text for preparing it to classification
data_ready = clear_data(data)
#add the text and his category to the data list
data_pd.append([data_ready, cat[0]])
for url in url_cat2 :
text_html = requests.get( url, timeout=5).text
soup = BeautifulSoup(text_html,"html5lib").find("div", {"id": "bodyContent"})
data = soup.get_text().lower()
data_ready = clear_data(data)
data_pd.append([data_ready, cat[1]])
# Create the pandas DataFrame
df = pd.DataFrame(data_pd, columns = ['the_article', 'Class'])
# set a numerical label for the category
df['label'] = df['Class'].apply(lambda x: 0 if x=='Art' else 1)
#Save the data to CSV file
df.to_csv('DATA.csv')