-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathextract_csv_data.py
121 lines (90 loc) · 2.69 KB
/
extract_csv_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 11 17:30:30 2018
@author: Sahit
"""
#whatsapp && (police || arrest || crime || jail || lynch || kill || fire || riots || hate || death || dead || FIR)
import pandas as pd
import os
import nltk
import re
folder = nltk.data.find(
'/Users/Sahit/Documents/GitHub/Article-Analysis/webscrapper/webscrapper/article_data')
path = '/Users/Sahit/Documents/GitHub/Article-Analysis/webscrapper/webscrapper/article_data'
dirListing = os.listdir(path)
editFiles = []
for item in dirListing:
if ".txt" in item:
editFiles.append(path+'/'+item)
print len(editFiles)
g = open('article_links_final.txt', 'r')
g = g.read().split('--')
def get_dictionary_word_list(filepath):
# with context manager assures us the
# file will be closed when leaving the scope
with open(filepath) as f:
# return the split results, which is all the words in the file.
return f.read().lower().split()
def get_keywords(text):
keywords = []
lst = ['police', 'arrest', 'crime', 'jail', 'lynch', 'kill', 'fire', 'riots', 'hate', 'death', 'dead', 'fir']
flag = 0
if 'whatsapp' in text:
keywords.append('whatsapp')
for word in lst:
if word in text:
keywords.append(word)
flag = 1
keywords.append(flag)
return keywords
data = []
t = 0
for filepath in editFiles:
entry = {}
id = re.search(r'(\d+).txt$', filepath)
filepath_num = int(id.group(1))-1
entry['link'] = g[filepath_num]
content = get_dictionary_word_list(filepath)
try:
place_ = re.search(r'([a-zA-Z]+):', content[0])
try:
place = place_.group(1)
except:
place = 'NA'
except:
place = 'NA'
_date_ = re.search(r'^/Users/Sahit/Documents/GitHub/Article-Analysis/webscrapper/webscrapper/article_data/(\d+)-(\d+)-(\d+)T', filepath)
try:
year = _date_.group(1)
except:
year = 'NA'
try:
month = _date_.group(2)
except:
month = 'NA'
try:
day = _date_.group(3)
except:
day = 'NA'
date = str(year)+'/'+str(month)+'/'+str(day)
l = get_keywords(content)
flag = l[-1]
keywords = l[:-1]
entry['year'] = year
entry['month'] = month
entry['day'] = day
entry['date'] = date
entry['place'] = place
entry['id'] = filepath_num+1
entry['publisher'] = 'TOI'
#entry['content'] = content
entry['filename'] = filepath
entry['flag'] = flag
entry['keywords'] = keywords
if flag == 1:
data.append(entry)
t += 1
article_df = pd.DataFrame(data)
#print article_df
article_df.to_csv('data_nec.csv')