-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconcept_articles.py
141 lines (124 loc) · 5.22 KB
/
concept_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Event registry summary statistics
#imports
import numpy as np
import math
import json
from collections import Counter
import time
from eventregistry import *
from datetime import date, datetime
print 'imports completed'
startTime = datetime.now()
file_counter = 876 #all files
#file_counter = 10 #realistic
total_files = file_counter * 1000
files = 0 #file incrementer
event = 0 #event within file incrementer
output = []
concept_main_list = {}
max_sum = 0
def hashing(word):
sum = 0
global max_sum
for pos in range(len(word)):
sum = sum + ord(word[pos]) * (pos + 1)
if (sum > max_sum):
max_sum = sum
return sum%2000000
def list_check(list,index):
try:
return list[index]
except IndexError:
return 0
#count = 0
#while count < 2000000: #initialise the hash table
# concept_main_list.append(0)
# count = count + 1
with open("concept_hash_table.json", "r") as idf_file:
idf = json.load(idf_file)
idf_file.close()
while files < file_counter:
concept_list = []
print 'events-00' + '{0:03d}'.format(files) + '000.json'
print datetime.now() - startTime
print str(round(float(files) / float(file_counter),4) * 100) + '%'
with open('events/events-00' + "{0:03d}".format(files) + '000.json') as data_file:
data = json.load(data_file)
#loop through events in json file
count = len(data) + (files * 1000)
while event < count:
event_dict = {}
print event
the_event = data[str(event)] #the_event refers to one event
if 'info' in the_event: #if the event isn't a merge with another event
the_event = the_event['info']
if 'uri' in the_event:
event_dict['ID'] = the_event['uri']
if 'stories' in the_event:
#event_dict['story_title'] = the_event['stories'][0]['title']
#event_dict['story_lang'] = the_event['stories'][0]['lang']
#event_dict['story_summary'] = the_event['stories'][0]['summary']
#event_dict['story_date'] = the_event['stories'][0]['averageDate']
var0 = 0
if 'concepts' in the_event:
#concept_number = 0
#concept_list = []
#for concept in the_event['concepts']:
#the_hash = hashing(concept['labelEng'])
#print the_hash
#concept_main_list[the_hash] = concept_main_list[the_hash] + 1
#concept_main_list.insert(the_hash,initial_count + 1)
#event_dict['concept' + str(concept_number)] = concept['labelEng']
#concept_list.append(concept['labelEng'])
#concept_number = concept_number + 1
event_dict['concepts'] = concept_list
if 'eventDate' in the_event:
if the_event['eventDate'] != "":
event_date = datetime.strptime(the_event['eventDate'], "%Y-%m-%d").date()
event_dict['event_date'] = the_event['eventDate']
if 'multiLingInfo' in the_event:
for key, value in the_event['multiLingInfo'].iteritems():
if key == "eng":
summary = the_event['multiLingInfo'][key]['summary'].split()
word_list = {}
word_count = 0
for word in summary:
word = word.replace(".", "")
word = word.replace(",", "")
word = word.replace("\"", "")
if word_list.has_key(word):
word_list[word] = word_list[word] + 1
else:
word_list[word] = 1
word_count = count + 1
for word in word_list:
#the_hash = hashing(word)
print word
print word_list[word]
if concept_main_list.has_key(word):
concept_main_list[word][event_dict['ID']] = idf[word] * float(word_list[word]) / float(word_count)
else:
concept_main_list[word] = {}
concept_main_list[word][event_dict['ID']] = idf[word] * float(word_list[word]) / float(word_count)
the_event = the_event['multiLingInfo'].values()
#print the_event[0]['title']
#" + str("{0:03d}".format(int(round(math.ceil(event / 1000) * 1000,4)))) + "
output.append(event_dict)
#with open("flatten.json", "a") as myfile:
# myfile.write(json.dumps(event_dict, indent=4))
event = event + 1;
files = files + 1 #increment the file counter
startTime = datetime.now()
#print concept_main_list
with open("concept_articles.json", "w") as myfile:
myfile.write(json.dumps(concept_main_list, indent=0))
myfile.close()
with open("concept_articles.json", "r") as myfile:
reading = json.load(myfile)
myfile.close()
#print concept_main_list
word = raw_input('What is the word? ')
print reading[word]
print 'There are ' + str(event) + ' events.'
#print json.dumps(output, indent=4)
print 'This took ' + str(datetime.now() - startTime) + ' to run'