-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_bigrams.py
98 lines (83 loc) · 2.78 KB
/
plot_bigrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
import csv
import pickle
import nltk
import collections
import random
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
common_words = set(stopwords.words('english'))
other_ignore = ["'s", "s", "t", "'t"]
wnlemma = WordNetLemmatizer()
def word_plot(text, num_tweets, with_punc=False, bigram=False, graph=False):
word_dict = {}
if (with_punc):
split_words = word_tokenize(text)
else:
split_words = re.split('\W+', text)
if (bigram):
bg = list(nltk.bigrams(split_words))
split_words = []
for (a, b) in bg:
a = a.lower()
b = b.lower()
if a not in common_words and a not in other_ignore and b not in common_words and b not in other_ignore:
combined = a+"-"+b
split_words.append(combined)
for word in split_words:
word = wnlemma.lemmatize(word.lower())
if word not in common_words and word not in other_ignore:
if word in word_dict.keys():
word_dict[word] += 1
else:
word_dict[word] = 1
most_common = sorted(word_dict.items(), key=lambda x: -1*x[1])[:30]
for word, freq in most_common:
ratio = freq / num_tweets
print(word+": "+str(ratio))
if graph:
x = np.arange(20)
fig, ax = plt.subplots()
unzipped = list(zip(*most_common[:20]))
freq_vals = []
for val in unzipped[1]:
freq_vals.append(val / num_tweets)
plt.bar(x, freq_vals)
plt.xticks(x, unzipped[0])
plt.show()
all_bot_tweets = []
num_bot_tweets = 0
all_ppl_tweets = []
num_ppl_tweets = 0
with open('russian-troll-tweets/tweets.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
all_bot_tweets.append(row)
num_bot_tweets += 1
with open('organic_tweets.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
all_ppl_tweets.append(row)
num_ppl_tweets += 1
all_bot_tweet_text = ""
all_ppl_tweet_text = ""
for b_tweet in all_bot_tweets:
all_bot_tweet_text += b_tweet['text']
for p_tweet in all_ppl_tweets:
all_ppl_tweet_text += p_tweet['text']
print("\nBots (no punctuation)")
word_plot(all_bot_tweet_text, num_bot_tweets, with_punc=False, bigram=False, graph=False)
print("\nUsers (no punctuation)")
word_plot(all_ppl_tweet_text, num_ppl_tweets, with_punc=False, bigram=False, graph=False)
print("\nBots (with punctuation)")
word_plot(all_bot_tweet_text, num_bot_tweets, with_punc=True, bigram=False, graph=False)
print("\nUsers (with punctuation)")
word_plot(all_ppl_tweet_text, num_ppl_tweets, with_punc=True, bigram=False, graph=False)
print("\nBots (bigrams)")
word_plot(all_bot_tweet_text, num_bot_tweets, with_punc=False, bigram=True, graph=False)
print("\nUsers (bigrams)")
word_plot(all_ppl_tweet_text, num_ppl_tweets, with_punc=False, bigram=True, graph=False)