-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilterMergedData.py
82 lines (62 loc) · 2.39 KB
/
filterMergedData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Filter from merged data to reduce the number of calls to bluemix API tu under 1000
import pandas
import ipdb
import json
import time
import bluemixhelper
import ConfigParser
configFile = "./keysconfig.txt"
outFile = "data/raw_bluemix_analysis.csv"
filename = "data/mergedData.csv" # the filtered one
def get_from_config(section, config_tags):
# Reads the configFile file and returns the config tags located in specified section.
config = ConfigParser.ConfigParser()
config.read(configFile)
if isinstance(config_tags,list):
config_data = {k: config.get(section, k) for k in config_tags}
else:
config_data = {config_tags: config.get(section, config_tags)}
return config_data
def filter_merged_data(df):
# First, only take pull requests. Because.
sub = df[df["gh_is_pr"] == True]
# 49803 rows
# We then drop lines where there are no comments on the pull request
sub = sub.dropna(subset=["gh_issue_comments"])
# Keep only rubies. Because.
sub = sub[sub['gh_lang']=='ruby']
return sub
def unique_filtered_data(data_in):
# GROUPBY on gh_pr_comments field, so that multiple builds corresponding to the same comment chains are analyzed only once
data_out = data_in[["gh_issue_comments"]]
data_out = data_out.drop_duplicates()
return data_out
###
## The reason this removes so many results is that some pull requests had had many updates, and the comment thread appears many times in there. this demonstrate it:
## sub = sub.groupby(["gh_issue_comments"]).agg(['count'])
###
if __name__ == "__main__":
df = pandas.read_csv(filename, sep = ";")
data = filter_merged_data(df)
data2 = unique_filtered_data(data)
creds = get_from_config("bluemix_client",["username","password"])
bmc = bluemixhelper.BluemixClient(credentials=creds, verbose=True)
tone_analysis = pandas.DataFrame(columns=('issue_comment', 'tone_analysis'))
# data_string_test = data2["gh_issue_comments"][41]
ipdb.set_trace()
try:
progress = 0
for i, comment in data2["gh_issue_comments"].iteritems():
print "%s / %s " %(progress, len( data2["gh_issue_comments"]))
try:
analysis = bmc.get_tone_analysis(comment)
tone_analysis.loc[i] = [comment, json.loads(analysis.content)]
except:
print "Caught error"
tone_analysis.loc[i] = [comment, ""]
time.sleep(0.5)
tone_analysis.to_csv(outFile, sep=';', encoding='utf-8') # TODO MOVE OUT OF HERE
progress += 1
except:
ipdb.set_trace()
print "complete!"