-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathcompetition_utilities.py
83 lines (68 loc) · 3.25 KB
/
competition_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from __future__ import division
from collections import Counter
import csv
import dateutil
import numpy as np
import os
import pandas as pd
data_path = None
submissions_path = None
if not data_path or not submissions_path:
raise Exception("Set the data and submission paths in competition_utilities.py!")
def parse_date_maybe_null(date):
if date:
return dateutil.parser.parse(date)
return None
df_converters = {"PostCreationDate": dateutil.parser.parse,
"OwnerCreationDate": dateutil.parser.parse}
# "PostClosedDate": parse_date_maybe_null}
def get_reader(file_name="train-sample.csv"):
reader = csv.reader(open(os.path.join(data_path, file_name)))
header = reader.next()
return reader
def get_header(file_name="train-sample.csv"):
reader = csv.reader(open(os.path.join(data_path, file_name)))
header = reader.next()
return header
def get_closed_count(file_name):
return sum(1 for q in iter_closed_questions(file_name))
def iter_closed_questions(file_name):
df_iter = pd.io.parsers.read_csv(os.path.join(data_path, file_name), iterator=True, chunksize=1000)
return (question[1] for df in df_iter for question in df[df["OpenStatus"] != "open"].iterrows())
def iter_open_questions(file_name):
df_iter = pd.io.parsers.read_csv(os.path.join(data_path, file_name), iterator=True, chunksize=1000)
return (question[1] for df in df_iter for question in df[df["OpenStatus"] == "open"].iterrows())
def get_dataframe(file_name="train-sample.csv"):
return pd.io.parsers.read_csv(os.path.join(data_path, file_name), converters = df_converters)
def get_priors(file_name):
closed_reasons = [r[14] for r in get_reader(file_name)]
closed_reason_counts = Counter(closed_reasons)
reasons = sorted(closed_reason_counts.keys())
total = len(closed_reasons)
priors = [closed_reason_counts[reason]/total for reason in reasons]
return priors
def write_sample(file_name, header, sample):
writer = csv.writer(open(os.path.join(data_path, file_name), "w"), lineterminator="\n")
writer.writerow(header)
writer.writerows(sample)
def update_prior(old_prior, old_posterior, new_prior):
evidence_ratio = (old_prior*(1-old_posterior)) / (old_posterior*(1-old_prior))
new_posterior = new_prior / (new_prior + (1-new_prior)*evidence_ratio)
return new_posterior
def cap_and_update_priors(old_priors, old_posteriors, new_priors, epsilon):
old_posteriors = cap_predictions(old_posteriors, epsilon)
old_priors = np.kron(np.ones((np.size(old_posteriors, 0), 1)), old_priors)
new_priors = np.kron(np.ones((np.size(old_posteriors, 0), 1)), new_priors)
evidence_ratio = (old_priors*(1-old_posteriors)) / (old_posteriors*(1-old_priors))
new_posteriors = new_priors / (new_priors + (1-new_priors)*evidence_ratio)
new_posteriors = cap_predictions(new_posteriors, epsilon)
return new_posteriors
def cap_predictions(probs, epsilon):
probs[probs>1-epsilon] = 1-epsilon
probs[probs<epsilon] = epsilon
row_sums = probs.sum(axis=1)
probs = probs / row_sums[:, np.newaxis]
return probs
def write_submission(file_name, predictions):
writer = csv.writer(open(os.path.join(submissions_path, file_name), "w"), lineterminator="\n")
writer.writerows(predictions)