From 8cdba09e81c32b68cb9df2041269221b22aac450 Mon Sep 17 00:00:00 2001
From: Jennifer Hu <jhu890@gmail.com>
Date: Thu, 3 Oct 2019 17:16:33 -0400
Subject: [PATCH] clean accuracy code

---
 README.md                                     |  38 +++-
 analysis/accuracy.py                          | 164 ------------------
 analysis/compute_accuracy.py                  | 120 +++++++++++++
 analysis/get_accuracy                         |  21 +++
 analysis/{plot_all => get_figures}            |   2 +-
 data/accuracy/all_exp_accuracy_grnn.csv       |  16 --
 data/accuracy/all_exp_accuracy_tiny_rnng.csv  |  19 --
 data/accuracy/all_exp_accuracy_tinywiki.csv   |  10 --
 data/accuracy/cc_accuracy_all.csv             |  28 ---
 data/accuracy/exp1a-ml-rc.csv                 |  16 ++
 data/accuracy/exp1b-ml-comp.csv               |  16 ++
 data/accuracy/exp2-rc.csv                     |  22 +++
 data/accuracy/exp3-comp.csv                   |  22 +++
 data/accuracy/exp4-pp.csv                     |  22 +++
 ...rell_rc_subrc_accuracy_grnn_jrnn_trans.csv |   7 -
 ...c_subrc_accuracy_grnn_jrnn_trans_5gram.csv |   9 -
 data/accuracy/loc_accuracy_all.csv            |  28 ---
 data/accuracy/ml_accuracy_grnn_jrnn_trans.csv |  10 --
 .../ml_accuracy_grnn_jrnn_trans_5gram.csv     |  13 --
 ...ml_accuracy_grnn_jrnn_trans_5gram_bert.csv |  16 --
 ..._grnn_multi_grnn_jrnn_trans_5gram_bert.csv |  19 --
 .../ml_rc_accuracy_grnn_jrnn_trans.csv        |  10 --
 .../ml_rc_accuracy_grnn_jrnn_trans_5gram.csv  |  13 --
 ...rc_accuracy_grnn_jrnn_trans_5gram_bert.csv |  16 --
 ..._grnn_multi_grnn_jrnn_trans_5gram_bert.csv |  19 --
 data/accuracy/rc_accuracy_all.csv             |  28 ---
 .../rc_accuracy_grnn_jrnn_trans_tiny_rnng.csv |  16 --
 27 files changed, 270 insertions(+), 450 deletions(-)
 delete mode 100644 analysis/accuracy.py
 create mode 100755 analysis/compute_accuracy.py
 create mode 100755 analysis/get_accuracy
 rename analysis/{plot_all => get_figures} (88%)
 delete mode 100644 data/accuracy/all_exp_accuracy_grnn.csv
 delete mode 100644 data/accuracy/all_exp_accuracy_tiny_rnng.csv
 delete mode 100644 data/accuracy/all_exp_accuracy_tinywiki.csv
 delete mode 100644 data/accuracy/cc_accuracy_all.csv
 create mode 100644 data/accuracy/exp1a-ml-rc.csv
 create mode 100644 data/accuracy/exp1b-ml-comp.csv
 create mode 100644 data/accuracy/exp2-rc.csv
 create mode 100644 data/accuracy/exp3-comp.csv
 create mode 100644 data/accuracy/exp4-pp.csv
 delete mode 100644 data/accuracy/futrell_rc_subrc_accuracy_grnn_jrnn_trans.csv
 delete mode 100644 data/accuracy/futrell_rc_subrc_accuracy_grnn_jrnn_trans_5gram.csv
 delete mode 100644 data/accuracy/loc_accuracy_all.csv
 delete mode 100644 data/accuracy/ml_accuracy_grnn_jrnn_trans.csv
 delete mode 100644 data/accuracy/ml_accuracy_grnn_jrnn_trans_5gram.csv
 delete mode 100644 data/accuracy/ml_accuracy_grnn_jrnn_trans_5gram_bert.csv
 delete mode 100644 data/accuracy/ml_accuracy_grnn_multi_grnn_jrnn_trans_5gram_bert.csv
 delete mode 100644 data/accuracy/ml_rc_accuracy_grnn_jrnn_trans.csv
 delete mode 100644 data/accuracy/ml_rc_accuracy_grnn_jrnn_trans_5gram.csv
 delete mode 100644 data/accuracy/ml_rc_accuracy_grnn_jrnn_trans_5gram_bert.csv
 delete mode 100644 data/accuracy/ml_rc_accuracy_grnn_multi_grnn_jrnn_trans_5gram_bert.csv
 delete mode 100644 data/accuracy/rc_accuracy_all.csv
 delete mode 100644 data/accuracy/rc_accuracy_grnn_jrnn_trans_tiny_rnng.csv

diff --git a/README.md b/README.md
index b1ab937..fba97f7 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ training corpus. See the paper for more details on how we
 constructed our materials.
 
 ### Vocabulary issues
-In all of our novel materials (**TODO: list the experiment names**), the
+In our novel materials (used in `['exp2-rc-all', 'exp3-comp', 'exp4-pp']`), the
 lexical items are designed to be in-vocabulary for models trained on the
 Penn Treebank. This is not the case for the materials used in Experiment 1, the 
 [Marvin & Linzen (2018)](https://arxiv.org/abs/1808.09031) replication.
@@ -73,11 +73,16 @@ Penn Treebank. This is not the case for the materials used in Experiment 1, the
 The per-token surprisal values for each model can be found in the [data](data)
 folder, following this naming convention:
 ```
-data/<MODEL>/<EXPERIMENT>/<PRONOUN>_<MODEL>.txt
+data/surprisal/<MODEL>/<EXPERIMENT>/<PRONOUN>_<MODEL>.txt
 ```
 The BERT data is in a slightly different `.csv` format, but otherwise
 follows the same naming convention.
 
+The accuracy results can be found at
+```
+data/accuracy/<EXPERIMENT>.csv
+```
+
 ## Dependencies
 Our analysis code requires a basic scientific installation of Python
 (`numpy`, `pandas`, `matplotlib`, `seaborn`, etc.). 
@@ -100,14 +105,18 @@ We can make the training script for our n-gram model available upon request.
 ## Reproducing our results
 
 ### Figures
-To generate the plots for a given experiment and model, run the following:
+To generate the plots for a given experiment and list of models, run the following:
 
 ```bash
 cd analysis
-mkdir figures
-python generate_lot.py -o figures -model <MODELS> -exp <EXPERIMENT> -vs
+mkdir -p figures
+python generate_plot.py -o figures -model <MODELS> -exp <EXPERIMENT> -vs
 ```
-This will save a plot to `analysis/figures/<EXPERIMENT>_<MODEL>.png`.
+This will save a plot to `analysis/figures/<EXPERIMENT>-<MODELS>.png`.
+Note that `<MODELS>` can be a list of model names (e.g. `-model rnng bert jrnn`),
+`'big'` for large-vocabulary models, or `'all'` for all models. The
+large-vocabulary models are **BERT, Transformer-XL, JRNN, GRNN, 5-gram**.
+
 The `-vs` flag specifies to plot the negative log probability **differential**.
 You can omit the flag to plot the raw negative log probabilities.
 
@@ -117,9 +126,22 @@ if it does not exist):
 
 ```bash
 cd analysis
-./plot_all figures
+./get_figures figures
 ```
 
 ### Accuracy
 
-**TODO**
+Similarly, to compute the accuracy for a given experiment and list of models,
+run:
+```bash
+cd analysis
+mkdir -p accuracy
+python compute_accuracy.py -o accuracy -model <MODELS> -exp <EXPERIMENT>
+```
+This will save a file to `analysis/accuracy/<EXPERIMENT>-<MODELS>.csv`.
+
+To compute the accuracy for all our experiments, run the following:
+```bash
+cd analysis
+./get_accuracy accuracy
+```
\ No newline at end of file
diff --git a/analysis/accuracy.py b/analysis/accuracy.py
deleted file mode 100644
index 15bef0d..0000000
--- a/analysis/accuracy.py
+++ /dev/null
@@ -1,164 +0,0 @@
-"""
-    accuracy.py
-    Get accuracy results.
-"""
-import argparse
-from numpy import mean
-import random
-import pandas as pd
-
-import utils
-
-#################################################################################
-# Global variables
-#################################################################################
-
-# MODELS = ['grnn_multi', 'grnn', 'jrnn', 'trans', 'rnng', 'tiny', 'tinywiki', '5gram', 'bert']
-
-#################################################################################
-# Helper functions
-#################################################################################
-
-# def _prob_ratio(df1, df2):
-#     prob_ratios = []
-#     for row in df1.itertuples():
-#         surprisal1 = row.surprisal
-#         surprisal2 = df2.loc[row.Index].surprisal
-#         prob_ratio = 2**(surprisal2 - surprisal1)
-#         prob_ratios.append(prob_ratio)
-#     return mean(prob_ratios)
-
-
-# def _get_data_df(data, surp, exp, nonrefl, multi=False):
-#     # read surprisals and data
-#     if not multi:
-#         surp_df = pd.read_csv(surp, delim_whitespace=True,
-#                               names=['token', 'surprisal'])
-#     else:
-#         surp_df = pd.read_csv(surp, sep=' ',
-#                               names=['token', 'sentid', 'sentpos', 'wlen', 'surprisal', 'entropy'],
-#                               skiprows=2, skipfooter=3)
-#         print(surp_df.head())
-#     data_df = pd.read_csv(data)
-
-#     agree, pl = 'agree' in exp, 'pl' in exp
-#     # only keep surprisal at specified pronoun or verb
-#     if agree:
-#         verb = 'were' if pl else 'was'
-#         surp_df = surp_df.loc[surp_df.token == verb]
-#     else:
-#         if nonrefl:
-#             pn = 'them' if pl else exp.split('_')[-1][:3]
-#         else:
-#             pn = 'themselves' if pl else exp.split('_')[-1]
-#         surp_df = surp_df.loc[surp_df.token == pn]
-
-#         # data_df = data_df.loc[data_df.pronoun == pn]
-
-#     # insert surprisal into data_df
-#     data_df['surprisal'] = surp_df.surprisal.values
-
-#     return data_df
-
-
-# def _subtract_baseline(df, exp):
-#     item_list = df.item.unique()
-#     for item in item_list:
-#         item_rows = df.loc[df.item == item]
-#         base_rows = item_rows.loc[item_rows.mismatch_position == 'none']
-#         baseline = base_rows.surprisal.mean()
-#         # subtract baseline from surprisal of all rows
-#         item_rows.surprisal -= baseline
-#         df.loc[df.item == item] = item_rows
-#     return df
-
-def get_accuracy(df, mismatch_position):
-    item_list = df.item.unique()
-    n_items = len(item_list)
-    num_correct_vs_baseline = 0
-    num_correct_vs_distractor = 0
-    num_correct = 0
-
-    for item in item_list:
-        item_rows = df[df.item == item]
-        ungrammatical_rows = item_rows[item_rows.grammatical == 0]
-        baseline_rows = item_rows[item_rows.mismatch_position == 'none']
-        distractor_rows = item_rows[item_rows.mismatch_position == mismatch_position]
-
-        vs_baseline = ungrammatical_rows.surprisal.mean() - baseline_rows.surprisal.mean()
-        vs_distractor = ungrammatical_rows.surprisal.mean() - distractor_rows.surprisal.mean()
-
-        if vs_baseline > 0:
-            num_correct_vs_baseline += 1
-
-        if vs_distractor > 0:
-            num_correct_vs_distractor += 1
-
-        if vs_baseline > 0 and vs_distractor > 0:
-            num_correct += 1
-
-        elif vs_baseline == 0 and vs_distractor == 0:
-            choice = random.choice(['baseline', 'distractor', 'ungrammatical'])
-            if choice == 'ungrammatical':
-                num_correct += 1
-
-    vs_baseline_acc = num_correct_vs_baseline / float(n_items)
-    vs_distractor_acc = num_correct_vs_distractor / float(n_items)
-    total_acc = num_correct / float(n_items)
-
-    return total_acc, vs_baseline_acc, vs_distractor_acc
-
-
-#################################################################################
-# Main function
-#################################################################################
-
-def main(out_prefix, model, exp):
-    out_path = '%s/%s_accuracy_%s.csv' % (out_prefix, exp, '_'.join(model))
-    suffixes = ['_himself', '_herself', '_pl']
-    model_list = MODELS if model == ['all'] else model
-    
-    acc_dict = {'model':[], 'full_exp':[], 'total_acc':[], 'vs_baseline_acc':[], 'vs_distractor_acc':[]}
-    for m in model_list:
-        print(m)
-        dfs = []
-        for s in suffixes:
-            full_exp = exp + s
-            print(full_exp)
-            data_path = '../materials/%s.csv' % full_exp
-            surp = '../surprisal_data/%s/%s_surprisal_%s.txt' % (m, full_exp, m)
-            if m == 'bert':
-                df = pd.read_csv('../surprisal_data/bert/%s_surprisal_bert.csv' % full_exp)
-            else:
-                multi = m == 'grnn_multi'
-                df = _get_data_df(data_path, surp, full_exp, nonrefl=nonrefl, multi=multi)
-
-            if 'rc' in exp:
-                mismatch_position = 'rc_subj'
-            elif 'loc' in exp or 'ml' in exp:
-                mismatch_position = 'nonlocal_subj'
-            elif 'cc' in exp:
-                mismatch_position = 'distractor'
-
-            total_acc, vs_baseline_acc, vs_distractor_acc = _get_accuracy(df, mismatch_position)
-            acc_dict['model'].append(m)
-            acc_dict['total_acc'].append(total_acc)
-            acc_dict['full_exp'].append(full_exp)
-            acc_dict['vs_baseline_acc'].append(vs_baseline_acc)
-            acc_dict['vs_distractor_acc'].append(vs_distractor_acc)
-    acc_df = pd.DataFrame(acc_dict)
-    acc_df.to_csv(out_path, index=False)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Compute accuracy for models.')
-    parser.add_argument('--out_prefix', '-out_prefix', '--O', '-O',
-                        default='accuracy',
-                        help='prefix to save final file (file will '
-                             'be named according to experiment name)')
-    parser.add_argument('--model', '-model', '--M', '-M', nargs='+',
-                        help='names of models, or all to plot all at once')
-    parser.add_argument('--exp', '-exp',
-                        help='name of experiment')
-    args = parser.parse_args()
-    main(args)
diff --git a/analysis/compute_accuracy.py b/analysis/compute_accuracy.py
new file mode 100755
index 0000000..118f865
--- /dev/null
+++ b/analysis/compute_accuracy.py
@@ -0,0 +1,120 @@
+"""
+    accuracy.py
+    Get accuracy results.
+"""
+import argparse
+from pathlib import Path
+from numpy import mean
+import random
+import pandas as pd
+
+import utils
+
+def get_accuracy(df, distractor_pos):
+    item_list = df.item.unique()
+    n_items = len(item_list)
+    num_correct_vs_baseline = 0
+    num_correct_vs_distractor = 0
+    num_correct = 0
+
+    for item in item_list:
+        item_rows = df[df.item == item]
+        baseline_rows = item_rows[item_rows.mismatch_position == 'none']
+        distractor_rows = item_rows[item_rows.mismatch_position == distractor_pos]
+        ungrammatical_rows = item_rows[item_rows.grammatical == 0]
+
+        vs_baseline = ungrammatical_rows.surprisal.mean() - baseline_rows.surprisal.mean()
+        vs_distractor = ungrammatical_rows.surprisal.mean() - distractor_rows.surprisal.mean()
+
+        # Check if ungrammatical - baseline is positive.
+        if vs_baseline > 0:
+            num_correct_vs_baseline += 1
+
+        # Check if ungrammatical - distractor is positive.
+        if vs_distractor > 0:
+            num_correct_vs_distractor += 1
+
+        # Check if both differentials are positive.
+        if vs_baseline > 0 and vs_distractor > 0:
+            num_correct += 1
+
+        # If both differentials are zero, then label correct with probability 1/3.
+        elif vs_baseline == 0 and vs_distractor == 0:
+            choice = random.choice(['baseline', 'distractor', 'ungrammatical'])
+            if choice == 'ungrammatical':
+                num_correct += 1
+
+    # Calculate proportion of items where different accuracy conditions hold.
+    vs_baseline_acc = num_correct_vs_baseline / float(n_items)
+    vs_distractor_acc = num_correct_vs_distractor / float(n_items)
+    total_acc = num_correct / float(n_items)
+
+    return total_acc, vs_baseline_acc, vs_distractor_acc
+
+#################################################################################
+# Main function -- partially shared with generate_plot.py
+#################################################################################
+
+def main(args):
+    # Get list of model names.
+    if args.model == ['all']:
+        model_list = utils.MODELS
+    elif args.model == ['big']:
+        model_list = utils.BIG_MODELS
+    else:
+        model_list = args.model
+
+    # Ensure only large-vocabulary models are specified for M&L replication.
+    if 'ml' in args.exp and any(m not in utils.BIG_MODELS for m in model_list):
+        raise ValueError(
+            'Only large-vocabulary models are compatible with '
+            'Marvin & Linzen\'s (2018) materials. '
+            'Please use "--model big" to plot the results from that experiment.'
+        )
+    
+    # Assign file name based on name of experiment and specified models.
+    out_path = Path(f'{args.out_prefix}/{args.exp}-{"_".join(args.model)}.csv')
+
+    acc_dict = []
+    for model in model_list:
+        # Get data for each pronoun for current model.
+        for pn in utils.PRONOUNS:
+            surp_ext = 'csv' if model == 'bert' else 'txt'
+            surp_path = Path(
+                f'../data/surprisal/{model}/{args.exp}/{pn}_{model}.{surp_ext}'
+            )
+            if model == 'bert':
+                pn_df = pd.read_csv(surp_path)
+            else:
+                data_path = Path(f'../stimuli/{args.exp}/{pn}.csv')
+                pn_df = utils.get_data_df(data_path, surp_path, args.exp, pn)
+
+            # Assign appropriate mismatch position for distractor condition.
+            if 'rc' in args.exp:
+                distractor_pos = 'rc_subj'
+            elif 'comp' in args.exp or 'ml' in args.exp:
+                distractor_pos = 'nonlocal_subj'
+            else:
+                distractor_pos = 'distractor'
+
+            total_acc, vs_baseline_acc, vs_distractor_acc = get_accuracy(
+                pn_df, distractor_pos
+            )
+            acc_dict.append(dict(
+                model=model, total_acc=total_acc, exp=args.exp, pronoun=pn,
+                vs_baseline_acc=vs_baseline_acc, vs_distractor_acc=vs_distractor_acc
+            ))
+    acc_df = pd.DataFrame(acc_dict)
+    acc_df.to_csv(out_path, index=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Compute accuracy for models.')
+    parser.add_argument('--out_prefix', '-out_prefix', '--o', '-o',
+                        help='prefix to path to save final .csv file '
+                            '(file will be named according to experiment)')
+    parser.add_argument('--model', '-model', '--m', '-m', nargs='+',
+                        help='list of model names, or "all" or "big"')
+    parser.add_argument('--exp', '-exp', help='name of experiment')
+    args = parser.parse_args()
+    main(args)
diff --git a/analysis/get_accuracy b/analysis/get_accuracy
new file mode 100755
index 0000000..ea256c4
--- /dev/null
+++ b/analysis/get_accuracy
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+    echo "Expected usage: ./get_accuracy <output_folder>"
+fi
+
+mkdir -p $1
+
+ML_EXPS=("exp1a-ml-rc" "exp1b-ml-comp")
+OTHER_EXPS=("exp2-rc" "exp3-comp" "exp4-pp")
+EXPS=("${ML_EXPS[@]}" "${OTHER_EXPS[@]}")
+
+for exp in ${EXPS[@]}; do
+    echo "== Computing accuracy for $exp =="
+    if [[ " ${ML_EXPS[*]} " == *" $exp "* ]]; then
+        model="big"
+    else
+        model="all"
+    fi
+    python compute_accuracy.py -o $1 -model $model -exp $exp
+done
\ No newline at end of file
diff --git a/analysis/plot_all b/analysis/get_figures
similarity index 88%
rename from analysis/plot_all
rename to analysis/get_figures
index 340681a..4887fa7 100755
--- a/analysis/plot_all
+++ b/analysis/get_figures
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 if [ "$#" -ne 1 ]; then
-    echo "Expected usage: ./plot_all <output_folder>"
+    echo "Expected usage: ./get_figures <output_folder>"
 fi
 
 mkdir -p $1
diff --git a/data/accuracy/all_exp_accuracy_grnn.csv b/data/accuracy/all_exp_accuracy_grnn.csv
deleted file mode 100644
index 23627d1..0000000
--- a/data/accuracy/all_exp_accuracy_grnn.csv
+++ /dev/null
@@ -1,16 +0,0 @@
-model,full_exp,total_acc,vs_baseline_acc,vs_distractor_acc
-grnn,loc_himself,1.0,1.0,1.0
-grnn,loc_herself,0.9733333333333334,1.0,0.9733333333333334
-grnn,loc_pl,1.0,1.0,1.0
-grnn,cc_himself,0.6533333333333333,1.0,0.6533333333333333
-grnn,cc_herself,0.8133333333333334,0.9866666666666667,0.8266666666666667
-grnn,cc_pl,0.7866666666666666,1.0,0.7866666666666666
-grnn,rc_himself,0.13333333333333333,1.0,0.13333333333333333
-grnn,rc_herself,0.37333333333333335,1.0,0.37333333333333335
-grnn,rc_pl,0.76,1.0,0.76
-grnn,ml_himself,1.0,1.0,1.0
-grnn,ml_herself,1.0,1.0,1.0
-grnn,ml_pl,1.0,1.0,1.0
-grnn,ml_rc_himself,0.5714285714285714,1.0,0.5714285714285714
-grnn,ml_rc_herself,0.8857142857142857,1.0,0.8857142857142857
-grnn,ml_rc_pl,0.6428571428571429,1.0,0.6428571428571429
\ No newline at end of file
diff --git a/data/accuracy/all_exp_accuracy_tiny_rnng.csv b/data/accuracy/all_exp_accuracy_tiny_rnng.csv
deleted file mode 100644
index d7a1a93..0000000
--- a/data/accuracy/all_exp_accuracy_tiny_rnng.csv
+++ /dev/null
@@ -1,19 +0,0 @@
-model,full_exp,total_acc,vs_baseline_acc,vs_distractor_acc
-rnng,loc_himself,1.0,1.0,1.0
-rnng,loc_herself,0.9866666666666667,1.0,0.9866666666666667
-rnng,loc_pl,0.7066666666666667,0.8666666666666667,0.7066666666666667
-tiny,loc_himself,0.9333333333333333,0.9733333333333334,0.9333333333333333
-tiny,loc_herself,0.49333333333333335,0.52,0.6933333333333334
-tiny,loc_pl,0.9866666666666667,0.9866666666666667,0.9866666666666667
-rnng,cc_himself,0.64,0.9866666666666667,0.6533333333333333
-rnng,cc_herself,0.6266666666666667,0.9733333333333334,0.6266666666666667
-rnng,cc_pl,0.6133333333333333,0.9733333333333334,0.64
-tiny,cc_himself,0.21333333333333335,0.9066666666666666,0.24
-tiny,cc_herself,0.12,0.28,0.28
-tiny,cc_pl,0.9333333333333333,1.0,0.9333333333333333
-rnng,rc_himself,0.12,0.8,0.12
-rnng,rc_herself,0.0,0.7733333333333333,0.0
-rnng,rc_pl,0.6,1.0,0.6
-tiny,rc_himself,0.14666666666666667,1.0,0.14666666666666667
-tiny,rc_herself,0.21333333333333335,0.41333333333333333,0.29333333333333333
-tiny,rc_pl,0.04,1.0,0.04
\ No newline at end of file
diff --git a/data/accuracy/all_exp_accuracy_tinywiki.csv b/data/accuracy/all_exp_accuracy_tinywiki.csv
deleted file mode 100644
index 4b335c4..0000000
--- a/data/accuracy/all_exp_accuracy_tinywiki.csv
+++ /dev/null
@@ -1,10 +0,0 @@
-model,full_exp,total_acc,vs_baseline_acc,vs_distractor_acc
-tinywiki,loc_himself,1.0,1.0,1.0
-tinywiki,loc_herself,1.0,1.0,1.0
-tinywiki,loc_pl,1.0,1.0,1.0
-tinywiki,cc_himself,0.37333333333333335,1.0,0.37333333333333335
-tinywiki,cc_herself,0.84,1.0,0.84
-tinywiki,cc_pl,1.0,1.0,1.0
-tinywiki,rc_himself,0.0,1.0,0.0
-tinywiki,rc_herself,0.21333333333333335,1.0,0.21333333333333335
-tinywiki,rc_pl,0.14666666666666667,0.5466666666666666,0.14666666666666667
\ No newline at end of file
diff --git a/data/accuracy/cc_accuracy_all.csv b/data/accuracy/cc_accuracy_all.csv
deleted file mode 100644
index 0b9e66f..0000000
--- a/data/accuracy/cc_accuracy_all.csv
+++ /dev/null
@@ -1,28 +0,0 @@
-model,full_exp,total_acc,vs_baseline_acc,vs_distractor_acc
-grnn_multi,cc_himself,0.30666666666666664,1.0,0.30666666666666664
-grnn_multi,cc_herself,0.5466666666666666,1.0,0.5466666666666666
-grnn_multi,cc_pl,0.9333333333333333,1.0,0.9333333333333333
-grnn,cc_himself,0.6533333333333333,1.0,0.6533333333333333
-grnn,cc_herself,0.8133333333333334,0.9866666666666667,0.8266666666666667
-grnn,cc_pl,0.7866666666666666,1.0,0.7866666666666666
-jrnn,cc_himself,0.92,0.9866666666666667,0.9333333333333333
-jrnn,cc_herself,0.8533333333333334,0.8533333333333334,1.0
-jrnn,cc_pl,0.30666666666666664,1.0,0.30666666666666664
-trans,cc_himself,0.6533333333333333,0.8533333333333334,0.7066666666666667
-trans,cc_herself,0.6266666666666667,0.9866666666666667,0.6266666666666667
-trans,cc_pl,0.7466666666666667,1.0,0.7466666666666667
-rnng,cc_himself,0.64,0.9866666666666667,0.6533333333333333
-rnng,cc_herself,0.6266666666666667,0.9733333333333334,0.6266666666666667
-rnng,cc_pl,0.6133333333333333,0.9733333333333334,0.64
-tiny,cc_himself,0.21333333333333335,0.9066666666666666,0.24
-tiny,cc_herself,0.12,0.28,0.28
-tiny,cc_pl,0.9333333333333333,1.0,0.9333333333333333
-tinywiki,cc_himself,0.37333333333333335,1.0,0.37333333333333335
-tinywiki,cc_herself,0.84,1.0,0.84
-tinywiki,cc_pl,1.0,1.0,1.0
-5gram,cc_himself,0.26666666666666666,0.0,0.16
-5gram,cc_herself,0.26666666666666666,0.0,0.13333333333333333
-5gram,cc_pl,0.24,0.0,0.06666666666666667
-bert,cc_himself,1.0,1.0,1.0
-bert,cc_herself,0.9866666666666667,0.9866666666666667,1.0
-bert,cc_pl,1.0,1.0,1.0
diff --git a/data/accuracy/exp1a-ml-rc.csv b/data/accuracy/exp1a-ml-rc.csv
new file mode 100644
index 0000000..d431af6
--- /dev/null
+++ b/data/accuracy/exp1a-ml-rc.csv
@@ -0,0 +1,16 @@
+exp,model,pronoun,total_acc,vs_baseline_acc,vs_distractor_acc
+exp1a-ml-rc,bert,themselves,0.7571428571428571,1.0,0.7571428571428571
+exp1a-ml-rc,bert,himself,0.7714285714285715,0.9857142857142858,0.7714285714285715
+exp1a-ml-rc,bert,herself,0.7428571428571429,0.9571428571428572,0.7428571428571429
+exp1a-ml-rc,trans,themselves,1.0,1.0,1.0
+exp1a-ml-rc,trans,himself,0.6,1.0,0.6
+exp1a-ml-rc,trans,herself,0.6142857142857143,0.9571428571428572,0.6142857142857143
+exp1a-ml-rc,jrnn,themselves,0.8,1.0,0.8
+exp1a-ml-rc,jrnn,himself,0.14285714285714285,1.0,0.14285714285714285
+exp1a-ml-rc,jrnn,herself,0.3,1.0,0.3
+exp1a-ml-rc,grnn,themselves,0.6428571428571429,1.0,0.6428571428571429
+exp1a-ml-rc,grnn,himself,0.5714285714285714,1.0,0.5714285714285714
+exp1a-ml-rc,grnn,herself,0.8857142857142857,1.0,0.8857142857142857
+exp1a-ml-rc,5gram,themselves,0.32857142857142857,0.0,0.0
+exp1a-ml-rc,5gram,himself,0.2714285714285714,0.0,0.0
+exp1a-ml-rc,5gram,herself,0.3142857142857143,0.0,0.0
diff --git a/data/accuracy/exp1b-ml-comp.csv b/data/accuracy/exp1b-ml-comp.csv
new file mode 100644
index 0000000..902b293
--- /dev/null
+++ b/data/accuracy/exp1b-ml-comp.csv
@@ -0,0 +1,16 @@
+exp,model,pronoun,total_acc,vs_baseline_acc,vs_distractor_acc
+exp1b-ml-comp,bert,themselves,0.8571428571428571,1.0,0.8571428571428571
+exp1b-ml-comp,bert,himself,1.0,1.0,1.0
+exp1b-ml-comp,bert,herself,1.0,1.0,1.0
+exp1b-ml-comp,trans,themselves,1.0,1.0,1.0
+exp1b-ml-comp,trans,himself,0.8857142857142857,1.0,0.8857142857142857
+exp1b-ml-comp,trans,herself,0.8571428571428571,0.9857142857142858,0.8571428571428571
+exp1b-ml-comp,jrnn,themselves,0.8857142857142857,1.0,0.8857142857142857
+exp1b-ml-comp,jrnn,himself,1.0,1.0,1.0
+exp1b-ml-comp,jrnn,herself,1.0,1.0,1.0
+exp1b-ml-comp,grnn,themselves,1.0,1.0,1.0
+exp1b-ml-comp,grnn,himself,1.0,1.0,1.0
+exp1b-ml-comp,grnn,herself,1.0,1.0,1.0
+exp1b-ml-comp,5gram,themselves,0.4,0.0,0.0
+exp1b-ml-comp,5gram,himself,0.37142857142857144,0.0,0.0
+exp1b-ml-comp,5gram,herself,0.2571428571428571,0.0,0.0
diff --git a/data/accuracy/exp2-rc.csv b/data/accuracy/exp2-rc.csv
new file mode 100644
index 0000000..a0ba46f
--- /dev/null
+++ b/data/accuracy/exp2-rc.csv
@@ -0,0 +1,22 @@
+exp,model,pronoun,total_acc,vs_baseline_acc,vs_distractor_acc
+exp2-rc,bert,themselves,0.9333333333333333,1.0,0.9333333333333333
+exp2-rc,bert,himself,0.6533333333333333,0.8933333333333333,0.6533333333333333
+exp2-rc,bert,herself,0.52,0.76,0.5466666666666666
+exp2-rc,trans,themselves,1.0,1.0,1.0
+exp2-rc,trans,himself,0.30666666666666664,0.5733333333333334,0.3333333333333333
+exp2-rc,trans,herself,0.7866666666666666,1.0,0.7866666666666666
+exp2-rc,jrnn,themselves,0.8666666666666667,1.0,0.8666666666666667
+exp2-rc,jrnn,himself,0.8,1.0,0.8
+exp2-rc,jrnn,herself,0.3333333333333333,1.0,0.3333333333333333
+exp2-rc,grnn,themselves,0.76,1.0,0.76
+exp2-rc,grnn,himself,0.13333333333333333,1.0,0.13333333333333333
+exp2-rc,grnn,herself,0.37333333333333335,1.0,0.37333333333333335
+exp2-rc,5gram,themselves,0.37333333333333335,0.0,0.0
+exp2-rc,5gram,himself,0.30666666666666664,0.0,0.0
+exp2-rc,5gram,herself,0.3466666666666667,0.0,0.0
+exp2-rc,tiny,themselves,0.04,1.0,0.04
+exp2-rc,tiny,himself,0.14666666666666667,1.0,0.14666666666666667
+exp2-rc,tiny,herself,0.21333333333333335,0.41333333333333333,0.29333333333333333
+exp2-rc,rnng,themselves,0.6,1.0,0.6
+exp2-rc,rnng,himself,0.12,0.8,0.12
+exp2-rc,rnng,herself,0.0,0.7733333333333333,0.0
diff --git a/data/accuracy/exp3-comp.csv b/data/accuracy/exp3-comp.csv
new file mode 100644
index 0000000..03c94e7
--- /dev/null
+++ b/data/accuracy/exp3-comp.csv
@@ -0,0 +1,22 @@
+exp,model,pronoun,total_acc,vs_baseline_acc,vs_distractor_acc
+exp3-comp,bert,themselves,1.0,1.0,1.0
+exp3-comp,bert,himself,1.0,1.0,1.0
+exp3-comp,bert,herself,0.9333333333333333,0.9333333333333333,0.9466666666666667
+exp3-comp,trans,themselves,1.0,1.0,1.0
+exp3-comp,trans,himself,0.9866666666666667,0.9866666666666667,0.9866666666666667
+exp3-comp,trans,herself,0.8266666666666667,1.0,0.8266666666666667
+exp3-comp,jrnn,themselves,0.9333333333333333,1.0,0.9333333333333333
+exp3-comp,jrnn,himself,1.0,1.0,1.0
+exp3-comp,jrnn,herself,0.9733333333333334,0.9866666666666667,0.9866666666666667
+exp3-comp,grnn,themselves,1.0,1.0,1.0
+exp3-comp,grnn,himself,1.0,1.0,1.0
+exp3-comp,grnn,herself,0.9733333333333334,1.0,0.9733333333333334
+exp3-comp,5gram,themselves,0.4266666666666667,0.21333333333333335,0.21333333333333335
+exp3-comp,5gram,himself,0.36,0.08,0.08
+exp3-comp,5gram,herself,0.21333333333333335,0.05333333333333334,0.05333333333333334
+exp3-comp,tiny,themselves,0.9866666666666667,0.9866666666666667,0.9866666666666667
+exp3-comp,tiny,himself,0.9333333333333333,0.9733333333333334,0.9333333333333333
+exp3-comp,tiny,herself,0.49333333333333335,0.52,0.6933333333333334
+exp3-comp,rnng,themselves,0.7066666666666667,0.8666666666666667,0.7066666666666667
+exp3-comp,rnng,himself,1.0,1.0,1.0
+exp3-comp,rnng,herself,0.9866666666666667,1.0,0.9866666666666667
diff --git a/data/accuracy/exp4-pp.csv b/data/accuracy/exp4-pp.csv
new file mode 100644
index 0000000..ed6a07c
--- /dev/null
+++ b/data/accuracy/exp4-pp.csv
@@ -0,0 +1,22 @@
+exp,model,pronoun,total_acc,vs_baseline_acc,vs_distractor_acc
+exp4-pp,bert,themselves,1.0,1.0,1.0
+exp4-pp,bert,himself,1.0,1.0,1.0
+exp4-pp,bert,herself,0.9866666666666667,0.9866666666666667,1.0
+exp4-pp,trans,themselves,0.7466666666666667,1.0,0.7466666666666667
+exp4-pp,trans,himself,0.6533333333333333,0.8533333333333334,0.7066666666666667
+exp4-pp,trans,herself,0.6266666666666667,0.9866666666666667,0.6266666666666667
+exp4-pp,jrnn,themselves,0.30666666666666664,1.0,0.30666666666666664
+exp4-pp,jrnn,himself,0.92,0.9866666666666667,0.9333333333333333
+exp4-pp,jrnn,herself,0.8533333333333334,0.8533333333333334,1.0
+exp4-pp,grnn,themselves,0.7866666666666666,1.0,0.7866666666666666
+exp4-pp,grnn,himself,0.6533333333333333,1.0,0.6533333333333333
+exp4-pp,grnn,herself,0.8133333333333334,0.9866666666666667,0.8266666666666667
+exp4-pp,5gram,themselves,0.26666666666666666,0.0,0.06666666666666667
+exp4-pp,5gram,himself,0.29333333333333333,0.0,0.16
+exp4-pp,5gram,herself,0.28,0.0,0.13333333333333333
+exp4-pp,tiny,themselves,0.9333333333333333,1.0,0.9333333333333333
+exp4-pp,tiny,himself,0.21333333333333335,0.9066666666666666,0.24
+exp4-pp,tiny,herself,0.12,0.28,0.28
+exp4-pp,rnng,themselves,0.6133333333333333,0.9733333333333334,0.64
+exp4-pp,rnng,himself,0.64,0.9866666666666667,0.6533333333333333
+exp4-pp,rnng,herself,0.6266666666666667,0.9733333333333334,0.6266666666666667
diff --git a/data/accuracy/futrell_rc_subrc_accuracy_grnn_jrnn_trans.csv b/data/accuracy/futrell_rc_subrc_accuracy_grnn_jrnn_trans.csv
deleted file mode 100644
index bf134a9..0000000
--- a/data/accuracy/futrell_rc_subrc_accuracy_grnn_jrnn_trans.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-model,full_exp,vs_baseline_acc,vs_distractor_acc
-grnn,futrell_rc_subrc_himself,0.7111111111111111,0.8333333333333334
-grnn,futrell_rc_subrc_herself,1.0,0.9333333333333333
-jrnn,futrell_rc_subrc_himself,0.9888888888888889,0.5777777777777777
-jrnn,futrell_rc_subrc_herself,1.0,1.0
-trans,futrell_rc_subrc_himself,1.0,0.9444444444444444
-trans,futrell_rc_subrc_herself,0.9222222222222223,0.9555555555555556
diff --git a/data/accuracy/futrell_rc_subrc_accuracy_grnn_jrnn_trans_5gram.csv b/data/accuracy/futrell_rc_subrc_accuracy_grnn_jrnn_trans_5gram.csv
deleted file mode 100644
index 45bb203..0000000
--- a/data/accuracy/futrell_rc_subrc_accuracy_grnn_jrnn_trans_5gram.csv
+++ /dev/null
@@ -1,9 +0,0 @@
-full_exp,model,total_acc,vs_baseline_acc,vs_distractor_acc
-futrell_rc_subrc_himself,grnn,0.6222222222222222,0.7111111111111111,0.8333333333333334
-futrell_rc_subrc_herself,grnn,0.9333333333333333,1.0,0.9333333333333333
-futrell_rc_subrc_himself,jrnn,0.5666666666666667,0.9888888888888889,0.5777777777777777
-futrell_rc_subrc_herself,jrnn,1.0,1.0,1.0
-futrell_rc_subrc_himself,trans,0.9444444444444444,1.0,0.9444444444444444
-futrell_rc_subrc_herself,trans,0.9222222222222223,0.9222222222222223,0.9555555555555556
-futrell_rc_subrc_himself,5gram,0.022222222222222223,0.022222222222222223,0.08888888888888889
-futrell_rc_subrc_herself,5gram,0.0,0.0,0.0
diff --git a/data/accuracy/loc_accuracy_all.csv b/data/accuracy/loc_accuracy_all.csv
deleted file mode 100644
index d854c60..0000000
--- a/data/accuracy/loc_accuracy_all.csv
+++ /dev/null
@@ -1,28 +0,0 @@
-model,full_exp,total_acc,vs_baseline_acc,vs_distractor_acc
-grnn_multi,loc_himself,1.0,1.0,1.0
-grnn_multi,loc_herself,1.0,1.0,1.0
-grnn_multi,loc_pl,1.0,1.0,1.0
-grnn,loc_himself,1.0,1.0,1.0
-grnn,loc_herself,0.9733333333333334,1.0,0.9733333333333334
-grnn,loc_pl,1.0,1.0,1.0
-jrnn,loc_himself,1.0,1.0,1.0
-jrnn,loc_herself,0.9733333333333334,0.9866666666666667,0.9866666666666667
-jrnn,loc_pl,0.9333333333333333,1.0,0.9333333333333333
-trans,loc_himself,0.9866666666666667,0.9866666666666667,0.9866666666666667
-trans,loc_herself,0.8266666666666667,1.0,0.8266666666666667
-trans,loc_pl,1.0,1.0,1.0
-rnng,loc_himself,1.0,1.0,1.0
-rnng,loc_herself,0.9866666666666667,1.0,0.9866666666666667
-rnng,loc_pl,0.7066666666666667,0.8666666666666667,0.7066666666666667
-tiny,loc_himself,0.9333333333333333,0.9733333333333334,0.9333333333333333
-tiny,loc_herself,0.49333333333333335,0.52,0.6933333333333334
-tiny,loc_pl,0.9866666666666667,0.9866666666666667,0.9866666666666667
-tinywiki,loc_himself,1.0,1.0,1.0
-tinywiki,loc_herself,1.0,1.0,1.0
-tinywiki,loc_pl,1.0,1.0,1.0
-5gram,loc_himself,0.36,0.08,0.08
-5gram,loc_herself,0.29333333333333333,0.05333333333333334,0.05333333333333334
-5gram,loc_pl,0.38666666666666666,0.21333333333333335,0.21333333333333335
-bert,loc_himself,1.0,1.0,1.0
-bert,loc_herself,0.9333333333333333,0.9333333333333333,0.9466666666666667
-bert,loc_pl,1.0,1.0,1.0
diff --git a/data/accuracy/ml_accuracy_grnn_jrnn_trans.csv b/data/accuracy/ml_accuracy_grnn_jrnn_trans.csv
deleted file mode 100644
index ad0f57c..0000000
--- a/data/accuracy/ml_accuracy_grnn_jrnn_trans.csv
+++ /dev/null
@@ -1,10 +0,0 @@
-model,full_exp,vs_baseline_acc,vs_distractor_acc
-grnn,ml_himself,1.0,1.0
-grnn,ml_herself,1.0,1.0
-grnn,ml_pl,1.0,1.0
-jrnn,ml_himself,1.0,1.0
-jrnn,ml_herself,1.0,1.0
-jrnn,ml_pl,1.0,0.8857142857142857
-trans,ml_himself,1.0,0.8857142857142857
-trans,ml_herself,0.9857142857142858,0.8571428571428571
-trans,ml_pl,1.0,1.0
diff --git a/data/accuracy/ml_accuracy_grnn_jrnn_trans_5gram.csv b/data/accuracy/ml_accuracy_grnn_jrnn_trans_5gram.csv
deleted file mode 100644
index e5f2138..0000000
--- a/data/accuracy/ml_accuracy_grnn_jrnn_trans_5gram.csv
+++ /dev/null
@@ -1,13 +0,0 @@
-full_exp,model,total_acc,vs_baseline_acc,vs_distractor_acc
-ml_himself,grnn,1.0,1.0,1.0
-ml_herself,grnn,1.0,1.0,1.0
-ml_pl,grnn,1.0,1.0,1.0
-ml_himself,jrnn,1.0,1.0,1.0
-ml_herself,jrnn,1.0,1.0,1.0
-ml_pl,jrnn,0.8857142857142857,1.0,0.8857142857142857
-ml_himself,trans,0.8857142857142857,1.0,0.8857142857142857
-ml_herself,trans,0.8571428571428571,0.9857142857142858,0.8571428571428571
-ml_pl,trans,1.0,1.0,1.0
-ml_himself,5gram,0.0,0.0,0.0
-ml_herself,5gram,0.0,0.0,0.0
-ml_pl,5gram,0.0,0.0,0.0
diff --git a/data/accuracy/ml_accuracy_grnn_jrnn_trans_5gram_bert.csv b/data/accuracy/ml_accuracy_grnn_jrnn_trans_5gram_bert.csv
deleted file mode 100644
index f6b446d..0000000
--- a/data/accuracy/ml_accuracy_grnn_jrnn_trans_5gram_bert.csv
+++ /dev/null
@@ -1,16 +0,0 @@
-full_exp,model,total_acc,vs_baseline_acc,vs_distractor_acc
-ml_himself,grnn,1.0,1.0,1.0
-ml_herself,grnn,1.0,1.0,1.0
-ml_pl,grnn,1.0,1.0,1.0
-ml_himself,jrnn,1.0,1.0,1.0
-ml_herself,jrnn,1.0,1.0,1.0
-ml_pl,jrnn,0.8857142857142857,1.0,0.8857142857142857
-ml_himself,trans,0.8857142857142857,1.0,0.8857142857142857
-ml_herself,trans,0.8571428571428571,0.9857142857142858,0.8571428571428571
-ml_pl,trans,1.0,1.0,1.0
-ml_himself,5gram,0.35714285714285715,0.0,0.0
-ml_herself,5gram,0.3,0.0,0.0
-ml_pl,5gram,0.4,0.0,0.0
-ml_himself,bert,1.0,1.0,1.0
-ml_herself,bert,1.0,1.0,1.0
-ml_pl,bert,0.8571428571428571,1.0,0.8571428571428571
diff --git a/data/accuracy/ml_accuracy_grnn_multi_grnn_jrnn_trans_5gram_bert.csv b/data/accuracy/ml_accuracy_grnn_multi_grnn_jrnn_trans_5gram_bert.csv
deleted file mode 100644
index ef58ebe..0000000
--- a/data/accuracy/ml_accuracy_grnn_multi_grnn_jrnn_trans_5gram_bert.csv
+++ /dev/null
@@ -1,19 +0,0 @@
-full_exp,model,total_acc,vs_baseline_acc,vs_distractor_acc
-ml_himself,grnn_multi,1.0,1.0,1.0
-ml_herself,grnn_multi,1.0,1.0,1.0
-ml_pl,grnn_multi,1.0,1.0,1.0
-ml_himself,grnn,1.0,1.0,1.0
-ml_herself,grnn,1.0,1.0,1.0
-ml_pl,grnn,1.0,1.0,1.0
-ml_himself,jrnn,1.0,1.0,1.0
-ml_herself,jrnn,1.0,1.0,1.0
-ml_pl,jrnn,0.8857142857142857,1.0,0.8857142857142857
-ml_himself,trans,0.8857142857142857,1.0,0.8857142857142857
-ml_herself,trans,0.8571428571428571,0.9857142857142858,0.8571428571428571
-ml_pl,trans,1.0,1.0,1.0
-ml_himself,5gram,0.44285714285714284,0.0,0.0
-ml_herself,5gram,0.2857142857142857,0.0,0.0
-ml_pl,5gram,0.45714285714285713,0.0,0.0
-ml_himself,bert,1.0,1.0,1.0
-ml_herself,bert,1.0,1.0,1.0
-ml_pl,bert,0.8571428571428571,1.0,0.8571428571428571
diff --git a/data/accuracy/ml_rc_accuracy_grnn_jrnn_trans.csv b/data/accuracy/ml_rc_accuracy_grnn_jrnn_trans.csv
deleted file mode 100644
index 2b55208..0000000
--- a/data/accuracy/ml_rc_accuracy_grnn_jrnn_trans.csv
+++ /dev/null
@@ -1,10 +0,0 @@
-model,full_exp,vs_baseline_acc,vs_distractor_acc
-grnn,ml_rc_himself,1.0,0.5714285714285714
-grnn,ml_rc_herself,1.0,0.8857142857142857
-grnn,ml_rc_pl,1.0,0.6428571428571429
-jrnn,ml_rc_himself,1.0,0.14285714285714285
-jrnn,ml_rc_herself,1.0,0.3
-jrnn,ml_rc_pl,1.0,0.8
-trans,ml_rc_himself,1.0,0.6
-trans,ml_rc_herself,0.9571428571428572,0.6142857142857143
-trans,ml_rc_pl,1.0,1.0
diff --git a/data/accuracy/ml_rc_accuracy_grnn_jrnn_trans_5gram.csv b/data/accuracy/ml_rc_accuracy_grnn_jrnn_trans_5gram.csv
deleted file mode 100644
index bc742d6..0000000
--- a/data/accuracy/ml_rc_accuracy_grnn_jrnn_trans_5gram.csv
+++ /dev/null
@@ -1,13 +0,0 @@
-full_exp,model,total_acc,vs_baseline_acc,vs_distractor_acc
-ml_rc_himself,grnn,0.5714285714285714,1.0,0.5714285714285714
-ml_rc_herself,grnn,0.8857142857142857,1.0,0.8857142857142857
-ml_rc_pl,grnn,0.6428571428571429,1.0,0.6428571428571429
-ml_rc_himself,jrnn,0.14285714285714285,1.0,0.14285714285714285
-ml_rc_herself,jrnn,0.3,1.0,0.3
-ml_rc_pl,jrnn,0.8,1.0,0.8
-ml_rc_himself,trans,0.6,1.0,0.6
-ml_rc_herself,trans,0.6142857142857143,0.9571428571428572,0.6142857142857143
-ml_rc_pl,trans,1.0,1.0,1.0
-ml_rc_himself,5gram,0.0,0.0,0.0
-ml_rc_herself,5gram,0.0,0.0,0.0
-ml_rc_pl,5gram,0.0,0.0,0.0
diff --git a/data/accuracy/ml_rc_accuracy_grnn_jrnn_trans_5gram_bert.csv b/data/accuracy/ml_rc_accuracy_grnn_jrnn_trans_5gram_bert.csv
deleted file mode 100644
index 95297d7..0000000
--- a/data/accuracy/ml_rc_accuracy_grnn_jrnn_trans_5gram_bert.csv
+++ /dev/null
@@ -1,16 +0,0 @@
-full_exp,model,total_acc,vs_baseline_acc,vs_distractor_acc
-ml_rc_himself,grnn,0.5714285714285714,1.0,0.5714285714285714
-ml_rc_herself,grnn,0.8857142857142857,1.0,0.8857142857142857
-ml_rc_pl,grnn,0.6428571428571429,1.0,0.6428571428571429
-ml_rc_himself,jrnn,0.14285714285714285,1.0,0.14285714285714285
-ml_rc_herself,jrnn,0.3,1.0,0.3
-ml_rc_pl,jrnn,0.8,1.0,0.8
-ml_rc_himself,trans,0.6,1.0,0.6
-ml_rc_herself,trans,0.6142857142857143,0.9571428571428572,0.6142857142857143
-ml_rc_pl,trans,1.0,1.0,1.0
-ml_rc_himself,5gram,0.34285714285714286,0.0,0.0
-ml_rc_herself,5gram,0.32857142857142857,0.0,0.0
-ml_rc_pl,5gram,0.38571428571428573,0.0,0.0
-ml_rc_himself,bert,0.7714285714285715,0.9857142857142858,0.7714285714285715
-ml_rc_herself,bert,0.7428571428571429,0.9571428571428572,0.7428571428571429
-ml_rc_pl,bert,0.7571428571428571,1.0,0.7571428571428571
diff --git a/data/accuracy/ml_rc_accuracy_grnn_multi_grnn_jrnn_trans_5gram_bert.csv b/data/accuracy/ml_rc_accuracy_grnn_multi_grnn_jrnn_trans_5gram_bert.csv
deleted file mode 100644
index 8d78a4a..0000000
--- a/data/accuracy/ml_rc_accuracy_grnn_multi_grnn_jrnn_trans_5gram_bert.csv
+++ /dev/null
@@ -1,19 +0,0 @@
-full_exp,model,total_acc,vs_baseline_acc,vs_distractor_acc
-ml_rc_himself,grnn_multi,0.04285714285714286,1.0,0.04285714285714286
-ml_rc_herself,grnn_multi,0.04285714285714286,0.9142857142857143,0.04285714285714286
-ml_rc_pl,grnn_multi,0.0,1.0,0.0
-ml_rc_himself,grnn,0.5714285714285714,1.0,0.5714285714285714
-ml_rc_herself,grnn,0.8857142857142857,1.0,0.8857142857142857
-ml_rc_pl,grnn,0.6428571428571429,1.0,0.6428571428571429
-ml_rc_himself,jrnn,0.14285714285714285,1.0,0.14285714285714285
-ml_rc_herself,jrnn,0.3,1.0,0.3
-ml_rc_pl,jrnn,0.8,1.0,0.8
-ml_rc_himself,trans,0.6,1.0,0.6
-ml_rc_herself,trans,0.6142857142857143,0.9571428571428572,0.6142857142857143
-ml_rc_pl,trans,1.0,1.0,1.0
-ml_rc_himself,5gram,0.21428571428571427,0.0,0.0
-ml_rc_herself,5gram,0.3142857142857143,0.0,0.0
-ml_rc_pl,5gram,0.2857142857142857,0.0,0.0
-ml_rc_himself,bert,0.7714285714285715,0.9857142857142858,0.7714285714285715
-ml_rc_herself,bert,0.7428571428571429,0.9571428571428572,0.7428571428571429
-ml_rc_pl,bert,0.7571428571428571,1.0,0.7571428571428571
diff --git a/data/accuracy/rc_accuracy_all.csv b/data/accuracy/rc_accuracy_all.csv
deleted file mode 100644
index de73145..0000000
--- a/data/accuracy/rc_accuracy_all.csv
+++ /dev/null
@@ -1,28 +0,0 @@
-model,full_exp,total_acc,vs_baseline_acc,vs_distractor_acc
-grnn_multi,rc_himself,0.013333333333333334,0.9733333333333334,0.013333333333333334
-grnn_multi,rc_herself,0.0,0.6,0.0
-grnn_multi,rc_pl,0.0,1.0,0.0
-grnn,rc_himself,0.13333333333333333,1.0,0.13333333333333333
-grnn,rc_herself,0.37333333333333335,1.0,0.37333333333333335
-grnn,rc_pl,0.76,1.0,0.76
-jrnn,rc_himself,0.8,1.0,0.8
-jrnn,rc_herself,0.3333333333333333,1.0,0.3333333333333333
-jrnn,rc_pl,0.8666666666666667,1.0,0.8666666666666667
-trans,rc_himself,0.30666666666666664,0.5733333333333334,0.3333333333333333
-trans,rc_herself,0.7866666666666666,1.0,0.7866666666666666
-trans,rc_pl,1.0,1.0,1.0
-rnng,rc_himself,0.12,0.8,0.12
-rnng,rc_herself,0.0,0.7733333333333333,0.0
-rnng,rc_pl,0.6,1.0,0.6
-tiny,rc_himself,0.14666666666666667,1.0,0.14666666666666667
-tiny,rc_herself,0.21333333333333335,0.41333333333333333,0.29333333333333333
-tiny,rc_pl,0.04,1.0,0.04
-tinywiki,rc_himself,0.0,1.0,0.0
-tinywiki,rc_herself,0.21333333333333335,1.0,0.21333333333333335
-tinywiki,rc_pl,0.14666666666666667,0.5466666666666666,0.14666666666666667
-5gram,rc_himself,0.3333333333333333,0.0,0.0
-5gram,rc_herself,0.2,0.0,0.0
-5gram,rc_pl,0.44,0.0,0.0
-bert,rc_himself,0.6533333333333333,0.8933333333333333,0.6533333333333333
-bert,rc_herself,0.52,0.76,0.5466666666666666
-bert,rc_pl,0.9333333333333333,1.0,0.9333333333333333
diff --git a/data/accuracy/rc_accuracy_grnn_jrnn_trans_tiny_rnng.csv b/data/accuracy/rc_accuracy_grnn_jrnn_trans_tiny_rnng.csv
deleted file mode 100644
index 7d8bc74..0000000
--- a/data/accuracy/rc_accuracy_grnn_jrnn_trans_tiny_rnng.csv
+++ /dev/null
@@ -1,16 +0,0 @@
-model,full_exp,vs_baseline_acc,vs_distractor_acc
-grnn,rc_himself,1.0,0.13333333333333333
-grnn,rc_herself,1.0,0.37333333333333335
-grnn,rc_pl,1.0,0.76
-jrnn,rc_himself,1.0,0.8
-jrnn,rc_herself,1.0,0.3333333333333333
-jrnn,rc_pl,1.0,0.8666666666666667
-trans,rc_himself,0.5733333333333334,0.3333333333333333
-trans,rc_herself,1.0,0.7866666666666666
-trans,rc_pl,1.0,1.0
-tiny,rc_himself,1.0,0.14666666666666667
-tiny,rc_herself,0.41333333333333333,0.29333333333333333
-tiny,rc_pl,1.0,0.04
-rnng,rc_himself,0.8,0.12
-rnng,rc_herself,0.7733333333333333,0.0
-rnng,rc_pl,1.0,0.6