fix plotting code, update README

jennhu · Oct 3, 2019 · a1e98bd · a1e98bd
1 parent e15a98d
commit a1e98bd
Show file tree

Hide file tree

Showing 314 changed files with 6,152 additions and 1,146 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-# Large data files, SLURM scripts, etc.
+# Large data files, SLURM scripts, old data, etc.
 slurm
 data/rnng/old/
 data/grnn/old/
@@ -14,6 +14,9 @@ stimuli/unused
 stimuli/old/
 stimuli/marvin-linzen-2016
 
+analysis/freq
+analysis/figures
+
 # Temporary files
 ._*
 

diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # A closer look at the performance of neural language models on reflexive anaphor licensing
 
-This repository contains the code for the following paper:
+This repository contains the materials for the following paper:
 
-Jennifer Hu, Sherry Yong Chen, and Roger Levy (2020). 
+> Jennifer Hu, Sherry Yong Chen, and Roger Levy (2020). 
 A closer look at the performance of neural language models on reflexive anaphor licensing. 
 *Proceedings of the Society for Computation in Linguistics (SCiL 2020)* Volume 3.
 
@@ -20,7 +20,12 @@ If you use any of our code, data, or analyses, please cite the paper using the b
 
 ## Overview
 
-**TODO: write short overview/summary here to orient people**
+Our materials are organized into three primary folders:
+* [analysis](analysis) (code for reproducing the results and figures in the paper)
+* [data](data) (accuracy and surprisal results from each model)
+* [stimuli](stimuli) (test suites and script for extracting sentences)
+
+Please note that we do not provide code for running each model. For more details, see the [Dependencies](#dependencies) section.
 
 ## Stimuli
 
@@ -29,6 +34,7 @@ For each experiment, a `.csv` file containing the stimuli can be found at
 
 **SHERRY TODO: explain how stimuli file is structured**
 
+### Extracting sentences
 To extract the sentences from this file, use the script
 `stimuli/extract_sentences.py`. You can toggle flags like `--uncased` and `--eos`
 depending on the requirements of your model. **Please note that the final period
@@ -75,32 +81,29 @@ We can make the training script for our n-gram model available upon request.
 5. [Tiny LSTM](https://github.com/pytorch/examples/tree/master/word_language_model)
 6. n-gram
 
-<!-- ### Transformer-XL
-Note that we use the [pytorch-pretrained-BERT](https://github.com/huggingface/pytorch-pretrained-BERT) implementation of Transformer-XL. To download the 
-state-of-the-art model parameters, run the script `get_model.sh`.
-After doing so, you'll need to load the model and tokenizer like this:
-
-```python
-tokenizer = TransfoXLTokenizer.from_pretrained('./model/')
-model = TransfoXLModel.from_pretrained('./model/')
-```
-
-(Source: [Issue #451](https://github.com/huggingface/pytorch-pretrained-BERT/issues/451#issuecomment-481155274))
-
-See [pytorch-pretrained-BERT](https://github.com/huggingface/pytorch-pretrained-BERT) 
-for more detailed setup instructions. -->
+## Reproducing our results
 
+### Figures
+To generate the plots for a given experiment and model, run the following:
 
-## Reproducing our figures
+```bash
+cd analysis
+mkdir figures
+python plot_for_paper.py -o figures -model <MODELS> -exp <EXPERIMENT> -vs
+```
+This will save a plot to `analysis/figures/<EXPERIMENT>_<MODEL>.png`.
+The `-vs` flag specifies to plot the negative log probability **differential**.
+You can omit the flag to plot the raw negative log probabilities.
 
-To generate the plots for a given experiment and model, run the following:
-**TODO: FIX/CLEAN THIS**
+To plot the results for all our experiments, run the following
+(replacing `figures` with your desired output folder, which will be created
+if it does not exist):
 
 ```bash
 cd analysis
-python plot_surprisals.py -exp <EXPERIMENT> -model <MODEL>
+./plot_all figures
 ```
-This will save a plot to `analysis/plots/<EXPERIMENT>_<MODEL>.png` showing
-the mean surprisal at the target word across each condition.
-The relevant target word (e.g. *himself*, *themselves*, *was*) will be
-inferred from the name of the experiment.
+
+### Accuracy
+
+**TODO**
diff --git a/analysis/acc_freq.pdf b/analysis/acc_freq.pdf
diff --git a/analysis/acc_freq_data.csv b/analysis/acc_freq_data.csv
diff --git a/analysis/acc_freq_data_vanillaLSTM.csv b/analysis/acc_freq_data_vanillaLSTM.csv
diff --git a/analysis/acc_freq_vanillaLSTM.pdf b/analysis/acc_freq_vanillaLSTM.pdf
diff --git a/analysis/acc_grnn.pdf b/analysis/acc_grnn.pdf
diff --git a/analysis/accuracy.py b/analysis/accuracy.py
@@ -1,76 +1,78 @@
-'''
+"""
     accuracy.py
-    Get accuracy.
-'''
+    Get accuracy results.
+"""
 import argparse
 from numpy import mean
 import random
 import pandas as pd
 
+import utils
+
 #################################################################################
 # Global variables
 #################################################################################
 
-MODELS = ['grnn_multi', 'grnn', 'jrnn', 'trans', 'rnng', 'tiny', 'tinywiki', '5gram', 'bert']
+# MODELS = ['grnn_multi', 'grnn', 'jrnn', 'trans', 'rnng', 'tiny', 'tinywiki', '5gram', 'bert']
 
 #################################################################################
 # Helper functions
 #################################################################################
 
-def _prob_ratio(df1, df2):
-    prob_ratios = []
-    for row in df1.itertuples():
-        surprisal1 = row.surprisal
-        surprisal2 = df2.loc[row.Index].surprisal
-        prob_ratio = 2**(surprisal2 - surprisal1)
-        prob_ratios.append(prob_ratio)
-    return mean(prob_ratios)
-
-
-def _get_data_df(data, surp, exp, nonrefl, multi=False):
-    # read surprisals and data
-    if not multi:
-        surp_df = pd.read_csv(surp, delim_whitespace=True,
-                              names=['token', 'surprisal'])
-    else:
-        surp_df = pd.read_csv(surp, sep=' ',
-                              names=['token', 'sentid', 'sentpos', 'wlen', 'surprisal', 'entropy'],
-                              skiprows=2, skipfooter=3)
-        print(surp_df.head())
-    data_df = pd.read_csv(data)
-
-    agree, pl = 'agree' in exp, 'pl' in exp
-    # only keep surprisal at specified pronoun or verb
-    if agree:
-        verb = 'were' if pl else 'was'
-        surp_df = surp_df.loc[surp_df.token == verb]
-    else:
-        if nonrefl:
-            pn = 'them' if pl else exp.split('_')[-1][:3]
-        else:
-            pn = 'themselves' if pl else exp.split('_')[-1]
-        surp_df = surp_df.loc[surp_df.token == pn]
-
-        # data_df = data_df.loc[data_df.pronoun == pn]
-
-    # insert surprisal into data_df
-    data_df['surprisal'] = surp_df.surprisal.values
-
-    return data_df
-
-
-def _subtract_baseline(df, exp):
-    item_list = df.item.unique()
-    for item in item_list:
-        item_rows = df.loc[df.item == item]
-        base_rows = item_rows.loc[item_rows.mismatch_position == 'none']
-        baseline = base_rows.surprisal.mean()
-        # subtract baseline from surprisal of all rows
-        item_rows.surprisal -= baseline
-        df.loc[df.item == item] = item_rows
-    return df
-
-def _get_accuracy(df, mismatch_position):
+# def _prob_ratio(df1, df2):
+#     prob_ratios = []
+#     for row in df1.itertuples():
+#         surprisal1 = row.surprisal
+#         surprisal2 = df2.loc[row.Index].surprisal
+#         prob_ratio = 2**(surprisal2 - surprisal1)
+#         prob_ratios.append(prob_ratio)
+#     return mean(prob_ratios)
+
+
+# def _get_data_df(data, surp, exp, nonrefl, multi=False):
+#     # read surprisals and data
+#     if not multi:
+#         surp_df = pd.read_csv(surp, delim_whitespace=True,
+#                               names=['token', 'surprisal'])
+#     else:
+#         surp_df = pd.read_csv(surp, sep=' ',
+#                               names=['token', 'sentid', 'sentpos', 'wlen', 'surprisal', 'entropy'],
+#                               skiprows=2, skipfooter=3)
+#         print(surp_df.head())
+#     data_df = pd.read_csv(data)
+
+#     agree, pl = 'agree' in exp, 'pl' in exp
+#     # only keep surprisal at specified pronoun or verb
+#     if agree:
+#         verb = 'were' if pl else 'was'
+#         surp_df = surp_df.loc[surp_df.token == verb]
+#     else:
+#         if nonrefl:
+#             pn = 'them' if pl else exp.split('_')[-1][:3]
+#         else:
+#             pn = 'themselves' if pl else exp.split('_')[-1]
+#         surp_df = surp_df.loc[surp_df.token == pn]
+
+#         # data_df = data_df.loc[data_df.pronoun == pn]
+
+#     # insert surprisal into data_df
+#     data_df['surprisal'] = surp_df.surprisal.values
+
+#     return data_df
+
+
+# def _subtract_baseline(df, exp):
+#     item_list = df.item.unique()
+#     for item in item_list:
+#         item_rows = df.loc[df.item == item]
+#         base_rows = item_rows.loc[item_rows.mismatch_position == 'none']
+#         baseline = base_rows.surprisal.mean()
+#         # subtract baseline from surprisal of all rows
+#         item_rows.surprisal -= baseline
+#         df.loc[df.item == item] = item_rows
+#     return df
+
+def get_accuracy(df, mismatch_position):
     item_list = df.item.unique()
     n_items = len(item_list)
     num_correct_vs_baseline = 0
@@ -111,14 +113,9 @@ def _get_accuracy(df, mismatch_position):
 # Main function
 #################################################################################
 
-def main(out_prefix, model, exp, nonrefl, vs_baseline):
+def main(out_prefix, model, exp):
     out_path = '%s/%s_accuracy_%s.csv' % (out_prefix, exp, '_'.join(model))
-    if 'futrell' in exp:
-        suffixes = ['_himself', '_herself']
-    elif 'agree' in exp:
-        suffixes = ['', '_pl']
-    else:
-        suffixes = ['_himself', '_herself', '_pl']
+    suffixes = ['_himself', '_herself', '_pl']
     model_list = MODELS if model == ['all'] else model
 
     acc_dict = {'model':[], 'full_exp':[], 'total_acc':[], 'vs_baseline_acc':[], 'vs_distractor_acc':[]}
@@ -154,20 +151,14 @@ def main(out_prefix, model, exp, nonrefl, vs_baseline):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Plot surprisals.')
+    parser = argparse.ArgumentParser(description='Compute accuracy for models.')
     parser.add_argument('--out_prefix', '-out_prefix', '--O', '-O',
                         default='accuracy',
-                        help='prefix to path to save final plots (file will '
+                        help='prefix to save final file (file will '
                              'be named according to experiment name)')
     parser.add_argument('--model', '-model', '--M', '-M', nargs='+',
                         help='names of models, or all to plot all at once')
     parser.add_argument('--exp', '-exp',
                         help='name of experiment')
-    parser.add_argument('--nonrefl', '-nonrefl', action='store_true',
-                        help='toggle whether using nonreflexive pronoun')
-    parser.add_argument('--vs_baseline', '-vs_baseline', '--vs', '-vs',
-                        default=False, action='store_true',
-                        help='toggle plotting raw surprisal or surprisal '
-                             'difference vs. baseline')
     args = parser.parse_args()
-    main(**vars(args))
+    main(args)
diff --git a/analysis/cc_all.pdf b/analysis/cc_all.pdf