diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..c7c409d
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,68 @@
+# Use an official Python runtime as a parent image
+# FROM python:3.9-slim
+FROM pytorch/pytorch:1.9.1-cuda11.1-cudnn8-devel
+
+# Upgrade pip
+RUN pip3 install --upgrade pip
+
+# Install Python packages
+RUN pip install \
+    async-timeout==4.0.2 \
+    Cython==0.29.32 \
+    h5py==3.6.0 \
+    huggingface-hub==0.5.1 \
+    IProgress==0.4 \
+    ipykernel==6.13.0 \
+    ipython==7.29.0 \
+    ipython-genutils==0.2.0 \
+    ipywidgets==7.7.0 \
+    joblib==1.1.0 \
+    jupyter-client==7.3.4 \
+    jupyter-core==4.10.0 \
+    jupyter-server==1.17.0 \
+    jupyterlab==3.4.0 \
+    jupyterlab-pygments==0.2.2 \
+    jupyterlab-server==2.13.0 \
+    jupyterlab-widgets==1.1.0 \
+    matplotlib==3.5.2 \
+    matplotlib-inline==0.1.2 \
+    mosestokenizer==1.2.1 \
+    multidict==6.0.2 \
+    nltk==3.7 \
+    numba==0.56.4 \
+    numpy==1.21.2 \
+    pandas==1.3.5 \
+    pickleshare==0.7.5 \
+    Pillow==8.4.0 \
+    pytorch-lightning==1.6.3 \
+    pytorch-memlab==0.2.4 \
+    pytorch-nlp==0.5.0 \
+    requests==2.25.1 \
+    scikit-learn==1.0.2 \
+    scipy==1.7.3 \
+    seaborn==0.11.2 \
+    sentencepiece==0.1.97 \
+    six==1.16.0 \
+    smart-open==5.2.1 \
+    tensorboard==2.9.0 \
+    tensorboard-data-server==0.6.1 \
+    tensorboard-plugin-wit==1.8.1 \
+    tokenizers==0.12.1 \
+    toolwrapper==2.1.0 \
+    torch==1.13.0 \
+    torchaudio==0.13.0 \
+    torchelastic==0.2.0 \
+    torchmetrics==0.8.2 \
+    torchtext==0.11.0 \
+    torchvision==0.11.1 \
+    tqdm==4.61.2 \
+    transformers==4.18.0
+
+# Set the working directory
+WORKDIR /app
+
+# Copy the current directory contents into the container at /app
+COPY . /app
+
+# Specify the default command to run on container start
+CMD ["bash"]
\ No newline at end of file
diff --git a/README.md b/README.md
index d55dc99..ebd0102 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,58 @@
-# Code coming soon!
+
+## Probing for Incremental Parse States in Autoregressive Language Models
+
+Supplementary materials and demo for "Probing for Incremental Parse States in Autoregressive Language Models" (Eisape et al., 2022).
+
+## Environment
+
+Our [dockerfile](Dockerfile) contains the necessary dependencies to run the code in this repository and can be built with the following command:
+
+    docker build -t incremental_parse_probe .
+
+The rest of the walkthrough assumes you are working in a suitable environment.
+
+## Preprocessing
+
+The necessary datasets are 1) PTB formatted constituency parses and 2) conllx formatted dependency parses (i.e. `$SPLIT.txt`, `$SPLIT.conllx`; conllx formatted tree can be generated with [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/)). After adding those files to `data/`, running `python3 src/preprocess.py` will generate preprocessed versions of the dataset in `data/`.
+
+## Training
+
+The following command trains a probe specified by `config.yaml` with PyTorch Lightning:
+
+    python3 src/train.py --config $CONFIG_PATH
+
+The result of training is a new repository in `./experiment_checkpoints` with model parameters and hyperparameters. We provide config files for each of the models in the paper in [configs/](configs). **NOTE**: the geometric action probe is pretrained on the regression task from Hewitt and Manning (2019), to train these probes first train a geometric regression probe on the relevant model and layer, then point to its weights from the config file. See [configs/](configs) for an example.
+
+## Evaluation
+
+To evaluate the probes with probe-based word-synchronous beam search, run the following command with the path of a model training run:
+
+    python3 src/parse.py --experiment_path $EXPERIMENT_PATH
+
+Where `experiment` points to the directory with the probe that was created during training. This script uses utilities from gpt2.py to decode an incremental parse state up to and including the current word from GPT2 encodings of a sentence prefix up to that word. The result is a new CSV file in `results/` with parsing statistics (e.g. UAS).
+
+In addition to these, the paper includes several more involved experiments, including behavioural and causal intervention experiments on GPT-2 processing garden path sentences. This codebase contains all of the necessary utilities to replicate these experiments, mainly in gpt2.py; we also include the dataset used there in  ([data/npz_experiment](data/npz_experiment)).  Please contact [eisape@mit.edu]([mailto:eisape@mit.edu](https://eisape.github.io/)) with any difficulties or questions.
+
+## Citation
+
+    ```
+    @inproceedings{eisape-etal-2022-probing,
+        title = "Probing for Incremental Parse States in Autoregressive Language Models",
+        author = "Eisape, Tiwalayo and Gangireddy, Vineet  and Levy, Roger and Kim, Yoon",
+        booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
+        address = "Abu Dhabi, United Arab Emirates",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2022.findings-emnlp.203",
+        pages = "2801--2813",
+        month = dec,
+        year = "2022",
+    }
+    ```
+
+## Acknowledgments
+
+This project builds on code based from the following repositories:
+
+- [https://github.com/john-hewitt/structural-probe](https://github.com/john-hewitt/structural-probe)
+- [https://github.com/aistairc/rnng-pytorch](https://github.com/aistairc/rnng-pytorch)
+- [https://github.com/qipeng/arc-swift](https://github.com/qipeng/arc-swift)
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_0.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_0.yaml
new file mode 100644
index 0000000..7f929a0
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_0.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_0
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 0
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_1.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_1.yaml
new file mode 100644
index 0000000..ab35826
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_1.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_1
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 1
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_10.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_10.yaml
new file mode 100644
index 0000000..cd1d14e
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_10.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_10
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 10
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_11.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_11.yaml
new file mode 100644
index 0000000..ea72f73
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_11.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_11
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 11
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_12.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_12.yaml
new file mode 100644
index 0000000..502de49
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_12.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_12
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 12
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_13.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_13.yaml
new file mode 100644
index 0000000..fa767ce
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_13.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_13
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 13
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_14.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_14.yaml
new file mode 100644
index 0000000..f216291
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_14.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_14
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 14
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_15.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_15.yaml
new file mode 100644
index 0000000..665f4ba
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_15.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_15
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 15
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_16.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_16.yaml
new file mode 100644
index 0000000..3240715
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_16.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_16
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 16
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_17.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_17.yaml
new file mode 100644
index 0000000..27b1d7a
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_17.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_17
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 17
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_18.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_18.yaml
new file mode 100644
index 0000000..80f8fec
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_18.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_18
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 18
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_19.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_19.yaml
new file mode 100644
index 0000000..2257ce4
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_19.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_19
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 19
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_2.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_2.yaml
new file mode 100644
index 0000000..be24c39
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_2.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_2
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 2
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_20.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_20.yaml
new file mode 100644
index 0000000..01c06a5
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_20.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_20
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 20
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_21.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_21.yaml
new file mode 100644
index 0000000..d165ebd
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_21.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_21
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 21
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_22.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_22.yaml
new file mode 100644
index 0000000..0a1d809
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_22.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_22
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 22
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_23.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_23.yaml
new file mode 100644
index 0000000..b94b2d9
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_23.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_23
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 23
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_24.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_24.yaml
new file mode 100644
index 0000000..c56b856
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_24.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_24
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 24
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_25.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_25.yaml
new file mode 100644
index 0000000..4cc7527
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_25.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_25
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 25
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_26.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_26.yaml
new file mode 100644
index 0000000..4102817
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_26.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_26
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 26
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_27.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_27.yaml
new file mode 100644
index 0000000..a3001f4
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_27.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_27
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 27
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_28.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_28.yaml
new file mode 100644
index 0000000..7b5fc39
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_28.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_28
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 28
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_29.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_29.yaml
new file mode 100644
index 0000000..48b3602
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_29.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_29
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 29
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_3.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_3.yaml
new file mode 100644
index 0000000..d561aaf
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_3.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_3
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 3
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_30.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_30.yaml
new file mode 100644
index 0000000..65e472e
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_30.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_30
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 30
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_31.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_31.yaml
new file mode 100644
index 0000000..4960ca9
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_31.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_31
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 31
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_32.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_32.yaml
new file mode 100644
index 0000000..5065b5f
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_32.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_32
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 32
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_33.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_33.yaml
new file mode 100644
index 0000000..67a5f20
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_33.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_33
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 33
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_34.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_34.yaml
new file mode 100644
index 0000000..d0f6b07
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_34.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_34
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 34
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_35.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_35.yaml
new file mode 100644
index 0000000..5a7b973
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_35.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_35
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 35
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_36.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_36.yaml
new file mode 100644
index 0000000..7cff1a8
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_36.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_36
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 36
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_37.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_37.yaml
new file mode 100644
index 0000000..a03ef11
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_37.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_37
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 37
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_38.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_38.yaml
new file mode 100644
index 0000000..ca73146
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_38.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_38
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 38
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_39.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_39.yaml
new file mode 100644
index 0000000..257542f
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_39.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_39
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 39
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_4.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_4.yaml
new file mode 100644
index 0000000..c3a66d0
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_4.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_4
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 4
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_40.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_40.yaml
new file mode 100644
index 0000000..dc4a1e8
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_40.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_40
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 40
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_41.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_41.yaml
new file mode 100644
index 0000000..78f3ab8
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_41.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_41
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 41
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_42.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_42.yaml
new file mode 100644
index 0000000..c5445d6
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_42.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_42
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 42
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_43.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_43.yaml
new file mode 100644
index 0000000..f6333d0
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_43.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_43
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 43
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_44.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_44.yaml
new file mode 100644
index 0000000..f55e94c
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_44.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_44
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 44
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_45.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_45.yaml
new file mode 100644
index 0000000..6041758
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_45.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_45
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 45
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_46.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_46.yaml
new file mode 100644
index 0000000..63254ab
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_46.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_46
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 46
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_47.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_47.yaml
new file mode 100644
index 0000000..e514ada
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_47.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_47
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 47
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_48.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_48.yaml
new file mode 100644
index 0000000..11fbfec
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_48.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_48
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 48
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_5.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_5.yaml
new file mode 100644
index 0000000..bc65a0e
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_5.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_5
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 5
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_6.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_6.yaml
new file mode 100644
index 0000000..f5b5289
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_6.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_6
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 6
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_7.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_7.yaml
new file mode 100644
index 0000000..bbad8f6
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_7.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_7
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 7
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_8.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_8.yaml
new file mode 100644
index 0000000..db06cb2
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_8.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_8
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 8
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/AttentiveProbe/layer_9.yaml b/configs/eval/gpt2-xl/AttentiveProbe/layer_9.yaml
new file mode 100644
index 0000000..12d7f43
--- /dev/null
+++ b/configs/eval/gpt2-xl/AttentiveProbe/layer_9.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_9
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 9
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_0.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_0.yaml
new file mode 100644
index 0000000..3925b02
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_0.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_0
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_0/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 0
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_1.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_1.yaml
new file mode 100644
index 0000000..220a81e
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_1.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_1
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_1/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 1
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_10.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_10.yaml
new file mode 100644
index 0000000..86bcea8
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_10.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_10
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_10/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 10
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_11.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_11.yaml
new file mode 100644
index 0000000..62888e5
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_11.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_11
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_11/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 11
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_12.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_12.yaml
new file mode 100644
index 0000000..62412aa
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_12.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_12
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_12/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 12
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_13.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_13.yaml
new file mode 100644
index 0000000..1c36ff9
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_13.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_13
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_13/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 13
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_14.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_14.yaml
new file mode 100644
index 0000000..9a1113d
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_14.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_14
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_14/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 14
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_15.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_15.yaml
new file mode 100644
index 0000000..b2d61eb
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_15.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_15
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_15/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 15
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_16.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_16.yaml
new file mode 100644
index 0000000..0e517a3
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_16.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_16
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_16/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 16
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_17.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_17.yaml
new file mode 100644
index 0000000..4bf2ccf
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_17.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_17
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_17/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 17
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_18.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_18.yaml
new file mode 100644
index 0000000..b2da955
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_18.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_18
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_18/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 18
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_19.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_19.yaml
new file mode 100644
index 0000000..30eb935
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_19.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_19
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_19/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 19
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_2.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_2.yaml
new file mode 100644
index 0000000..dac1aac
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_2.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_2
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_2/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 2
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_20.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_20.yaml
new file mode 100644
index 0000000..ed66035
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_20.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_20
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_20/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 20
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_21.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_21.yaml
new file mode 100644
index 0000000..d3b1429
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_21.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_21
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_21/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 21
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_22.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_22.yaml
new file mode 100644
index 0000000..445fce3
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_22.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_22
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_22/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 22
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_23.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_23.yaml
new file mode 100644
index 0000000..3a780e6
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_23.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_23
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_23/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 23
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_24.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_24.yaml
new file mode 100644
index 0000000..34abe71
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_24.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_24
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_24/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 24
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_25.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_25.yaml
new file mode 100644
index 0000000..a3d4450
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_25.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_25
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_25/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 25
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_26.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_26.yaml
new file mode 100644
index 0000000..446a28e
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_26.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_26
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_26/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 26
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_27.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_27.yaml
new file mode 100644
index 0000000..85e7fa8
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_27.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_27
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_27/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 27
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_28.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_28.yaml
new file mode 100644
index 0000000..bf22af5
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_28.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_28
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_28/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 28
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_29.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_29.yaml
new file mode 100644
index 0000000..6b2ceab
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_29.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_29
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_29/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 29
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_3.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_3.yaml
new file mode 100644
index 0000000..0917146
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_3.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_3
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_3/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 3
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_30.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_30.yaml
new file mode 100644
index 0000000..d36471c
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_30.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_30
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_30/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 30
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_31.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_31.yaml
new file mode 100644
index 0000000..0c13b6f
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_31.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_31
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_31/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 31
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_32.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_32.yaml
new file mode 100644
index 0000000..8b4f45e
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_32.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_32
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_32/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 32
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_33.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_33.yaml
new file mode 100644
index 0000000..cad515f
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_33.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_33
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_33/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 33
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_34.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_34.yaml
new file mode 100644
index 0000000..4046eed
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_34.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_34
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_34/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 34
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_35.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_35.yaml
new file mode 100644
index 0000000..9b6a361
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_35.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_35
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_35/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 35
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_36.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_36.yaml
new file mode 100644
index 0000000..f80f80d
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_36.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_36
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_36/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 36
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_37.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_37.yaml
new file mode 100644
index 0000000..aceb9cc
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_37.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_37
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_37/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 37
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_38.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_38.yaml
new file mode 100644
index 0000000..b90e64a
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_38.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_38
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_38/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 38
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_39.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_39.yaml
new file mode 100644
index 0000000..7191ac9
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_39.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_39
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_39/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 39
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_4.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_4.yaml
new file mode 100644
index 0000000..b2eb870
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_4.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_4
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_4/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 4
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_40.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_40.yaml
new file mode 100644
index 0000000..09fc133
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_40.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_40
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_40/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 40
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_41.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_41.yaml
new file mode 100644
index 0000000..6de4f68
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_41.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_41
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_41/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 41
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_42.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_42.yaml
new file mode 100644
index 0000000..4d0f5ae
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_42.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_42
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_42/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 42
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_43.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_43.yaml
new file mode 100644
index 0000000..f1d20be
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_43.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_43
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_43/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 43
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_44.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_44.yaml
new file mode 100644
index 0000000..cc4919a
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_44.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_44
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_44/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 44
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_45.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_45.yaml
new file mode 100644
index 0000000..369f821
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_45.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_45
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_45/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 45
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_46.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_46.yaml
new file mode 100644
index 0000000..5fcb989
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_46.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_46
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_46/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 46
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_47.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_47.yaml
new file mode 100644
index 0000000..9400fa4
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_47.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_47
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_47/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 47
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_48.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_48.yaml
new file mode 100644
index 0000000..f624817
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_48.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_48
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_48/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 48
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_5.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_5.yaml
new file mode 100644
index 0000000..cc7184a
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_5.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_5
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_5/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 5
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_6.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_6.yaml
new file mode 100644
index 0000000..c51079e
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_6.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_6
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_6/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 6
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_7.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_7.yaml
new file mode 100644
index 0000000..9415e51
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_7.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_7
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_7/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 7
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_8.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_8.yaml
new file mode 100644
index 0000000..e78f054
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_8.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_8
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_8/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 8
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Action/layer_9.yaml b/configs/eval/gpt2-xl/Geometric_Action/layer_9.yaml
new file mode 100644
index 0000000..0cee575
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Action/layer_9.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_9
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2-xl/Geometric_Action/layer_9/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 9
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_0.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_0.yaml
new file mode 100644
index 0000000..5c32b5a
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_0.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_0
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 0
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_1.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_1.yaml
new file mode 100644
index 0000000..eab1a43
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_1.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_1
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 1
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_10.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_10.yaml
new file mode 100644
index 0000000..d16a26b
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_10.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_10
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 10
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_11.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_11.yaml
new file mode 100644
index 0000000..fa28f7a
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_11.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_11
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 11
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_12.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_12.yaml
new file mode 100644
index 0000000..09ae0a8
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_12.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_12
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 12
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_13.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_13.yaml
new file mode 100644
index 0000000..459a454
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_13.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_13
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 13
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_14.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_14.yaml
new file mode 100644
index 0000000..2595747
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_14.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_14
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 14
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_15.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_15.yaml
new file mode 100644
index 0000000..101345f
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_15.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_15
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 15
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_16.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_16.yaml
new file mode 100644
index 0000000..3a632a7
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_16.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_16
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 16
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_17.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_17.yaml
new file mode 100644
index 0000000..4c20068
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_17.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_17
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 17
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_18.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_18.yaml
new file mode 100644
index 0000000..fffaf0e
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_18.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_18
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 18
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_19.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_19.yaml
new file mode 100644
index 0000000..7698306
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_19.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_19
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 19
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_2.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_2.yaml
new file mode 100644
index 0000000..bb82b26
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_2.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_2
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 2
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_20.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_20.yaml
new file mode 100644
index 0000000..7c2a065
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_20.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_20
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 20
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_21.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_21.yaml
new file mode 100644
index 0000000..bd7f3ca
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_21.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_21
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 21
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_22.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_22.yaml
new file mode 100644
index 0000000..cab6a20
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_22.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_22
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 22
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_23.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_23.yaml
new file mode 100644
index 0000000..11ec87a
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_23.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_23
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 23
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_24.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_24.yaml
new file mode 100644
index 0000000..b6927c7
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_24.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_24
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 24
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_25.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_25.yaml
new file mode 100644
index 0000000..e200a03
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_25.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_25
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 25
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_26.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_26.yaml
new file mode 100644
index 0000000..41da13c
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_26.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_26
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 26
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_27.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_27.yaml
new file mode 100644
index 0000000..96a6c3f
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_27.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_27
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 27
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_28.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_28.yaml
new file mode 100644
index 0000000..8b765f4
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_28.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_28
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 28
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_29.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_29.yaml
new file mode 100644
index 0000000..7e1014b
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_29.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_29
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 29
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_3.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_3.yaml
new file mode 100644
index 0000000..55af01d
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_3.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_3
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 3
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_30.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_30.yaml
new file mode 100644
index 0000000..d8d637d
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_30.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_30
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 30
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_31.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_31.yaml
new file mode 100644
index 0000000..e3748d7
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_31.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_31
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 31
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_32.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_32.yaml
new file mode 100644
index 0000000..3c32602
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_32.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_32
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 32
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_33.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_33.yaml
new file mode 100644
index 0000000..503a610
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_33.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_33
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 33
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_34.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_34.yaml
new file mode 100644
index 0000000..a592bbb
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_34.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_34
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 34
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_35.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_35.yaml
new file mode 100644
index 0000000..35de6b7
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_35.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_35
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 35
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_36.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_36.yaml
new file mode 100644
index 0000000..08e945c
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_36.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_36
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 36
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_37.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_37.yaml
new file mode 100644
index 0000000..401ecc7
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_37.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_37
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 37
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_38.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_38.yaml
new file mode 100644
index 0000000..b9196f3
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_38.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_38
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 38
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_39.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_39.yaml
new file mode 100644
index 0000000..fe78ca4
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_39.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_39
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 39
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_4.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_4.yaml
new file mode 100644
index 0000000..ba226b0
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_4.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_4
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 4
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_40.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_40.yaml
new file mode 100644
index 0000000..a1c6783
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_40.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_40
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 40
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_41.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_41.yaml
new file mode 100644
index 0000000..35d1895
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_41.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_41
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 41
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_42.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_42.yaml
new file mode 100644
index 0000000..c3e59fb
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_42.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_42
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 42
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_43.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_43.yaml
new file mode 100644
index 0000000..11be9d5
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_43.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_43
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 43
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_44.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_44.yaml
new file mode 100644
index 0000000..255bb98
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_44.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_44
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 44
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_45.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_45.yaml
new file mode 100644
index 0000000..759fbc7
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_45.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_45
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 45
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_46.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_46.yaml
new file mode 100644
index 0000000..d98b0e2
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_46.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_46
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 46
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_47.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_47.yaml
new file mode 100644
index 0000000..fa58adc
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_47.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_47
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 47
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_48.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_48.yaml
new file mode 100644
index 0000000..6ec744c
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_48.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_48
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 48
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_5.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_5.yaml
new file mode 100644
index 0000000..9080f73
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_5.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_5
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 5
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_6.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_6.yaml
new file mode 100644
index 0000000..40ba15a
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_6.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_6
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 6
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_7.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_7.yaml
new file mode 100644
index 0000000..417cf1f
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_7.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_7
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 7
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_8.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_8.yaml
new file mode 100644
index 0000000..03ad194
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_8.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_8
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 8
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/Geometric_Regression/layer_9.yaml b/configs/eval/gpt2-xl/Geometric_Regression/layer_9.yaml
new file mode 100644
index 0000000..8022900
--- /dev/null
+++ b/configs/eval/gpt2-xl/Geometric_Regression/layer_9.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_9
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 9
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_0.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_0.yaml
new file mode 100644
index 0000000..9e76a45
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_0.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_0
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 0
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_1.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_1.yaml
new file mode 100644
index 0000000..225db39
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_1.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_1
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 1
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_10.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_10.yaml
new file mode 100644
index 0000000..e2dbacf
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_10.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_10
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 10
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_11.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_11.yaml
new file mode 100644
index 0000000..26535d8
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_11.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_11
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 11
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_12.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_12.yaml
new file mode 100644
index 0000000..1a9f601
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_12.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_12
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 12
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_13.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_13.yaml
new file mode 100644
index 0000000..ff4da4e
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_13.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_13
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 13
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_14.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_14.yaml
new file mode 100644
index 0000000..cf62caa
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_14.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_14
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 14
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_15.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_15.yaml
new file mode 100644
index 0000000..8f343cf
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_15.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_15
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 15
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_16.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_16.yaml
new file mode 100644
index 0000000..8885397
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_16.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_16
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 16
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_17.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_17.yaml
new file mode 100644
index 0000000..2c997a5
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_17.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_17
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 17
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_18.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_18.yaml
new file mode 100644
index 0000000..be860c3
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_18.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_18
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 18
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_19.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_19.yaml
new file mode 100644
index 0000000..010963e
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_19.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_19
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 19
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_2.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_2.yaml
new file mode 100644
index 0000000..ddceb7e
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_2.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_2
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 2
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_20.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_20.yaml
new file mode 100644
index 0000000..7e1d5e0
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_20.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_20
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 20
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_21.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_21.yaml
new file mode 100644
index 0000000..93a7788
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_21.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_21
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 21
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_22.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_22.yaml
new file mode 100644
index 0000000..3aef5bd
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_22.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_22
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 22
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_23.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_23.yaml
new file mode 100644
index 0000000..fc6b2d2
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_23.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_23
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 23
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_24.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_24.yaml
new file mode 100644
index 0000000..d81523f
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_24.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_24
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 24
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_25.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_25.yaml
new file mode 100644
index 0000000..3b39709
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_25.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_25
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 25
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_26.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_26.yaml
new file mode 100644
index 0000000..f4b5700
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_26.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_26
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 26
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_27.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_27.yaml
new file mode 100644
index 0000000..815f992
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_27.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_27
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 27
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_28.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_28.yaml
new file mode 100644
index 0000000..9407945
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_28.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_28
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 28
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_29.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_29.yaml
new file mode 100644
index 0000000..b471109
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_29.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_29
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 29
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_3.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_3.yaml
new file mode 100644
index 0000000..3f64e85
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_3.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_3
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 3
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_30.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_30.yaml
new file mode 100644
index 0000000..7c0904d
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_30.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_30
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 30
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_31.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_31.yaml
new file mode 100644
index 0000000..6a6b4c8
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_31.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_31
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 31
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_32.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_32.yaml
new file mode 100644
index 0000000..6a8f0bb
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_32.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_32
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 32
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_33.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_33.yaml
new file mode 100644
index 0000000..078de38
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_33.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_33
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 33
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_34.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_34.yaml
new file mode 100644
index 0000000..6cff065
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_34.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_34
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 34
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_35.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_35.yaml
new file mode 100644
index 0000000..a7b347d
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_35.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_35
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 35
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_36.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_36.yaml
new file mode 100644
index 0000000..76a5da7
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_36.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_36
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 36
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_37.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_37.yaml
new file mode 100644
index 0000000..59cc796
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_37.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_37
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 37
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_38.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_38.yaml
new file mode 100644
index 0000000..bfe2cd5
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_38.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_38
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 38
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_39.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_39.yaml
new file mode 100644
index 0000000..d7c2e99
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_39.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_39
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 39
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_4.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_4.yaml
new file mode 100644
index 0000000..8c7d2df
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_4.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_4
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 4
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_40.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_40.yaml
new file mode 100644
index 0000000..9b08a65
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_40.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_40
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 40
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_41.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_41.yaml
new file mode 100644
index 0000000..f8c76df
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_41.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_41
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 41
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_42.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_42.yaml
new file mode 100644
index 0000000..fb1cbff
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_42.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_42
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 42
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_43.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_43.yaml
new file mode 100644
index 0000000..3511daa
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_43.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_43
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 43
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_44.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_44.yaml
new file mode 100644
index 0000000..8f1aa2a
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_44.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_44
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 44
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_45.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_45.yaml
new file mode 100644
index 0000000..6b71348
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_45.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_45
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 45
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_46.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_46.yaml
new file mode 100644
index 0000000..9f212e5
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_46.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_46
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 46
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_47.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_47.yaml
new file mode 100644
index 0000000..4855e42
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_47.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_47
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 47
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_48.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_48.yaml
new file mode 100644
index 0000000..2230640
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_48.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_48
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 48
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_5.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_5.yaml
new file mode 100644
index 0000000..3a4f8d8
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_5.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_5
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 5
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_6.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_6.yaml
new file mode 100644
index 0000000..befee60
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_6.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_6
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 6
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_7.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_7.yaml
new file mode 100644
index 0000000..cb52a9a
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_7.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_7
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 7
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_8.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_8.yaml
new file mode 100644
index 0000000..bbf3273
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_8.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_8
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 8
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2-xl/StackActionProbe/layer_9.yaml b/configs/eval/gpt2-xl/StackActionProbe/layer_9.yaml
new file mode 100644
index 0000000..a2a8d4a
--- /dev/null
+++ b/configs/eval/gpt2-xl/StackActionProbe/layer_9.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2-xl/
+  version: layer_9
+pretrained_model: gpt2-xl
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 9
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_0.yaml b/configs/eval/gpt2/AttentiveProbe/layer_0.yaml
new file mode 100644
index 0000000..67f41a0
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_0.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_0
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 0
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_1.yaml b/configs/eval/gpt2/AttentiveProbe/layer_1.yaml
new file mode 100644
index 0000000..1e341c0
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_1.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_1
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 1
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_10.yaml b/configs/eval/gpt2/AttentiveProbe/layer_10.yaml
new file mode 100644
index 0000000..e3e3f62
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_10.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_10
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 10
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_11.yaml b/configs/eval/gpt2/AttentiveProbe/layer_11.yaml
new file mode 100644
index 0000000..96aa6a3
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_11.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_11
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 11
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_12.yaml b/configs/eval/gpt2/AttentiveProbe/layer_12.yaml
new file mode 100644
index 0000000..c41030b
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_12.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_12
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 12
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_2.yaml b/configs/eval/gpt2/AttentiveProbe/layer_2.yaml
new file mode 100644
index 0000000..4f84463
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_2.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_2
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 2
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_3.yaml b/configs/eval/gpt2/AttentiveProbe/layer_3.yaml
new file mode 100644
index 0000000..0b71914
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_3.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_3
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 3
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_4.yaml b/configs/eval/gpt2/AttentiveProbe/layer_4.yaml
new file mode 100644
index 0000000..bf52f3e
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_4.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_4
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 4
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_5.yaml b/configs/eval/gpt2/AttentiveProbe/layer_5.yaml
new file mode 100644
index 0000000..21cf784
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_5.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_5
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 5
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_6.yaml b/configs/eval/gpt2/AttentiveProbe/layer_6.yaml
new file mode 100644
index 0000000..83b0ba0
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_6.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_6
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 6
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_7.yaml b/configs/eval/gpt2/AttentiveProbe/layer_7.yaml
new file mode 100644
index 0000000..5f1b1e1
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_7.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_7
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 7
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_8.yaml b/configs/eval/gpt2/AttentiveProbe/layer_8.yaml
new file mode 100644
index 0000000..292104c
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_8.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_8
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 8
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2/AttentiveProbe/layer_9.yaml b/configs/eval/gpt2/AttentiveProbe/layer_9.yaml
new file mode 100644
index 0000000..27a1c59
--- /dev/null
+++ b/configs/eval/gpt2/AttentiveProbe/layer_9.yaml
@@ -0,0 +1,60 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 30
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 30
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_9
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  continuous: true
+  data_sources:
+  - action_ids
+  - continuous_action_masks
+  - gold_tuples
+  emb_size: 100
+  embeddings_dropout_rate: 0
+  layer: 9
+  layer_dropout_rate: 0.2
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: AttentiveProbe
+  probe_type: AttentiveProbe
+  reverse: true
+  rnn_type: GRU
+  state_size: 100
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_0.yaml b/configs/eval/gpt2/Geometric_Action/layer_0.yaml
new file mode 100644
index 0000000..bef272f
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_0.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_0
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_0/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 0
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_1.yaml b/configs/eval/gpt2/Geometric_Action/layer_1.yaml
new file mode 100644
index 0000000..aff494a
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_1.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_1
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_1/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 1
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_10.yaml b/configs/eval/gpt2/Geometric_Action/layer_10.yaml
new file mode 100644
index 0000000..b3ad87b
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_10.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_10
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_10/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 10
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_11.yaml b/configs/eval/gpt2/Geometric_Action/layer_11.yaml
new file mode 100644
index 0000000..c2fc235
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_11.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_11
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_11/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 11
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_12.yaml b/configs/eval/gpt2/Geometric_Action/layer_12.yaml
new file mode 100644
index 0000000..7e6befe
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_12.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_12
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_12/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 12
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_2.yaml b/configs/eval/gpt2/Geometric_Action/layer_2.yaml
new file mode 100644
index 0000000..e420bc8
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_2.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_2
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_2/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 2
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_3.yaml b/configs/eval/gpt2/Geometric_Action/layer_3.yaml
new file mode 100644
index 0000000..ad8532d
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_3.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_3
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_3/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 3
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_4.yaml b/configs/eval/gpt2/Geometric_Action/layer_4.yaml
new file mode 100644
index 0000000..459531f
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_4.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_4
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_4/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 4
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_5.yaml b/configs/eval/gpt2/Geometric_Action/layer_5.yaml
new file mode 100644
index 0000000..86e5373
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_5.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_5
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_5/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 5
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_6.yaml b/configs/eval/gpt2/Geometric_Action/layer_6.yaml
new file mode 100644
index 0000000..7f93343
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_6.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_6
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_6/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 6
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_7.yaml b/configs/eval/gpt2/Geometric_Action/layer_7.yaml
new file mode 100644
index 0000000..8ca0bb7
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_7.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_7
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_7/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 7
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_8.yaml b/configs/eval/gpt2/Geometric_Action/layer_8.yaml
new file mode 100644
index 0000000..455792e
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_8.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_8
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_8/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 8
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Action/layer_9.yaml b/configs/eval/gpt2/Geometric_Action/layer_9.yaml
new file mode 100644
index 0000000..a961423
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Action/layer_9.yaml
@@ -0,0 +1,63 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 1.0e-05
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_9
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: experiment_checkpoints/eval/gpt2/Geometric_Action/layer_9/checkpoints/last.ckpt
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 9
+  layer_dropout_rate: 0.2
+  loss_types:
+  - oracle_action_nll
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Action
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_0.yaml b/configs/eval/gpt2/Geometric_Regression/layer_0.yaml
new file mode 100644
index 0000000..73ad1de
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_0.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_0
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 0
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_1.yaml b/configs/eval/gpt2/Geometric_Regression/layer_1.yaml
new file mode 100644
index 0000000..8d1daf1
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_1.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_1
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 1
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_10.yaml b/configs/eval/gpt2/Geometric_Regression/layer_10.yaml
new file mode 100644
index 0000000..fc3e287
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_10.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_10
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 10
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_11.yaml b/configs/eval/gpt2/Geometric_Regression/layer_11.yaml
new file mode 100644
index 0000000..f03c1e8
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_11.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_11
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 11
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_12.yaml b/configs/eval/gpt2/Geometric_Regression/layer_12.yaml
new file mode 100644
index 0000000..1f85125
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_12.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_12
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 12
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_2.yaml b/configs/eval/gpt2/Geometric_Regression/layer_2.yaml
new file mode 100644
index 0000000..25e3ca1
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_2.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_2
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 2
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_3.yaml b/configs/eval/gpt2/Geometric_Regression/layer_3.yaml
new file mode 100644
index 0000000..dc031d0
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_3.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_3
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 3
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_4.yaml b/configs/eval/gpt2/Geometric_Regression/layer_4.yaml
new file mode 100644
index 0000000..f497efe
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_4.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_4
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 4
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_5.yaml b/configs/eval/gpt2/Geometric_Regression/layer_5.yaml
new file mode 100644
index 0000000..4c94c92
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_5.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_5
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 5
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_6.yaml b/configs/eval/gpt2/Geometric_Regression/layer_6.yaml
new file mode 100644
index 0000000..e509aeb
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_6.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_6
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 6
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 0
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_7.yaml b/configs/eval/gpt2/Geometric_Regression/layer_7.yaml
new file mode 100644
index 0000000..b976222
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_7.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_7
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 7
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_8.yaml b/configs/eval/gpt2/Geometric_Regression/layer_8.yaml
new file mode 100644
index 0000000..3000333
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_8.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_8
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 8
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2/Geometric_Regression/layer_9.yaml b/configs/eval/gpt2/Geometric_Regression/layer_9.yaml
new file mode 100644
index 0000000..895e277
--- /dev/null
+++ b/configs/eval/gpt2/Geometric_Regression/layer_9.yaml
@@ -0,0 +1,64 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_9
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: null
+  data_sources:
+  - action_ids
+  - gold_tuples
+  - gold_distances
+  - gold_depths
+  - xpos
+  embeddings_dropout_rate: 0
+  layer: 9
+  layer_dropout_rate: 0.2
+  loss_types:
+  - distance_mse
+  - depth_mse
+  num_layers: 1
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: Geometric_Regression
+  probe_type: GeometricProbe
+  temp: 0.1
+  threshold: 1.5
+  verbose: false
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 3
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_0.yaml b/configs/eval/gpt2/StackActionProbe/layer_0.yaml
new file mode 100644
index 0000000..760e020
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_0.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_0
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 0
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_1.yaml b/configs/eval/gpt2/StackActionProbe/layer_1.yaml
new file mode 100644
index 0000000..2b8815c
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_1.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_1
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 1
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_10.yaml b/configs/eval/gpt2/StackActionProbe/layer_10.yaml
new file mode 100644
index 0000000..089d9b7
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_10.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_10
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 10
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 9
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_11.yaml b/configs/eval/gpt2/StackActionProbe/layer_11.yaml
new file mode 100644
index 0000000..6c8cc3f
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_11.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_11
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 11
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 7
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_12.yaml b/configs/eval/gpt2/StackActionProbe/layer_12.yaml
new file mode 100644
index 0000000..cc4385e
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_12.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_12
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 12
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_2.yaml b/configs/eval/gpt2/StackActionProbe/layer_2.yaml
new file mode 100644
index 0000000..b38d540
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_2.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_2
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 2
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 4
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_3.yaml b/configs/eval/gpt2/StackActionProbe/layer_3.yaml
new file mode 100644
index 0000000..8574323
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_3.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_3
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 3
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_4.yaml b/configs/eval/gpt2/StackActionProbe/layer_4.yaml
new file mode 100644
index 0000000..a1bcad5
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_4.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_4
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 4
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_5.yaml b/configs/eval/gpt2/StackActionProbe/layer_5.yaml
new file mode 100644
index 0000000..f3341a2
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_5.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_5
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 5
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 5
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_6.yaml b/configs/eval/gpt2/StackActionProbe/layer_6.yaml
new file mode 100644
index 0000000..507782d
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_6.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_6
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 6
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 1
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_7.yaml b/configs/eval/gpt2/StackActionProbe/layer_7.yaml
new file mode 100644
index 0000000..b17a8e1
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_7.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_7
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 7
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 2
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_8.yaml b/configs/eval/gpt2/StackActionProbe/layer_8.yaml
new file mode 100644
index 0000000..c43f258
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_8.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_8
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 8
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 6
+  max_epochs: 25
diff --git a/configs/eval/gpt2/StackActionProbe/layer_9.yaml b/configs/eval/gpt2/StackActionProbe/layer_9.yaml
new file mode 100644
index 0000000..9c3853d
--- /dev/null
+++ b/configs/eval/gpt2/StackActionProbe/layer_9.yaml
@@ -0,0 +1,55 @@
+cuda: true
+data_params:
+  action_ngram_pad: 40
+  action_pad: 400
+  num_workers: 4
+  pin_memory: false
+  root_dir: data
+  test:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  token_pad: 150
+  train:
+    batch_size: 15
+    dry_run: false
+    shuffle: true
+  valid:
+    batch_size: 15
+    dry_run: false
+    shuffle: false
+device: cuda
+exp_params:
+  manual_seed: 1265
+  optimizer_params:
+    lr: 0.001
+  optimizer_type: Adam
+  scheduler_params:
+    factor: 0.1
+    mode: min
+    patience: 0
+  scheduler_type: ReduceLROnPlateau
+logging_params:
+  save_dir: ./experiment_checkpoints/eval/gpt2/
+  version: layer_9
+pretrained_model: gpt2
+probe_params:
+  add_root: true
+  checkpoint_path: false
+  data_sources:
+  - action_ids
+  - gold_tuples
+  embeddings_dropout_rate: 0
+  layer: 9
+  layer_dropout_rate: 0.2
+  num_layers: 3
+  oracle_params:
+    mappings_file: data/mappings-ptb.txt
+    name: ArcStandard
+  probe_name: StackActionProbe
+  probe_type: StackActionProbe
+trainer_params:
+  accumulate_grad_batches: 1
+  gpus:
+  - 8
+  max_epochs: 25
diff --git a/data/mappings-ptb.txt b/data/mappings-ptb.txt
new file mode 100644
index 0000000..c6dd34f
--- /dev/null
+++ b/data/mappings-ptb.txt
@@ -0,0 +1,109 @@
+::rel
+acomp
+advcl
+advmod
+amod
+appos
+aux
+auxpass
+cc
+ccomp
+conj
+cop
+csubj
+csubjpass
+dep
+det
+discourse
+dobj
+expl
+iobj
+mark
+mwe
+neg
+nn
+npadvmod
+nsubj
+nsubjpass
+num
+number
+parataxis
+pcomp
+pobj
+poss
+possessive
+preconj
+predet
+prep
+prt
+punct
+quantmod
+rcmod
+root
+tmod
+vmod
+xcomp
+::pos
+ADJ
+ADP
+ADV
+AUX
+CCONJ
+DET
+INTJ
+NOUN
+NUM
+PART
+PRON
+PROPN
+PUNCT
+SCONJ
+SYM
+VERB
+X
+::fpos
+#
+$
+''
+,
+-LRB-
+-RRB-
+.
+:
+CC
+CD
+DT
+EX
+FW
+IN
+JJ
+JJR
+JJS
+LS
+MD
+NN
+NNP
+NNPS
+NNS
+PDT
+POS
+PRP
+PRP$
+RB
+RBR
+RBS
+RP
+SYM
+TO
+UH
+VB
+VBD
+VBG
+VBN
+VBP
+VBZ
+WDT
+WP
+WP$
+WRB
+``
diff --git a/data/npz_experiment/README.md b/data/npz_experiment/README.md
new file mode 100644
index 0000000..2b1e496
--- /dev/null
+++ b/data/npz_experiment/README.md
@@ -0,0 +1 @@
+<!-- read me for the npz dataset -->
\ No newline at end of file
diff --git a/data/npz_experiment/npz.csv b/data/npz_experiment/npz.csv
new file mode 100644
index 0000000..d95c917
--- /dev/null
+++ b/data/npz_experiment/npz.csv
@@ -0,0 +1,53 @@
+prefix,continuation_1,head_idxs_1,continuation_2,head_idxs_2,both,neither,transitive
+Although the band left the party,I stayed .,"[4,3,4,8,6,4,8,0]",went on .,"[4,3,4,7,6,7,0,7]",raged on for,.,True
+As the criminal shot the woman,I shouted .,"[4,3,4,8,6,4,8,0]",fell down .,"[4,3,4,7,6,7,0,7]",on the porch,.,True
+When the dog bit the doctor,I laughed .,"[4,3,4,8,6,4,8,0]",ran away .,"[4,3,4,7,6,7,0,7]",who was walking,.,True
+As the ship crossed the waters,we slept .,"[4,3,4,8,6,4,8,0]",remained calm .,"[4,3,4,7,6,7,0,7]",which were calm,.,True
+After the newcomers asked the soldiers,we marched .,"[4,3,4,8,6,4,8,0]",shared food .,"[4,3,4,7,6,7,0,7]",in the camp,.,True
+Though the athlete telephoned the coach,we practiced .,"[4,3,4,8,6,4,8,0]",rejected him .,"[4,3,4,7,6,7,0,7]",of the team,.,True
+While the crowd appluaded the actor,I left .,"[4,3,4,8,6,4,8,0]",sat down .,"[4,3,4,7,6,7,0,7]",who danced on,.,True
+While the audience cheered the actor,we left .,"[4,3,4,8,6,4,8,0]",continued performing .,"[4,3,4,7,6,7,0,7]",who sang the,.,True
+While the students ate the food,I starved .,"[4,3,4,8,6,4,8,0]",became cold .,"[4,3,4,7,6,7,0,7]",from the grocery,.,True
+When the professor taught the visitors,I listened .,"[4,3,4,8,6,4,8,0]",lost interest .,"[4,3,4,7,6,7,0,7]",who were in,.,True
+Because the baby grabbed the woman,we cheered .,"[4,3,4,8,6,4,8,0]",stayed longer .,"[4,3,4,7,6,7,0,7]",who was near,.,True
+Because the manager applauded the comedian,we laughed .,"[4,3,4,8,6,4,8,0]",spoke more .,"[4,3,4,7,6,7,0,7]",who just performed,.,True
+After the patient asked the nurse,I cried .,"[4,3,4,8,6,4,8,0]",told her .,"[4,3,4,7,6,7,0,7]",in the hospital,.,True
+After the guard visited the children,we visited .,"[4,3,4,8,6,4,8,0]",played more .,"[4,3,4,7,6,7,0,7]",who were playing,.,True
+After the dog signaled the farmer,I worked .,"[4,3,4,8,6,4,8,0]",went out .,"[4,3,4,7,6,7,0,7]",who was focused,.,True
+As the students considered the teacher,I arrived .,"[4,3,4,8,6,4,8,0]",asked questions .,"[4,3,4,7,6,7,0,7]",who was speaking,.,True
+While the woman decorated the pot,we watched .,"[4,3,4,8,6,4,8,0]",sat still .,"[4,3,4,7,6,7,0,7]",which was made,.,True
+When the customer interrupted the manager,we watched .,"[4,3,4,8,6,4,8,0]",responded calmly .,"[4,3,4,7,6,7,0,7]",of the store,.,True
+As the man wiped the pipe,I watched .,"[4,3,4,8,6,4,8,0]",blew smoke .,"[4,3,4,7,6,7,0,7]",made of wood,.,True
+While the prisoners watched the guards,we escaped .,"[4,3,4,8,6,4,8,0]",threatened them .,"[4,3,4,7,6,7,0,7]",who were working,.,True
+While the king governed his subjects,I left .,"[4,3,4,8,6,4,8,0]",feared him .,"[4,3,4,7,6,7,0,7]",who were loyal,.,True
+After the host interrupted the discussion,I spoke .,"[4,3,4,8,6,4,8,0]",became interesting .,"[4,3,4,7,6,7,0,7]",about the show,.,True
+After the woman dressed her children,we departed .,"[4,3,4,8,6,4,8,0]",played games .,"[4,3,4,7,6,7,0,7]",who were waiting,.,True
+After the soldier signaled the doctor,I helped .,"[4,3,4,8,6,4,8,0]",stopped operating .,"[4,3,4,7,6,7,0,7]",in the hospital,.,True
+As the guards stopped the thieves,we relaxed .,"[4,3,4,8,6,4,8,0]",stole more .,"[4,3,4,7,6,7,0,7]",who were running,.,True
+
+Although the band performed the party,I stayed .,"[4,3,4,8,6,4,8,0]",went on .,"[4,3,4,7,6,7,0,7]",raged on for,.,False
+As the criminal fled the woman,I shouted .,"[4,3,4,8,6,4,8,0]",fell down .,"[4,3,4,7,6,7,0,7]",on the porch,.,False
+When the dog struggled the doctor,I laughed .,"[4,3,4,8,6,4,8,0]",ran away .,"[4,3,4,7,6,7,0,7]",who was walking,.,False
+As the ship drifted the waters,we slept .,"[4,3,4,8,6,4,8,0]",remained calm .,"[4,3,4,7,6,7,0,7]",which were calm,.,False
+After the newcomers negotiated the soldiers,we marched .,"[4,3,4,8,6,4,8,0]",shared food .,"[4,3,4,7,6,7,0,7]",in the camp,.,False
+Though the athlete complained the coach,we practiced .,"[4,3,4,8,6,4,8,0]",rejected him .,"[4,3,4,7,6,7,0,7]",of the team,.,False
+While the crowd yelled the actor,I left .,"[4,3,4,8,6,4,8,0]",sat down .,"[4,3,4,7,6,7,0,7]",who danced on,.,False
+While the audience arrived the actor,we left .,"[4,3,4,8,6,4,8,0]",continued performing .,"[4,3,4,7,6,7,0,7]",who sang the,.,False
+While the students talked the food,I starved .,"[4,3,4,8,6,4,8,0]",became cold .,"[4,3,4,7,6,7,0,7]",from the grocery,.,False
+When the professor talked the visitors,I listened .,"[4,3,4,8,6,4,8,0]",lost interest .,"[4,3,4,7,6,7,0,7]",who were in,.,False
+Because the baby yelled the woman,we cheered .,"[4,3,4,8,6,4,8,0]",stayed longer .,"[4,3,4,7,6,7,0,7]",who was near,.,False
+Because the manager departed the comedian,we laughed .,"[4,3,4,8,6,4,8,0]",spoke more .,"[4,3,4,7,6,7,0,7]",who just performed,.,False
+After the patient arrived the nurse,I cried .,"[4,3,4,8,6,4,8,0]",told her .,"[4,3,4,7,6,7,0,7]",in the hospital,.,False
+After the guard departed the children,we visited .,"[4,3,4,8,6,4,8,0]",played more .,"[4,3,4,7,6,7,0,7]",who were playing,.,False
+After the dog struggled the farmer,I worked .,"[4,3,4,8,6,4,8,0]",went out .,"[4,3,4,7,6,7,0,7]",who was focused,.,False
+As the students slept the teacher,I arrived .,"[4,3,4,8,6,4,8,0]",asked questions .,"[4,3,4,7,6,7,0,7]",who was speaking,.,False
+While the woman slept the pot,we watched .,"[4,3,4,8,6,4,8,0]",sat still .,"[4,3,4,7,6,7,0,7]",which was made,.,False
+When the customer complained the manager,we watched .,"[4,3,4,8,6,4,8,0]",responded calmly .,"[4,3,4,7,6,7,0,7]",of the store,.,False
+As the man slept the pipe,I watched .,"[4,3,4,8,6,4,8,0]",blew smoke .,"[4,3,4,7,6,7,0,7]",made of wood,.,False
+
+While the prisoners looked the guards,we escaped .,"[4,3,4,8,6,4,8,0]",threatened them .,"[4,3,4,7,6,7,0,7]",who were working,.,False
+While the king yelled his subjects,I left .,"[4,3,4,8,6,4,8,0]",feared him .,"[4,3,4,7,6,7,0,7]",who were loyal,.,False
+After the host yelled the discussion,I spoke .,"[4,3,4,8,6,4,8,0]",became interesting .,"[4,3,4,7,6,7,0,7]",about the show,.,False
+After the woman departed her children,we departed .,"[4,3,4,8,6,4,8,0]",played games .,"[4,3,4,7,6,7,0,7]",who were waiting,.,False
+After the soldier complained the doctor,I helped .,"[4,3,4,8,6,4,8,0]",stopped operating .,"[4,3,4,7,6,7,0,7]",in the hospital,.,False
+As the guards slept the thieves,we relaxed .,"[4,3,4,8,6,4,8,0]",stole more .,"[4,3,4,7,6,7,0,7]",who were running,.,False
\ No newline at end of file
diff --git a/data/test.json b/data/test.json
new file mode 100644
index 0000000..1d01c1d
--- /dev/null
+++ b/data/test.json
@@ -0,0 +1,20 @@
+{"orig_tokens": ["No", ",", "it", "was", "n't", "Black", "Monday", "."], "tokens": ["<cunk>", ",", "it", "was", "n't", "<cunk>", "<cunk>", "."], "token_ids": [7, 45, 71, 56, 126, 7, 7, 62], "tags": ["RB", ",", "PRP", "VBD", "RB", "NNP", "NNP", "."], "tree_str": "(S (INTJ (RB No)) (, ,) (NP (PRP it)) (VP (VBD was) (RB n't) (NP (NNP Black) (NNP Monday))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [4, 3, 2, 1, 0], [5, 4, 3, 2, 1, 0], [6, 5, 4, 3, 2, 1, 0], [7, 6, 5, 4, 3, 2, 1, 0], [7, 5, 4, 3, 2, 1, 0], [7, 4, 3, 2, 1, 0], [7, 3, 2, 1, 0], [7, 2, 1, 0], [7, 1, 0], [7, 0], [8, 7, 0], [7, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8], [2, 3, 4, 5, 6, 7, 8], [3, 4, 5, 6, 7, 8], [4, 5, 6, 7, 8], [5, 6, 7, 8], [6, 7, 8], [7, 8], [8], [8], [8], [8], [8], [8], [8], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 21], [1, 10], [1, 24], [1, 37], [1, 15], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [0, 3, 2], [0, 4, 3], [0, 5, 4], [0, 6, 5], [1, 7, 6], [1, 7, 5], [1, 7, 4], [1, 7, 3], [1, 7, 2], [1, 7, 1], [0, 7, 0], [2, 8, 7], [2, 7, 0]]}}
+{"orig_tokens": ["But", "while", "the", "New", "York", "Stock", "Exchange", "did", "n't", "fall", "apart", "Friday", "as", "the", "Dow", "Jones", "Industrial", "Average", "plunged", "190.58", "points", "--", "most", "of", "it", "in", "the", "final", "hour", "--", "it", "barely", "managed", "to", "stay", "this", "side", "of", "chaos", "."], "tokens": ["But", "<unk>", "the", "<cunk>", "<cunk>", "<cunk>", "Exchange", "did", "n't", "<unk>", "<unk>", "<cunk>", "as", "the", "<cunk>", "<cunks>", "<unk>", "<cunk>", "<unked>", "<unkn>", "<unks>", "--", "<unk>", "of", "it", "in", "the", "<unkal>", "<unk>", "--", "it", "<unkly>", "<unked>", "to", "<unk>", "this", "<unk>", "of", "<unks>", "."], "token_ids": [266, 16, 40, 7, 7, 7, 143, 255, 126, 16, 16, 7, 168, 40, 7, 5, 16, 7, 3, 10, 6, 216, 16, 26, 71, 42, 40, 20, 16, 216, 71, 2, 3, 59, 16, 185, 16, 26, 6, 62], "tags": ["CC", "IN", "DT", "NNP", "NNP", "NNP", "NNP", "VBD", "RB", "VB", "RB", "NNP", "IN", "DT", "NNP", "NNP", "NNP", "NNP", "VBD", "CD", "NNS", ":", "JJS", "IN", "PRP", "IN", "DT", "JJ", "NN", ":", "PRP", "RB", "VBD", "TO", "VB", "DT", "NN", "IN", "NN", "."], "tree_str": "(S (CC But) (SBAR (IN while) (S (NP (DT the) (NNP New) (NNP York) (NNP Stock) (NNP Exchange)) (VP (VBD did) (RB n't) (VP (VB fall) (ADVP (RB apart)) (NP (NNP Friday)) (SBAR (IN as) (S (NP (DT the) (NNP Dow) (NNP Jones) (NNP Industrial) (NNP Average)) (VP (VBD plunged) (NP (NP (CD 190.58) (NNS points)) (PRN (: --) (NP (NP (JJS most)) (PP (IN of) (NP (PRP it))) (PP (IN in) (NP (DT the) (JJ final) (NN hour)))) (: --)))))))))) (NP (PRP it)) (ADVP (RB barely)) (VP (VBD managed) (S (VP (TO to) (VP (VB stay) (NP (NP (DT this) (NN side)) (PP (IN of) (NP (NN chaos)))))))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [4, 3, 2, 1, 0], [5, 4, 3, 2, 1, 0], [6, 5, 4, 3, 2, 1, 0], [7, 6, 5, 4, 3, 2, 1, 0], [7, 5, 4, 3, 2, 1, 0], [7, 4, 3, 2, 1, 0], [7, 3, 2, 1, 0], [7, 2, 1, 0], [8, 7, 2, 1, 0], [9, 8, 7, 2, 1, 0], [10, 9, 8, 7, 2, 1, 0], [10, 8, 7, 2, 1, 0], [10, 7, 2, 1, 0], [10, 2, 1, 0], [10, 1, 0], [11, 10, 1, 0], [10, 1, 0], [12, 10, 1, 0], [10, 1, 0], [13, 10, 1, 0], [14, 13, 10, 1, 0], [15, 14, 13, 10, 1, 0], [16, 15, 14, 13, 10, 1, 0], [17, 16, 15, 14, 13, 10, 1, 0], [18, 17, 16, 15, 14, 13, 10, 1, 0], [18, 16, 15, 14, 13, 10, 1, 0], [18, 15, 14, 13, 10, 1, 0], [18, 14, 13, 10, 1, 0], [18, 13, 10, 1, 0], [19, 18, 13, 10, 1, 0], [19, 13, 10, 1, 0], [19, 10, 1, 0], [20, 19, 10, 1, 0], [21, 20, 19, 10, 1, 0], [21, 19, 10, 1, 0], [22, 21, 19, 10, 1, 0], [23, 22, 21, 19, 10, 1, 0], [23, 21, 19, 10, 1, 0], [24, 23, 21, 19, 10, 1, 0], [25, 24, 23, 21, 19, 10, 1, 0], [24, 23, 21, 19, 10, 1, 0], [23, 21, 19, 10, 1, 0], [26, 23, 21, 19, 10, 1, 0], [27, 26, 23, 21, 19, 10, 1, 0], [28, 27, 26, 23, 21, 19, 10, 1, 0], [29, 28, 27, 26, 23, 21, 19, 10, 1, 0], [29, 27, 26, 23, 21, 19, 10, 1, 0], [29, 26, 23, 21, 19, 10, 1, 0], [26, 23, 21, 19, 10, 1, 0], [23, 21, 19, 10, 1, 0], [30, 23, 21, 19, 10, 1, 0], [23, 21, 19, 10, 1, 0], [21, 19, 10, 1, 0], [19, 10, 1, 0], [10, 1, 0], [31, 10, 1, 0], [32, 31, 10, 1, 0], [33, 32, 31, 10, 1, 0], [33, 31, 10, 1, 0], [33, 10, 1, 0], [33, 1, 0], [33, 0], [34, 33, 0], [35, 34, 33, 0], [35, 33, 0], [36, 35, 33, 0], [37, 36, 35, 33, 0], [37, 35, 33, 0], [38, 37, 35, 33, 0], [39, 38, 37, 35, 33, 0], [38, 37, 35, 33, 0], [37, 35, 33, 0], [35, 33, 0], [33, 0], [40, 33, 0], [33, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [32, 33, 34, 35, 36, 37, 38, 39, 40], [33, 34, 35, 36, 37, 38, 39, 40], [34, 35, 36, 37, 38, 39, 40], [34, 35, 36, 37, 38, 39, 40], [34, 35, 36, 37, 38, 39, 40], [34, 35, 36, 37, 38, 39, 40], [34, 35, 36, 37, 38, 39, 40], [35, 36, 37, 38, 39, 40], [36, 37, 38, 39, 40], [36, 37, 38, 39, 40], [37, 38, 39, 40], [38, 39, 40], [38, 39, 40], [39, 40], [40], [40], [40], [40], [40], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 22], [1, 14], [0, -1], [0, -1], [0, -1], [1, 21], [1, 5], [1, 24], [1, 19], [0, -1], [2, 2], [0, -1], [2, 41], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 22], [1, 14], [0, -1], [1, 24], [1, 19], [0, -1], [0, -1], [1, 26], [0, -1], [0, -1], [1, 37], [0, -1], [0, -1], [2, 30], [2, 35], [0, -1], [0, -1], [0, -1], [0, -1], [1, 3], [1, 14], [2, 30], [2, 35], [0, -1], [2, 37], [2, 13], [2, 16], [2, 1], [0, -1], [0, -1], [0, -1], [1, 2], [1, 24], [1, 1], [1, 7], [0, -1], [0, -1], [1, 5], [0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [2, 30], [2, 35], [2, 16], [2, 43], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [0, 3, 2], [0, 4, 3], [0, 5, 4], [0, 6, 5], [1, 7, 6], [1, 7, 5], [1, 7, 4], [1, 7, 3], [0, 7, 2], [0, 8, 7], [0, 9, 8], [1, 10, 9], [1, 10, 8], [1, 10, 7], [1, 10, 2], [0, 10, 1], [2, 11, 10], [0, 10, 1], [2, 12, 10], [0, 10, 1], [0, 13, 10], [0, 14, 13], [0, 15, 14], [0, 16, 15], [0, 17, 16], [1, 18, 17], [1, 18, 16], [1, 18, 15], [1, 18, 14], [0, 18, 13], [1, 19, 18], [1, 19, 13], [0, 19, 10], [0, 20, 19], [1, 21, 20], [0, 21, 19], [0, 22, 21], [1, 23, 22], [0, 23, 21], [0, 24, 23], [2, 25, 24], [2, 24, 23], [0, 23, 21], [0, 26, 23], [0, 27, 26], [0, 28, 27], [1, 29, 28], [1, 29, 27], [2, 29, 26], [2, 26, 23], [0, 23, 21], [2, 30, 23], [2, 23, 21], [2, 21, 19], [2, 19, 10], [0, 10, 1], [0, 31, 10], [0, 32, 31], [1, 33, 32], [1, 33, 31], [1, 33, 10], [1, 33, 1], [0, 33, 0], [0, 34, 33], [1, 35, 34], [0, 35, 33], [0, 36, 35], [1, 37, 36], [0, 37, 35], [0, 38, 37], [2, 39, 38], [2, 38, 37], [2, 37, 35], [2, 35, 33], [0, 33, 0], [2, 40, 33], [2, 33, 0]]}}
+{"orig_tokens": ["Some", "``", "circuit", "breakers", "''", "installed", "after", "the", "October", "1987", "crash", "failed", "their", "first", "test", ",", "traders", "say", ",", "unable", "to", "cool", "the", "selling", "panic", "in", "both", "stocks", "and", "futures", "."], "tokens": ["<cunk>", "``", "<unk>", "<unks>", "''", "<unked>", "after", "the", "<cunkER>", "1987", "crash", "<unked>", "their", "<unk>", "<unk>", ",", "<unks>", "<unk>", ",", "<ununk>", "to", "<unk>", "the", "<unking>", "<unk>", "in", "<unk>", "stocks", "and", "<unks>", "."], "token_ids": [7, 27, 16, 6, 30, 3, 264, 40, 19, 213, 215, 3, 147, 16, 16, 45, 6, 16, 45, 4, 59, 16, 40, 17, 16, 42, 16, 231, 92, 6, 62], "tags": ["DT", "``", "NN", "NNS", "''", "VBN", "IN", "DT", "NNP", "CD", "NN", "VBD", "PRP$", "JJ", "NN", ",", "NNS", "VBP", ",", "JJ", "TO", "VB", "DT", "NN", "NN", "IN", "DT", "NNS", "CC", "NNS", "."], "tree_str": "(S (NP (NP (DT Some) (`` ``) (NN circuit) (NNS breakers) ('' '')) (VP (VBN installed) (PP (IN after) (NP (DT the) (NNP October) (CD 1987) (NN crash))))) (VP (VBD failed) (NP (PRP$ their) (JJ first) (NN test)) (PRN (, ,) (S (NP (NNS traders)) (VP (VBP say))) (, ,)) (S (ADJP (JJ unable) (S (VP (TO to) (VP (VB cool) (NP (NP (DT the) (NN selling) (NN panic)) (PP (IN in) (NP (DT both) (NNS stocks) (CC and) (NNS futures)))))))))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [4, 3, 2, 1, 0], [4, 2, 1, 0], [4, 1, 0], [4, 0], [5, 4, 0], [4, 0], [6, 4, 0], [7, 6, 4, 0], [8, 7, 6, 4, 0], [9, 8, 7, 6, 4, 0], [10, 9, 8, 7, 6, 4, 0], [11, 10, 9, 8, 7, 6, 4, 0], [11, 9, 8, 7, 6, 4, 0], [11, 8, 7, 6, 4, 0], [11, 7, 6, 4, 0], [7, 6, 4, 0], [6, 4, 0], [4, 0], [12, 4, 0], [12, 0], [13, 12, 0], [14, 13, 12, 0], [15, 14, 13, 12, 0], [15, 13, 12, 0], [15, 12, 0], [12, 0], [16, 12, 0], [17, 16, 12, 0], [18, 17, 16, 12, 0], [18, 16, 12, 0], [18, 12, 0], [19, 18, 12, 0], [18, 12, 0], [12, 0], [20, 12, 0], [21, 20, 12, 0], [22, 21, 20, 12, 0], [22, 20, 12, 0], [23, 22, 20, 12, 0], [24, 23, 22, 20, 12, 0], [25, 24, 23, 22, 20, 12, 0], [25, 23, 22, 20, 12, 0], [25, 22, 20, 12, 0], [26, 25, 22, 20, 12, 0], [27, 26, 25, 22, 20, 12, 0], [28, 27, 26, 25, 22, 20, 12, 0], [28, 26, 25, 22, 20, 12, 0], [29, 28, 26, 25, 22, 20, 12, 0], [28, 26, 25, 22, 20, 12, 0], [30, 28, 26, 25, 22, 20, 12, 0], [28, 26, 25, 22, 20, 12, 0], [26, 25, 22, 20, 12, 0], [25, 22, 20, 12, 0], [22, 20, 12, 0], [20, 12, 0], [12, 0], [31, 12, 0], [12, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [23, 24, 25, 26, 27, 28, 29, 30, 31], [23, 24, 25, 26, 27, 28, 29, 30, 31], [24, 25, 26, 27, 28, 29, 30, 31], [25, 26, 27, 28, 29, 30, 31], [26, 27, 28, 29, 30, 31], [26, 27, 28, 29, 30, 31], [26, 27, 28, 29, 30, 31], [27, 28, 29, 30, 31], [28, 29, 30, 31], [29, 30, 31], [29, 30, 31], [30, 31], [30, 31], [31], [31], [31], [31], [31], [31], [31], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 37], [1, 14], [0, -1], [2, 37], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 26], [1, 22], [1, 14], [2, 30], [2, 35], [2, 42], [0, -1], [1, 24], [0, -1], [0, -1], [0, -1], [1, 3], [1, 31], [2, 16], [0, -1], [0, -1], [0, -1], [1, 24], [1, 37], [0, -1], [2, 37], [2, 28], [0, -1], [0, -1], [0, -1], [1, 5], [0, -1], [0, -1], [0, -1], [1, 22], [1, 14], [0, -1], [0, -1], [0, -1], [1, 33], [0, -1], [2, 7], [0, -1], [2, 9], [2, 30], [2, 35], [2, 16], [2, 43], [2, 13], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [0, 3, 2], [1, 4, 3], [1, 4, 2], [1, 4, 1], [0, 4, 0], [2, 5, 4], [0, 4, 0], [0, 6, 4], [0, 7, 6], [0, 8, 7], [0, 9, 8], [0, 10, 9], [1, 11, 10], [1, 11, 9], [1, 11, 8], [2, 11, 7], [2, 7, 6], [2, 6, 4], [0, 4, 0], [1, 12, 4], [0, 12, 0], [0, 13, 12], [0, 14, 13], [1, 15, 14], [1, 15, 13], [2, 15, 12], [0, 12, 0], [0, 16, 12], [0, 17, 16], [1, 18, 17], [1, 18, 16], [0, 18, 12], [2, 19, 18], [2, 18, 12], [0, 12, 0], [0, 20, 12], [0, 21, 20], [1, 22, 21], [0, 22, 20], [0, 23, 22], [0, 24, 23], [1, 25, 24], [1, 25, 23], [0, 25, 22], [0, 26, 25], [0, 27, 26], [1, 28, 27], [0, 28, 26], [2, 29, 28], [0, 28, 26], [2, 30, 28], [2, 28, 26], [2, 26, 25], [2, 25, 22], [2, 22, 20], [2, 20, 12], [0, 12, 0], [2, 31, 12], [2, 12, 0]]}}
+{"orig_tokens": ["The", "49", "stock", "specialist", "firms", "on", "the", "Big", "Board", "floor", "--", "the", "buyers", "and", "sellers", "of", "last", "resort", "who", "were", "criticized", "after", "the", "1987", "crash", "--", "once", "again", "could", "n't", "handle", "the", "selling", "pressure", "."], "tokens": ["The", "<unkn>", "stock", "<unk>", "<unks>", "on", "the", "<cunk>", "<cunk>", "<unk>", "--", "the", "<unks>", "and", "<unks>", "of", "last", "<unk>", "who", "were", "<unked>", "after", "the", "1987", "crash", "--", "<unk>", "<unk>", "<unk>", "n't", "<unk>", "the", "<unking>", "<unk>", "."], "token_ids": [28, 10, 150, 16, 6, 165, 40, 7, 7, 16, 216, 40, 6, 92, 6, 26, 85, 16, 280, 250, 3, 264, 40, 213, 215, 216, 16, 16, 16, 126, 16, 40, 17, 16, 62], "tags": ["DT", "CD", "NN", "NN", "NNS", "IN", "DT", "NNP", "NNP", "NN", ":", "DT", "NNS", "CC", "NNS", "IN", "JJ", "NN", "WP", "VBD", "VBN", "IN", "DT", "CD", "NN", ":", "RB", "RB", "MD", "RB", "VB", "DT", "NN", "NN", "."], "tree_str": "(S (NP (NP (NP (DT The) (CD 49) (NN stock) (NN specialist) (NNS firms)) (PP (IN on) (NP (DT the) (NNP Big) (NNP Board) (NN floor)))) (: --) (NP (NP (DT the) (NNS buyers) (CC and) (NNS sellers)) (PP (IN of) (NP (JJ last) (NN resort))) (SBAR (WHNP (WP who)) (S (VP (VBD were) (VP (VBN criticized) (PP (IN after) (NP (DT the) (CD 1987) (NN crash)))))))) (: --)) (ADVP (RB once) (RB again)) (VP (MD could) (RB n't) (VP (VB handle) (NP (DT the) (NN selling) (NN pressure)))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [4, 3, 2, 1, 0], [5, 4, 3, 2, 1, 0], [5, 3, 2, 1, 0], [5, 2, 1, 0], [5, 1, 0], [5, 0], [6, 5, 0], [7, 6, 5, 0], [8, 7, 6, 5, 0], [9, 8, 7, 6, 5, 0], [10, 9, 8, 7, 6, 5, 0], [10, 8, 7, 6, 5, 0], [10, 7, 6, 5, 0], [10, 6, 5, 0], [6, 5, 0], [5, 0], [11, 5, 0], [5, 0], [12, 5, 0], [13, 12, 5, 0], [13, 5, 0], [14, 13, 5, 0], [13, 5, 0], [15, 13, 5, 0], [13, 5, 0], [16, 13, 5, 0], [17, 16, 13, 5, 0], [18, 17, 16, 13, 5, 0], [18, 16, 13, 5, 0], [16, 13, 5, 0], [13, 5, 0], [19, 13, 5, 0], [20, 19, 13, 5, 0], [21, 20, 19, 13, 5, 0], [21, 19, 13, 5, 0], [21, 13, 5, 0], [22, 21, 13, 5, 0], [23, 22, 21, 13, 5, 0], [24, 23, 22, 21, 13, 5, 0], [25, 24, 23, 22, 21, 13, 5, 0], [25, 23, 22, 21, 13, 5, 0], [25, 22, 21, 13, 5, 0], [22, 21, 13, 5, 0], [21, 13, 5, 0], [13, 5, 0], [5, 0], [26, 5, 0], [5, 0], [27, 5, 0], [28, 27, 5, 0], [28, 5, 0], [29, 28, 5, 0], [30, 29, 28, 5, 0], [31, 30, 29, 28, 5, 0], [31, 29, 28, 5, 0], [31, 28, 5, 0], [31, 5, 0], [31, 0], [32, 31, 0], [33, 32, 31, 0], [34, 33, 32, 31, 0], [34, 32, 31, 0], [34, 31, 0], [31, 0], [35, 31, 0], [31, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [27, 28, 29, 30, 31, 32, 33, 34, 35], [27, 28, 29, 30, 31, 32, 33, 34, 35], [28, 29, 30, 31, 32, 33, 34, 35], [29, 30, 31, 32, 33, 34, 35], [29, 30, 31, 32, 33, 34, 35], [30, 31, 32, 33, 34, 35], [31, 32, 33, 34, 35], [32, 33, 34, 35], [32, 33, 34, 35], [32, 33, 34, 35], [32, 33, 34, 35], [32, 33, 34, 35], [33, 34, 35], [34, 35], [35], [35], [35], [35], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 26], [1, 14], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 14], [2, 30], [2, 35], [0, -1], [2, 37], [0, -1], [0, -1], [1, 14], [0, -1], [2, 7], [0, -1], [2, 9], [0, -1], [0, -1], [0, -1], [1, 3], [2, 30], [2, 35], [0, -1], [0, -1], [0, -1], [1, 6], [1, 25], [0, -1], [0, -1], [0, -1], [0, -1], [1, 26], [1, 14], [2, 30], [2, 35], [2, 39], [2, 13], [0, -1], [2, 37], [0, -1], [0, -1], [1, 2], [0, -1], [0, -1], [0, -1], [1, 21], [1, 5], [1, 2], [1, 24], [0, -1], [0, -1], [0, -1], [1, 22], [1, 14], [2, 16], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [0, 3, 2], [0, 4, 3], [1, 5, 4], [1, 5, 3], [1, 5, 2], [1, 5, 1], [0, 5, 0], [0, 6, 5], [0, 7, 6], [0, 8, 7], [0, 9, 8], [1, 10, 9], [1, 10, 8], [1, 10, 7], [2, 10, 6], [2, 6, 5], [0, 5, 0], [2, 11, 5], [0, 5, 0], [0, 12, 5], [1, 13, 12], [0, 13, 5], [2, 14, 13], [0, 13, 5], [2, 15, 13], [0, 13, 5], [0, 16, 13], [0, 17, 16], [1, 18, 17], [2, 18, 16], [2, 16, 13], [0, 13, 5], [0, 19, 13], [0, 20, 19], [1, 21, 20], [1, 21, 19], [0, 21, 13], [0, 22, 21], [0, 23, 22], [0, 24, 23], [1, 25, 24], [1, 25, 23], [2, 25, 22], [2, 22, 21], [2, 21, 13], [2, 13, 5], [0, 5, 0], [2, 26, 5], [0, 5, 0], [0, 27, 5], [1, 28, 27], [0, 28, 5], [0, 29, 28], [0, 30, 29], [1, 31, 30], [1, 31, 29], [1, 31, 28], [1, 31, 5], [0, 31, 0], [0, 32, 31], [0, 33, 32], [1, 34, 33], [1, 34, 32], [2, 34, 31], [0, 31, 0], [2, 35, 31], [2, 31, 0]]}}
+{"orig_tokens": ["Big", "investment", "banks", "refused", "to", "step", "up", "to", "the", "plate", "to", "support", "the", "beleaguered", "floor", "traders", "by", "buying", "big", "blocks", "of", "stock", ",", "traders", "say", "."], "tokens": ["<cunk>", "<unk>", "<unks>", "<unked>", "to", "<unk>", "<unk>", "to", "the", "<unk>", "to", "<unk>", "the", "<unked>", "<unk>", "<unks>", "by", "<unking>", "<unk>", "<unks>", "of", "stock", ",", "<unks>", "<unk>", "."], "token_ids": [7, 16, 6, 3, 59, 16, 16, 59, 40, 16, 59, 16, 40, 3, 16, 6, 53, 17, 16, 6, 26, 150, 45, 6, 16, 62], "tags": ["JJ", "NN", "NNS", "VBD", "TO", "VB", "IN", "TO", "DT", "NN", "TO", "VB", "DT", "JJ", "NN", "NNS", "IN", "VBG", "JJ", "NNS", "IN", "NN", ",", "NNS", "VBP", "."], "tree_str": "(S (S (NP (JJ Big) (NN investment) (NNS banks)) (VP (VBD refused) (S (VP (TO to) (VP (VB step) (ADVP (IN up) (PP (TO to) (NP (DT the) (NN plate)))) (S (VP (TO to) (VP (VB support) (NP (DT the) (JJ beleaguered) (NN floor) (NNS traders)) (PP (IN by) (S (VP (VBG buying) (NP (NP (JJ big) (NNS blocks)) (PP (IN of) (NP (NN stock))))))))))))))) (, ,) (NP (NNS traders)) (VP (VBP say)) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [3, 1, 0], [3, 0], [4, 3, 0], [4, 0], [5, 4, 0], [6, 5, 4, 0], [6, 4, 0], [7, 6, 4, 0], [8, 7, 6, 4, 0], [9, 8, 7, 6, 4, 0], [10, 9, 8, 7, 6, 4, 0], [10, 8, 7, 6, 4, 0], [8, 7, 6, 4, 0], [7, 6, 4, 0], [6, 4, 0], [11, 6, 4, 0], [12, 11, 6, 4, 0], [12, 6, 4, 0], [13, 12, 6, 4, 0], [14, 13, 12, 6, 4, 0], [15, 14, 13, 12, 6, 4, 0], [16, 15, 14, 13, 12, 6, 4, 0], [16, 14, 13, 12, 6, 4, 0], [16, 13, 12, 6, 4, 0], [16, 12, 6, 4, 0], [12, 6, 4, 0], [17, 12, 6, 4, 0], [18, 17, 12, 6, 4, 0], [19, 18, 17, 12, 6, 4, 0], [20, 19, 18, 17, 12, 6, 4, 0], [20, 18, 17, 12, 6, 4, 0], [21, 20, 18, 17, 12, 6, 4, 0], [22, 21, 20, 18, 17, 12, 6, 4, 0], [21, 20, 18, 17, 12, 6, 4, 0], [20, 18, 17, 12, 6, 4, 0], [18, 17, 12, 6, 4, 0], [17, 12, 6, 4, 0], [12, 6, 4, 0], [6, 4, 0], [4, 0], [23, 4, 0], [24, 23, 4, 0], [25, 24, 23, 4, 0], [25, 23, 4, 0], [25, 4, 0], [25, 0], [26, 25, 0], [25, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [18, 19, 20, 21, 22, 23, 24, 25, 26], [19, 20, 21, 22, 23, 24, 25, 26], [20, 21, 22, 23, 24, 25, 26], [21, 22, 23, 24, 25, 26], [21, 22, 23, 24, 25, 26], [22, 23, 24, 25, 26], [23, 24, 25, 26], [23, 24, 25, 26], [23, 24, 25, 26], [23, 24, 25, 26], [23, 24, 25, 26], [23, 24, 25, 26], [23, 24, 25, 26], [23, 24, 25, 26], [24, 25, 26], [25, 26], [26], [26], [26], [26], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [1, 22], [1, 3], [0, -1], [1, 24], [0, -1], [0, -1], [1, 5], [0, -1], [0, -1], [0, -1], [0, -1], [1, 14], [2, 30], [2, 35], [2, 2], [0, -1], [0, -1], [1, 5], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 3], [1, 14], [2, 16], [0, -1], [0, -1], [0, -1], [0, -1], [1, 3], [0, -1], [0, -1], [2, 30], [2, 35], [2, 16], [2, 29], [2, 35], [2, 43], [2, 43], [0, -1], [0, -1], [0, -1], [1, 24], [1, 37], [1, 8], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [1, 3, 2], [1, 3, 1], [0, 3, 0], [1, 4, 3], [0, 4, 0], [0, 5, 4], [1, 6, 5], [0, 6, 4], [0, 7, 6], [0, 8, 7], [0, 9, 8], [1, 10, 9], [2, 10, 8], [2, 8, 7], [2, 7, 6], [0, 6, 4], [0, 11, 6], [1, 12, 11], [0, 12, 6], [0, 13, 12], [0, 14, 13], [0, 15, 14], [1, 16, 15], [1, 16, 14], [1, 16, 13], [2, 16, 12], [0, 12, 6], [0, 17, 12], [0, 18, 17], [0, 19, 18], [1, 20, 19], [0, 20, 18], [0, 21, 20], [2, 22, 21], [2, 21, 20], [2, 20, 18], [2, 18, 17], [2, 17, 12], [2, 12, 6], [2, 6, 4], [0, 4, 0], [0, 23, 4], [0, 24, 23], [1, 25, 24], [1, 25, 23], [1, 25, 4], [0, 25, 0], [2, 26, 25], [2, 25, 0]]}}
+{"orig_tokens": ["Heavy", "selling", "of", "Standard", "&", "Poor", "'s", "500-stock", "index", "futures", "in", "Chicago", "relentlessly", "beat", "stocks", "downward", "."], "tokens": ["<cunk>", "<unking>", "of", "<cunk>", "&", "<cunk>", "'s", "<unk->", "<unk>", "<unks>", "in", "Chicago", "<unkly>", "<unk>", "stocks", "<unk>", "."], "token_ids": [7, 17, 26, 7, 47, 7, 33, 1, 16, 6, 42, 32, 2, 16, 231, 16, 62], "tags": ["JJ", "NN", "IN", "NNP", "CC", "NNP", "POS", "JJ", "NN", "NNS", "IN", "NNP", "RB", "VBD", "NNS", "RB", "."], "tree_str": "(S (NP (NP (JJ Heavy) (NN selling)) (PP (IN of) (NP (NP (NNP Standard) (CC &) (NNP Poor) (POS 's)) (JJ 500-stock) (NN index) (NNS futures))) (PP (IN in) (NP (NNP Chicago)))) (VP (ADVP (RB relentlessly)) (VBD beat) (NP (NNS stocks)) (ADVP (RB downward))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [2, 0], [3, 2, 0], [4, 3, 2, 0], [5, 4, 3, 2, 0], [4, 3, 2, 0], [6, 4, 3, 2, 0], [4, 3, 2, 0], [7, 4, 3, 2, 0], [4, 3, 2, 0], [8, 4, 3, 2, 0], [9, 8, 4, 3, 2, 0], [10, 9, 8, 4, 3, 2, 0], [10, 8, 4, 3, 2, 0], [10, 4, 3, 2, 0], [10, 3, 2, 0], [3, 2, 0], [2, 0], [11, 2, 0], [12, 11, 2, 0], [11, 2, 0], [2, 0], [13, 2, 0], [14, 13, 2, 0], [14, 2, 0], [14, 0], [15, 14, 0], [14, 0], [16, 14, 0], [14, 0], [17, 14, 0], [14, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [9, 10, 11, 12, 13, 14, 15, 16, 17], [10, 11, 12, 13, 14, 15, 16, 17], [11, 12, 13, 14, 15, 16, 17], [11, 12, 13, 14, 15, 16, 17], [11, 12, 13, 14, 15, 16, 17], [11, 12, 13, 14, 15, 16, 17], [11, 12, 13, 14, 15, 16, 17], [11, 12, 13, 14, 15, 16, 17], [12, 13, 14, 15, 16, 17], [13, 14, 15, 16, 17], [13, 14, 15, 16, 17], [13, 14, 15, 16, 17], [14, 15, 16, 17], [15, 16, 17], [15, 16, 17], [15, 16, 17], [16, 17], [16, 17], [17], [17], [], [], []], "actions": [[0, -1], [0, -1], [1, 3], [0, -1], [0, -1], [0, -1], [2, 7], [0, -1], [2, 9], [0, -1], [2, 32], [0, -1], [0, -1], [0, -1], [1, 22], [1, 3], [1, 31], [2, 30], [2, 35], [0, -1], [0, -1], [2, 30], [2, 35], [0, -1], [0, -1], [1, 2], [1, 24], [0, -1], [2, 16], [0, -1], [2, 2], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [1, 2, 1], [0, 2, 0], [0, 3, 2], [0, 4, 3], [2, 5, 4], [0, 4, 3], [2, 6, 4], [0, 4, 3], [2, 7, 4], [0, 4, 3], [0, 8, 4], [0, 9, 8], [1, 10, 9], [1, 10, 8], [1, 10, 4], [2, 10, 3], [2, 3, 2], [0, 2, 0], [0, 11, 2], [2, 12, 11], [2, 11, 2], [0, 2, 0], [0, 13, 2], [1, 14, 13], [1, 14, 2], [0, 14, 0], [2, 15, 14], [0, 14, 0], [2, 16, 14], [0, 14, 0], [2, 17, 14], [2, 14, 0]]}}
+{"orig_tokens": ["Seven", "Big", "Board", "stocks", "--", "UAL", ",", "AMR", ",", "BankAmerica", ",", "Walt", "Disney", ",", "Capital", "Cities\\/ABC", ",", "Philip", "Morris", "and", "Pacific", "Telesis", "Group", "--", "stopped", "trading", "and", "never", "resumed", "."], "tokens": ["<cunk>", "<cunk>", "<cunk>", "stocks", "--", "<cunk>", ",", "<cunk>", ",", "<cunk>", ",", "<cunk>", "<cunk>", ",", "<unk>", "<cunk>", ",", "<cunk>", "<cunks>", "and", "<cunk>", "<cunks>", "<cunk>", "--", "<unked>", "<unking>", "and", "<unkER>", "<unked>", "."], "token_ids": [7, 7, 7, 231, 216, 7, 45, 7, 45, 7, 45, 7, 7, 45, 16, 7, 45, 7, 5, 92, 7, 5, 7, 216, 3, 17, 92, 13, 3, 62], "tags": ["CD", "NNP", "NNP", "NNS", ":", "NNP", ",", "NNP", ",", "NNP", ",", "NNP", "NNP", ",", "NNP", "NNP", ",", "NNP", "NNP", "CC", "NNP", "NNP", "NNP", ":", "VBD", "VBG", "CC", "RB", "VBD", "."], "tree_str": "(S (NP (NP (CD Seven) (NNP Big) (NNP Board) (NNS stocks)) (: --) (NP (NP (NNP UAL)) (, ,) (NP (NNP AMR)) (, ,) (NP (NNP BankAmerica)) (, ,) (NP (NNP Walt) (NNP Disney)) (, ,) (NP (NNP Capital) (NNP Cities\\/ABC)) (, ,) (NP (NNP Philip) (NNP Morris)) (CC and) (NP (NNP Pacific) (NNP Telesis) (NNP Group))) (: --)) (VP (VP (VBD stopped) (S (VP (VBG trading)))) (CC and) (VP (ADVP (RB never)) (VBD resumed))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [4, 3, 2, 1, 0], [4, 2, 1, 0], [4, 1, 0], [4, 0], [5, 4, 0], [4, 0], [6, 4, 0], [7, 6, 4, 0], [6, 4, 0], [8, 6, 4, 0], [6, 4, 0], [9, 6, 4, 0], [6, 4, 0], [10, 6, 4, 0], [6, 4, 0], [11, 6, 4, 0], [6, 4, 0], [12, 6, 4, 0], [13, 12, 6, 4, 0], [13, 6, 4, 0], [6, 4, 0], [14, 6, 4, 0], [6, 4, 0], [15, 6, 4, 0], [16, 15, 6, 4, 0], [16, 6, 4, 0], [6, 4, 0], [17, 6, 4, 0], [6, 4, 0], [18, 6, 4, 0], [19, 18, 6, 4, 0], [19, 6, 4, 0], [6, 4, 0], [20, 6, 4, 0], [6, 4, 0], [21, 6, 4, 0], [22, 21, 6, 4, 0], [23, 22, 21, 6, 4, 0], [23, 21, 6, 4, 0], [23, 6, 4, 0], [6, 4, 0], [4, 0], [24, 4, 0], [4, 0], [25, 4, 0], [25, 0], [26, 25, 0], [25, 0], [27, 25, 0], [25, 0], [28, 25, 0], [29, 28, 25, 0], [29, 25, 0], [25, 0], [30, 25, 0], [25, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [22, 23, 24, 25, 26, 27, 28, 29, 30], [23, 24, 25, 26, 27, 28, 29, 30], [24, 25, 26, 27, 28, 29, 30], [24, 25, 26, 27, 28, 29, 30], [24, 25, 26, 27, 28, 29, 30], [24, 25, 26, 27, 28, 29, 30], [24, 25, 26, 27, 28, 29, 30], [25, 26, 27, 28, 29, 30], [25, 26, 27, 28, 29, 30], [26, 27, 28, 29, 30], [26, 27, 28, 29, 30], [27, 28, 29, 30], [27, 28, 29, 30], [28, 29, 30], [28, 29, 30], [29, 30], [30], [30], [30], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 26], [0, -1], [2, 37], [0, -1], [0, -1], [2, 37], [0, -1], [2, 9], [0, -1], [2, 37], [0, -1], [2, 9], [0, -1], [2, 37], [0, -1], [0, -1], [1, 22], [2, 9], [0, -1], [2, 37], [0, -1], [0, -1], [1, 22], [2, 9], [0, -1], [2, 37], [0, -1], [0, -1], [1, 22], [2, 9], [0, -1], [2, 7], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [2, 9], [2, 13], [0, -1], [2, 37], [0, -1], [1, 24], [0, -1], [2, 43], [0, -1], [2, 7], [0, -1], [0, -1], [1, 21], [2, 9], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [0, 3, 2], [1, 4, 3], [1, 4, 2], [1, 4, 1], [0, 4, 0], [2, 5, 4], [0, 4, 0], [0, 6, 4], [2, 7, 6], [0, 6, 4], [2, 8, 6], [0, 6, 4], [2, 9, 6], [0, 6, 4], [2, 10, 6], [0, 6, 4], [2, 11, 6], [0, 6, 4], [0, 12, 6], [1, 13, 12], [2, 13, 6], [0, 6, 4], [2, 14, 6], [0, 6, 4], [0, 15, 6], [1, 16, 15], [2, 16, 6], [0, 6, 4], [2, 17, 6], [0, 6, 4], [0, 18, 6], [1, 19, 18], [2, 19, 6], [0, 6, 4], [2, 20, 6], [0, 6, 4], [0, 21, 6], [0, 22, 21], [1, 23, 22], [1, 23, 21], [2, 23, 6], [2, 6, 4], [0, 4, 0], [2, 24, 4], [0, 4, 0], [1, 25, 4], [0, 25, 0], [2, 26, 25], [0, 25, 0], [2, 27, 25], [0, 25, 0], [0, 28, 25], [1, 29, 28], [2, 29, 25], [0, 25, 0], [2, 30, 25], [2, 25, 0]]}}
+{"orig_tokens": ["The", "finger-pointing", "has", "already", "begun", "."], "tokens": ["The", "<unking>", "has", "already", "<unk>", "."], "token_ids": [28, 17, 125, 217, 16, 62], "tags": ["DT", "NN", "VBZ", "RB", "VBN", "."], "tree_str": "(S (NP (DT The) (NN finger-pointing)) (VP (VBZ has) (ADVP (RB already)) (VP (VBN begun))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [2, 0], [3, 2, 0], [4, 3, 2, 0], [5, 4, 3, 2, 0], [5, 3, 2, 0], [5, 2, 0], [5, 0], [6, 5, 0], [5, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6], [3, 4, 5, 6], [3, 4, 5, 6], [4, 5, 6], [5, 6], [6], [6], [6], [6], [], [], []], "actions": [[0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [0, -1], [1, 2], [1, 5], [1, 24], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [1, 2, 1], [0, 2, 0], [0, 3, 2], [0, 4, 3], [1, 5, 4], [1, 5, 3], [1, 5, 2], [0, 5, 0], [2, 6, 5], [2, 5, 0]]}}
+{"orig_tokens": ["``", "The", "equity", "market", "was", "illiquid", "."], "tokens": ["``", "The", "<unk>", "market", "was", "<unk>", "."], "token_ids": [27, 28, 16, 214, 56, 16, 62], "tags": ["``", "DT", "NN", "NN", "VBD", "JJ", "."], "tree_str": "(S (`` ``) (NP (DT The) (NN equity) (NN market)) (VP (VBD was) (ADJP (JJ illiquid))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [4, 3, 2, 1, 0], [4, 2, 1, 0], [4, 1, 0], [5, 4, 1, 0], [6, 5, 4, 1, 0], [6, 4, 1, 0], [6, 1, 0], [6, 0], [7, 6, 0], [6, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7], [2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7], [4, 5, 6, 7], [5, 6, 7], [5, 6, 7], [5, 6, 7], [6, 7], [7], [7], [7], [7], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 14], [0, -1], [0, -1], [1, 10], [1, 24], [1, 37], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [0, 3, 2], [1, 4, 3], [1, 4, 2], [0, 4, 1], [0, 5, 4], [1, 6, 5], [1, 6, 4], [1, 6, 1], [0, 6, 0], [2, 7, 6], [2, 6, 0]]}}
+{"orig_tokens": ["Once", "again", "-LCB-", "the", "specialists", "-RCB-", "were", "not", "able", "to", "handle", "the", "imbalances", "on", "the", "floor", "of", "the", "New", "York", "Stock", "Exchange", ",", "''", "said", "Christopher", "Pedersen", ",", "senior", "vice", "president", "at", "Twenty-First", "Securities", "Corp", "."], "tokens": ["<cunk>", "<unk>", "<unk->", "the", "<unks>", "<unk->", "were", "not", "<unk>", "to", "<unk>", "the", "<unks>", "on", "the", "<unk>", "of", "the", "<cunk>", "<cunk>", "<cunk>", "Exchange", ",", "''", "said", "<cunkER>", "<cunk>", ",", "<unk>", "<unk>", "president", "at", "<cunk->", "Securities", "<cunk>", "."], "token_ids": [7, 16, 1, 40, 6, 1, 250, 145, 16, 59, 16, 40, 6, 165, 40, 16, 26, 40, 7, 7, 7, 143, 45, 30, 70, 19, 7, 45, 16, 16, 91, 31, 11, 142, 7, 62], "tags": ["RB", "RB", "-LRB-", "DT", "NNS", "-RRB-", "VBD", "RB", "JJ", "TO", "VB", "DT", "NNS", "IN", "DT", "NN", "IN", "DT", "NNP", "NNP", "NNP", "NNP", ",", "''", "VBD", "NNP", "NNP", ",", "JJ", "NN", "NN", "IN", "NNP", "NNP", "NNP", "."], "tree_str": "(SINV (S (ADVP (RB Once) (RB again)) (-LRB- -LCB-) (NP (DT the) (NNS specialists)) (-RRB- -RCB-) (VP (VBD were) (RB not) (ADJP (JJ able) (S (VP (TO to) (VP (VB handle) (NP (NP (DT the) (NNS imbalances)) (PP (IN on) (NP (NP (DT the) (NN floor)) (PP (IN of) (NP (DT the) (NNP New) (NNP York) (NNP Stock) (NNP Exchange)))))))))))) (, ,) ('' '') (VP (VBD said)) (NP (NP (NNP Christopher) (NNP Pedersen)) (, ,) (NP (NP (JJ senior) (NN vice) (NN president)) (PP (IN at) (NP (NNP Twenty-First) (NNP Securities) (NNP Corp))))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [2, 0], [3, 2, 0], [4, 3, 2, 0], [5, 4, 3, 2, 0], [5, 3, 2, 0], [6, 5, 3, 2, 0], [7, 6, 5, 3, 2, 0], [8, 7, 6, 5, 3, 2, 0], [9, 8, 7, 6, 5, 3, 2, 0], [9, 7, 6, 5, 3, 2, 0], [9, 6, 5, 3, 2, 0], [9, 5, 3, 2, 0], [9, 3, 2, 0], [9, 2, 0], [9, 0], [10, 9, 0], [11, 10, 9, 0], [11, 9, 0], [12, 11, 9, 0], [13, 12, 11, 9, 0], [13, 11, 9, 0], [14, 13, 11, 9, 0], [15, 14, 13, 11, 9, 0], [16, 15, 14, 13, 11, 9, 0], [16, 14, 13, 11, 9, 0], [17, 16, 14, 13, 11, 9, 0], [18, 17, 16, 14, 13, 11, 9, 0], [19, 18, 17, 16, 14, 13, 11, 9, 0], [20, 19, 18, 17, 16, 14, 13, 11, 9, 0], [21, 20, 19, 18, 17, 16, 14, 13, 11, 9, 0], [22, 21, 20, 19, 18, 17, 16, 14, 13, 11, 9, 0], [22, 20, 19, 18, 17, 16, 14, 13, 11, 9, 0], [22, 19, 18, 17, 16, 14, 13, 11, 9, 0], [22, 18, 17, 16, 14, 13, 11, 9, 0], [22, 17, 16, 14, 13, 11, 9, 0], [17, 16, 14, 13, 11, 9, 0], [16, 14, 13, 11, 9, 0], [14, 13, 11, 9, 0], [13, 11, 9, 0], [11, 9, 0], [9, 0], [23, 9, 0], [24, 23, 9, 0], [25, 24, 23, 9, 0], [25, 23, 9, 0], [25, 9, 0], [25, 0], [26, 25, 0], [27, 26, 25, 0], [27, 25, 0], [28, 27, 25, 0], [27, 25, 0], [29, 27, 25, 0], [30, 29, 27, 25, 0], [31, 30, 29, 27, 25, 0], [31, 29, 27, 25, 0], [31, 27, 25, 0], [32, 31, 27, 25, 0], [33, 32, 31, 27, 25, 0], [34, 33, 32, 31, 27, 25, 0], [35, 34, 33, 32, 31, 27, 25, 0], [35, 33, 32, 31, 27, 25, 0], [35, 32, 31, 27, 25, 0], [32, 31, 27, 25, 0], [31, 27, 25, 0], [27, 25, 0], [25, 0], [36, 25, 0], [25, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [28, 29, 30, 31, 32, 33, 34, 35, 36], [28, 29, 30, 31, 32, 33, 34, 35, 36], [29, 30, 31, 32, 33, 34, 35, 36], [29, 30, 31, 32, 33, 34, 35, 36], [30, 31, 32, 33, 34, 35, 36], [31, 32, 33, 34, 35, 36], [32, 33, 34, 35, 36], [32, 33, 34, 35, 36], [32, 33, 34, 35, 36], [33, 34, 35, 36], [34, 35, 36], [35, 36], [36], [36], [36], [36], [36], [36], [36], [], [], []], "actions": [[0, -1], [0, -1], [1, 2], [0, -1], [0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [0, -1], [0, -1], [1, 21], [1, 10], [1, 37], [1, 24], [1, 37], [1, 2], [0, -1], [0, -1], [1, 5], [0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 22], [1, 14], [2, 30], [2, 35], [2, 30], [2, 35], [2, 16], [2, 43], [0, -1], [0, -1], [0, -1], [1, 37], [1, 37], [1, 8], [0, -1], [0, -1], [1, 22], [0, -1], [2, 37], [0, -1], [0, -1], [0, -1], [1, 22], [1, 3], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [2, 30], [2, 35], [2, 4], [2, 24], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [1, 2, 1], [0, 2, 0], [0, 3, 2], [0, 4, 3], [1, 5, 4], [0, 5, 3], [0, 6, 5], [0, 7, 6], [0, 8, 7], [1, 9, 8], [1, 9, 7], [1, 9, 6], [1, 9, 5], [1, 9, 3], [1, 9, 2], [0, 9, 0], [0, 10, 9], [1, 11, 10], [0, 11, 9], [0, 12, 11], [1, 13, 12], [0, 13, 11], [0, 14, 13], [0, 15, 14], [1, 16, 15], [0, 16, 14], [0, 17, 16], [0, 18, 17], [0, 19, 18], [0, 20, 19], [0, 21, 20], [1, 22, 21], [1, 22, 20], [1, 22, 19], [1, 22, 18], [2, 22, 17], [2, 17, 16], [2, 16, 14], [2, 14, 13], [2, 13, 11], [2, 11, 9], [0, 9, 0], [0, 23, 9], [0, 24, 23], [1, 25, 24], [1, 25, 23], [1, 25, 9], [0, 25, 0], [0, 26, 25], [1, 27, 26], [0, 27, 25], [2, 28, 27], [0, 27, 25], [0, 29, 27], [0, 30, 29], [1, 31, 30], [1, 31, 29], [0, 31, 27], [0, 32, 31], [0, 33, 32], [0, 34, 33], [1, 35, 34], [1, 35, 33], [2, 35, 32], [2, 32, 31], [2, 31, 27], [2, 27, 25], [0, 25, 0], [2, 36, 25], [2, 25, 0]]}}
+{"orig_tokens": ["Countered", "James", "Maguire", ",", "chairman", "of", "specialists", "Henderson", "Brothers", "Inc.", ":", "``", "It", "is", "easy", "to", "say", "the", "specialist", "is", "n't", "doing", "his", "job", "."], "tokens": ["<cunked>", "<cunks>", "<cunk>", ",", "<unk>", "of", "<unks>", "<cunk>", "<cunks>", "Inc.", "<unk>", "``", "<cunk>", "<unk>", "<unk>", "to", "<unk>", "the", "<unk>", "<unk>", "n't", "<unking>", "<unk>", "<unk>", "."], "token_ids": [15, 5, 7, 45, 16, 26, 6, 7, 5, 69, 16, 27, 7, 16, 16, 59, 16, 40, 16, 16, 126, 17, 16, 16, 62], "tags": ["VBD", "NNP", "NNP", ",", "NN", "IN", "NNS", "NNP", "NNP", "NNP", ":", "``", "PRP", "VBZ", "JJ", "TO", "VB", "DT", "NN", "VBZ", "RB", "VBG", "PRP$", "NN", "."], "tree_str": "(SINV (VP (VBD Countered)) (NP (NP (NNP James) (NNP Maguire)) (, ,) (NP (NP (NN chairman)) (PP (IN of) (NP (NNS specialists) (NNP Henderson) (NNP Brothers) (NNP Inc.))))) (: :) (`` ``) (S (NP (PRP It)) (VP (VBZ is) (ADJP (JJ easy)) (S (VP (TO to) (VP (VB say) (SBAR (S (NP (DT the) (NN specialist)) (VP (VBZ is) (RB n't) (VP (VBG doing) (NP (PRP$ his) (NN job))))))))))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [3, 1, 0], [4, 3, 1, 0], [3, 1, 0], [5, 3, 1, 0], [6, 5, 3, 1, 0], [7, 6, 5, 3, 1, 0], [8, 7, 6, 5, 3, 1, 0], [9, 8, 7, 6, 5, 3, 1, 0], [10, 9, 8, 7, 6, 5, 3, 1, 0], [10, 8, 7, 6, 5, 3, 1, 0], [10, 7, 6, 5, 3, 1, 0], [10, 6, 5, 3, 1, 0], [6, 5, 3, 1, 0], [5, 3, 1, 0], [3, 1, 0], [1, 0], [11, 1, 0], [1, 0], [12, 1, 0], [1, 0], [13, 1, 0], [14, 13, 1, 0], [15, 14, 13, 1, 0], [15, 13, 1, 0], [15, 1, 0], [16, 15, 1, 0], [17, 16, 15, 1, 0], [17, 15, 1, 0], [18, 17, 15, 1, 0], [19, 18, 17, 15, 1, 0], [19, 17, 15, 1, 0], [20, 19, 17, 15, 1, 0], [21, 20, 19, 17, 15, 1, 0], [22, 21, 20, 19, 17, 15, 1, 0], [22, 20, 19, 17, 15, 1, 0], [22, 19, 17, 15, 1, 0], [22, 17, 15, 1, 0], [23, 22, 17, 15, 1, 0], [24, 23, 22, 17, 15, 1, 0], [24, 22, 17, 15, 1, 0], [22, 17, 15, 1, 0], [17, 15, 1, 0], [15, 1, 0], [1, 0], [25, 1, 0], [1, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [17, 18, 19, 20, 21, 22, 23, 24, 25], [18, 19, 20, 21, 22, 23, 24, 25], [18, 19, 20, 21, 22, 23, 24, 25], [19, 20, 21, 22, 23, 24, 25], [20, 21, 22, 23, 24, 25], [20, 21, 22, 23, 24, 25], [21, 22, 23, 24, 25], [22, 23, 24, 25], [23, 24, 25], [23, 24, 25], [23, 24, 25], [23, 24, 25], [24, 25], [25], [25], [25], [25], [25], [25], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [1, 22], [0, -1], [2, 37], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 22], [2, 30], [2, 35], [2, 4], [2, 24], [0, -1], [2, 37], [0, -1], [2, 37], [0, -1], [0, -1], [0, -1], [1, 10], [1, 24], [0, -1], [0, -1], [1, 5], [0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [0, -1], [1, 21], [1, 5], [1, 24], [0, -1], [0, -1], [1, 31], [2, 16], [2, 8], [2, 43], [2, 8], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [1, 3, 2], [0, 3, 1], [2, 4, 3], [0, 3, 1], [0, 5, 3], [0, 6, 5], [0, 7, 6], [0, 8, 7], [0, 9, 8], [1, 10, 9], [1, 10, 8], [1, 10, 7], [2, 10, 6], [2, 6, 5], [2, 5, 3], [2, 3, 1], [0, 1, 0], [2, 11, 1], [0, 1, 0], [2, 12, 1], [0, 1, 0], [0, 13, 1], [0, 14, 13], [1, 15, 14], [1, 15, 13], [0, 15, 1], [0, 16, 15], [1, 17, 16], [0, 17, 15], [0, 18, 17], [1, 19, 18], [0, 19, 17], [0, 20, 19], [0, 21, 20], [1, 22, 21], [1, 22, 20], [1, 22, 19], [0, 22, 17], [0, 23, 22], [1, 24, 23], [2, 24, 22], [2, 22, 17], [2, 17, 15], [2, 15, 1], [0, 1, 0], [2, 25, 1], [2, 1, 0]]}}
+{"orig_tokens": ["When", "the", "dollar", "is", "in", "a", "free-fall", ",", "even", "central", "banks", "ca", "n't", "stop", "it", "."], "tokens": ["<cunk>", "the", "<unk>", "<unk>", "in", "a", "<unk->", ",", "<unk>", "<unkal>", "<unks>", "<unk>", "n't", "<unk>", "it", "."], "token_ids": [7, 40, 16, 16, 42, 113, 1, 45, 16, 20, 6, 16, 126, 16, 71, 62], "tags": ["WRB", "DT", "NN", "VBZ", "IN", "DT", "NN", ",", "RB", "JJ", "NNS", "MD", "RB", "VB", "PRP", "."], "tree_str": "(S (SBAR (WHADVP (WRB When)) (S (NP (DT the) (NN dollar)) (VP (VBZ is) (PP (IN in) (NP (DT a) (NN free-fall)))))) (, ,) (NP (RB even) (JJ central) (NNS banks)) (VP (MD ca) (RB n't) (VP (VB stop) (NP (PRP it)))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [3, 1, 0], [4, 3, 1, 0], [4, 1, 0], [4, 0], [5, 4, 0], [6, 5, 4, 0], [7, 6, 5, 4, 0], [7, 5, 4, 0], [5, 4, 0], [4, 0], [8, 4, 0], [9, 8, 4, 0], [10, 9, 8, 4, 0], [11, 10, 9, 8, 4, 0], [11, 9, 8, 4, 0], [11, 8, 4, 0], [12, 11, 8, 4, 0], [13, 12, 11, 8, 4, 0], [14, 13, 12, 11, 8, 4, 0], [14, 12, 11, 8, 4, 0], [14, 11, 8, 4, 0], [14, 8, 4, 0], [14, 4, 0], [14, 0], [15, 14, 0], [14, 0], [16, 14, 0], [14, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [8, 9, 10, 11, 12, 13, 14, 15, 16], [8, 9, 10, 11, 12, 13, 14, 15, 16], [8, 9, 10, 11, 12, 13, 14, 15, 16], [8, 9, 10, 11, 12, 13, 14, 15, 16], [9, 10, 11, 12, 13, 14, 15, 16], [10, 11, 12, 13, 14, 15, 16], [11, 12, 13, 14, 15, 16], [12, 13, 14, 15, 16], [12, 13, 14, 15, 16], [12, 13, 14, 15, 16], [13, 14, 15, 16], [14, 15, 16], [15, 16], [15, 16], [15, 16], [15, 16], [15, 16], [15, 16], [16], [16], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [1, 14], [0, -1], [1, 24], [1, 2], [0, -1], [0, -1], [0, -1], [1, 14], [2, 30], [2, 35], [0, -1], [0, -1], [0, -1], [0, -1], [1, 3], [1, 2], [0, -1], [0, -1], [0, -1], [1, 21], [1, 5], [1, 24], [1, 37], [1, 1], [0, -1], [2, 16], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [1, 3, 2], [0, 3, 1], [1, 4, 3], [1, 4, 1], [0, 4, 0], [0, 5, 4], [0, 6, 5], [1, 7, 6], [2, 7, 5], [2, 5, 4], [0, 4, 0], [0, 8, 4], [0, 9, 8], [0, 10, 9], [1, 11, 10], [1, 11, 9], [0, 11, 8], [0, 12, 11], [0, 13, 12], [1, 14, 13], [1, 14, 12], [1, 14, 11], [1, 14, 8], [1, 14, 4], [0, 14, 0], [2, 15, 14], [0, 14, 0], [2, 16, 14], [2, 14, 0]]}}
+{"orig_tokens": ["Speculators", "are", "calling", "for", "a", "degree", "of", "liquidity", "that", "is", "not", "there", "in", "the", "market", ".", "''"], "tokens": ["<cunks>", "are", "calling", "for", "a", "<unk>", "of", "<unk>", "that", "<unk>", "not", "<unk>", "in", "the", "market", ".", "''"], "token_ids": [5, 140, 281, 99, 113, 16, 26, 16, 188, 16, 145, 16, 42, 40, 214, 62, 30], "tags": ["NNS", "VBP", "VBG", "IN", "DT", "NN", "IN", "NN", "WDT", "VBZ", "RB", "RB", "IN", "DT", "NN", ".", "''"], "tree_str": "(S (NP (NNS Speculators)) (VP (VBP are) (VP (VBG calling) (PP (IN for) (NP (NP (DT a) (NN degree)) (PP (IN of) (NP (NN liquidity))) (SBAR (WHNP (WDT that)) (S (VP (VBZ is) (RB not) (ADVP (RB there)) (PP (IN in) (NP (DT the) (NN market)))))))))) (. .) ('' ''))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [3, 1, 0], [3, 0], [4, 3, 0], [5, 4, 3, 0], [6, 5, 4, 3, 0], [6, 4, 3, 0], [7, 6, 4, 3, 0], [8, 7, 6, 4, 3, 0], [7, 6, 4, 3, 0], [6, 4, 3, 0], [9, 6, 4, 3, 0], [10, 9, 6, 4, 3, 0], [10, 6, 4, 3, 0], [11, 10, 6, 4, 3, 0], [10, 6, 4, 3, 0], [12, 10, 6, 4, 3, 0], [10, 6, 4, 3, 0], [13, 10, 6, 4, 3, 0], [14, 13, 10, 6, 4, 3, 0], [15, 14, 13, 10, 6, 4, 3, 0], [15, 13, 10, 6, 4, 3, 0], [13, 10, 6, 4, 3, 0], [10, 6, 4, 3, 0], [6, 4, 3, 0], [4, 3, 0], [3, 0], [16, 3, 0], [3, 0], [17, 3, 0], [3, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [9, 10, 11, 12, 13, 14, 15, 16, 17], [9, 10, 11, 12, 13, 14, 15, 16, 17], [9, 10, 11, 12, 13, 14, 15, 16, 17], [10, 11, 12, 13, 14, 15, 16, 17], [11, 12, 13, 14, 15, 16, 17], [11, 12, 13, 14, 15, 16, 17], [12, 13, 14, 15, 16, 17], [12, 13, 14, 15, 16, 17], [13, 14, 15, 16, 17], [13, 14, 15, 16, 17], [14, 15, 16, 17], [15, 16, 17], [16, 17], [16, 17], [16, 17], [16, 17], [16, 17], [16, 17], [16, 17], [17], [17], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [1, 5], [1, 24], [0, -1], [0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [2, 30], [2, 35], [0, -1], [0, -1], [1, 24], [0, -1], [2, 21], [0, -1], [2, 2], [0, -1], [0, -1], [0, -1], [1, 14], [2, 30], [2, 35], [2, 39], [2, 30], [2, 35], [0, -1], [2, 37], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [1, 3, 2], [1, 3, 1], [0, 3, 0], [0, 4, 3], [0, 5, 4], [1, 6, 5], [0, 6, 4], [0, 7, 6], [2, 8, 7], [2, 7, 6], [0, 6, 4], [0, 9, 6], [1, 10, 9], [0, 10, 6], [2, 11, 10], [0, 10, 6], [2, 12, 10], [0, 10, 6], [0, 13, 10], [0, 14, 13], [1, 15, 14], [2, 15, 13], [2, 13, 10], [2, 10, 6], [2, 6, 4], [2, 4, 3], [0, 3, 0], [2, 16, 3], [0, 3, 0], [2, 17, 3], [2, 3, 0]]}}
+{"orig_tokens": ["Many", "money", "managers", "and", "some", "traders", "had", "already", "left", "their", "offices", "early", "Friday", "afternoon", "on", "a", "warm", "autumn", "day", "--", "because", "the", "stock", "market", "was", "so", "quiet", "."], "tokens": ["Many", "money", "managers", "and", "some", "<unks>", "had", "already", "<unk>", "their", "<unks>", "<unkly>", "<cunk>", "<unk>", "on", "a", "<unk>", "<unk>", "<unk>", "--", "<unk>", "the", "stock", "market", "was", "so", "<unk>", "."], "token_ids": [207, 174, 175, 92, 159, 6, 292, 217, 16, 147, 6, 2, 7, 16, 165, 113, 16, 16, 16, 216, 16, 40, 150, 214, 56, 211, 16, 62], "tags": ["JJ", "NN", "NNS", "CC", "DT", "NNS", "VBD", "RB", "VBN", "PRP$", "NNS", "RB", "NNP", "NN", "IN", "DT", "JJ", "NN", "NN", ":", "IN", "DT", "NN", "NN", "VBD", "RB", "JJ", "."], "tree_str": "(S (NP (NP (JJ Many) (NN money) (NNS managers)) (CC and) (NP (DT some) (NNS traders))) (VP (VBD had) (ADVP (RB already)) (VP (VBN left) (NP (PRP$ their) (NNS offices)) (NP (RB early) (NNP Friday) (NN afternoon)) (PP (IN on) (NP (DT a) (JJ warm) (NN autumn) (NN day))) (: --) (SBAR (IN because) (S (NP (DT the) (NN stock) (NN market)) (VP (VBD was) (ADJP (RB so) (JJ quiet))))))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [3, 1, 0], [3, 0], [4, 3, 0], [3, 0], [5, 3, 0], [6, 5, 3, 0], [6, 3, 0], [3, 0], [7, 3, 0], [8, 7, 3, 0], [9, 8, 7, 3, 0], [9, 7, 3, 0], [9, 3, 0], [9, 0], [10, 9, 0], [11, 10, 9, 0], [11, 9, 0], [9, 0], [12, 9, 0], [13, 12, 9, 0], [14, 13, 12, 9, 0], [14, 12, 9, 0], [14, 9, 0], [9, 0], [15, 9, 0], [16, 15, 9, 0], [17, 16, 15, 9, 0], [18, 17, 16, 15, 9, 0], [19, 18, 17, 16, 15, 9, 0], [19, 17, 16, 15, 9, 0], [19, 16, 15, 9, 0], [19, 15, 9, 0], [15, 9, 0], [9, 0], [20, 9, 0], [9, 0], [21, 9, 0], [22, 21, 9, 0], [23, 22, 21, 9, 0], [24, 23, 22, 21, 9, 0], [24, 22, 21, 9, 0], [24, 21, 9, 0], [25, 24, 21, 9, 0], [26, 25, 24, 21, 9, 0], [27, 26, 25, 24, 21, 9, 0], [27, 25, 24, 21, 9, 0], [27, 24, 21, 9, 0], [27, 21, 9, 0], [27, 9, 0], [9, 0], [28, 9, 0], [9, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28], [21, 22, 23, 24, 25, 26, 27, 28], [21, 22, 23, 24, 25, 26, 27, 28], [22, 23, 24, 25, 26, 27, 28], [23, 24, 25, 26, 27, 28], [24, 25, 26, 27, 28], [25, 26, 27, 28], [25, 26, 27, 28], [25, 26, 27, 28], [26, 27, 28], [27, 28], [28], [28], [28], [28], [28], [28], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [1, 22], [1, 3], [0, -1], [2, 7], [0, -1], [0, -1], [1, 14], [2, 9], [0, -1], [0, -1], [0, -1], [1, 2], [1, 5], [1, 24], [0, -1], [0, -1], [1, 31], [2, 13], [0, -1], [0, -1], [0, -1], [1, 22], [1, 2], [2, 16], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 3], [1, 14], [2, 30], [2, 35], [0, -1], [2, 37], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 14], [0, -1], [0, -1], [0, -1], [1, 2], [1, 10], [1, 24], [1, 19], [2, 1], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [1, 3, 2], [1, 3, 1], [0, 3, 0], [2, 4, 3], [0, 3, 0], [0, 5, 3], [1, 6, 5], [2, 6, 3], [0, 3, 0], [0, 7, 3], [0, 8, 7], [1, 9, 8], [1, 9, 7], [1, 9, 3], [0, 9, 0], [0, 10, 9], [1, 11, 10], [2, 11, 9], [0, 9, 0], [0, 12, 9], [0, 13, 12], [1, 14, 13], [1, 14, 12], [2, 14, 9], [0, 9, 0], [0, 15, 9], [0, 16, 15], [0, 17, 16], [0, 18, 17], [1, 19, 18], [1, 19, 17], [1, 19, 16], [2, 19, 15], [2, 15, 9], [0, 9, 0], [2, 20, 9], [0, 9, 0], [0, 21, 9], [0, 22, 21], [0, 23, 22], [1, 24, 23], [1, 24, 22], [0, 24, 21], [0, 25, 24], [0, 26, 25], [1, 27, 26], [1, 27, 25], [1, 27, 24], [1, 27, 21], [2, 27, 9], [0, 9, 0], [2, 28, 9], [2, 9, 0]]}}
+{"orig_tokens": ["Then", "in", "a", "lightning", "plunge", ",", "the", "Dow", "Jones", "industrials", "in", "barely", "an", "hour", "surrendered", "about", "a", "third", "of", "their", "gains", "this", "year", ",", "chalking", "up", "a", "190.58-point", ",", "or", "6.9", "%", ",", "loss", "on", "the", "day", "in", "gargantuan", "trading", "volume", "."], "tokens": ["<cunk>", "in", "a", "<unking>", "<unk>", ",", "the", "<cunk>", "<cunks>", "<unks>", "in", "<unkly>", "an", "<unk>", "<unked>", "about", "a", "<unk>", "of", "their", "<unks>", "this", "year", ",", "<unking>", "<unk>", "a", "<unk>", ",", "<unk>", "<unkn>", "%", ",", "<unks>", "on", "the", "<unk>", "in", "<unk>", "<unking>", "<unk>", "."], "token_ids": [7, 42, 113, 17, 16, 45, 40, 7, 5, 6, 42, 2, 22, 16, 3, 78, 113, 16, 26, 147, 6, 185, 86, 45, 17, 16, 113, 16, 45, 16, 10, 268, 45, 6, 165, 40, 16, 42, 16, 17, 16, 62], "tags": ["RB", "IN", "DT", "NN", "NN", ",", "DT", "NNP", "NNP", "NNS", "IN", "RB", "DT", "NN", "VBD", "RB", "DT", "JJ", "IN", "PRP$", "NNS", "DT", "NN", ",", "VBG", "RP", "DT", "JJ", ",", "CC", "CD", "NN", ",", "NN", "IN", "DT", "NN", "IN", "JJ", "NN", "NN", "."], "tree_str": "(S (RB Then) (PP (IN in) (NP (DT a) (NN lightning) (NN plunge))) (, ,) (NP (DT the) (NNP Dow) (NNP Jones) (NNS industrials)) (PP (IN in) (NP (QP (RB barely) (DT an)) (NN hour))) (VP (VBD surrendered) (NP (NP (QP (RB about) (DT a)) (JJ third)) (PP (IN of) (NP (NP (PRP$ their) (NNS gains)) (NP (DT this) (NN year))))) (, ,) (S (VP (VBG chalking) (PRT (RP up)) (NP (NP (DT a) (ADJP (ADJP (JJ 190.58-point)) (, ,) (CC or) (ADJP (CD 6.9) (NN %)) (, ,)) (NN loss)) (PP (IN on) (NP (DT the) (NN day)))) (PP (IN in) (NP (JJ gargantuan) (NN trading) (NN volume)))))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [4, 3, 2, 1, 0], [5, 4, 3, 2, 1, 0], [5, 3, 2, 1, 0], [5, 2, 1, 0], [2, 1, 0], [6, 2, 1, 0], [7, 6, 2, 1, 0], [8, 7, 6, 2, 1, 0], [9, 8, 7, 6, 2, 1, 0], [10, 9, 8, 7, 6, 2, 1, 0], [10, 8, 7, 6, 2, 1, 0], [10, 7, 6, 2, 1, 0], [10, 6, 2, 1, 0], [11, 10, 6, 2, 1, 0], [12, 11, 10, 6, 2, 1, 0], [13, 12, 11, 10, 6, 2, 1, 0], [13, 11, 10, 6, 2, 1, 0], [14, 13, 11, 10, 6, 2, 1, 0], [14, 11, 10, 6, 2, 1, 0], [11, 10, 6, 2, 1, 0], [15, 11, 10, 6, 2, 1, 0], [15, 10, 6, 2, 1, 0], [15, 6, 2, 1, 0], [15, 2, 1, 0], [15, 1, 0], [15, 0], [16, 15, 0], [17, 16, 15, 0], [17, 15, 0], [18, 17, 15, 0], [18, 15, 0], [19, 18, 15, 0], [20, 19, 18, 15, 0], [21, 20, 19, 18, 15, 0], [21, 19, 18, 15, 0], [22, 21, 19, 18, 15, 0], [23, 22, 21, 19, 18, 15, 0], [23, 21, 19, 18, 15, 0], [21, 19, 18, 15, 0], [19, 18, 15, 0], [18, 15, 0], [15, 0], [24, 15, 0], [15, 0], [25, 15, 0], [26, 25, 15, 0], [25, 15, 0], [27, 25, 15, 0], [28, 27, 25, 15, 0], [29, 28, 27, 25, 15, 0], [28, 27, 25, 15, 0], [30, 28, 27, 25, 15, 0], [28, 27, 25, 15, 0], [31, 28, 27, 25, 15, 0], [32, 31, 28, 27, 25, 15, 0], [32, 28, 27, 25, 15, 0], [28, 27, 25, 15, 0], [33, 28, 27, 25, 15, 0], [28, 27, 25, 15, 0], [34, 28, 27, 25, 15, 0], [34, 27, 25, 15, 0], [34, 25, 15, 0], [35, 34, 25, 15, 0], [36, 35, 34, 25, 15, 0], [37, 36, 35, 34, 25, 15, 0], [37, 35, 34, 25, 15, 0], [35, 34, 25, 15, 0], [34, 25, 15, 0], [25, 15, 0], [38, 25, 15, 0], [39, 38, 25, 15, 0], [40, 39, 38, 25, 15, 0], [41, 40, 39, 38, 25, 15, 0], [41, 39, 38, 25, 15, 0], [41, 38, 25, 15, 0], [38, 25, 15, 0], [25, 15, 0], [15, 0], [42, 15, 0], [15, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [33, 34, 35, 36, 37, 38, 39, 40, 41, 42], [34, 35, 36, 37, 38, 39, 40, 41, 42], [34, 35, 36, 37, 38, 39, 40, 41, 42], [35, 36, 37, 38, 39, 40, 41, 42], [35, 36, 37, 38, 39, 40, 41, 42], [35, 36, 37, 38, 39, 40, 41, 42], [36, 37, 38, 39, 40, 41, 42], [37, 38, 39, 40, 41, 42], [38, 39, 40, 41, 42], [38, 39, 40, 41, 42], [38, 39, 40, 41, 42], [38, 39, 40, 41, 42], [38, 39, 40, 41, 42], [39, 40, 41, 42], [40, 41, 42], [41, 42], [42], [42], [42], [42], [42], [42], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 14], [2, 30], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 14], [0, -1], [0, -1], [0, -1], [1, 38], [0, -1], [1, 26], [2, 30], [0, -1], [1, 35], [1, 24], [1, 37], [1, 35], [1, 2], [0, -1], [0, -1], [1, 38], [0, -1], [1, 26], [0, -1], [0, -1], [0, -1], [1, 31], [0, -1], [0, -1], [1, 14], [2, 13], [2, 30], [2, 35], [2, 16], [0, -1], [2, 37], [0, -1], [0, -1], [2, 36], [0, -1], [0, -1], [0, -1], [2, 37], [0, -1], [2, 7], [0, -1], [0, -1], [1, 27], [2, 9], [0, -1], [2, 37], [0, -1], [1, 3], [1, 14], [0, -1], [0, -1], [0, -1], [1, 14], [2, 30], [2, 35], [2, 16], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 3], [2, 30], [2, 35], [2, 42], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [0, 3, 2], [0, 4, 3], [1, 5, 4], [1, 5, 3], [2, 5, 2], [0, 2, 1], [0, 6, 2], [0, 7, 6], [0, 8, 7], [0, 9, 8], [1, 10, 9], [1, 10, 8], [1, 10, 7], [0, 10, 6], [0, 11, 10], [0, 12, 11], [1, 13, 12], [0, 13, 11], [1, 14, 13], [2, 14, 11], [0, 11, 10], [1, 15, 11], [1, 15, 10], [1, 15, 6], [1, 15, 2], [1, 15, 1], [0, 15, 0], [0, 16, 15], [1, 17, 16], [0, 17, 15], [1, 18, 17], [0, 18, 15], [0, 19, 18], [0, 20, 19], [1, 21, 20], [0, 21, 19], [0, 22, 21], [1, 23, 22], [2, 23, 21], [2, 21, 19], [2, 19, 18], [2, 18, 15], [0, 15, 0], [2, 24, 15], [0, 15, 0], [0, 25, 15], [2, 26, 25], [0, 25, 15], [0, 27, 25], [0, 28, 27], [2, 29, 28], [0, 28, 27], [2, 30, 28], [0, 28, 27], [0, 31, 28], [1, 32, 31], [2, 32, 28], [0, 28, 27], [2, 33, 28], [0, 28, 27], [1, 34, 28], [1, 34, 27], [0, 34, 25], [0, 35, 34], [0, 36, 35], [1, 37, 36], [2, 37, 35], [2, 35, 34], [2, 34, 25], [0, 25, 15], [0, 38, 25], [0, 39, 38], [0, 40, 39], [1, 41, 40], [1, 41, 39], [2, 41, 38], [2, 38, 25], [2, 25, 15], [0, 15, 0], [2, 42, 15], [2, 15, 0]]}}
+{"orig_tokens": ["Final-hour", "trading", "accelerated", "to", "108.1", "million", "shares", ",", "a", "record", "for", "the", "Big", "Board", "."], "tokens": ["<cunk->", "<unking>", "<unked>", "to", "<unkn>", "<unkion>", "shares", ",", "a", "record", "for", "the", "<cunk>", "<cunk>", "."], "token_ids": [11, 17, 3, 59, 10, 8, 198, 45, 113, 123, 99, 40, 7, 7, 62], "tags": ["JJ", "NN", "VBD", "TO", "CD", "CD", "NNS", ",", "DT", "NN", "IN", "DT", "NNP", "NNP", "."], "tree_str": "(S (NP (JJ Final-hour) (NN trading)) (VP (VBD accelerated) (PP (TO to) (NP (NP (QP (CD 108.1) (CD million)) (NNS shares)) (, ,) (NP (NP (DT a) (NN record)) (PP (IN for) (NP (DT the) (NNP Big) (NNP Board))))))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [2, 0], [3, 2, 0], [3, 0], [4, 3, 0], [5, 4, 3, 0], [6, 5, 4, 3, 0], [6, 4, 3, 0], [7, 6, 4, 3, 0], [7, 4, 3, 0], [8, 7, 4, 3, 0], [7, 4, 3, 0], [9, 7, 4, 3, 0], [10, 9, 7, 4, 3, 0], [10, 7, 4, 3, 0], [11, 10, 7, 4, 3, 0], [12, 11, 10, 7, 4, 3, 0], [13, 12, 11, 10, 7, 4, 3, 0], [14, 13, 12, 11, 10, 7, 4, 3, 0], [14, 12, 11, 10, 7, 4, 3, 0], [14, 11, 10, 7, 4, 3, 0], [11, 10, 7, 4, 3, 0], [10, 7, 4, 3, 0], [7, 4, 3, 0], [4, 3, 0], [3, 0], [15, 3, 0], [3, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [7, 8, 9, 10, 11, 12, 13, 14, 15], [7, 8, 9, 10, 11, 12, 13, 14, 15], [8, 9, 10, 11, 12, 13, 14, 15], [8, 9, 10, 11, 12, 13, 14, 15], [9, 10, 11, 12, 13, 14, 15], [9, 10, 11, 12, 13, 14, 15], [10, 11, 12, 13, 14, 15], [11, 12, 13, 14, 15], [11, 12, 13, 14, 15], [12, 13, 14, 15], [13, 14, 15], [14, 15], [15], [15], [15], [15], [15], [15], [15], [15], [], [], []], "actions": [[0, -1], [0, -1], [1, 3], [0, -1], [1, 24], [0, -1], [0, -1], [0, -1], [1, 27], [0, -1], [1, 26], [0, -1], [2, 37], [0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 14], [2, 30], [2, 35], [2, 4], [2, 30], [2, 35], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [1, 2, 1], [0, 2, 0], [1, 3, 2], [0, 3, 0], [0, 4, 3], [0, 5, 4], [1, 6, 5], [0, 6, 4], [1, 7, 6], [0, 7, 4], [2, 8, 7], [0, 7, 4], [0, 9, 7], [1, 10, 9], [0, 10, 7], [0, 11, 10], [0, 12, 11], [0, 13, 12], [1, 14, 13], [1, 14, 12], [2, 14, 11], [2, 11, 10], [2, 10, 7], [2, 7, 4], [2, 4, 3], [0, 3, 0], [2, 15, 3], [2, 3, 0]]}}
+{"orig_tokens": ["At", "the", "end", "of", "the", "day", ",", "251.2", "million", "shares", "were", "traded", "."], "tokens": ["<cunk>", "the", "<unk>", "of", "the", "<unk>", ",", "<unkn>", "<unkion>", "shares", "were", "<unked>", "."], "token_ids": [7, 40, 16, 26, 40, 16, 45, 10, 8, 198, 250, 3, 62], "tags": ["IN", "DT", "NN", "IN", "DT", "NN", ",", "CD", "CD", "NNS", "VBD", "VBN", "."], "tree_str": "(S (PP (IN At) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (NN day))))) (, ,) (NP (QP (CD 251.2) (CD million)) (NNS shares)) (VP (VBD were) (VP (VBN traded))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [3, 1, 0], [4, 3, 1, 0], [5, 4, 3, 1, 0], [6, 5, 4, 3, 1, 0], [6, 4, 3, 1, 0], [4, 3, 1, 0], [3, 1, 0], [1, 0], [7, 1, 0], [8, 7, 1, 0], [9, 8, 7, 1, 0], [9, 7, 1, 0], [10, 9, 7, 1, 0], [10, 7, 1, 0], [11, 10, 7, 1, 0], [12, 11, 10, 7, 1, 0], [12, 10, 7, 1, 0], [12, 7, 1, 0], [12, 1, 0], [12, 0], [13, 12, 0], [12, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [5, 6, 7, 8, 9, 10, 11, 12, 13], [6, 7, 8, 9, 10, 11, 12, 13], [7, 8, 9, 10, 11, 12, 13], [7, 8, 9, 10, 11, 12, 13], [7, 8, 9, 10, 11, 12, 13], [7, 8, 9, 10, 11, 12, 13], [7, 8, 9, 10, 11, 12, 13], [8, 9, 10, 11, 12, 13], [9, 10, 11, 12, 13], [10, 11, 12, 13], [10, 11, 12, 13], [11, 12, 13], [11, 12, 13], [12, 13], [13], [13], [13], [13], [13], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [0, -1], [1, 14], [2, 30], [2, 35], [2, 30], [0, -1], [0, -1], [0, -1], [1, 27], [0, -1], [1, 26], [0, -1], [0, -1], [1, 6], [1, 25], [1, 37], [1, 35], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [1, 3, 2], [0, 3, 1], [0, 4, 3], [0, 5, 4], [1, 6, 5], [2, 6, 4], [2, 4, 3], [2, 3, 1], [0, 1, 0], [0, 7, 1], [0, 8, 7], [1, 9, 8], [0, 9, 7], [1, 10, 9], [0, 10, 7], [0, 11, 10], [1, 12, 11], [1, 12, 10], [1, 12, 7], [1, 12, 1], [0, 12, 0], [2, 13, 12], [2, 12, 0]]}}
+{"orig_tokens": ["The", "Dow", "Jones", "industrials", "closed", "at", "2569.26", "."], "tokens": ["The", "<cunk>", "<cunks>", "<unks>", "<unked>", "at", "<unkn>", "."], "token_ids": [28, 7, 5, 6, 3, 31, 10, 62], "tags": ["DT", "NNP", "NNP", "NNS", "VBD", "IN", "CD", "."], "tree_str": "(S (NP (DT The) (NNP Dow) (NNP Jones) (NNS industrials)) (VP (VBD closed) (PP (IN at) (NP (CD 2569.26)))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [4, 3, 2, 1, 0], [4, 2, 1, 0], [4, 1, 0], [4, 0], [5, 4, 0], [5, 0], [6, 5, 0], [7, 6, 5, 0], [6, 5, 0], [5, 0], [8, 5, 0], [5, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8], [2, 3, 4, 5, 6, 7, 8], [3, 4, 5, 6, 7, 8], [4, 5, 6, 7, 8], [5, 6, 7, 8], [5, 6, 7, 8], [5, 6, 7, 8], [5, 6, 7, 8], [6, 7, 8], [6, 7, 8], [7, 8], [8], [8], [8], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 14], [0, -1], [1, 24], [0, -1], [0, -1], [2, 30], [2, 35], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [0, 3, 2], [1, 4, 3], [1, 4, 2], [1, 4, 1], [0, 4, 0], [1, 5, 4], [0, 5, 0], [0, 6, 5], [2, 7, 6], [2, 6, 5], [0, 5, 0], [2, 8, 5], [2, 5, 0]]}}
+{"orig_tokens": ["The", "Dow", "'s", "decline", "was", "second", "in", "point", "terms", "only", "to", "the", "508-point", "Black", "Monday", "crash", "that", "occurred", "Oct.", "19", ",", "1987", "."], "tokens": ["The", "<cunk>", "'s", "<unk>", "was", "<unk>", "in", "<unk>", "<unks>", "<unkly>", "to", "the", "<unk->", "<cunk>", "<cunk>", "crash", "that", "<unked>", "Oct.", "19", ",", "1987", "."], "token_ids": [28, 7, 33, 16, 56, 16, 42, 16, 6, 2, 59, 40, 1, 7, 7, 215, 188, 3, 23, 24, 45, 213, 62], "tags": ["DT", "NNP", "POS", "NN", "VBD", "JJ", "IN", "NN", "NNS", "RB", "TO", "DT", "JJ", "NNP", "NNP", "NN", "WDT", "VBD", "NNP", "CD", ",", "CD", "."], "tree_str": "(S (NP (NP (DT The) (NNP Dow) (POS 's)) (NN decline)) (VP (VBD was) (ADJP (JJ second) (PP (IN in) (NP (NN point) (NNS terms))) (PP (ADVP (RB only)) (TO to) (NP (NP (DT the) (JJ 508-point) (NNP Black) (NNP Monday) (NN crash)) (SBAR (WHNP (WDT that)) (S (VP (VBD occurred) (NP (NNP Oct.) (CD 19) (, ,) (CD 1987))))))))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [2, 0], [3, 2, 0], [2, 0], [4, 2, 0], [4, 0], [5, 4, 0], [6, 5, 4, 0], [6, 4, 0], [6, 0], [7, 6, 0], [8, 7, 6, 0], [9, 8, 7, 6, 0], [9, 7, 6, 0], [7, 6, 0], [6, 0], [10, 6, 0], [11, 10, 6, 0], [11, 6, 0], [12, 11, 6, 0], [13, 12, 11, 6, 0], [14, 13, 12, 11, 6, 0], [15, 14, 13, 12, 11, 6, 0], [16, 15, 14, 13, 12, 11, 6, 0], [16, 14, 13, 12, 11, 6, 0], [16, 13, 12, 11, 6, 0], [16, 12, 11, 6, 0], [16, 11, 6, 0], [17, 16, 11, 6, 0], [18, 17, 16, 11, 6, 0], [18, 16, 11, 6, 0], [19, 18, 16, 11, 6, 0], [20, 19, 18, 16, 11, 6, 0], [19, 18, 16, 11, 6, 0], [21, 19, 18, 16, 11, 6, 0], [19, 18, 16, 11, 6, 0], [22, 19, 18, 16, 11, 6, 0], [19, 18, 16, 11, 6, 0], [18, 16, 11, 6, 0], [16, 11, 6, 0], [11, 6, 0], [6, 0], [23, 6, 0], [6, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [15, 16, 17, 18, 19, 20, 21, 22, 23], [16, 17, 18, 19, 20, 21, 22, 23], [17, 18, 19, 20, 21, 22, 23], [17, 18, 19, 20, 21, 22, 23], [17, 18, 19, 20, 21, 22, 23], [17, 18, 19, 20, 21, 22, 23], [17, 18, 19, 20, 21, 22, 23], [18, 19, 20, 21, 22, 23], [19, 20, 21, 22, 23], [19, 20, 21, 22, 23], [20, 21, 22, 23], [21, 22, 23], [21, 22, 23], [22, 23], [22, 23], [23], [23], [23], [23], [23], [23], [], [], []], "actions": [[0, -1], [0, -1], [1, 14], [0, -1], [2, 32], [0, -1], [1, 31], [0, -1], [0, -1], [1, 10], [1, 24], [0, -1], [0, -1], [0, -1], [1, 22], [2, 30], [2, 35], [0, -1], [0, -1], [1, 2], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 22], [1, 22], [1, 3], [1, 14], [0, -1], [0, -1], [1, 24], [0, -1], [0, -1], [2, 26], [0, -1], [2, 37], [0, -1], [2, 26], [2, 41], [2, 39], [2, 30], [2, 35], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [1, 2, 1], [0, 2, 0], [2, 3, 2], [0, 2, 0], [1, 4, 2], [0, 4, 0], [0, 5, 4], [1, 6, 5], [1, 6, 4], [0, 6, 0], [0, 7, 6], [0, 8, 7], [1, 9, 8], [2, 9, 7], [2, 7, 6], [0, 6, 0], [0, 10, 6], [1, 11, 10], [0, 11, 6], [0, 12, 11], [0, 13, 12], [0, 14, 13], [0, 15, 14], [1, 16, 15], [1, 16, 14], [1, 16, 13], [1, 16, 12], [0, 16, 11], [0, 17, 16], [1, 18, 17], [0, 18, 16], [0, 19, 18], [2, 20, 19], [0, 19, 18], [2, 21, 19], [0, 19, 18], [2, 22, 19], [2, 19, 18], [2, 18, 16], [2, 16, 11], [2, 11, 6], [0, 6, 0], [2, 23, 6], [2, 6, 0]]}}
+{"orig_tokens": ["In", "percentage", "terms", ",", "however", ",", "the", "Dow", "'s", "dive", "was", "the", "12th-worst", "ever", "and", "the", "sharpest", "since", "the", "market", "fell", "156.83", ",", "or", "8", "%", ",", "a", "week", "after", "Black", "Monday", "."], "tokens": ["In", "<unk>", "<unks>", ",", "<unkER>", ",", "the", "<cunk>", "'s", "<unk>", "was", "the", "<unk->", "<unkER>", "and", "the", "<unk>", "since", "the", "market", "<unk>", "<unkn>", ",", "<unk>", "<unkn>", "%", ",", "a", "<unk>", "after", "<cunk>", "<cunk>", "."], "token_ids": [21, 16, 6, 45, 13, 45, 40, 7, 33, 16, 56, 40, 1, 13, 92, 40, 16, 249, 40, 214, 16, 10, 45, 16, 10, 268, 45, 113, 16, 264, 7, 7, 62], "tags": ["IN", "NN", "NNS", ",", "RB", ",", "DT", "NNP", "POS", "NN", "VBD", "DT", "JJ", "RB", "CC", "DT", "JJS", "IN", "DT", "NN", "VBD", "CD", ",", "CC", "CD", "NN", ",", "DT", "NN", "IN", "NNP", "NNP", "."], "tree_str": "(S (PP (IN In) (NP (NN percentage) (NNS terms))) (, ,) (ADVP (RB however)) (, ,) (NP (NP (DT the) (NNP Dow) (POS 's)) (NN dive)) (VP (VBD was) (NP (NP (NP (DT the) (JJ 12th-worst)) (ADVP (RB ever))) (CC and) (NP (NP (DT the) (JJS sharpest)) (SBAR (IN since) (S (NP (DT the) (NN market)) (VP (VBD fell) (NP (NP (CD 156.83)) (, ,) (CC or) (NP (CD 8) (NN %))) (, ,) (PP (NP (DT a) (NN week)) (IN after) (NP (NNP Black) (NNP Monday))))))))) (. .))", "key": "sentence", "projective": true, "ASd": {"gold_stacks": [[0], [1, 0], [2, 1, 0], [3, 2, 1, 0], [3, 1, 0], [1, 0], [4, 1, 0], [5, 4, 1, 0], [6, 5, 4, 1, 0], [7, 6, 5, 4, 1, 0], [8, 7, 6, 5, 4, 1, 0], [8, 6, 5, 4, 1, 0], [9, 8, 6, 5, 4, 1, 0], [8, 6, 5, 4, 1, 0], [10, 8, 6, 5, 4, 1, 0], [10, 6, 5, 4, 1, 0], [11, 10, 6, 5, 4, 1, 0], [12, 11, 10, 6, 5, 4, 1, 0], [13, 12, 11, 10, 6, 5, 4, 1, 0], [13, 11, 10, 6, 5, 4, 1, 0], [13, 10, 6, 5, 4, 1, 0], [13, 6, 5, 4, 1, 0], [13, 5, 4, 1, 0], [13, 4, 1, 0], [13, 1, 0], [13, 0], [14, 13, 0], [13, 0], [15, 13, 0], [13, 0], [16, 13, 0], [17, 16, 13, 0], [17, 13, 0], [18, 17, 13, 0], [19, 18, 17, 13, 0], [20, 19, 18, 17, 13, 0], [20, 18, 17, 13, 0], [21, 20, 18, 17, 13, 0], [21, 18, 17, 13, 0], [22, 21, 18, 17, 13, 0], [23, 22, 21, 18, 17, 13, 0], [22, 21, 18, 17, 13, 0], [24, 22, 21, 18, 17, 13, 0], [22, 21, 18, 17, 13, 0], [25, 22, 21, 18, 17, 13, 0], [26, 25, 22, 21, 18, 17, 13, 0], [26, 22, 21, 18, 17, 13, 0], [22, 21, 18, 17, 13, 0], [21, 18, 17, 13, 0], [27, 21, 18, 17, 13, 0], [21, 18, 17, 13, 0], [28, 21, 18, 17, 13, 0], [29, 28, 21, 18, 17, 13, 0], [29, 21, 18, 17, 13, 0], [30, 29, 21, 18, 17, 13, 0], [30, 21, 18, 17, 13, 0], [31, 30, 21, 18, 17, 13, 0], [32, 31, 30, 21, 18, 17, 13, 0], [32, 30, 21, 18, 17, 13, 0], [30, 21, 18, 17, 13, 0], [21, 18, 17, 13, 0], [18, 17, 13, 0], [17, 13, 0], [13, 0], [33, 13, 0], [13, 0], [0]], "gold_buffers": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [25, 26, 27, 28, 29, 30, 31, 32, 33], [25, 26, 27, 28, 29, 30, 31, 32, 33], [26, 27, 28, 29, 30, 31, 32, 33], [27, 28, 29, 30, 31, 32, 33], [27, 28, 29, 30, 31, 32, 33], [27, 28, 29, 30, 31, 32, 33], [27, 28, 29, 30, 31, 32, 33], [28, 29, 30, 31, 32, 33], [28, 29, 30, 31, 32, 33], [29, 30, 31, 32, 33], [30, 31, 32, 33], [30, 31, 32, 33], [31, 32, 33], [31, 32, 33], [32, 33], [33], [33], [33], [33], [33], [33], [33], [], [], []], "actions": [[0, -1], [0, -1], [0, -1], [1, 22], [2, 30], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 14], [0, -1], [2, 32], [0, -1], [1, 31], [0, -1], [0, -1], [0, -1], [1, 14], [1, 10], [1, 24], [1, 37], [1, 2], [1, 37], [1, 35], [0, -1], [2, 2], [0, -1], [2, 7], [0, -1], [0, -1], [1, 14], [0, -1], [0, -1], [0, -1], [1, 14], [0, -1], [1, 24], [0, -1], [0, -1], [2, 37], [0, -1], [2, 7], [0, -1], [0, -1], [1, 26], [2, 9], [2, 16], [0, -1], [2, 37], [0, -1], [0, -1], [1, 14], [0, -1], [1, 23], [0, -1], [0, -1], [1, 22], [2, 30], [2, 35], [2, 29], [2, 35], [2, 9], [0, -1], [2, 37], [2, 40]], "action_tuples": [[0, -1, -1], [0, 1, 0], [0, 2, 1], [1, 3, 2], [2, 3, 1], [0, 1, 0], [0, 4, 1], [0, 5, 4], [0, 6, 5], [0, 7, 6], [1, 8, 7], [0, 8, 6], [2, 9, 8], [0, 8, 6], [1, 10, 8], [0, 10, 6], [0, 11, 10], [0, 12, 11], [1, 13, 12], [1, 13, 11], [1, 13, 10], [1, 13, 6], [1, 13, 5], [1, 13, 4], [1, 13, 1], [0, 13, 0], [2, 14, 13], [0, 13, 0], [2, 15, 13], [0, 13, 0], [0, 16, 13], [1, 17, 16], [0, 17, 13], [0, 18, 17], [0, 19, 18], [1, 20, 19], [0, 20, 18], [1, 21, 20], [0, 21, 18], [0, 22, 21], [2, 23, 22], [0, 22, 21], [2, 24, 22], [0, 22, 21], [0, 25, 22], [1, 26, 25], [2, 26, 22], [2, 22, 21], [0, 21, 18], [2, 27, 21], [0, 21, 18], [0, 28, 21], [1, 29, 28], [0, 29, 21], [1, 30, 29], [0, 30, 21], [0, 31, 30], [1, 32, 31], [2, 32, 30], [2, 30, 21], [2, 21, 18], [2, 18, 17], [2, 17, 13], [0, 13, 0], [2, 33, 13], [2, 13, 0]]}}
diff --git a/data/test.txt b/data/test.txt
new file mode 100644
index 0000000..acfabbc
--- /dev/null
+++ b/data/test.txt
@@ -0,0 +1,20 @@
+(S (INTJ (RB No)) (, ,) (NP (PRP it)) (VP (VBD was) (RB n't) (NP (NNP Black) (NNP Monday))) (. .))
+(S (CC But) (SBAR (IN while) (S (NP (DT the) (NNP New) (NNP York) (NNP Stock) (NNP Exchange)) (VP (VBD did) (RB n't) (VP (VB fall) (ADVP (RB apart)) (NP (NNP Friday)) (SBAR (IN as) (S (NP (DT the) (NNP Dow) (NNP Jones) (NNP Industrial) (NNP Average)) (VP (VBD plunged) (NP (NP (CD 190.58) (NNS points)) (PRN (: --) (NP (NP (JJS most)) (PP (IN of) (NP (PRP it))) (PP (IN in) (NP (DT the) (JJ final) (NN hour)))) (: --)))))))))) (NP (PRP it)) (ADVP (RB barely)) (VP (VBD managed) (S (VP (TO to) (VP (VB stay) (NP (NP (DT this) (NN side)) (PP (IN of) (NP (NN chaos)))))))) (. .))
+(S (NP (NP (DT Some) (`` ``) (NN circuit) (NNS breakers) ('' '')) (VP (VBN installed) (PP (IN after) (NP (DT the) (NNP October) (CD 1987) (NN crash))))) (VP (VBD failed) (NP (PRP$ their) (JJ first) (NN test)) (PRN (, ,) (S (NP (NNS traders)) (VP (VBP say))) (, ,)) (S (ADJP (JJ unable) (S (VP (TO to) (VP (VB cool) (NP (NP (DT the) (NN selling) (NN panic)) (PP (IN in) (NP (DT both) (NNS stocks) (CC and) (NNS futures)))))))))) (. .))
+(S (NP (NP (NP (DT The) (CD 49) (NN stock) (NN specialist) (NNS firms)) (PP (IN on) (NP (DT the) (NNP Big) (NNP Board) (NN floor)))) (: --) (NP (NP (DT the) (NNS buyers) (CC and) (NNS sellers)) (PP (IN of) (NP (JJ last) (NN resort))) (SBAR (WHNP (WP who)) (S (VP (VBD were) (VP (VBN criticized) (PP (IN after) (NP (DT the) (CD 1987) (NN crash)))))))) (: --)) (ADVP (RB once) (RB again)) (VP (MD could) (RB n't) (VP (VB handle) (NP (DT the) (NN selling) (NN pressure)))) (. .))
+(S (S (NP (JJ Big) (NN investment) (NNS banks)) (VP (VBD refused) (S (VP (TO to) (VP (VB step) (ADVP (IN up) (PP (TO to) (NP (DT the) (NN plate)))) (S (VP (TO to) (VP (VB support) (NP (DT the) (JJ beleaguered) (NN floor) (NNS traders)) (PP (IN by) (S (VP (VBG buying) (NP (NP (JJ big) (NNS blocks)) (PP (IN of) (NP (NN stock))))))))))))))) (, ,) (NP (NNS traders)) (VP (VBP say)) (. .))
+(S (NP (NP (JJ Heavy) (NN selling)) (PP (IN of) (NP (NP (NNP Standard) (CC &) (NNP Poor) (POS 's)) (JJ 500-stock) (NN index) (NNS futures))) (PP (IN in) (NP (NNP Chicago)))) (VP (ADVP (RB relentlessly)) (VBD beat) (NP (NNS stocks)) (ADVP (RB downward))) (. .))
+(S (NP (NP (CD Seven) (NNP Big) (NNP Board) (NNS stocks)) (: --) (NP (NP (NNP UAL)) (, ,) (NP (NNP AMR)) (, ,) (NP (NNP BankAmerica)) (, ,) (NP (NNP Walt) (NNP Disney)) (, ,) (NP (NNP Capital) (NNP Cities\/ABC)) (, ,) (NP (NNP Philip) (NNP Morris)) (CC and) (NP (NNP Pacific) (NNP Telesis) (NNP Group))) (: --)) (VP (VP (VBD stopped) (S (VP (VBG trading)))) (CC and) (VP (ADVP (RB never)) (VBD resumed))) (. .))
+(S (NP (DT The) (NN finger-pointing)) (VP (VBZ has) (ADVP (RB already)) (VP (VBN begun))) (. .))
+(S (`` ``) (NP (DT The) (NN equity) (NN market)) (VP (VBD was) (ADJP (JJ illiquid))) (. .))
+(SINV (S (ADVP (RB Once) (RB again)) (-LRB- -LCB-) (NP (DT the) (NNS specialists)) (-RRB- -RCB-) (VP (VBD were) (RB not) (ADJP (JJ able) (S (VP (TO to) (VP (VB handle) (NP (NP (DT the) (NNS imbalances)) (PP (IN on) (NP (NP (DT the) (NN floor)) (PP (IN of) (NP (DT the) (NNP New) (NNP York) (NNP Stock) (NNP Exchange)))))))))))) (, ,) ('' '') (VP (VBD said)) (NP (NP (NNP Christopher) (NNP Pedersen)) (, ,) (NP (NP (JJ senior) (NN vice) (NN president)) (PP (IN at) (NP (NNP Twenty-First) (NNP Securities) (NNP Corp))))) (. .))
+(SINV (VP (VBD Countered)) (NP (NP (NNP James) (NNP Maguire)) (, ,) (NP (NP (NN chairman)) (PP (IN of) (NP (NNS specialists) (NNP Henderson) (NNP Brothers) (NNP Inc.))))) (: :) (`` ``) (S (NP (PRP It)) (VP (VBZ is) (ADJP (JJ easy)) (S (VP (TO to) (VP (VB say) (SBAR (S (NP (DT the) (NN specialist)) (VP (VBZ is) (RB n't) (VP (VBG doing) (NP (PRP$ his) (NN job))))))))))) (. .))
+(S (SBAR (WHADVP (WRB When)) (S (NP (DT the) (NN dollar)) (VP (VBZ is) (PP (IN in) (NP (DT a) (NN free-fall)))))) (, ,) (NP (RB even) (JJ central) (NNS banks)) (VP (MD ca) (RB n't) (VP (VB stop) (NP (PRP it)))) (. .))
+(S (NP (NNS Speculators)) (VP (VBP are) (VP (VBG calling) (PP (IN for) (NP (NP (DT a) (NN degree)) (PP (IN of) (NP (NN liquidity))) (SBAR (WHNP (WDT that)) (S (VP (VBZ is) (RB not) (ADVP (RB there)) (PP (IN in) (NP (DT the) (NN market)))))))))) (. .) ('' ''))
+(S (NP (NP (JJ Many) (NN money) (NNS managers)) (CC and) (NP (DT some) (NNS traders))) (VP (VBD had) (ADVP (RB already)) (VP (VBN left) (NP (PRP$ their) (NNS offices)) (NP (RB early) (NNP Friday) (NN afternoon)) (PP (IN on) (NP (DT a) (JJ warm) (NN autumn) (NN day))) (: --) (SBAR (IN because) (S (NP (DT the) (NN stock) (NN market)) (VP (VBD was) (ADJP (RB so) (JJ quiet))))))) (. .))
+(S (RB Then) (PP (IN in) (NP (DT a) (NN lightning) (NN plunge))) (, ,) (NP (DT the) (NNP Dow) (NNP Jones) (NNS industrials)) (PP (IN in) (NP (QP (RB barely) (DT an)) (NN hour))) (VP (VBD surrendered) (NP (NP (QP (RB about) (DT a)) (JJ third)) (PP (IN of) (NP (NP (PRP$ their) (NNS gains)) (NP (DT this) (NN year))))) (, ,) (S (VP (VBG chalking) (PRT (RP up)) (NP (NP (DT a) (ADJP (ADJP (JJ 190.58-point)) (, ,) (CC or) (ADJP (CD 6.9) (NN %)) (, ,)) (NN loss)) (PP (IN on) (NP (DT the) (NN day)))) (PP (IN in) (NP (JJ gargantuan) (NN trading) (NN volume)))))) (. .))
+(S (NP (JJ Final-hour) (NN trading)) (VP (VBD accelerated) (PP (TO to) (NP (NP (QP (CD 108.1) (CD million)) (NNS shares)) (, ,) (NP (NP (DT a) (NN record)) (PP (IN for) (NP (DT the) (NNP Big) (NNP Board))))))) (. .))
+(S (PP (IN At) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (NN day))))) (, ,) (NP (QP (CD 251.2) (CD million)) (NNS shares)) (VP (VBD were) (VP (VBN traded))) (. .))
+(S (NP (DT The) (NNP Dow) (NNP Jones) (NNS industrials)) (VP (VBD closed) (PP (IN at) (NP (CD 2569.26)))) (. .))
+(S (NP (NP (DT The) (NNP Dow) (POS 's)) (NN decline)) (VP (VBD was) (ADJP (JJ second) (PP (IN in) (NP (NN point) (NNS terms))) (PP (ADVP (RB only)) (TO to) (NP (NP (DT the) (JJ 508-point) (NNP Black) (NNP Monday) (NN crash)) (SBAR (WHNP (WDT that)) (S (VP (VBD occurred) (NP (NNP Oct.) (CD 19) (, ,) (CD 1987))))))))) (. .))
+(S (PP (IN In) (NP (NN percentage) (NNS terms))) (, ,) (ADVP (RB however)) (, ,) (NP (NP (DT the) (NNP Dow) (POS 's)) (NN dive)) (VP (VBD was) (NP (NP (NP (DT the) (JJ 12th-worst)) (ADVP (RB ever))) (CC and) (NP (NP (DT the) (JJS sharpest)) (SBAR (IN since) (S (NP (DT the) (NN market)) (VP (VBD fell) (NP (NP (CD 156.83)) (, ,) (CC or) (NP (CD 8) (NN %))) (, ,) (PP (NP (DT a) (NN week)) (IN after) (NP (NNP Black) (NNP Monday))))))))) (. .))
diff --git a/data/train.txt b/data/train.txt
new file mode 100644
index 0000000..a794269
--- /dev/null
+++ b/data/train.txt
@@ -0,0 +1,20 @@
+(S (PP (IN In) (NP (NP (DT an) (NNP Oct.) (CD 19) (NN review)) (PP (IN of) (NP (`` ``) (NP (DT The) (NN Misanthrope)) ('' '') (PP (IN at) (NP (NP (NNP Chicago) (POS 's)) (NNP Goodman) (NNP Theatre))))) (PRN (-LRB- -LRB-) (`` ``) (S (NP (VBN Revitalized) (NNS Classics)) (VP (VBP Take) (NP (DT the) (NN Stage)) (PP (IN in) (NP (NNP Windy) (NNP City))))) (, ,) ('' '') (NP (NN Leisure) (CC &) (NNS Arts)) (-RRB- -RRB-)))) (, ,) (NP (NP (NP (DT the) (NN role)) (PP (IN of) (NP (NNP Celimene)))) (, ,) (VP (VBN played) (PP (IN by) (NP (NNP Kim) (NNP Cattrall)))) (, ,)) (VP (VBD was) (VP (ADVP (RB mistakenly)) (VBN attributed) (PP (TO to) (NP (NNP Christina) (NNP Haag))))) (. .))
+(S (NP (NNP Ms.) (NNP Haag)) (VP (VBZ plays) (NP (NNP Elianti))) (. .))
+(S (NP (NNP Rolls-Royce) (NNP Motor) (NNPS Cars) (NNP Inc.)) (VP (VBD said) (SBAR (S (NP (PRP it)) (VP (VBZ expects) (S (NP (PRP$ its) (NNP U.S.) (NNS sales)) (VP (TO to) (VP (VB remain) (ADJP (JJ steady)) (PP (IN at) (NP (QP (IN about) (CD 1,200)) (NNS cars))) (PP (IN in) (NP (CD 1990)))))))))) (. .))
+(S (NP (DT The) (NN luxury) (NN auto) (NN maker)) (NP (JJ last) (NN year)) (VP (VBD sold) (NP (CD 1,214) (NNS cars)) (PP (IN in) (NP (DT the) (NNP U.S.)))))
+(S (NP (NP (NNP Howard) (NNP Mosher)) (, ,) (NP (NP (NN president)) (CC and) (NP (JJ chief) (NN executive) (NN officer))) (, ,)) (VP (VBD said) (SBAR (S (NP (PRP he)) (VP (VBZ anticipates) (NP (NP (NN growth)) (PP (IN for) (NP (DT the) (NN luxury) (NN auto) (NN maker))) (PP (PP (IN in) (NP (NNP Britain) (CC and) (NNP Europe))) (, ,) (CC and) (PP (IN in) (NP (ADJP (JJ Far) (JJ Eastern)) (NNS markets))))))))) (. .))
+(S (NP (NNP BELL) (NNP INDUSTRIES) (NNP Inc.)) (VP (VBD increased) (NP (PRP$ its) (NN quarterly)) (PP (TO to) (NP (CD 10) (NNS cents))) (PP (IN from) (NP (NP (CD seven) (NNS cents)) (NP (DT a) (NN share))))) (. .))
+(S (NP (DT The) (JJ new) (NN rate)) (VP (MD will) (VP (VB be) (ADJP (JJ payable) (NP (NNP Feb.) (CD 15))))) (. .))
+(S (NP (DT A) (NN record) (NN date)) (VP (VBZ has) (RB n't) (VP (VBN been) (VP (VBN set)))) (. .))
+(S (NP (NP (NNP Bell)) (, ,) (VP (VBN based) (PP (IN in) (NP (NNP Los) (NNP Angeles)))) (, ,)) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (UCP (JJ electronic) (, ,) (NN computer) (CC and) (NN building)) (NNS products))) (. .))
+(S (NP (NNS Investors)) (VP (VBP are) (VP (VBG appealing) (PP (TO to) (NP (DT the) (NNPS Securities) (CC and) (NNP Exchange) (NNP Commission))) (S (RB not) (VP (TO to) (VP (VB limit) (NP (NP (PRP$ their) (NN access)) (PP (TO to) (NP (NP (NN information)) (PP (IN about) (NP (NP (NN stock) (NNS purchases) (CC and) (NNS sales)) (PP (IN by) (NP (JJ corporate) (NNS insiders))))))))))))) (. .))
+(S (S (NP (DT A) (NNP SEC) (NN proposal) (S (VP (TO to) (VP (VB ease) (NP (NP (NN reporting) (NNS requirements)) (PP (IN for) (NP (DT some) (NN company) (NNS executives)))))))) (VP (MD would) (VP (VB undermine) (NP (NP (DT the) (NN usefulness)) (PP (IN of) (NP (NP (NN information)) (PP (IN on) (NP (NN insider) (NNS trades))))) (PP (IN as) (NP (DT a) (JJ stock-picking) (NN tool))))))) (, ,) (NP (NP (JJ individual) (NNS investors)) (CC and) (NP (JJ professional) (NN money) (NNS managers))) (VP (VBP contend)) (. .))
+(S (NP (PRP They)) (VP (VBP make) (NP (DT the) (NN argument)) (PP (IN in) (NP (NP (NNS letters)) (PP (TO to) (NP (DT the) (NN agency))) (PP (IN about) (NP (NP (NN rule) (NNS changes)) (VP (VBD proposed) (NP (DT this) (JJ past) (NN summer))) (SBAR (WHNP (IN that)) (, ,) (S (PP (IN among) (NP (JJ other) (NNS things))) (, ,) (VP (MD would) (VP (VB exempt) (NP (JJ many) (JJ middle-management) (NNS executives)) (PP (IN from) (S (VP (VBG reporting) (NP (NP (NNS trades)) (PP (IN in) (NP (NP (PRP$ their) (JJ own) (NNS companies) (POS ')) (NNS shares)))))))))))))))) (. .))
+(S (NP (DT The) (VBN proposed) (NNS changes)) (ADVP (RB also)) (VP (MD would) (VP (VB allow) (S (NP (NNS executives)) (VP (TO to) (VP (VB report) (NP (NP (NNS exercises)) (PP (IN of) (NP (NNS options)))) (ADVP (ADVP (RBR later)) (CC and) (ADVP (RBR less) (RB often)))))))) (. .))
+(S (NP (NP (JJ Many)) (PP (IN of) (NP (DT the) (NNS letters)))) (VP (VBP maintain) (SBAR (IN that) (S (S (NP (NN investor) (NN confidence)) (VP (VBZ has) (VP (VBN been) (VP (ADVP (RB so)) (VBN shaken) (PP (IN by) (NP (DT the) (CD 1987) (NN stock) (NN market) (NN crash))))))) (: --) (CC and) (S (NP (DT the) (NNS markets)) (ADVP (RB already)) (VP (ADVP (RB so)) (VBN stacked) (PP (IN against) (NP (DT the) (JJ little) (NN guy))))) (: --) (SBAR (IN that) (S (NP (NP (DT any) (NN decrease)) (PP (IN in) (NP (NP (NN information)) (PP (IN on) (NP (NN insider-trading) (NNS patterns)))))) (VP (MD might) (VP (VB prompt) (S (NP (NNS individuals)) (VP (TO to) (VP (VB get) (ADVP (RB out) (PP (IN of) (NP (NNS stocks)))) (ADVP (RB altogether)))))))))))) (. .))
+(SINV (`` ``) (S (NP (DT The) (NNP SEC)) (VP (VBZ has) (ADVP (RB historically)) (VP (VBN paid) (NP (NN obeisance)) (PP (TO to) (NP (NP (DT the) (NN ideal)) (PP (IN of) (NP (DT a) (JJ level) (NN playing) (NN field)))))))) (, ,) ('' '') (VP (VBD wrote)) (NP (NP (NNP Clyde) (NNP S.) (NNP McGregor)) (PP (IN of) (NP (NP (NNP Winnetka)) (, ,) (NP (NNP Ill.)) (, ,)))) (PP (IN in) (NP (NP (CD one)) (PP (IN of) (NP (NP (DT the) (CD 92) (NNS letters)) (SBAR (S (NP (DT the) (NN agency)) (VP (VBZ has) (VP (VBN received) (SBAR (IN since) (S (NP (DT the) (NNS changes)) (VP (VBD were) (VP (VBN proposed) (NP (NNP Aug.) (CD 17)))))))))))))) (. .))
+(S (`` ``) (ADVP (RB Apparently)) (NP (DT the) (NN commission)) (VP (VBD did) (RB not) (ADVP (RB really)) (VP (VB believe) (PP (IN in) (NP (DT this) (NN ideal))))) (. .) ('' ''))
+(S (ADVP (RB Currently)) (, ,) (NP (DT the) (NNS rules)) (VP (VBP force) (S (NP (NP (NNS executives)) (, ,) (NP (NNS directors)) (CC and) (NP (JJ other) (JJ corporate) (NNS insiders))) (VP (TO to) (VP (VB report) (NP (NP (NNS purchases) (CC and) (NNS sales)) (PP (IN of) (NP (NP (PRP$ their) (NNS companies) (POS ')) (NNS shares)))) (PP (IN within) (NP (NP (QP (IN about) (DT a)) (NN month)) (PP (IN after) (NP (DT the) (NN transaction))))))))) (. .))
+(S (CC But) (NP (NP (QP (IN about) (CD 25)) (NN %)) (PP (IN of) (NP (DT the) (NNS insiders)))) (, ,) (PP (VBG according) (PP (TO to) (NP (NNP SEC) (NNS figures)))) (, ,) (VP (VBP file) (NP (PRP$ their) (NNS reports)) (ADVP (RB late))) (. .))
+(SINV (S (NP (DT The) (NNS changes)) (VP (VBD were) (VP (VBN proposed) (PP (IN in) (NP (DT an) (NN effort) (S (VP (TO to) (VP (VP (VB streamline) (NP (JJ federal) (NN bureaucracy))) (CC and) (VP (VB boost) (NP (NP (NN compliance)) (PP (IN by) (NP (NP (DT the) (NNS executives)) (`` ``) (SBAR (WHNP (WP who)) (S (VP (VBP are) (ADVP (RB really)) (VP (VBG calling) (NP (DT the) (NNS shots)))))))))))))))))) (, ,) ('' '') (VP (VBD said)) (NP (NP (NNP Brian) (NNP Lane)) (, ,) (NP (NP (JJ special) (NN counsel)) (PP (IN at) (NP (NP (NP (NP (DT the) (NNP SEC) (POS 's)) (NN office)) (PP (IN of) (NP (NN disclosure) (NN policy)))) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD proposed) (NP (DT the) (NNS changes))))))))) (. .))
+(S (S (S (NP (NP (NNS Investors)) (, ,) (NP (NN money) (NNS managers)) (CC and) (NP (JJ corporate) (NNS officials))) (VP (VBD had) (PP (IN until) (NP (NN today))) (S (VP (TO to) (VP (VB comment) (PP (IN on) (NP (DT the) (NNS proposals)))))))) (, ,) (CC and) (S (NP (DT the) (NN issue)) (VP (VBZ has) (VP (VBN produced) (NP (NP (JJR more) (NN mail)) (PP (IN than) (NP (NP (ADJP (RB almost) (DT any)) (JJ other) (NN issue)) (PP (IN in) (NP (NN memory)))))))))) (, ,) (NP (NNP Mr.) (NNP Lane)) (VP (VBD said)) (. .))
diff --git a/data/valid.txt b/data/valid.txt
new file mode 100644
index 0000000..2d07eb2
--- /dev/null
+++ b/data/valid.txt
@@ -0,0 +1,20 @@
+(S (NP (NP (DT The) (NN economy) (POS 's)) (NN temperature)) (VP (MD will) (VP (VB be) (VP (VBN taken) (PP (IN from) (NP (JJ several) (NN vantage) (NNS points))) (NP (DT this) (NN week)) (, ,) (PP (IN with) (NP (NP (NNS readings)) (PP (IN on) (NP (NP (NN trade)) (, ,) (NP (NN output)) (, ,) (NP (NN housing)) (CC and) (NP (NN inflation))))))))) (. .))
+(S (NP (DT The) (ADJP (RBS most) (JJ troublesome)) (NN report)) (VP (MD may) (VP (VB be) (NP (NP (DT the) (NNP August) (NN merchandise) (NN trade) (NN deficit)) (ADJP (JJ due) (ADVP (IN out)) (NP (NN tomorrow)))))) (. .))
+(S (NP (DT The) (NN trade) (NN gap)) (VP (VBZ is) (VP (VBN expected) (S (VP (TO to) (VP (VB widen) (PP (TO to) (NP (QP (IN about) ($ $) (CD 9) (CD billion)))) (PP (IN from) (NP (NP (NNP July) (POS 's)) (QP ($ $) (CD 7.6) (CD billion))))))) (, ,) (PP (VBG according) (PP (TO to) (NP (NP (DT a) (NN survey)) (PP (IN by) (NP (NP (NNP MMS) (NNP International)) (, ,) (NP (NP (DT a) (NN unit)) (PP (IN of) (NP (NP (NNP McGraw-Hill) (NNP Inc.)) (, ,) (NP (NNP New) (NNP York)))))))))))) (. .))
+(S (NP (NP (NP (NNP Thursday) (POS 's)) (NN report)) (PP (IN on) (NP (DT the) (NNP September) (NN consumer) (NN price) (NN index)))) (VP (VBZ is) (VP (VBN expected) (S (VP (TO to) (VP (VB rise) (, ,) (SBAR (IN although) (ADVP (ADVP (RB not) (RB as) (RB sharply)) (PP (IN as) (NP (NP (DT the) (ADJP (CD 0.9) (NN %)) (NN gain)) (VP (VBN reported) (NP (NNP Friday)) (PP (IN in) (NP (DT the) (NN producer) (NN price) (NN index))))))))))))) (. .))
+(S (NP (DT That) (NN gain)) (VP (VBD was) (VP (VBG being) (VP (VBD cited) (PP (IN as) (NP (NP (DT a) (NN reason)) (SBAR (S (NP (DT the) (NN stock) (NN market)) (VP (VBD was) (ADVP (IN down)) (ADVP (RB early) (PP (IN in) (NP (NP (NNP Friday) (POS 's)) (NN session)))) (, ,) (SBAR (IN before) (S (NP (PRP it)) (VP (VBD got) (S (VP (VBN started) (PP (IN on) (NP (PRP$ its) (JJ reckless) (JJ 190-point) (NN plunge)))))))))))))))) (. .))
+(S (NP (NNS Economists)) (VP (VBP are) (VP (VBN divided) (PP (IN as) (PP (TO to) (SBAR (WHNP (WHADVP (WRB how) (JJ much)) (VBG manufacturing) (NN strength)) (S (NP (PRP they)) (VP (VBP expect) (S (VP (TO to) (VP (VB see) (PP (IN in) (NP (NP (NP (NNP September) (NNS reports)) (PP (IN on) (NP (NP (JJ industrial) (NN production)) (CC and) (NP (NN capacity) (NN utilization))))) (, ,) (ADJP (ADVP (RB also)) (JJ due) (NP (NN tomorrow))))))))))))))) (. .))
+(S (ADVP (RB Meanwhile)) (, ,) (NP (NP (NNP September) (NN housing) (NNS starts)) (, ,) (ADJP (JJ due) (NP (NNP Wednesday))) (, ,)) (VP (VBP are) (VP (VBN thought) (S (VP (TO to) (VP (VB have) (VP (VBN inched) (ADVP (RB upward)))))))) (. .))
+(SINV (S (`` ``) (NP (EX There)) (VP (VBZ 's) (NP (NP (DT a) (NN possibility)) (PP (IN of) (NP (NP (DT a) (NN surprise)) ('' '') (PP (IN in) (NP (DT the) (NN trade) (NN report)))))))) (, ,) (VP (VBD said)) (NP (NP (NNP Michael) (NNP Englund)) (, ,) (NP (NP (NN director)) (PP (IN of) (NP (NN research))) (PP (IN at) (NP (NNP MMS))))) (. .))
+(S (S (NP (NP (DT A) (NN widening)) (PP (IN of) (NP (DT the) (NN deficit)))) (, ,) (SBAR (IN if) (S (NP (PRP it)) (VP (VBD were) (VP (VBN combined) (PP (IN with) (NP (DT a) (ADJP (RB stubbornly) (JJ strong)) (NN dollar))))))) (, ,) (VP (MD would) (VP (VB exacerbate) (NP (NN trade) (NNS problems)))) (: --)) (CC but) (S (NP (DT the) (NN dollar)) (VP (VBD weakened) (NP (NNP Friday)) (SBAR (IN as) (S (NP (NNS stocks)) (VP (VBD plummeted)))))) (. .))
+(S (PP (IN In) (NP (DT any) (NN event))) (, ,) (NP (NP (NNP Mr.) (NNP Englund)) (CC and) (NP (JJ many) (NNS others))) (VP (VBP say) (SBAR (IN that) (S (NP (NP (DT the) (JJ easy) (NNS gains)) (PP (IN in) (S (VP (VBG narrowing) (NP (DT the) (NN trade) (NN gap)))))) (VP (VBP have) (ADVP (RB already)) (VP (VBN been) (VP (VBN made))))))) (. .))
+(S (`` ``) (S (NP (NN Trade)) (VP (VBZ is) (ADVP (RB definitely)) (VP (VBG going) (S (VP (TO to) (VP (VB be) (ADJP (RBR more) (RB politically) (JJ sensitive)) (PP (IN over) (NP (DT the) (JJ next) (QP (CD six) (CC or) (CD seven)) (NNS months))) (SBAR (IN as) (S (NP (NN improvement)) (VP (VBZ begins) (S (VP (TO to) (VP (VB slow))))))))))))) (, ,) ('' '') (NP (PRP he)) (VP (VBD said)) (. .))
+(S (S (NP (NNS Exports)) (VP (VBP are) (VP (VBN thought) (S (VP (TO to) (VP (VB have) (VP (VBN risen) (ADVP (ADVP (RB strongly) (PP (IN in) (NP (NNP August)))) (, ,) (CC but) (ADVP (ADVP (RB probably)) (RB not) (RB enough) (S (VP (TO to) (VP (VB offset) (NP (NP (DT the) (NN jump)) (PP (IN in) (NP (NNS imports)))))))))))))))) (, ,) (NP (NNS economists)) (VP (VBD said)) (. .))
+(S (NP (NP (NNS Views)) (PP (IN on) (NP (VBG manufacturing) (NN strength)))) (VP (VBP are) (ADJP (VBN split) (PP (IN between) (NP (NP (NP (NNS economists)) (SBAR (WHNP (WP who)) (S (VP (VBP read) (NP (NP (NP (NNP September) (POS 's)) (JJ low) (NN level)) (PP (IN of) (NP (NN factory) (NN job) (NN growth)))) (PP (IN as) (NP (NP (DT a) (NN sign)) (PP (IN of) (NP (DT a) (NN slowdown))))))))) (CC and) (NP (NP (DT those)) (SBAR (WHNP (WP who)) (S (VP (VBP use) (NP (DT the) (ADJP (RB somewhat) (JJR more) (VBG comforting)) (JJ total) (NN employment) (NNS figures)) (PP (IN in) (NP (PRP$ their) (NNS calculations))))))))))) (. .))
+(S (S (NP (NP (DT The) (JJ wide) (NN range)) (PP (IN of) (NP (NP (NNS estimates)) (PP (IN for) (NP (DT the) (JJ industrial) (NN output) (NN number)))))) (VP (VBZ underscores) (NP (DT the) (NNS differences)))) (: :) (S (NP (DT The) (NNS forecasts)) (VP (VBD run) (PP (IN from) (NP (NP (DT a) (NN drop)) (PP (IN of) (NP (CD 0.5) (NN %))))) (PP (TO to) (NP (NP (DT an) (NN increase)) (PP (IN of) (NP (CD 0.4) (NN %))))) (, ,) (PP (VBG according) (PP (TO to) (NP (NNP MMS)))))) (. .))
+(S (NP (NP (DT A) (NN rebound)) (PP (IN in) (NP (NN energy) (NNS prices))) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD helped) (VP (VB push) (PRT (RP up)) (NP (DT the) (NN producer) (NN price) (NN index)))))) (, ,)) (VP (VBZ is) (VP (VBN expected) (S (VP (TO to) (VP (VB do) (NP (DT the) (JJ same)) (PP (IN in) (NP (DT the) (NN consumer) (NN price) (NN report)))))))) (. .))
+(S (NP (DT The) (NN consensus) (NN view)) (VP (VBZ expects) (NP (NP (DT a) (ADJP (CD 0.4) (NN %)) (NN increase)) (PP (IN in) (NP (DT the) (NNP September) (NNP CPI)))) (PP (IN after) (NP (NP (DT a) (JJ flat) (NN reading)) (PP (IN in) (NP (NNP August)))))) (. .))
+(S (NP (NP (NNP Robert) (NNP H.) (NNP Chandross)) (, ,) (NP (NP (DT an) (NN economist)) (PP (IN for) (NP (NP (NP (NNP Lloyd) (POS 's)) (NNP Bank)) (PP (IN in) (NP (NNP New) (NNP York)))))) (, ,)) (VP (VBZ is) (PP (IN among) (NP (NP (DT those)) (VP (VBG expecting) (NP (NP (DT a) (ADJP (RBR more) (JJ moderate)) (NN gain)) (PP (IN in) (NP (DT the) (NNP CPI))) (PP (IN than) (PP (IN in) (NP (NP (NNS prices)) (PP (IN at) (NP (DT the) (NN producer) (NN level))))))))))) (. .))
+(S (`` ``) (S (S (NP (NN Auto) (NNS prices)) (VP (VBD had) (NP (DT a) (JJ big) (NN effect)) (PP (IN in) (NP (DT the) (NNP PPI))))) (, ,) (CC and) (S (PP (IN at) (NP (DT the) (NNP CPI) (NN level))) (NP (PRP they)) (VP (MD wo) (RB n't)))) (, ,) ('' '') (NP (PRP he)) (VP (VBD said)) (. .))
+(SINV (S (S (NP (NN Food) (NNS prices)) (VP (VBP are) (VP (VBN expected) (S (VP (TO to) (VP (VB be) (ADJP (JJ unchanged)))))))) (, ,) (CC but) (S (NP (NN energy) (NNS costs)) (VP (VBD jumped) (NP (NP (RB as) (RB much) (IN as) (CD 4)) (NN %))))) (, ,) (VP (VBD said)) (NP (NP (NNP Gary) (NNP Ciminero)) (, ,) (NP (NP (NN economist)) (PP (IN at) (NP (NNP Fleet\/Norstar) (NNP Financial) (NNP Group))))) (. .))
+(S (NP (PRP He)) (ADVP (RB also)) (VP (VBZ says) (SBAR (S (NP (PRP he)) (VP (VBZ thinks) (SBAR (S (NP (`` ``) (NP (NN core) (NN inflation)) (, ,) ('' '') (SBAR (WHNP (WDT which)) (S (VP (VBZ excludes) (NP (DT the) (JJ volatile) (NN food) (CC and) (NN energy) (NNS prices))))) (, ,)) (VP (VBD was) (ADJP (JJ strong)) (NP (JJ last) (NN month))))))))) (. .))
diff --git a/src/architectures.py b/src/architectures.py
new file mode 100644
index 0000000..c549db6
--- /dev/null
+++ b/src/architectures.py
@@ -0,0 +1,783 @@
+from utils import *
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import transition as transition_system
+from sklearn.metrics import accuracy_score, f1_score
+import experiment
+
+
+class IncrementalProbe(nn.Module):
+    def __init__(self, args):
+        super(IncrementalProbe, self).__init__()
+        self.args = args
+        if args["oracle_params"]["name"]:
+            self.oracle = getattr(transition_system, args["oracle_params"]["name"])(
+                args["oracle_params"]["mappings_file"]
+            )
+        else:
+            self.oracle = None
+
+        self.add_root = args["add_root"] if "add_root" in args.keys() else None
+        self.embeddings_dropout_rate = (
+            args["embeddings_dropout_rate"]
+            if "embeddings_dropout_rate" in args.keys()
+            else None
+        )
+        self.layer_dropout_rate = (
+            args["layer_dropout_rate"] if "layer_dropout_rate" in args.keys() else None
+        )
+        self.checkpoint_path = (
+            args["checkpoint_path"] if "checkpoint_path" in args.keys() else None
+        )
+        self.num_layers = args["num_layers"] if "num_layers" in args.keys() else None
+        self.layer = args["layer"] if "layer" in args.keys() else None
+        self.pretrained_model = (
+            args["pretrained_model"] if "pretrained_model" in args.keys() else None
+        )
+
+        self.vocab_size = len(self.oracle.a2i)
+
+        self.probe_rank = self.model_dim = MODEL_DATA[self.pretrained_model][
+            "feature_count"
+        ]
+
+        self.root = nn.Parameter(data=torch.zeros(self.model_dim))
+        self.nll = nn.NLLLoss(reduction="none")
+
+    def add_root_distance_labels(self, batch):
+        depths_w_root = self.add_root_depth_labels(batch)
+        gold_distances = batch["gold_distances"].clone().to(self.device)
+        distances_w_root = torch.zeros(
+            gold_distances.shape[0],
+            gold_distances.shape[1] + 1,
+            gold_distances.shape[2] + 1,
+            device=self.device,
+        )
+        distances_w_root[:, 1:, 1:] += gold_distances
+        distances_w_root[:, 0, :] += depths_w_root.clone()
+        distances_w_root[:, :, 0] += depths_w_root.clone()
+        return distances_w_root
+
+    def add_root_depth_labels(self, batch):
+        gold_depths = batch["gold_depths"].clone().to(self.device)
+
+        gold_depths += 1
+        gold_depths[gold_depths == 0] = -1
+
+        depths_w_root = torch.zeros(
+            gold_depths.shape[0], gold_depths.shape[1] + 1, device=self.device
+        )
+        depths_w_root[:, 1:] += gold_depths.clone().to(self.device)
+        return depths_w_root
+
+    def add_root_model_embeddings(self, batch):
+        model_embeddings = batch["padded_embeddings"][:, 0, 1:, :].to(self.device)
+        embeddings_w_root = torch.zeros(
+            model_embeddings.shape[0],
+            model_embeddings.shape[1] + 1,
+            model_embeddings.shape[2],
+            device=self.device,
+        )
+        embeddings_w_root[:, 1:, :] = model_embeddings.clone()
+        embeddings_w_root[:, 0, :] += self.root
+        return embeddings_w_root.unsqueeze(1)
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, y_dim=512, x_dim=512):
+        super(AttentionLayer, self).__init__()
+        self.key = nn.Linear(y_dim, x_dim, bias=False)
+        self.query = nn.Linear(x_dim, x_dim, bias=False)
+        self.device = None
+
+    def forward(self, x, y, masks=None, output_attentions=False):
+        self.device = next(self.parameters()).device
+        q = self.query(x)
+        k = self.key(y)
+        v = y
+
+        w = torch.matmul(q, k.transpose(-1, -2))
+
+        w = torch.where(
+            masks.unsqueeze(2).bool(), w, torch.tensor(-1e10).to(self.device)
+        )
+
+        w = nn.Softmax(dim=-1)(w)
+
+        return torch.matmul(w, v)[:, :, 0, :], w[:, :, 0, :]
+
+
+class AttentiveProbe(IncrementalProbe):
+    def __init__(self, args):
+        super(AttentiveProbe, self).__init__(args)
+        IncrementalProbe.__init__(self, args)
+        self.reverse = args["reverse"]
+        self.continuous = args["continuous"]
+        self.rnn_type = args["rnn_type"]
+        self.num_layers = args["num_layers"]
+        self.emb_size = args["emb_size"]
+        self.state_size = args["state_size"]
+        self.vocab_size = len(self.oracle.a2i)
+
+        self.embeddings_dropout = nn.Dropout(self.embeddings_dropout_rate)
+        self.layer_dropout = nn.Dropout(self.layer_dropout_rate)
+        self.encoder = nn.Embedding(self.vocab_size, self.emb_size)
+
+        self.rnn = getattr(nn, self.rnn_type)(
+            self.emb_size, self.state_size, self.num_layers, dropout=0, batch_first=True
+        )
+
+        layers = [
+            nn.Sequential(
+                nn.Linear(self.state_size + self.model_dim, self.state_size), nn.ReLU()
+            )
+            for layer_idx in range(1)
+        ]
+        layers = layers + [nn.Linear(self.state_size, len(self.oracle.actions_list()))]
+        self.decoder = nn.Sequential(*layers)
+
+        initrange = 0.1
+        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
+
+        self.attn = AttentionLayer(y_dim=self.model_dim, x_dim=self.state_size)
+
+    def forward(self, batch):
+        model_embeddings = self.add_root_model_embeddings(batch)[:, 0, :, :].to(
+            self.device
+        )
+
+        inpt = batch["action_ids"].to(self.device)
+
+        models = model_embeddings.unsqueeze(1).repeat(1, inpt.shape[1], 1, 1)
+        models = self.embeddings_dropout(models)
+
+        masks = batch["continuous_action_masks"].to(self.device)
+
+        hidden = self.repackage_hidden(self.init_hidden(inpt.shape[0]))
+        emb = self.encoder(inpt)
+        output, hidden = self.rnn(emb, hidden)
+        context, attentions = self.attn(output.unsqueeze(2), models, masks)
+        context = self.layer_dropout(context)
+
+        output = torch.cat((output, context), dim=-1)
+        decoded = self.decoder(output)
+        decoded = decoded.view(-1, len(self.oracle.actions_list()))
+        return F.log_softmax(decoded, dim=-1), hidden, attentions
+
+    def batch_step_eval(self, batch):
+        self.device = next(self.parameters()).device
+        dists, hidden, attentions = self(batch)
+
+        targets = batch["action_ids"].roll(-1, dims=-1).to(self.device)
+        targets[:, -1] = self.oracle.a2i["PAD"]
+
+        mask = targets.flatten() - self.oracle.a2i["PAD"]
+        mask = mask.nonzero(as_tuple=True)
+
+        loss = self.nll(dists[mask], targets.flatten()[mask])
+
+        predicted_actions = dists[mask].argmax(dim=-1).detach().cpu().numpy()
+        losses = {
+            "loss": loss.mean(),
+            "f1": torch.tensor(
+                f1_score(
+                    predicted_actions,
+                    targets.flatten()[mask].detach().cpu().numpy(),
+                    average="macro",
+                )
+            ),
+            "accuracy": torch.tensor(
+                accuracy_score(
+                    predicted_actions, targets.flatten()[mask].detach().cpu().numpy()
+                )
+            ),
+            "perplexity": torch.exp(loss.mean()),
+        }
+
+        return losses
+
+    def action_dists(self, batch):
+        masks = batch["continuous_action_masks"]
+        self.device = next(self.parameters()).device
+        model_embeddings = self.add_root_model_embeddings(batch)[:, 0, :, :]
+
+        inpt = batch["action_ids"]
+
+        models = model_embeddings.unsqueeze(1).repeat(1, inpt.shape[1], 1, 1)
+        models = self.embeddings_dropout(models)
+
+        hidden = self.repackage_hidden(self.init_hidden(inpt.shape[0]))
+        emb = self.encoder(inpt)
+        output, hidden = self.rnn(emb, hidden)
+        context, attentions = self.attn(output.unsqueeze(2), models, masks)
+        output = torch.cat((output, context), dim=-1)
+        decoded = self.decoder(output)
+
+        return F.log_softmax(decoded[:, -1], dim=-1)
+
+    def batch_step_train(self, batch, deterministic_action_loss=False):
+        self.device = next(self.parameters()).device
+        dists, hidden, attentions = self(batch)
+        targets = batch["action_ids"].roll(-1, dims=-1)
+        targets[:, -1] = self.oracle.a2i["PAD"]
+        mask = targets.flatten() - self.oracle.a2i["PAD"]
+        mask = mask.nonzero(as_tuple=True)
+        loss = self.nll(dists[mask], targets.flatten()[mask])
+        return {"loss": loss.mean()}
+
+    def repackage_hidden(self, h):
+        """Wraps hidden states in new Tensors, to detach them from their history."""
+        if isinstance(h, torch.Tensor):
+            return h.detach()
+        else:
+            return tuple(self.repackage_hidden(v) for v in h)
+
+    def init_hidden(self, bsz):
+        weight = next(self.parameters())
+        if self.rnn_type == "LSTM":
+            return (
+                weight.new_zeros(self.num_layers, bsz, self.state_size),
+                weight.new_zeros(self.num_layers, bsz, self.state_size),
+            )
+        else:
+            return weight.new_zeros(self.num_layers, bsz, self.state_size)
+
+
+class StackActionProbe(IncrementalProbe):
+    def __init__(self, args):
+        super(IncrementalProbe, self).__init__()
+        IncrementalProbe.__init__(self, args)
+        self.input_size = MODEL_DATA[self.pretrained_model]["feature_count"] * 2
+        self.num_layers = args["num_layers"]
+        layers = [
+            nn.Sequential(
+                nn.Linear(self.input_size, self.input_size),
+                nn.ReLU(),
+                nn.Dropout(self.layer_dropout_rate),
+            )
+            for layer_idx in range(self.num_layers - 1)
+        ]
+        layers = (
+            [nn.Dropout(self.embeddings_dropout_rate)]
+            + layers
+            + [nn.Linear(self.input_size, len(self.oracle.actions_list()))]
+        )
+        self.transform = nn.Sequential(*layers)
+        self.device = next(self.parameters()).device
+
+    def forward(self, embeddings):
+        return self.transform(embeddings).log_softmax(-1)
+
+    def batch_step_eval(self, batch):
+        self.device = next(self.parameters()).device
+
+        if self.add_root:
+            model_embeddings = self.add_root_model_embeddings(batch)[:, 0, :, :].to(
+                self.device
+            )
+
+        oracle_action_idxs, targets = self.oracle.targets_idxs(batch)
+
+        first_emb_indx, second_emb_indx = (
+            oracle_action_idxs[[0, 1], :],
+            oracle_action_idxs[[0, 2], :],
+        )
+
+        emb_pairs = torch.cat(
+            (model_embeddings[first_emb_indx], model_embeddings[second_emb_indx]), dim=1
+        )
+
+        output_distributions = self.forward(emb_pairs)
+        predicted_actions = output_distributions.argmax(dim=-1).detach().cpu().numpy()
+
+        loss = self.nll(
+            output_distributions, torch.tensor(targets, device=self.device)
+        ).mean()
+        losses = {
+            "loss": loss,
+            "accuracy": torch.tensor(accuracy_score(predicted_actions, targets)),
+            "f1": torch.tensor(f1_score(predicted_actions, targets, average="macro")),
+            "perplexity": torch.exp(loss),
+        }
+        return losses
+
+    def action_dists(self, batch):
+        self.device = next(self.parameters()).device
+        if self.add_root:
+            model_embeddings = self.add_root_model_embeddings(batch)[:, 0, :, :].to(
+                self.device
+            )
+
+        emb_pairs = torch.cat(
+            (
+                model_embeddings[
+                    np.array(
+                        [
+                            torch.arange(model_embeddings.shape[0]).cpu(),
+                            batch["node1s"].cpu(),
+                        ]
+                    )
+                ],
+                model_embeddings[
+                    np.array(
+                        [
+                            torch.arange(model_embeddings.shape[0]).cpu(),
+                            batch["node2s"].cpu(),
+                        ]
+                    )
+                ],
+            ),
+            dim=1,
+        )
+
+        return self.forward(emb_pairs)
+
+    def batch_step_train(self, batch):
+        self.device = next(self.parameters()).device
+
+        if self.add_root:
+            model_embeddings = self.add_root_model_embeddings(batch)[:, 0, :, :].to(
+                self.device
+            )
+
+        oracle_action_idxs, targets = self.oracle.targets_idxs(batch)
+
+        first_emb_indx, second_emb_indx = (
+            oracle_action_idxs[[0, 1], :],
+            oracle_action_idxs[[0, 2], :],
+        )
+        emb_pairs = torch.cat(
+            (model_embeddings[first_emb_indx], model_embeddings[second_emb_indx]), dim=1
+        )
+
+        output_distributions = self.forward(emb_pairs)
+        return {
+            "loss": self.nll(
+                output_distributions, torch.tensor(targets, device=self.device)
+            ).mean()
+        }
+
+
+class GeometricProbe(IncrementalProbe):
+    def __init__(self, args):
+        super(IncrementalProbe, self).__init__()
+        IncrementalProbe.__init__(self, args)
+
+        self.loss_types = args["loss_types"]
+        self.verbose = args["verbose"]
+        self.threshold = args["threshold"]
+        self.temp = args["temp"]
+        self.num_layers = args["num_layers"]
+
+        layers = [
+            nn.Sequential(
+                nn.Linear(self.probe_rank, self.probe_rank, bias=False),
+                nn.ReLU(),
+                nn.Dropout(self.layer_dropout_rate),
+            )
+            for layer_idx in range(self.num_layers - 1)
+        ]
+        layers = (
+            [nn.Dropout(self.embeddings_dropout_rate)]
+            + layers
+            + [nn.Linear(self.probe_rank, self.probe_rank, bias=False)]
+        )
+        self.transform = nn.Sequential(*layers)
+        if self.checkpoint_path:
+            print(f"Loading checkpoint from {self.checkpoint_path}")
+            check_probe = (
+                experiment.IncrementalParseProbeExperiment.load_from_checkpoint(
+                    self.checkpoint_path
+                ).probe
+            )
+            self.transform = copy.deepcopy(check_probe.transform)
+            if self.add_root:
+                self.root = copy.deepcopy(check_probe.root)
+        self.device = next(self.parameters()).device
+
+    def t_sigmoid(self, x, threshold=1.5, temp=0.1):
+        return torch.sigmoid((x - threshold) / (temp)).clamp(min=1e-7, max=1 - 1e-7)
+
+    def p_shift(self, model_embeddings, temp, threshold):
+        return self.t_sigmoid(
+            self.distance_matrix(model_embeddings), threshold=self.threshold, temp=temp
+        )
+
+    def marginal_p_reduce(self, model_embeddings, temp):
+        return self.t_sigmoid(
+            self.depth_matrix(model_embeddings), threshold=0, temp=temp
+        )
+
+    def forward_distance(self, batch, add_root=True):
+        transformed = self.transform(batch)
+        batchlen, seqlen, rank = transformed.size()
+        transformed = transformed.unsqueeze(2)
+        transformed = transformed.expand(-1, -1, seqlen, -1)
+        transposed = transformed.transpose(1, 2)
+        diffs = transformed - transposed
+        squared_diffs = diffs.pow(2)
+        squared_distances = torch.sum(squared_diffs, -1)
+        return squared_distances
+
+    def forward_depth(self, batch):
+        transformed = self.transform(batch)
+        batchlen, seqlen, rank = transformed.size()
+        norms = torch.bmm(
+            transformed.view(batchlen * seqlen, 1, rank),
+            transformed.view(batchlen * seqlen, rank, 1),
+        )
+        norms = norms.view(batchlen, seqlen)
+        return norms
+
+    def distance_matrix(self, batch):
+        return self.forward_distance(batch)
+
+    def depth_matrix(self, batch):
+        predictions = self.forward_depth(batch)
+        return predictions[..., None] - predictions[..., None, :]
+
+    def L1DistanceLoss(self, predictions, label_batch, length_batch):
+        """Computes L1 loss on distance matrices.
+
+        Ignores all entries where label_batch=-1
+        Normalizes first within sentences (by dividing by the square of the sentence length)
+        and then across the batch.
+
+        Args:
+        predictions: A pytorch batch of predicted distances
+        label_batch: A pytorch batch of true distances
+        length_batch: A pytorch batch of sentence lengths
+        Returns:
+        A tuple of:
+            batch_loss: average loss in the batch
+            total_sents: number of sentences in the batch
+        """
+        labels_1s = (label_batch != -1).float()
+        predictions_masked = predictions * labels_1s
+        labels_masked = label_batch * labels_1s
+        total_sents = torch.sum((length_batch != 0)).float()
+        squared_lengths = length_batch.pow(2).float()
+        if total_sents > 0:
+            loss_per_sent = torch.sum(
+                torch.abs(predictions_masked - labels_masked), dim=(1, 2)
+            )
+            normalized_loss_per_sent = loss_per_sent / squared_lengths
+            batch_loss = torch.sum(normalized_loss_per_sent) / total_sents
+        else:
+            batch_loss = torch.tensor(0.0, device=self.device)
+        return batch_loss
+
+    def L1DepthLoss(self, predictions, label_batch, length_batch):
+        """Computes L1 loss on depth sequences.
+
+        Ignores all entries where label_batch=-1
+        Normalizes first within sentences (by dividing by the sentence length)
+        and then across the batch.
+
+        Args:
+        predictions: A pytorch batch of predicted depths
+        label_batch: A pytorch batch of true depths
+        length_batch: A pytorch batch of sentence lengths
+        Returns:
+        A tuple of:
+            batch_loss: average loss in the batch
+            total_sents: number of sentences in the batch
+        """
+        total_sents = torch.sum(length_batch != 0).float()
+        labels_1s = (label_batch != -1).float()
+        predictions_masked = predictions * labels_1s
+        labels_masked = label_batch * labels_1s
+        if total_sents > 0:
+            loss_per_sent = torch.sum(
+                torch.abs(predictions_masked - labels_masked), dim=1
+            )
+            normalized_loss_per_sent = loss_per_sent / length_batch.float()
+            batch_loss = torch.sum(normalized_loss_per_sent) / total_sents
+        else:
+            batch_loss = torch.tensor(0.0, device=self.device)
+        return batch_loss
+
+    def dist_spearmanr(self, predictions, label_batch, length_batch):
+        dist_lengths_to_spearmanrs = defaultdict(list)
+        for prediction, label, length in zip(
+            predictions.detach().cpu().numpy(), label_batch, length_batch
+        ):
+            length = int(length)
+            prediction = prediction[:length, :length]
+            label = label[:length, :length].cpu()
+            dist_spearmanrs = [
+                spearmanr(pred, gold) for pred, gold in zip(prediction, label)
+            ]
+            dist_lengths_to_spearmanrs[length].extend(
+                [x.correlation for x in dist_spearmanrs]
+            )
+        dist_mean_spearman_for_each_length = {
+            length: np.mean(dist_lengths_to_spearmanrs[length])
+            for length in dist_lengths_to_spearmanrs
+        }
+        return np.mean(
+            [
+                dist_mean_spearman_for_each_length[x]
+                for x in range(5, 51)
+                if x in dist_mean_spearman_for_each_length
+            ]
+        )
+
+    def dep_spearmanr(self, depth_predictions, depth_label_batch, depth_length_batch):
+        depth_lengths_to_spearmanrs = defaultdict(list)
+        for prediction, label, length in zip(
+            depth_predictions.detach().cpu().numpy(),
+            depth_label_batch,
+            depth_length_batch,
+        ):
+            length = int(length)
+            prediction = prediction[:length]
+            label = label[:length].cpu()
+            depth_sent_spearmanr = spearmanr(prediction, label)
+            depth_lengths_to_spearmanrs[length].append(depth_sent_spearmanr.correlation)
+
+        depth_mean_spearman_for_each_length = {
+            length: np.mean(depth_lengths_to_spearmanrs[length])
+            for length in depth_lengths_to_spearmanrs
+        }
+        return np.mean(
+            [
+                depth_mean_spearman_for_each_length[x]
+                for x in range(5, 51)
+                if x in depth_mean_spearman_for_each_length
+            ]
+        )
+
+    def root_accuracy_spanning_tree(
+        self, depth_predictions, depth_label_batch, depth_length_batch, tags
+    ):
+        """Computes the root prediction accuracy and writes to disk.
+        For each sentence in the corpus, the root token in the sentence
+        should be the least deep
+        Args:
+        batch: A sequence of observations
+        """
+        correct_root_predictions = 0
+        total_sents = 0
+        print(depth_label_batch.shape, "depth label batch shape")
+        print(depth_length_batch.shape, "depth length batch shape")
+        for tag, prediction, label, length in zip(
+            tags,
+            depth_predictions.detach().cpu().numpy(),
+            depth_label_batch,
+            depth_length_batch,
+        ):
+            length = int(length)
+            prediction = prediction[1 : length + 1]
+
+            label = label[1 : length + 1].cpu().numpy().tolist()
+            poses = tag
+
+            correct_root_predictions += label.index(1) == get_nopunct_argmin(
+                prediction, poses
+            )
+            total_sents += 1
+        return correct_root_predictions / float(total_sents)
+
+    def uuas_spanning_tree(self, predictions, label_batch, length_batch, tags):
+        """Computes the UUAS score for a batch.
+        From the true and predicted distances, computes a minimum spanning tree
+        of each, and computes the percentage overlap between edges in all
+        predicted and gold trees."""
+        uspan_correct = 0
+        uspan_total = 0
+        total_sents = 0
+        for tag, prediction, label, length in zip(
+            tags, predictions.detach().cpu().numpy(), label_batch, length_batch
+        ):
+            length = int(length)
+            prediction = prediction[1 : length + 1, 1 : length + 1]
+            label = label[1 : length + 1, 1 : length + 1].cpu()
+            poses = tag
+            gold_edges = prims_matrix_to_edges(label, poses)
+            pred_edges = prims_matrix_to_edges(prediction, poses)
+            uspan_correct += len(
+                set([tuple(sorted(x)) for x in gold_edges]).intersection(
+                    set([tuple(sorted(x)) for x in pred_edges])
+                )
+            )
+            uspan_total += len(gold_edges)
+            total_sents += 1
+        uuas = uspan_correct / float(uspan_total)
+        return uuas
+
+    def batch_step_eval(self, batch):
+        if "lengths" in batch:
+            max_tok_length = batch["lengths"].max()
+            batch["padded_embeddings"] = batch["padded_embeddings"][
+                :, :, : max_tok_length + 4, :
+            ]
+            batch["gold_distances"] = batch["gold_distances"][
+                :, : max_tok_length + 4, : max_tok_length + 4
+            ]
+            batch["gold_depths"] = batch["gold_depths"][:, : max_tok_length + 4]
+
+        self.device = next(self.parameters()).device
+
+        if self.add_root:
+            model_embeddings = self.add_root_model_embeddings(batch)[:, 0, :, :].to(
+                self.device
+            )
+
+            gold_distances = self.add_root_distance_labels(batch)[:, :-1, :-1].to(
+                self.device
+            )
+            gold_depths = self.add_root_depth_labels(batch)[:, :-1].to(self.device)
+
+        else:
+            model_embeddings = batch["padded_embeddings"][:, 0, 1:, :].to(self.device)
+            gold_distances = batch["gold_distances"][:, :-1, :-1].to(self.device)
+            gold_depths = batch["gold_depths"][:, :-1].to(self.device)
+
+        lengths = batch["lengths"].to(self.device)
+
+        losses = {
+            "L2": torch.linalg.norm(self.transform(model_embeddings)),
+            "temperature": torch.tensor(self.temp, device=self.device),
+        }
+
+        distance_predictions = self.forward_distance(model_embeddings)
+        depth_predictions = self.forward_depth(model_embeddings)
+        losses["distance_mse"] = self.L1DistanceLoss(
+            distance_predictions, gold_distances, lengths
+        )
+        losses["depth_mse"] = self.L1DepthLoss(depth_predictions, gold_depths, lengths)
+
+        action_dists = self.oracle.action_dists(
+            self.p_shift(model_embeddings, temp=self.temp, threshold=self.threshold),
+            self.marginal_p_reduce(model_embeddings, temp=self.temp),
+        )
+        oracle_action_idxs, targets = self.oracle.targets_idxs(batch)
+
+        losses["oracle_action_nll"] = self.nll(
+            action_dists[oracle_action_idxs], torch.tensor(targets, device=self.device)
+        ).mean()
+
+        predicted_actions = (
+            action_dists[oracle_action_idxs].argmax(dim=-1).detach().cpu().numpy()
+        )
+
+        losses["f1"] = torch.tensor(
+            f1_score(predicted_actions, targets, average="macro")
+        )
+        losses["accuracy"] = torch.tensor(accuracy_score(predicted_actions, targets))
+        losses["perplexity"] = torch.exp(losses["oracle_action_nll"].detach())
+        losses["uuas_spanning_tree"] = torch.tensor(
+            self.uuas_spanning_tree(
+                distance_predictions, gold_distances, lengths, batch["xpos"]
+            )
+        )
+        losses["root_accuracy_spanning_tree"] = torch.tensor(
+            self.root_accuracy_spanning_tree(
+                depth_predictions, gold_depths, lengths, batch["xpos"]
+            )
+        )
+        losses["dep_spearman"] = torch.tensor(
+            self.dep_spearmanr(depth_predictions, gold_depths, lengths)
+        )
+        losses["dist_spearman"] = torch.tensor(
+            self.dist_spearmanr(distance_predictions, gold_distances, lengths)
+        )
+
+        losses["loss"] = sum(losses[loss_type] for loss_type in self.loss_types)
+
+        for key in losses:
+            losses[key] = losses[key].detach()
+
+        return losses
+
+    def action_dists(self, batch):
+        self.device = next(self.parameters()).device
+        if self.add_root:
+            model_embeddings = self.add_root_model_embeddings(batch)[:, 0, :, :]
+
+        action_dists = self.oracle.action_dists(
+            self.p_shift(model_embeddings, temp=self.temp, threshold=self.threshold),
+            self.marginal_p_reduce(model_embeddings, temp=self.temp),
+        )
+
+        return action_dists[
+            np.array(
+                [
+                    torch.arange(model_embeddings.shape[0]).cpu(),
+                    batch["node1s"].cpu(),
+                    batch["node2s"].cpu(),
+                ]
+            )
+        ]
+
+    def batch_step_train(self, batch):
+        if "lengths" in batch:
+            max_tok_length = batch["lengths"].max()
+            batch["padded_embeddings"] = batch["padded_embeddings"][
+                :, :, : max_tok_length + 4, :
+            ]
+            batch["gold_distances"] = batch["gold_distances"][
+                :, : max_tok_length + 4, : max_tok_length + 4
+            ]
+            batch["gold_depths"] = batch["gold_depths"][:, : max_tok_length + 4]
+
+        self.device = next(self.parameters()).device
+
+        if self.add_root:
+            model_embeddings = self.add_root_model_embeddings(batch)[:, 0, :, :].to(
+                self.device
+            )
+
+            if self.args["probe_name"] == "Geometric_Regression":
+                gold_distances = self.add_root_distance_labels(batch)[:, :-1, :-1].to(
+                    self.device
+                )
+                gold_depths = self.add_root_depth_labels(batch)[:, :-1].to(self.device)
+                lengths = batch["lengths"].to(self.device)
+
+        else:
+            model_embeddings = batch["padded_embeddings"][:, 0, 1:, :].to(self.device)
+            if self.args["probe_name"] == "Geometric_Regression":
+                gold_distances = batch["gold_distances"][:, :-1, :-1].to(self.device)
+                gold_depths = batch["gold_depths"][:, :-1].to(self.device)
+                lengths = batch["lengths"].to(self.device)
+
+        losses = {
+            "L2": torch.linalg.norm(self.transform(model_embeddings)),
+            "temperature": torch.tensor(self.temp, device=self.device),
+        }
+
+        if self.args["probe_name"] == "Geometric_Regression":
+            distance_predictions = self.forward_distance(model_embeddings)
+            depth_predictions = self.forward_depth(model_embeddings)
+            losses["distance_mse"] = self.L1DistanceLoss(
+                distance_predictions, gold_distances, lengths
+            )
+            losses["depth_mse"] = self.L1DepthLoss(
+                depth_predictions, gold_depths, lengths
+            )
+
+        else:
+            action_dists = self.oracle.action_dists(
+                self.p_shift(
+                    model_embeddings, temp=self.temp, threshold=self.threshold
+                ),
+                self.marginal_p_reduce(model_embeddings, temp=self.temp),
+            )
+            oracle_action_idxs, targets = self.oracle.targets_idxs(batch)
+
+            losses["oracle_action_nll"] = self.nll(
+                action_dists[oracle_action_idxs],
+                torch.tensor(targets, device=self.device),
+            ).mean()
+        losses["loss"] = sum(losses[loss_type] for loss_type in self.loss_types)
+
+        for key in losses:
+            if not key == "loss":
+                losses[key] = losses[key].detach()
+
+        return losses
diff --git a/src/datasets.py b/src/datasets.py
new file mode 100644
index 0000000..16fbef7
--- /dev/null
+++ b/src/datasets.py
@@ -0,0 +1,500 @@
+import torch
+from pathlib import Path
+from typing import List, Optional
+import numpy as np
+from utils import *
+from gpt2 import GPT2_extended
+
+from tqdm import tqdm
+import json
+
+from torch.utils.data import DataLoader, Dataset
+from pytorch_lightning import LightningDataModule
+from transformers import AutoTokenizer, GPT2LMHeadModel
+import task
+from collections import namedtuple
+
+
+class PTB_Split(Dataset):
+    def __init__(self, split=None, probe=None, config=None, gpt=None):
+        with torch.no_grad():
+            self.root_dir = config["data_params"]["root_dir"]
+            self.data_path = f"{self.root_dir}/{split}.json"
+            self.config = config
+            self.oracle = probe.oracle
+            self.probe = probe
+            self.items, self.embs = [], []
+            self.gpt = gpt
+
+            if config["probe_params"]["layer"] == "all":
+                start, end = 0, MODEL_DATA[config["pretrained_model"]]["layer_count"]
+            else:
+                start, end = (
+                    config["probe_params"]["layer"],
+                    config["probe_params"]["layer"] + 1,
+                )
+
+            self.observations = self.load_conll_dataset(
+                f"{self.root_dir}/{split}.conllx"
+            )
+
+            device = "cuda"
+
+            (
+                self.token_ids,
+                self.stacks,
+                self.bufs,
+                self.action_ids,
+                self.padded_action_ngrams,
+                self.embs,
+                self.gold_distances,
+                self.gold_depths,
+                self.lengths,
+                self.gold_tuples,
+                self.cont_mask,
+                self.xpos,
+            ) = [[] for _ in range(12)]
+
+            with open(self.data_path) as f:
+                num_lines = len(f.readlines())
+
+            with open(self.data_path) as f:
+                batch_embs, batch_maps, batch_toks, count = [], [], [], 0
+                for idx, line in tqdm(
+                    enumerate(f), desc=f"loading {split} data", total=num_lines
+                ):
+                    o = json.loads(line)
+                    if o["key"] == "sentence":
+                        if o["projective"]:
+                            sent = " ".join(o["orig_tokens"])
+                            line = sent.strip()  # Remove trailing characters
+                            line = (
+                                self.gpt.tokenizer.bos_token
+                                + line
+                                + self.gpt.tokenizer.eos_token
+                            )
+                            tokenized_text = self.gpt.tokenizer.tokenize(line)
+                            untok_tok_mapping = self.gpt.match_tokenized_to_untokenized(
+                                tokenized_text, line
+                            )
+                            batch_maps.append(untok_tok_mapping)
+                            batch_toks.append(tokenized_text)
+                            count += 1
+
+                            if count > 100 or idx == num_lines - 1:
+                                lens = [len(x) for x in batch_toks]
+                                max_len = max(lens)
+
+                                for i, tok in enumerate(batch_toks):
+                                    if len(tok) < max_len:
+                                        batch_toks[i].extend(
+                                            [self.gpt.tokenizer.eos_token]
+                                            * (max_len - len(batch_toks[i]))
+                                        )
+                                batch_embs = [
+                                    torch.tensor(
+                                        [
+                                            self.gpt.tokenizer.convert_tokens_to_ids(
+                                                tokenized_text
+                                            )
+                                        ]
+                                    ).to(device)
+                                    for tokenized_text in batch_toks
+                                ]
+                                with torch.no_grad():
+                                    encoded_layers = self.gpt.model(
+                                        torch.cat(batch_embs, dim=0),
+                                        output_hidden_states=True,
+                                    )["hidden_states"][start]
+
+                                for ind2, untok_tok_mapping in enumerate(batch_maps):
+                                    model_embeddings = encoded_layers[ind2].unsqueeze(0)
+                                    aligned_model_embeddings = torch.cat(
+                                        [
+                                            torch.mean(
+                                                model_embeddings[
+                                                    :,
+                                                    untok_tok_mapping[i][
+                                                        0
+                                                    ] : untok_tok_mapping[i][-1]
+                                                    + 1,
+                                                    :,
+                                                ],
+                                                dim=1,
+                                            )
+                                            for i, tok in enumerate(
+                                                untok_tok_mapping.keys()
+                                            )
+                                        ]
+                                    ).unsqueeze(0)
+                                    aligned_model_embeddings = torch.cat(
+                                        (
+                                            model_embeddings[:, 0:1, :],
+                                            aligned_model_embeddings,
+                                            model_embeddings[:, -1:, :].repeat(
+                                                1,
+                                                self.config["data_params"]["token_pad"]
+                                                - aligned_model_embeddings.shape[1]
+                                                - 1,
+                                                1,
+                                            ),
+                                        ),
+                                        dim=1,
+                                    ).unsqueeze(0)
+                                    assert (
+                                        aligned_model_embeddings.shape[2]
+                                        == self.config["data_params"]["token_pad"]
+                                    )  # model_embeddings.shape[1]#len(untok_tok_mapping.keys())+2
+
+                                    # model_embeddings = align(encoded_layers[ind2].unsqueeze(0), b)
+                                    self.embs.append(
+                                        aligned_model_embeddings[:, 0, :, :].to("cpu")
+                                    )
+
+                                batch_embs, batch_maps, batch_toks, count = (
+                                    [],
+                                    [],
+                                    [],
+                                    0,
+                                )
+
+                            if self.oracle:
+                                action_ids = [
+                                    i[0] for i in o[self.oracle.name]["actions"]
+                                ]
+                                action_ids = np.pad(
+                                    action_ids,
+                                    (
+                                        0,
+                                        self.config["data_params"]["action_pad"]
+                                        - len(action_ids),
+                                    ),
+                                    "constant",
+                                    constant_values=self.probe.oracle.a2i["PAD"],
+                                )
+                            else:
+                                action_ids = torch.tensor([-1])
+
+                            if (
+                                "padded_action_ngrams"
+                                in config["probe_params"]["data_sources"]
+                            ):
+                                padded_action_ngrams = conv_padded_ngrams(
+                                    self.probe.oracle.a2i,
+                                    action_ids,
+                                    action_ngram_pad=self.config["data_params"][
+                                        "action_ngram_pad"
+                                    ],
+                                    token_pad=self.config["data_params"]["token_pad"],
+                                )
+                            else:
+                                padded_action_ngrams = torch.tensor([-1])
+
+                            if (
+                                "continuous_action_masks"
+                                in config["probe_params"]["data_sources"]
+                            ):
+                                mask = generate_continuous_mask(
+                                    action_ids, self.config["data_params"]["token_pad"]
+                                )
+                                cont_mask = np.pad(
+                                    mask,
+                                    (
+                                        (
+                                            0,
+                                            self.config["data_params"]["action_pad"]
+                                            - len(mask),
+                                        ),
+                                        (0, 0),
+                                    ),
+                                    "constant",
+                                    constant_values=-1,
+                                )
+                            else:
+                                cont_mask = torch.tensor([-1])
+
+                            if "gold_stacks" in config["probe_params"]["data_sources"]:
+                                stacks = o[self.oracle.name]["gold_stacks"]
+                                stacks.extend(
+                                    [[0]]
+                                    * (
+                                        self.config["data_params"]["action_pad"]
+                                        - len(stacks)
+                                    )
+                                )
+                                stacks = np.array(
+                                    [
+                                        i
+                                        + [0]
+                                        * (
+                                            self.config["data_params"]["token_pad"]
+                                            - len(i)
+                                        )
+                                        for i in stacks
+                                    ]
+                                )
+                            else:
+                                stacks = torch.tensor([-1])
+
+                            if "gold_buffers" in config["probe_params"]["data_sources"]:
+                                bufs = o[self.oracle.name]["gold_buffers"]
+                                bufs.extend(
+                                    [[0]]
+                                    * (
+                                        self.config["data_params"]["action_pad"]
+                                        - len(bufs)
+                                    )
+                                )
+                                bufs = np.array(
+                                    [
+                                        i
+                                        + [0]
+                                        * (
+                                            self.config["data_params"]["token_pad"]
+                                            - len(i)
+                                        )
+                                        for i in bufs
+                                    ]
+                                )
+                            else:
+                                bufs = torch.tensor([-1])
+
+                            if "gold_tuples" in config["probe_params"]["data_sources"]:
+                                gold_tuples = o[self.oracle.name]["action_tuples"]
+                                gold_tuples.extend(
+                                    [[-1]]
+                                    * (
+                                        self.config["data_params"]["action_pad"]
+                                        - len(gold_tuples)
+                                    )
+                                )
+                                gold_tuples = np.array(
+                                    [
+                                        i
+                                        + [-1]
+                                        * (
+                                            self.config["data_params"]["token_pad"]
+                                            - len(i)
+                                        )
+                                        for i in gold_tuples
+                                    ]
+                                )
+                            else:
+                                gold_tuples = torch.tensor([-1])
+
+                            if (
+                                "gold_distances"
+                                in config["probe_params"]["data_sources"]
+                            ):
+                                gold_distances = task.ParseDistanceTask.labels(
+                                    self.observations[idx]
+                                )
+                                gold_distances = np.pad(
+                                    gold_distances,
+                                    (
+                                        (
+                                            0,
+                                            config["data_params"]["token_pad"]
+                                            - len(gold_distances),
+                                        ),
+                                        (
+                                            0,
+                                            config["data_params"]["token_pad"]
+                                            - len(gold_distances),
+                                        ),
+                                    ),
+                                    "constant",
+                                    constant_values=-1,
+                                )
+                            else:
+                                gold_distances = torch.tensor([-1])
+
+                            if "gold_depths" in config["probe_params"]["data_sources"]:
+                                gold_depths = task.ParseDepthTask.labels(
+                                    self.observations[idx]
+                                )
+                                gold_depths = np.pad(
+                                    gold_depths,
+                                    (
+                                        0,
+                                        config["data_params"]["token_pad"]
+                                        - len(gold_depths),
+                                    ),
+                                    "constant",
+                                    constant_values=-1,
+                                )
+                            else:
+                                gold_depths = torch.tensor([-1])
+
+                            if "token_ids" in config["probe_params"]["data_sources"]:
+                                token_ids = np.pad(
+                                    o["token_ids"],
+                                    (
+                                        0,
+                                        self.config["data_params"]["token_pad"]
+                                        - len(o["token_ids"]),
+                                    ),
+                                    "constant",
+                                    constant_values=0,
+                                )
+                            else:
+                                token_ids = torch.tensor([-1])
+
+                            if "xpos" in config["probe_params"]["data_sources"]:
+                                xpos = np.pad(
+                                    [XPOS2IDX[t] for t in o["tags"]],
+                                    (
+                                        0,
+                                        self.config["data_params"]["token_pad"]
+                                        - len(o["tags"]),
+                                    ),
+                                    "constant",
+                                    constant_values=XPOS2IDX["."],
+                                )
+                            else:
+                                xpos = torch.tensor([-1])
+
+                            self.token_ids.append(token_ids)
+                            self.stacks.append(stacks)
+                            self.bufs.append(bufs)
+                            self.action_ids.append(action_ids)
+                            self.padded_action_ngrams.append(padded_action_ngrams)
+                            self.gold_distances.append(gold_distances)
+                            self.gold_depths.append(gold_depths)
+                            self.lengths.append(len(o["orig_tokens"]))
+                            self.gold_tuples.append(gold_tuples)
+                            self.cont_mask.append(cont_mask)
+                            self.xpos.append(xpos)
+
+                            if config["data_params"][split]["dry_run"]:
+                                if (
+                                    len(self.embs)
+                                    >= config["data_params"][split]["dry_run"]
+                                ):
+                                    break
+            self.gpt = None
+
+    def generate_lines_for_sent(self, lines):
+        """Yields batches of lines describing a sentence in conllx.
+        Args:
+            lines: Each line of a conllx file.
+        Yields:
+            a list of lines describing a single sentence in conllx.
+        """
+        buf = []
+        for line in lines:
+            if line.startswith("#"):
+                continue
+            if not line.strip():
+                if buf:
+                    yield buf
+                    buf = []
+                else:
+                    continue
+            else:
+                buf.append(line.strip())
+        if buf:
+            yield buf
+
+    def load_conll_dataset(self, filepath):
+        """Reads in a conllx file; generates Observation objects
+
+        For each sentence in a conllx file, generates a single Observation
+        object.
+        Args:
+        filepath: the filesystem path to the conll dataset
+
+        Returns:
+        A list of Observations
+        """
+        observation_class = namedtuple(
+            "Observation",
+            [
+                "index",
+                "sentence",
+                "lemma_sentence",
+                "upos_sentence",
+                "xpos_sentence",
+                "morph",
+                "head_indices",
+                "governance_relations",
+                "secondary_relations",
+                "extra_info",
+                "embeddings",
+            ],
+        )
+
+        observations = []
+        lines = (x for x in open(filepath))
+        for buf in self.generate_lines_for_sent(lines):
+            conllx_lines = []
+            for line in buf:
+                conllx_lines.append(line.strip().split("\t"))
+            embeddings = [None for x in range(len(conllx_lines))]
+            observation = observation_class(*zip(*conllx_lines), embeddings)
+            observations.append(observation)
+        return observations
+
+    def __len__(self):
+        return len(self.embs)
+
+    def __getitem__(self, idx):
+        return [
+            self.token_ids[idx],
+            self.stacks[idx],
+            self.bufs[idx],
+            self.action_ids[idx],
+            self.padded_action_ngrams[idx],
+            self.embs[idx],
+            self.gold_distances[idx],
+            self.gold_depths[idx],
+            self.lengths[idx],
+            self.gold_tuples[idx],
+            self.cont_mask[idx],
+            self.xpos[idx],
+        ]
+
+
+class PTB_Dataset(LightningDataModule):
+    def __init__(self, config=None, probe=None):
+        super().__init__()
+        self.config = config
+        device = "cuda"
+        self.probe = probe
+        tokenizer = AutoTokenizer.from_pretrained(
+            config["pretrained_model"], local_files_only=True
+        )
+        model = (
+            GPT2LMHeadModel.from_pretrained(
+                config["pretrained_model"], local_files_only=True
+            )
+            .to(device)
+            .eval()
+        )
+        self.gpt = GPT2_extended(model=model, tokenizer=tokenizer, tail=None)
+        for param in self.gpt.parameters():
+            param.requires_grad = False
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        self.train_dataset, self.valid_dataset, self.test_dataset = [
+            PTB_Split(split=split, probe=self.probe, config=self.config, gpt=self.gpt)
+            for split in ["train", "valid", "test"]
+        ]
+
+    def produce_dataloader(self, split):
+        return DataLoader(
+            self.__dict__[f"{split}_dataset"],
+            batch_size=self.config["data_params"][split]["batch_size"],
+            num_workers=self.config["data_params"]["num_workers"],
+            shuffle=self.config["data_params"][split]["shuffle"],
+            pin_memory=self.config["data_params"]["pin_memory"],
+        )
+
+    def train_dataloader(self) -> DataLoader:
+        return self.produce_dataloader("train")
+
+    def val_dataloader(self) -> DataLoader:
+        return self.produce_dataloader("valid")
+
+    def test_dataloader(self) -> DataLoader:
+        return self.produce_dataloader("test")
diff --git a/src/experiment.py b/src/experiment.py
new file mode 100644
index 0000000..f151759
--- /dev/null
+++ b/src/experiment.py
@@ -0,0 +1,49 @@
+import torch.optim as optim
+import pytorch_lightning as pl
+
+class IncrementalParseProbeExperiment(pl.LightningModule):
+    def __init__(self, probe=None, params: dict = None) -> None:
+        super(IncrementalParseProbeExperiment, self).__init__()
+        self.save_hyperparameters()
+        self.probe = probe
+        self.params = params
+        self.curr_device = 'cuda'
+        self.hold_graph = False
+        try: self.hold_graph = self.params['retain_first_backpass']
+        except: pass
+
+    def format_batch(self, batch):
+        token_ids, gold_stacks, gold_buffers, action_ids, padded_action_ngrams, padded_embeddings, gold_distances, gold_depths ,lengths, gold_tuples, cont_masks, xpos = batch
+        return {'token_ids': token_ids.to('cuda'), #batch_size x token_pad
+                'gold_stacks': gold_stacks.to('cuda'), #batch_size x token_pad
+                'gold_buffers': gold_buffers.to('cuda'), #batch_size x token_pad
+                'action_ids': action_ids.to('cuda'), #batch_size x action_pad
+                'padded_action_ngrams': padded_action_ngrams.to('cuda'),#batch_size x token_pad x action_ngram_pad
+                'padded_embeddings': padded_embeddings.to('cuda'), #batch_size x model_layers x token_pad x feature_count
+                'gold_distances': gold_distances.to('cuda'), #matrix of distances (batch_size x token_pad x token_pad)
+                'gold_depths': gold_depths.to('cuda'), #matrix of depths (batch_size x token_pad x token_pad)
+                'lengths': lengths.to('cuda'),
+                'gold_tuples': gold_tuples.to('cuda'),
+                'continuous_action_masks': cont_masks.to('cuda'), 
+                'xpos': xpos.to('cuda') # xpos for evaluation
+                }  
+
+    def training_step(self, batch, batch_idx, optimizer_idx = 0):
+        train_loss = self.probe.batch_step_train(self.format_batch(batch))
+        self.log_dict({key: val.item() for key, val in train_loss.items()}, sync_dist=True)
+        return train_loss['loss']
+
+    def validation_step(self, batch, batch_idx, optimizer_idx = 0):
+        val_loss = self.probe.batch_step_eval(self.format_batch(batch))
+        val_loss['loss'] = val_loss['loss'].detach()
+        self.log_dict({f"val_{key}": val.item() for key, val in val_loss.items()}, sync_dist=True)
+     
+    def on_validation_end(self) -> None: return None
+
+    def configure_optimizers(self):
+        optimizer = getattr(optim, self.params['optimizer_type'])(filter(lambda p: p.requires_grad, self.probe.parameters()), **self.params['optimizer_params'])
+        scheduler = getattr(optim.lr_scheduler, self.params['scheduler_type'])(optimizer, **self.params['scheduler_params'])
+        if self.params['scheduler_type'] == 'ReduceLROnPlateau': return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss'}
+        else: return {'optimizer': optimizer, 'lr_scheduler': scheduler}
+
+
diff --git a/src/gpt2.py b/src/gpt2.py
new file mode 100644
index 0000000..27d245e
--- /dev/null
+++ b/src/gpt2.py
@@ -0,0 +1,637 @@
+import torch.nn as nn
+import torch
+from collections import defaultdict
+from torch import optim
+from queue import PriorityQueue
+from utils import *
+from itertools import count
+
+# torch won't bp through time in eval mode unless we set:
+torch.backends.cudnn.enabled = False
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+class ClozeTail_gpt2(nn.Module):
+    def __init__(self, cloze_model, layer_idx):
+        super(ClozeTail_gpt2, self).__init__()
+        self.last_layer = cloze_model.lm_head
+
+    def forward(self, x):
+        transformer_output = self.transformer(x)[0]
+        return transformer_output
+
+
+class GPT2_extended(nn.Module):
+    def __init__(self, model=None, tokenizer=None, tail=None):
+        super(GPT2_extended, self).__init__()
+
+        self.model = model
+        self.tokenizer = tokenizer
+        self.model.eval()
+        self.tail = tail
+
+        for param in self.model.parameters():
+            param.requires_grad = False
+
+    def tail_by_layer(self, layer, x):
+        if layer < self.model.config.n_layer:
+            tl = ClozeTail_gpt2(self.model, layer)
+            tl.eval()
+            return tl(x)
+        else:
+            return self.model.lm_head(x)
+
+    def embeddings_w_map(self, sentence, layer):
+        untokenized_sent = sentence.split()
+        tokenized_sent = self.tokenizer.tokenize(
+            self.tokenizer.bos_token + sentence + self.tokenizer.eos_token
+        )
+        tokens_tensor = self.tokenizer.encode(
+            self.tokenizer.bos_token + sentence + self.tokenizer.eos_token,
+            return_tensors="pt",
+        ).to(self.model.device)
+        output = self.model(tokens_tensor, output_hidden_states=True)
+        model_embeddings = output["hidden_states"][layer].detach()
+
+        original_embeddings = model_embeddings.detach().clone().to(self.model.device)
+
+        untok_tok_mapping = self.match_tokenized_to_untokenized(
+            tokenized_sent, untokenized_sent
+        )
+
+        return original_embeddings, untok_tok_mapping
+
+    def align(self, model_embeddings, untok_tok_mapping):
+        aligned_model_embeddings = torch.cat(
+            [
+                torch.mean(
+                    model_embeddings[
+                        :, untok_tok_mapping[i][0] : untok_tok_mapping[i][-1] + 1, :
+                    ],
+                    dim=1,
+                )
+                for i, tok in enumerate(untok_tok_mapping.keys())
+            ]
+        ).unsqueeze(0)
+
+        aligned_model_embeddings = torch.cat(
+            (
+                model_embeddings[:, 0:1, :],
+                aligned_model_embeddings,
+                model_embeddings[:, -1:, :],
+            ),
+            dim=1,
+        ).unsqueeze(0)
+
+        assert aligned_model_embeddings.shape[2] == len(untok_tok_mapping.keys()) + 2
+
+        return aligned_model_embeddings
+
+    def match_tokenized_to_untokenized(self, tokenized_sent, untokenized_sent):
+        """Aligns tokenized and untokenized sentence given subwords "##" prefixed
+        Assuming that each subword token that does not start a new word is prefixed
+        by two hashes, "##", computes an alignment between the un-subword-tokenized
+        and subword-tokenized sentences.
+        Args:
+            tokenized_sent: a list of strings describing a subword-tokenized sentence
+            untokenized_sent: a list of strings describing a sentence, no subword tok.
+        Returns:
+            A dictionary of type {int: list(int)} mapping each untokenized sentence
+            index to a list of subword-tokenized sentence indices
+        """
+        # avoiding |eos|
+        tokenized_sent = tokenized_sent[:-1]
+        mapping = defaultdict(list)
+        untokenized_sent_index = 0
+        # avoiding |bos|
+        tokenized_sent_index = 1
+        while untokenized_sent_index < len(
+            untokenized_sent
+        ) and tokenized_sent_index < len(tokenized_sent):
+            while tokenized_sent_index + 1 < len(tokenized_sent) and not tokenized_sent[
+                tokenized_sent_index + 1
+            ].startswith("Ġ"):
+                mapping[untokenized_sent_index].append(tokenized_sent_index)
+                tokenized_sent_index += 1
+            mapping[untokenized_sent_index].append(tokenized_sent_index)
+            untokenized_sent_index += 1
+            tokenized_sent_index += 1
+        return mapping
+
+    def gen_counterfactuals(
+        self,
+        probe=None,
+        sent=None,
+        label_batch=None,
+        num_steps=500000,
+        patience=10000,
+        verbose=True,
+        loss_tolerance=0.05,
+        lr=0.0001,
+        print_every=5000,
+        prefix_freebits=1,
+        lastword_freebits=1,
+        kl_weight=1,
+        scheduler_patience=100,
+        compute_kl=True,
+    ):
+        probe.eval()
+        untokenized_sent = sent.split()
+        tokenized_sent = self.tokenizer.tokenize(
+            self.tokenizer.bos_token + sent + self.tokenizer.eos_token
+        )
+        tokens_tensor = self.tokenizer.encode(
+            self.tokenizer.bos_token + sent + self.tokenizer.eos_token,
+            return_tensors="pt",
+        ).to(self.model.device)
+        model_embeddings = self.model(tokens_tensor, output_hidden_states=True)[
+            "hidden_states"
+        ][probe.layer].detach()
+        original_embeddings = (
+            model_embeddings.detach().clone().unsqueeze(0).to(self.model.device)
+        )
+        model_embeddings = model_embeddings.unsqueeze(0).repeat(
+            label_batch["gold_tuples"].shape[0], 1, 1, 1
+        )
+        untok_tok_mapping = self.match_tokenized_to_untokenized(
+            tokenized_sent, untokenized_sent
+        )
+
+        model_embeddings.requires_grad = True
+        optimizer = torch.optim.Adam([model_embeddings], lr=lr)
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer, mode="min", factor=0.1, patience=scheduler_patience
+        )
+
+        prediction_loss = 100  # Initialize the prediction loss as high
+        increment_idx = 0
+
+        smallest_loss = prediction_loss
+        steps_since_best = 0
+        # print(prediction_loss > loss_tolerance)
+        while prediction_loss > loss_tolerance:
+            if increment_idx >= num_steps:
+                if verbose:
+                    print("Breaking because of increment index")
+                break
+
+            if increment_idx % print_every == 0 and verbose:
+                print(f"=========== step {increment_idx} ===========")
+
+            if model_embeddings.shape[1] == len(untokenized_sent) + 2:
+                aligned_model_embeddings = model_embedding  # s.unsqueeze(0)
+
+            else:
+                assert model_embeddings.shape[2] == len(tokenized_sent)
+
+                aligned_model_embeddings = torch.cat(
+                    [
+                        torch.mean(
+                            model_embeddings[
+                                :,
+                                :,
+                                untok_tok_mapping[i][0] : untok_tok_mapping[i][-1] + 1,
+                                :,
+                            ],
+                            dim=2,
+                        )
+                        for i, tok in enumerate(untokenized_sent)
+                    ],
+                    dim=1,
+                ).unsqueeze(1)
+
+                aligned_model_embeddings = torch.cat(
+                    (
+                        model_embeddings[:, :, 0:1, :],
+                        aligned_model_embeddings,
+                        model_embeddings[:, :, -1:, :],
+                    ),
+                    dim=2,
+                )  # .unsqueeze(0)
+
+                assert aligned_model_embeddings.shape[2] == len(untokenized_sent) + 2
+
+            batch = {
+                "padded_embeddings": aligned_model_embeddings,
+                "gold_tuples": label_batch["gold_tuples"].clone(),
+                "action_ids": label_batch["action_ids"].clone(),
+                "continuous_action_masks": label_batch[
+                    "continuous_action_masks"
+                ].clone(),
+            }
+
+            loss_dict = probe.batch_step_train(batch)
+            loss = loss_dict["loss"]
+            prediction_loss = loss.clone().detach()
+            if increment_idx == 0:
+                initial_loss = loss.clone().detach()
+
+            """kldivloss"""
+            if compute_kl and kl_weight > 0:
+                print("computing kl")
+                postperturb_logits = self.tail(aligned_model_embeddings[0])
+
+                prefix_kl_loss = (
+                    F.kl_div(
+                        preperturb_logits[:, :-2, :].log_softmax(-1),
+                        postperturb_logits[:, :-2, :].log_softmax(-1),
+                        size_average=None,
+                        reduce=False,
+                        log_target=True,
+                    )
+                    .sum(-1)
+                    .squeeze()
+                )
+
+                last_word_kl_loss = (
+                    F.kl_div(
+                        preperturb_logits[:, -2:-1, :].log_softmax(-1),
+                        postperturb_logits[:, -2:-1, :].log_softmax(-1),
+                        size_average=None,
+                        reduce=False,
+                        log_target=True,
+                    )
+                    .sum(-1)
+                    .squeeze()
+                )
+
+                # output_kl_loss_mean = output_kl_loss.sum()/mask_mask.sum()
+
+                loss += kl_weight * (
+                    torch.abs(last_word_kl_loss.mean() - lastword_freebits)
+                    + torch.abs(prefix_kl_loss.mean() - prefix_freebits)
+                )
+                if increment_idx % print_every == 0 and verbose:
+                    print(
+                        f"abs(last_word_kl - fb): {torch.abs(last_word_kl_loss.mean() - lastword_freebits).detach()}"
+                    )
+                    print(
+                        f"abs(prefix_kl - fb): {torch.abs(prefix_kl_loss.mean() - prefix_freebits).detach()}"
+                    )
+            """"""
+
+            loss.backward()
+            # adwf
+            optimizer.step()
+            scheduler.step(loss)
+
+            if increment_idx % print_every == 0 and verbose:
+                print(f"steps_since_best: {steps_since_best}")
+                print(f"total_loss: {loss.detach()}")
+                print("==============================")
+                print()
+
+            if (smallest_loss - prediction_loss) > 0.001:
+                best_embeddings = model_embeddings.detach().clone()
+                steps_since_best = 0
+                smallest_loss = prediction_loss
+
+            else:
+                steps_since_best += 1
+                # if steps_since_best == patience/2:
+                if steps_since_best == patience and verbose:
+                    print("Breaking because of patience with loss", smallest_loss)
+                    break
+            increment_idx += 1
+        if verbose:
+            print(f"Exited grad update loop after {increment_idx} steps, ")
+
+        return {
+            "padded_embeddings": best_embeddings,
+            "original_embeddings": original_embeddings[0],
+            "output_logits": None,
+            "original_logits": None,
+            "cfx_loss": prediction_loss.item(),
+            "initial_loss": initial_loss.item(),
+        }
+
+    def parse_beamsearch(
+        self,
+        probe=None,
+        sentence=None,
+        generative=False,
+        topk=30,
+        ncont=5,
+    ):
+        """
+        Beam search decoding
+        inputs: probe - IncrementalParse Probe
+        outputs: [(score, parsestate) x beam_width]
+        """
+        probe.eval().to(self.model.device)
+        init_parserstate = probe.oracle.initial_state()
+
+        original_model_embeddings, untok_tok_mapping = self.embeddings_w_map(
+            sentence, probe.layer
+        )
+        original_model_embeddings = self.align(
+            original_model_embeddings, untok_tok_mapping
+        )
+        init_parserstate.model_embeddings = original_model_embeddings
+        sentence_tokens = self.tokenizer.encode(
+            self.tokenizer.bos_token + sentence + self.tokenizer.eos_token,
+            return_tensors="pt",
+        ).to(self.model.device)[0]
+        self.model.device
+        endstates = []
+        states = PriorityQueue()
+        state_count = count()
+        states.put((0, next(state_count), init_parserstate))
+
+        sentence_len = len(sentence.split())
+
+        while True:
+            next_states = []
+            while states.qsize():
+                if len(next_states) >= topk:
+                    break
+                score, _, state = states.get()
+
+                ngram_init_state = state
+                ngram_beam_width = ncont // 10
+                ngram_topk = ncont
+                probe.eval()
+                # Number of ngrams to generate
+                ngram_endstates = []
+                ngram_states = PriorityQueue()
+                ngram_states.put((0, ngram_init_state))
+                # from itertools import count
+
+                while True:
+                    ngram_pruned_queue = PriorityQueue()
+                    ngram_state_model_embeddings = []
+                    ngram_node1s = []
+                    ngram_node2s = []
+                    ngram_action_ids = []
+                    ngram_continuous_action_masks = []
+                    # prune to the topl
+                    for i in range(ngram_topk):
+                        if ngram_states.qsize():
+                            ngram_score, ngram_state = ngram_states.get()
+                            # check if state has reached a shift or is terminal and check if we have the desired number of states
+                            # and state batch data to meta batch
+                            # only add to the batch if there is an action to predict
+                            if len(ngram_state.stack) > 1:
+                                ngram_state_batch = ngram_state.to_batch(probe)
+                                ngram_state_model_embeddings.append(
+                                    ngram_state_batch["padded_embeddings"]
+                                )
+                                ngram_node1s.append(ngram_state.stack[0])
+                                ngram_node2s.append(ngram_state.stack[1])
+                                ngram_action_ids.append(ngram_state_batch["action_ids"])
+                                ngram_continuous_action_masks.append(
+                                    ngram_state_batch["continuous_action_masks"]
+                                )
+                            ngram_pruned_queue.put((ngram_score, ngram_state))
+
+                    if ngram_node1s:
+                        ngram_batch = {
+                            "padded_embeddings": torch.cat(
+                                ngram_state_model_embeddings, dim=0
+                            ).to(self.model.device),
+                            "node1s": torch.tensor(ngram_node1s).to(self.model.device),
+                            "node2s": torch.tensor(ngram_node2s).to(self.model.device),
+                            "action_ids": torch.cat(ngram_action_ids, dim=0).to(
+                                self.model.device
+                            ),
+                            "continuous_action_masks": torch.cat(
+                                ngram_continuous_action_masks, dim=0
+                            ).to(self.model.device),
+                        }
+                        # run once for the whole q
+                        ngram_action_dists = probe.action_dists(ngram_batch)
+                    else:
+                        ngram_action_dists = []
+                    ngram_states = ngram_pruned_queue
+                    if not ngram_states.qsize():
+                        break
+
+                    ngram_c = count()
+                    ngram_next_states = []
+
+                    while ngram_states.qsize():
+                        ngram_score, ngram_state = ngram_states.get()
+                        """get predictions from probe"""
+                        ngram_possible_actions = np.array(
+                            [i[0] for i in ngram_state.transitionset()]
+                        )
+
+                        if len(ngram_state.stack) > 1:
+                            # get the action distribution for the current state
+                            # if stack <=1 dont need to increment because it's action dist isnt in the batch
+                            ngram_inc = next(ngram_c)
+                            ngram_node1, ngram_node2 = (
+                                ngram_state.stack[0],
+                                ngram_state.stack[1],
+                            )
+                            ngram_actions_dist = ngram_action_dists[ngram_inc][:3]
+
+                        else:
+                            ngram_node1, ngram_node2 = -1, -1
+                            ngram_actions_dist = (
+                                torch.zeros(probe.oracle.num_actions).to(
+                                    self.model.device
+                                )
+                                - 1e10
+                            )
+                            ngram_actions_dist[probe.oracle.a2i["SHIFT"]] = 0
+
+                        # take the top k scores
+                        ngram_log_prob, ngram_indexes = torch.topk(
+                            ngram_actions_dist, probe.oracle.num_actions
+                        )
+                        ngram_possible_action_mask = torch.zeros(
+                            probe.oracle.num_actions
+                        ).to(self.model.device)
+
+                        for ngram_pa in ngram_possible_actions:
+                            ngram_possible_action_mask += ngram_indexes == ngram_pa
+
+                        ngram_log_prob, ngram_indexes = (
+                            ngram_log_prob[ngram_possible_action_mask.bool()],
+                            ngram_indexes[ngram_possible_action_mask.bool()],
+                        )
+
+                        for ngram_new_k, _ in enumerate(ngram_possible_actions):
+                            ngram_action = ngram_indexes[ngram_new_k].item()
+                            if (
+                                0 in ngram_state.heads_idxs()
+                                and ngram_node2 == 0
+                                and ngram_action == 2
+                            ):
+                                continue
+                            ngram_action_log_prob = ngram_log_prob[ngram_new_k].item()
+
+                            ngram_state_clone = ngram_state.clone()
+                            # transition from int doesnt work aparently so we need to give tuple
+                            probe.oracle.advance(ngram_state_clone, (ngram_action, -1))
+                            probe.oracle._preparetransitionset(ngram_state_clone)
+
+                            ngram_state_clone.action_tuples.append(
+                                [ngram_action, ngram_node1, ngram_node2]
+                            )
+                            ngram_state_clone.log_prob += ngram_action_log_prob
+
+                            ngram_state_clone.action_log_probs.append(
+                                ngram_action_log_prob
+                            )
+                            if (
+                                ngram_state_clone.action_tuples[-1][0]
+                                == probe.oracle.a2i["SHIFT"]
+                                and ngram_state_clone.action_tuples
+                                != ngram_init_state.action_tuples
+                            ) or len(ngram_state_clone.transitionset()) == 0:
+                                ngram_endstates.append((ngram_score, ngram_state_clone))
+                                # if we reached maximum # of sentences required
+                                if (
+                                    len(ngram_endstates) >= ngram_beam_width
+                                    or not ngram_states.qsize()
+                                ):
+                                    break
+                                else:
+                                    continue
+
+                            ngram_next_states.append(
+                                (-ngram_state_clone.log_prob, ngram_state_clone)
+                            )
+
+                    for ngram_ss in ngram_next_states:
+                        ngram_states.put(ngram_ss)
+
+                scores_conts = sorted(
+                    ngram_endstates, key=lambda x: x[0], reverse=False
+                )
+
+                for score, cont in scores_conts:
+                    if cont.num_shifts == sentence_len:
+                        cont.buf = []
+                    else:
+                        cont.buf = [cont.num_shifts + 1]
+                    probe.oracle._preparetransitionset(cont)
+                    # next_states.append((-cont.log_prob/len(cont.action_tuples),_, cont))
+                    next_states.append((-cont.log_prob, _, cont))
+
+            for ss in next_states:
+                states.put((ss[0], next(state_count), ss[2]))
+
+            pruned_queue = PriorityQueue()
+            # mask the logits that are not the next token
+            next_token_masks = []
+            state_action_tuples = []
+
+            # prune to the topk
+            for i in range(topk):
+                if states.qsize():
+                    score, _, state = states.get()
+                    # check if state has reached a shift or is terminal and check if we have the desired number of states
+                    if len(state.transitionset()) == 0:
+                        if (
+                            state.num_shifts != sentence_len
+                            or (
+                                np.array([state.head[i] for i in state.head.keys()])
+                                == 0
+                            ).sum()
+                            > 1
+                        ):
+                            if states.qsize():
+                                continue
+                            else:
+                                break
+                        endstates.append((score, state))
+
+                        if len(endstates) >= topk or not states.qsize():
+                            break
+                        else:
+                            continue
+
+                    if generative:
+                        # and state batch data to meta batch
+                        state_batch = state.to_batch(probe)
+                        mask = (
+                            torch.zeros(
+                                sentence_tokens.shape[0], self.tokenizer.vocab_size
+                            )
+                            .to(self.model.device)
+                            .unsqueeze(0)
+                        )
+                        mask[
+                            :,
+                            untok_tok_mapping[state.num_shifts - 1][
+                                0
+                            ] : untok_tok_mapping[state.num_shifts - 1][-1]
+                            + 1,
+                            :,
+                        ] = 1
+                        next_token_masks.append(mask)
+
+                        state_action_tuples.append(
+                            torch.cat(
+                                [
+                                    state_batch["gold_tuples"],
+                                    torch.tensor([-1, -1, -1, -1])
+                                    .unsqueeze(0)
+                                    .repeat(
+                                        400 - state_batch["gold_tuples"].shape[1], 1
+                                    )
+                                    .unsqueeze(0)
+                                    .to(self.model.device),
+                                ],
+                                dim=1,
+                            )
+                        )
+
+                    pruned_queue.put((score, _, state))
+
+            if state_action_tuples:
+                batch = {"gold_tuples": torch.cat(state_action_tuples, dim=0)}
+
+                counterfactuals = self.gen_counterfactuals(
+                    probe=probe,
+                    sent=sentence,
+                    label_batch=batch,
+                    output_probs=False,
+                    print_every=100,
+                    lr=0.001,
+                    patience=100,
+                    num_steps=50000,
+                    loss_tolerance=0.01,
+                    prefix_freebits=0,
+                    lastword_freebits=0,
+                    kl_weight=0,  # .0001,
+                    scheduler_patience=1000,
+                    verbose=True,
+                    compute_kl=False,
+                )
+
+                # run once for the whole q
+                counterfactual_logprobs = self.tail_by_layer(
+                    probe.layer, counterfactuals["padded_embeddings"][:, 0, :, :]
+                ).log_softmax(dim=-1)
+                batch_mask = torch.cat(next_token_masks, dim=0)
+                next_word_log_probs = (
+                    torch.gather(
+                        counterfactual_logprobs[:, :-1] * batch_mask[:, :-1],
+                        -1,
+                        sentence_tokens[1:]
+                        .unsqueeze(0)
+                        .T.unsqueeze(0)
+                        .repeat(batch["gold_tuples"].shape[0], 1, 1),
+                    )
+                    .sum(-1)
+                    .sum(-1)
+                )
+
+                new_queue = PriorityQueue()
+                inc = count()
+                while pruned_queue.qsize():
+                    score, _, state = pruned_queue.get()
+                    state.log_prob += next_word_log_probs[next(inc)].item()
+                    # TODO: if using length norm then use it here
+                    new_queue.put((-state.log_prob, _, state))
+                states = new_queue
+            else:
+                states = pruned_queue
+
+            if not states.qsize():
+                break
+
+        return endstates
diff --git a/src/parse.py b/src/parse.py
new file mode 100644
index 0000000..7535de9
--- /dev/null
+++ b/src/parse.py
@@ -0,0 +1,239 @@
+import os
+import yaml
+import argparse
+from collections import defaultdict
+from itertools import count
+from tqdm import tqdm
+import torch
+import pandas as pd
+
+from experiment import IncrementalParseProbeExperiment
+from task import ParseDepthTask
+from datasets import PTB_Dataset
+from utils import ignored_tags
+
+from transformers import AutoTokenizer, GPT2LMHeadModel
+from transition import *
+from utils import *
+from gpt2 import GPT2_extended
+import json
+
+args = argparse.ArgumentParser()
+args.add_argument(
+    "--experiment_path",
+    type=str,
+    default="experiment_checkpoints/eval/gpt2/AttentiveProbe/layer_6/",
+)
+
+args = args.parse_args()
+
+with open(args.experiment_path + "config.yaml") as file:
+    l_args = yaml.safe_load(file)
+
+print("loading probe...")
+l_args["probe_params"]["pretrained_model"] = l_args["pretrained_model"]
+l_args["probe_params"]["checkpoint_path"] = None
+exp = IncrementalParseProbeExperiment.load_from_checkpoint(
+    args.experiment_path + "checkpoints/last.ckpt"
+)
+p = exp.probe.eval()
+
+print("loading gpt2...")
+device = "cuda"
+gpt2 = GPT2LMHeadModel.from_pretrained(
+    l_args["pretrained_model"], local_files_only=True
+)
+gpt2_tokenizer = AutoTokenizer.from_pretrained(
+    l_args["pretrained_model"], local_files_only=True
+)
+
+for param in gpt2.parameters():
+    param.requires_grad = False
+
+gpt2_ext = GPT2_extended(model=gpt2, tokenizer=gpt2_tokenizer, tail=None)
+
+results = pd.DataFrame(
+    columns=[
+        "model",
+        "probe_name",
+        "layer",
+        "loss",
+        "distance_mse",
+        "depth_mse",
+        "oracle_action_nll",
+        "f1",
+        "perplexity",
+        "accuracy",
+        "uuas_beamsearch",
+        "root_accuracy_beamsearch",
+        "root_accuracy_spanning_tree",
+        "uuas_spanning_tree",
+    ]
+)
+
+l_args["data_params"]["test"]["shuffle"] = False
+l_args["data_params"]["train"]["dry_run"] = 2
+l_args["data_params"]["valid"]["dry_run"] = 2
+l_args["data_params"]["test"]["dry_run"] = False
+
+l_args["probe_params"]["data_sources"].extend(
+    ["gold_distances", "gold_depths", "xpos", "gold_tuples"]
+)
+
+distance_depth_data = PTB_Dataset(config=l_args, probe=p)
+distance_depth_data.setup()
+
+with open(distance_depth_data.test_dataset.data_path) as f:
+    (
+        total_sents,
+        correct_root_predictions,
+        uspan_correct,
+        uspan_total,
+        uas_correct,
+        uas_total,
+        uuas_w_head_total,
+        uuas_w_head_correct,
+    ) = (0, 0, 0, 0, 0, 0, 0, 0)
+    incr = count()
+    for idx, line in tqdm(enumerate(f), desc=f"beamsearch decoding"):
+        o = json.loads(line)
+        if o["key"] == "sentence" and o["projective"]:
+            inc = next(incr)
+            if len(o["tokens"]) > 1:
+                topk, ncont, parses = 10, 10, []
+
+                while not parses:
+                    if topk > 100:
+                        print("max beamsize exceeded, breaking")
+                        break
+                    print("topk:", topk, " /ncont:", ncont)
+                    parses = gpt2_ext.parse_beamsearch(
+                        probe=p,
+                        sentence=" ".join(o["orig_tokens"]),
+                        generative=False,
+                        topk=topk,
+                        ncont=ncont,
+                    )
+                    topk, ncont = topk * 2, ncont * 2
+
+                if not parses:
+                    print("no parses found")
+                    continue
+
+                batch = exp.format_batch(
+                    [
+                        torch.tensor(i)
+                        for i in distance_depth_data.test_dataset.__getitem__(inc)
+                    ]
+                )
+
+                top_parse = parses[0][1]
+                test_batch = exp.format_batch(
+                    [
+                        torch.tensor(i)
+                        for i in distance_depth_data.test_dataset.__getitem__(inc)
+                    ]
+                )
+
+                vparse = parses[0][1]
+
+                gold_depths = batch["gold_depths"][: batch["lengths"]]
+                gold_distances = batch["gold_distances"][
+                    : batch["lengths"], : batch["lengths"]
+                ]
+
+                pred_depths = ParseDepthTask.labels(obs(top_parse.heads_idxs()))
+                correct_root_predictions += (
+                    (gold_depths == 0).nonzero(as_tuple=True)[0]
+                ).item() == get_nopunct_argmin(vparse.heads_idxs(), batch["xpos"])
+
+                gold_edges = prims_matrix_to_edges(gold_distances, test_batch["xpos"])
+                pred_edges = [
+                    tuple(sorted((tup[0] - 1, tup[1][0] - 1)))
+                    for tup, tag in zip(vparse.head.items(), o["tags"])
+                    if not tag in ignored_tags
+                ]
+
+                total_sents += 1
+                top_parse_head_invetred = {}
+
+                gold_heads = [
+                    i
+                    for i, tag in zip(
+                        distance_depth_data.test_dataset.observations[idx].head_indices,
+                        o["tags"],
+                    )
+                    if not tag in ignored_tags
+                ]
+                pred_heads = [
+                    i
+                    for i, tag in zip(vparse.heads_idxs(), o["tags"])
+                    if not tag in ignored_tags
+                ]
+
+                invert_heads = defaultdict(list)
+                for x, y in vparse.head.items():
+                    invert_heads[int(y[0])].append(int(x))
+
+                overlap = [
+                    h for i, h in enumerate(pred_heads) if gold_heads[i] == str(h)
+                ]
+                undir_overlap = [
+                    h
+                    for i, h in enumerate(pred_heads)
+                    if gold_heads[i] == str(h) or i in invert_heads[h]
+                ]
+                undir_overlap_no_root = [h for h in undir_overlap if h != 0]
+
+                uuas_w_head_correct += len(undir_overlap)
+
+                uuas_w_head_total += len(gold_heads)
+                uspan_correct += len(undir_overlap_no_root)
+                uspan_total += len(gold_heads) - 1
+                uas_correct += len(overlap)
+                uas_total += len(gold_heads)
+
+                root_acc = correct_root_predictions / float(total_sents)
+                uuas = uspan_correct / float(uspan_total)
+                uas = uas_correct / float(uas_total)
+                uuas_w_head = uuas_w_head_correct / float(uuas_w_head_total)
+
+                print(
+                    "root_acc:",
+                    root_acc,
+                    "uas:",
+                    uas,
+                    "uuas:",
+                    uuas,
+                    "uuas_w_head:",
+                    uuas_w_head,
+                    "inc:",
+                    inc,
+                )
+
+    results = results.append(
+        {
+            "model": l_args["pretrained_model"],
+            "probe_name": l_args["probe_params"]["probe_name"],
+            "layer": l_args["probe_params"]["layer"],
+            "uuas_beamsearch": uuas,
+            "uas_beamsearch": uas,
+            "uuas_beamsearch_w_head": uuas_w_head,
+            "root_accuracy_beamsearch": root_acc,
+        },
+        ignore_index=True,
+    )
+
+    results = results.melt(
+        id_vars=["model", "probe_name", "layer"], var_name="metric", value_name="value"
+    ).dropna()
+
+results_path = f'./results/results_{l_args["pretrained_model"]}_layer_{str(l_args["probe_params"]["layer"])}_{l_args["probe_params"]["probe_name"]}_beamsearch.csv'
+if os.path.exists(results_path):
+    net_res = pd.read_csv(results_path)
+    pd.concat([results, net_res]).drop_duplicates(
+        subset=["model", "probe_name", "layer", "metric"]
+    ).to_csv(results_path, index=False)
+else:
+    os.makedirs(results_path.rsplit("/", 1)[0], exist_ok=True)
+    results.to_csv(results_path, mode="a", header=True, index=False)
diff --git a/src/preprocess.py b/src/preprocess.py
new file mode 100755
index 0000000..308c2ae
--- /dev/null
+++ b/src/preprocess.py
@@ -0,0 +1,731 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Create data files
+source: https://github.com/aistairc/rnng-pytorch/blob/master/preprocess.py
+"""
+
+import os
+import sys
+import argparse
+import itertools
+from collections import defaultdict
+import utils as utils
+import re
+import shutil
+import json
+from multiprocessing import Pool
+import itertools
+
+from transition import (
+    ArcSwift,
+    ArcEagerReduce,
+    ArcEagerShift,
+    ArcStandard,
+    ArcHybrid,
+    ParserState_dec,
+)
+
+import json
+
+pad = "<pad>"
+unk = "<unk>"
+
+
+class Vocabulary(object):
+    """
+    This vocabulary prohibits registering a new token during lookup.
+    Vocabulary should be constructed from a set of tokens with counts (w2c), a dictionary
+    from a word to its count in the training data. (or anything)
+    """
+
+    def __init__(
+        self, w2c_list, pad="<pad>", unkmethod="unk", unktoken="<unk>", specials=[]
+    ):
+        self.pad = pad
+        self.padding_idx = 0
+        self.specials = specials
+        self.unkmethod = unkmethod
+        self.unktoken = unktoken
+        if self.unkmethod == "unk":
+            if unktoken not in specials:
+                specials.append(unktoken)
+
+        assert isinstance(w2c_list, list)
+        self.i2w = [self.pad] + specials + [w for w, _ in w2c_list]
+        self.w2i = dict([(w, i) for i, w in enumerate(self.i2w)])
+        self.w2c = dict(w2c_list)
+        self.i2c = dict([(self.w2i[w], c) for w, c in self.w2c.items()])
+
+        if self.unkmethod == "unk":
+            self.unk_id = self.w2i[self.unktoken]
+
+    def id_to_word(self, i):
+        return self.i2w[i]
+
+    def to_unk(self, w):
+        if self.unkmethod == "unk":
+            return self.unktoken
+        elif self.unkmethod == "berkeleyrule":
+            return utils.berkeley_unk_conv(w)
+        elif self.unkmethod == "berkeleyrule2":
+            return utils.berkeley_unk_conv2(w)
+
+    def to_unk_id(self, w_id):
+        if self.unkmethod == "unk":
+            return self.unk_id
+        else:
+            if 1 <= w_id < 1 + len(self.specials):
+                return w_id
+            else:
+                return self.get_id(utils.berkeley_unk_conv(self.i2w[w_id]))
+
+    def size(self):
+        return len(self.i2w)
+
+    def get_id(self, w):
+        if w not in self.w2i:
+            w = self.to_unk(w)
+            if w not in self.w2i:
+                # Back-off to a general unk token when converted unk-token is not registered in the
+                # vocabulary (which happens when an unseen unk token is generated at test time).
+                w = self.unktoken
+        return self.w2i[w]
+
+    def get_count_from_id(self, w_id):
+        if w_id not in self.i2c:
+            return 0
+        else:
+            return self.i2c[w_id]
+
+    def get_count(self, w):
+        if w not in self.w2c:
+            return 0
+        else:
+            return self.w2c[w]
+
+    # for serialization
+    def list_w2c(self):
+        return [(w, self.get_count(w)) for w in self.i2w[1 + len(self.specials) :]]
+
+    def dump(self, fn):
+        with open(fn, "wt") as o:
+            o.write(self.pad + "\n")
+            o.write(self.unkmethod + "\n")
+            o.write(self.unktoken + "\n")
+            o.write(" ".join(self.specials) + "\n")
+            for w, c in self.list_w2c():
+                o.write("{}\t{}\n".format(w, c))
+
+    def to_json_dict(self):
+        return {
+            "pad": self.pad,
+            "unkmethod": self.unkmethod,
+            "unktoken": self.unktoken,
+            "specials": self.specials,
+            "word_count": self.list_w2c(),
+        }
+
+    @staticmethod
+    def load(self, fn):
+        with open(fn) as f:
+            lines = [line for line in f]
+        pad, unkmethod, unktoken, specials = [l.strip() for l in line[:4]]
+        specials = [w for w in specials]
+
+        def parse_line(line):
+            w, c = line[:-1].split()
+            return w, int(c)
+
+        w2c_list = [parse_line(line) for line in lines[4:]]
+        return Vocabulary(w2c_list, pad, unkmethod, unktoken, specials)
+
+    @staticmethod
+    def from_data_json(data):
+        d = data["vocab"]
+        return Vocabulary(
+            d["word_count"], d["pad"], d["unkmethod"], d["unktoken"], d["specials"]
+        )
+
+
+def is_next_open_bracket(line, start_idx):
+    for char in line[(start_idx + 1) :]:
+        if char == "(":
+            return True
+        elif char == ")":
+            return False
+    raise IndexError(
+        "Bracket possibly not balanced, open bracket not followed by closed bracket"
+    )
+
+
+def get_next_bracket_index(line, start_idx):
+    for i in range(start_idx + 1, len(line)):
+        char = line[i]
+        if char == "(" or char == ")":
+            return i
+    raise IndexError(
+        "Bracket possibly not balanced, open bracket not followed by closed bracket"
+    )
+
+
+def get_between_brackets(line, start_idx):
+    output = []
+    for char in line[(start_idx + 1) :]:
+        if char == ")":
+            break
+        assert not (char == "(")
+        output.append(char)
+    return "".join(output)
+
+
+def get_tags_tokens_lowercase(line):
+    output = []
+    line = line.rstrip()
+    for i in range(len(line)):
+        if i == 0:
+            assert line[i] == "("
+        if line[i] == "(" and not (
+            is_next_open_bracket(line, i)
+        ):  # fulfilling this condition means this is a terminal symbol
+            output.append(get_between_brackets(line, i))
+    # print 'output:',output
+    output_tags = []
+    output_tokens = []
+    output_lowercase = []
+    for terminal in output:
+        terminal_split = terminal.split()
+        # print(terminal, terminal_split)
+        assert len(terminal_split) == 2  # each terminal contains a POS tag and word
+        output_tags.append(terminal_split[0])
+        output_tokens.append(terminal_split[1])
+        output_lowercase.append(terminal_split[1].lower())
+    return [output_tags, output_tokens, output_lowercase]
+
+
+def transform_to_subword_tree(line, sp):
+    line = line.rstrip()
+    tags, tokens, _ = get_tags_tokens_lowercase(line)
+    pieces = sp.encode(" ".join(tokens), out_type=str)
+    end_idxs = [i + 1 for i, p in enumerate(pieces) if "▁" in p]
+    begin_idxs = [0] + end_idxs[:-1]
+    spans = list(
+        zip(begin_idxs, end_idxs)
+    )  # map from original token idx to piece span idxs.
+
+    def get_piece_preterms(tok_i):
+        tag = tags[tok_i]
+        b, e = spans[tok_i]
+        span_pieces = pieces[b:e]
+        return " ".join(["({} {})".format(tag, p) for p in span_pieces])
+
+    new_preterms = [get_piece_preterms(i) for i in range(len(tokens))]
+    orig_token_spans = []
+    for i in range(len(line)):
+        if line[i] == "(":
+            next_bracket_idx = get_next_bracket_index(line, i)
+            found_bracket = line[next_bracket_idx]
+            if found_bracket == "(":
+                continue  # not terminal -> skip
+            orig_token_spans.append((i, next_bracket_idx + 1))
+    assert len(new_preterms) == len(orig_token_spans)
+    ex_span_ends = [span[0] for span in orig_token_spans] + [len(line)]
+    ex_span_begins = [0] + [span[1] for span in orig_token_spans]
+    parts = []
+    for i in range(len(new_preterms)):
+        parts.append(line[ex_span_begins[i] : ex_span_ends[i]])
+        parts.append(new_preterms[i])
+    parts.append(line[ex_span_begins[i + 1] : ex_span_ends[i + 1]])
+    return "".join(parts)
+
+
+def get_nonterminal(line, start_idx):
+    assert line[start_idx] == "("  # make sure it's an open bracket
+    output = []
+    for char in line[(start_idx + 1) :]:
+        if char == " ":
+            break
+        assert not (char == "(") and not (char == ")")
+        output.append(char)
+    return "".join(output)
+
+
+def get_actions(line):
+    output_actions = []
+    line_strip = line.rstrip()
+    i = 0
+    max_idx = len(line_strip) - 1
+    while i <= max_idx:
+        assert line_strip[i] == "(" or line_strip[i] == ")"
+        if line_strip[i] == "(":
+            if is_next_open_bracket(line_strip, i):  # open non-terminal
+                curr_NT = get_nonterminal(line_strip, i)
+                output_actions.append("NT(" + curr_NT + ")")
+                i += 1
+                while (
+                    line_strip[i] != "("
+                ):  # get the next open bracket, which may be a terminal or another non-terminal
+                    i += 1
+            else:  # it's a terminal symbol
+                output_actions.append("SHIFT")
+                while line_strip[i] != ")":
+                    i += 1
+                i += 1
+                while line_strip[i] != ")" and line_strip[i] != "(":
+                    i += 1
+        else:
+            output_actions.append("REDUCE")
+            if i == max_idx:
+                break
+            i += 1
+            while line_strip[i] != ")" and line_strip[i] != "(":
+                i += 1
+    assert i == max_idx
+    return output_actions
+
+
+def find_nts_in_tree(tree):
+    tree = tree.strip()
+    return re.findall(r"(?=\(([^\s]+)\s\()", tree)
+
+
+def get_sent_info(arg):
+    tree, setting = arg
+    tree = tree.strip()
+    lowercase, replace_num, vocab, sp = setting
+    if sp is not None:
+        # use sentencepiece
+        tree = transform_to_subword_tree(tree, sp)
+    subword_tokenized = sp is not None
+    tags, tokens, tokens_lower = get_tags_tokens_lowercase(tree)
+    tags, tokens, tokens_lower = get_tags_tokens_lowercase(tree)
+    orig_tokens = tokens[:]
+    if sp is None:
+        # these are not applied with sentencepiece
+        if lowercase:
+            tokens = tokens_lower
+        if replace_num:
+            tokens = [utils.clean_number(w) for w in tokens]
+
+        token_ids = [vocab.get_id(t) for t in tokens]
+        conved_tokens = [vocab.i2w[w_i] for w_i in token_ids]
+    else:
+        token_ids = sp.piece_to_id(tokens)
+        conved_tokens = tokens
+
+    return {
+        "orig_tokens": orig_tokens,
+        "tokens": conved_tokens,
+        "token_ids": token_ids,
+        "tags": tags,
+        "tree_str": tree,
+    }
+
+
+def make_vocab(
+    textfile,
+    seqlength,
+    minseqlength,
+    lowercase,
+    replace_num,
+    vocabsize,
+    vocabminfreq,
+    unkmethod,
+    jobs,
+    apply_length_filter=True,
+):
+    w2c = defaultdict(int)
+    with open(textfile, "r") as f:
+        trees = [tree.strip() for tree in f]
+    with Pool(jobs) as pool:
+        for tags, sent, sent_lower in pool.map(get_tags_tokens_lowercase, trees):
+            assert len(tags) == len(sent)
+            if lowercase:
+                sent = sent_lower
+            if replace_num:
+                sent = [utils.clean_number(w) for w in sent]
+            if (len(sent) > seqlength and apply_length_filter) or len(
+                sent
+            ) < minseqlength:
+                continue
+
+            for word in sent:
+                w2c[word] += 1
+    if unkmethod == "berkeleyrule" or unkmethod == "berkeleyrule2":
+        conv_method = (
+            utils.berkeley_unk_conv
+            if unkmethod == "berkeleyrule"
+            else utils.berkeley_unk_conv2
+        )
+        berkeley_unks = set([conv_method(w) for w, c in w2c.items()])
+        specials = list(berkeley_unks)
+    else:
+        specials = [unk]
+    if vocabminfreq:
+        w2c = dict([(w, c) for w, c in w2c.items() if c >= vocabminfreq])
+    elif vocabsize > 0 and len(w2c) > vocabsize:
+        sorted_wc = sorted(list(w2c.items()), key=lambda x: x[1], reverse=True)
+        w2c = dict(sorted_wc[:vocabsize])
+    return Vocabulary(list(w2c.items()), pad, unkmethod, unk, specials)
+
+
+def get_data(args):
+    def get_nonterminals(textfiles, jobs=-1):
+        nts = set()
+        for fn in textfiles:
+            with open(fn, "r") as f:
+                lines = [line for line in f]
+            with Pool(jobs) as pool:
+                local_nts = pool.map(find_nts_in_tree, lines)
+                nts.update(list(itertools.chain.from_iterable(local_nts)))
+        nts = sorted(list(nts))
+        print("Found nonterminals: {}".format(nts))
+        return nts
+
+    def convert(
+        textfile,
+        lowercase,
+        replace_num,
+        seqlength,
+        minseqlength,
+        outfile,
+        vocab,
+        sp,
+        apply_length_filter=True,
+        jobs=-1,
+    ):
+        dropped = 0
+        num_sents = 0
+        conv_setting = (lowercase, replace_num, vocab, sp)
+
+        def process_block(tree_with_settings, f):
+            _dropped = 0
+            with Pool(jobs) as pool:
+                for sent_info in pool.map(get_sent_info, tree_with_settings):
+                    tokens = sent_info["tokens"]
+                    if apply_length_filter and (
+                        len(tokens) > seqlength or len(tokens) < minseqlength
+                    ):
+                        _dropped += 1
+                        continue
+                    sent_info["key"] = "sentence"
+                    f.write(json.dumps(sent_info) + "\n")
+            return _dropped
+
+        with open(outfile, "w") as f, open(textfile, "r") as in_f:
+            block_size = 100000
+            tree_with_settings = []
+            for tree in in_f:
+                tree_with_settings.append((tree, conv_setting))
+                if len(tree_with_settings) >= block_size:
+                    dropped += process_block(tree_with_settings, f)
+                    num_sents += len(tree_with_settings)
+                    tree_with_settings = []
+                    print(num_sents)
+            if len(tree_with_settings) > 0:
+                process_block(tree_with_settings, f)
+                num_sents += len(tree_with_settings)
+
+            others = {
+                "vocab": vocab.to_json_dict() if vocab is not None else None,
+                "nonterminals": nonterminals,
+                "pad_token": pad,
+                "unk_token": unk,
+                "args": args.__dict__,
+            }
+            for k, v in others.items():
+                print("Saving {} to {}".format(k, outfile + "." + k))
+                f.write(json.dumps({"key": k, "value": v}) + "\n")
+
+        print(
+            "Saved {} sentences (dropped {} due to length/unk filter)".format(
+                num_sents, dropped
+            )
+        )
+
+    print("First pass through data to get nonterminals...")
+    nonterminals = get_nonterminals(
+        [args.trainfile, args.valfile, args.testfile], args.jobs
+    )
+
+    if args.unkmethod == "subword":
+        if args.vocabfile != "":
+            print(
+                "Loading pre-trained sentencepiece model from {}".format(args.vocabfile)
+            )
+            import sentencepiece as spm
+
+            sp = spm.SentencePieceProcessor(model_file=args.vocabfile)
+            sp_model_path = "{}-spm.model".format(args.outputpath)
+            print("Copy sentencepiece model to {}".format(sp_model_path))
+            shutil.copyfile(args.vocabfile, sp_model_path)
+        else:
+            print(
+                "unkmethod subword is selected. Running sentencepiece on the training data..."
+            )
+            sp = learn_sentencepiece(
+                args.trainfile, args.outputpath + "/" + "-spm", args
+            )
+        vocab = None
+    else:
+        if args.vocabfile != "":
+            print("Loading pre-specified source vocab from " + args.vocabfile)
+            vocab = Vocabulary.load(args.vocabfile)
+        else:
+            print("Second pass through data to get vocab...")
+            vocab = make_vocab(
+                args.trainfile,
+                args.seqlength,
+                args.minseqlength,
+                args.lowercase,
+                args.replace_num,
+                args.vocabsize,
+                args.vocabminfreq,
+                args.unkmethod,
+                args.jobs,
+            )
+        vocab.dump(args.outputpath + "/" + ".vocab")
+        print("Vocab size: {}".format(len(vocab.i2w)))
+        sp = None
+
+    convert(
+        args.testfile,
+        args.lowercase,
+        args.replace_num,
+        0,
+        args.minseqlength,
+        args.outputpath + "/" + "test.json",
+        vocab,
+        sp,
+        0,
+        args.jobs,
+    )
+    convert(
+        args.valfile,
+        args.lowercase,
+        args.replace_num,
+        args.seqlength,
+        args.minseqlength,
+        args.outputpath + "/" + "valid.json",
+        vocab,
+        sp,
+        0,
+        args.jobs,
+    )
+    convert(
+        args.trainfile,
+        args.lowercase,
+        args.replace_num,
+        args.seqlength,
+        args.minseqlength,
+        args.outputpath + "/" + "train.json",
+        vocab,
+        sp,
+        1,
+        args.jobs,
+    )
+
+
+def main(arguments):
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--vocabsize", type=int, default=100000)
+    parser.add_argument("--vocabminfreq", type=int, default=1)
+    parser.add_argument(
+        "--unkmethod",
+        choices=["unk", "berkeleyrule", "berkeleyrule2", "subword"],
+        default="berkeleyrule",
+    )
+    parser.add_argument("--subword_type", choices=["bpe", "unigram"], default="bpe")
+    parser.add_argument("--keep_ptb_bracket", action="store_true")
+    parser.add_argument("--subword_user_defined_symbols", nargs="*")
+    parser.add_argument("--lowercase", help="Lower case", action="store_true")
+    parser.add_argument(
+        "--replace_num", help="Replace numbers with N", action="store_true"
+    )
+    # parser.add_argument('--trainfile', help="Path to training data.",default='/data/cl/user/eisape/docker-home/incremental_parse_probe/data_large/train.txt')
+    # parser.add_argument('--valfile', help="Path to validation data.",default='/data/cl/user/eisape/docker-home/incremental_parse_probe/data_large/valid.txt')
+    # parser.add_argument('--testfile', help="Path to test validation data.",default='/data/cl/user/eisape/docker-home/incremental_parse_probe/data_large/test.txt')
+    parser.add_argument(
+        "--seqlength",
+        help="Maximum sequence length. Sequences longer than this are dropped.",
+        type=int,
+        default=300,
+    )
+    parser.add_argument(
+        "--minseqlength",
+        help="Minimum sequence length. Sequences shorter than this are dropped.",
+        type=int,
+        default=0,
+    )
+    parser.add_argument(
+        "--data_dir", help="Prefix of the output file names. ", type=str, default="data"
+    )
+    parser.add_argument("--vocabfile", type=str, default="")
+    parser.add_argument("--jobs", type=int, default=10)
+    # for example here is the command line to run the script
+    # python3 preprocess.py --trainfile data/train.txt --valfile data/valid.txt --testfile data/test.txt --outputfile ./data/ --jobs 10 --vocabminfreq 1 --lowercase
+    # comand to copy ./*.json to /data/cl/user/eisape/drive/ptb1/
+    # cp ./*.json /data/cl/user/eisape/drive/ptb1/
+    args = parser.parse_args(arguments)
+    if args.jobs == -1:
+        args.jobs = len(os.sched_getaffinity(0))
+    # set file pats by hand
+    args.trainfile = args.data_dir + "/train.txt"
+    args.valfile = args.data_dir + "/valid.txt"
+    args.testfile = args.data_dir + "/test.txt"
+    args.outputpath = args.data_dir
+
+    # np.random.seed(3435)
+    get_data(args)
+
+    def transsys_lookup(k):
+        lookup = {
+            "ASw": ArcSwift,
+            "AER": ArcEagerReduce,
+            "AES": ArcEagerShift,
+            "ASd": ArcStandard,
+            "AH": ArcHybrid,
+        }
+        return lookup[k]
+
+    def is_projective(lines):
+        projective = True
+
+        # find decendents
+        words = ["ROOT"]
+        for line in lines:
+            words += [line[1]]
+
+        children = [[] for i in range(len(words))]
+        for i, line in enumerate(lines):
+            try:
+                parent = int(line[6])
+                relation = line[7]
+                children[parent] += [(relation, i + 1)]
+            except Exception:
+                print(line)
+
+        decendents = [
+            set([child[1] for child in children[i]]) for i in range(len(words))
+        ]
+
+        change = True
+        while change:
+            change = False
+            for i in range(len(decendents)):
+                update = []
+                for d in decendents[i]:
+                    for d1 in decendents[d]:
+                        if d1 not in decendents[i]:
+                            update += [d1]
+                if len(update) > 0:
+                    decendents[i].update(update)
+                    change = True
+
+        for i, node in enumerate(children):
+            for child in node:
+                childid = child[1]
+                for j in range(min(childid, i) + 1, max(childid, i)):
+                    if j not in decendents[i]:
+                        projective = False
+
+        return projective
+
+    def processlines(lines, transsys):
+        arcs = [dict() for i in range(len(lines) + 1)]
+
+        pos = ["" for i in range(len(lines) + 1)]
+        fpos = ["" for i in range(len(lines) + 1)]
+
+        for i, line in enumerate(lines):
+            pos[i + 1] = line[3]  # fine-grained
+            fpos[i + 1] = line[4]
+            parent = int(line[6])
+            relation = line[7]
+            arcs[parent][i + 1] = transsys.mappings["rel"][relation]
+
+        res = [
+            ParserState_dec(["<ROOT>"] + lines, transsys=transsys, goldrels=arcs),
+            pos,
+        ]
+        if fpos:
+            res += [fpos]
+        else:
+            res == [None]
+        return res
+
+    for dataset in ["valid", "train", "test"]:
+        sents = []
+        ret_sents = []
+        ds = dataset
+        # if dataset == 'valid': ds='val'
+        with open(args.outputpath + "/" + ds + ".json", "r") as f:
+            for line in f:
+                o = json.loads(line)
+                if o["key"] == "sentence":
+                    sents.append(o)
+
+        count, nonproj, lines = 0, 0, []
+
+        with open(args.outputpath + "/" + ds + ".conllx", "r") as fin:
+            line = fin.readline()
+            while line:
+                if line.startswith("#"):
+                    line = fin.readline()
+                    continue
+                line = line.strip().split()
+                if len(line) > 0 and "-" in line[0]:
+                    line = fin.readline()
+                    continue
+
+                if len(line) == 0:
+                    if is_projective(lines):
+                        sents[count]["projective"] = True
+                        for tsys in ["ASd"]:
+                            sents[count][tsys] = {}
+                            transsys = transsys_lookup(tsys)("./data/mappings-ptb.txt")
+                            stck, buf, actions, tuples = [], [], [], []
+
+                            state, pos, fpos = processlines(lines, transsys)
+                            transsys = state.transsys
+
+                            while len(state.transitionset()) > 0:
+                                t = transsys.goldtransition(state)
+                                actions.append(t)
+                                stck.append(state.stack)
+                                buf.append(state.buf)
+                                tup = transsys.goldtransition(state, return_tuple=True)
+                                tuples.append(list(tup))
+                                transsys.advance(state, t)
+
+                            stck.append(state.stack)
+                            buf.append(state.buf)
+
+                            sents[count][tsys]["gold_stacks"] = stck
+                            sents[count][tsys]["gold_buffers"] = buf
+                            sents[count][tsys]["actions"] = actions
+                            sents[count][tsys]["action_tuples"] = tuples
+                            ret_sents.append(sents[count])
+                    else:
+                        # Remove non-projective sentences from the dataset
+                        sents[count]["projective"] = False
+                        ret_sents.append(sents[count])
+                    count += 1
+                    lines = []
+                else:
+                    lines += [line]
+                line = fin.readline()
+            if len(lines) > 0:
+                None
+        with open(
+            args.outputpath + "/" + ds + ".json", "w", encoding="utf8"
+        ) as json_file:
+            print(f"Writing {ds} to {args.outputpath+'/'+ds+'.json'}", os.getcwd())
+            for s in ret_sents:
+                json_file.write(json.dumps(s) + "\n")
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/src/task.py b/src/task.py
new file mode 100644
index 0000000..fa22e23
--- /dev/null
+++ b/src/task.py
@@ -0,0 +1,149 @@
+#  """Contains classes describing linguistic tasks of interest on annotated data."""
+
+import numpy as np
+import torch
+
+class Task:
+  """Abstract class representing a linguistic task mapping texts to labels."""
+
+  @staticmethod
+  def labels(observation):
+    """Maps an observation to a matrix of labels.
+    
+    Should be overriden in implementing classes.
+    """
+    raise NotImplementedError
+
+class ParseDistanceTask(Task):
+  """Maps observations to dependency parse distances between words."""
+
+  @staticmethod
+  def labels(observation):
+    """Computes the distances between all pairs of words; returns them as a torch tensor.
+
+    Args:
+      observation: a single Observation class for a sentence:
+    Returns:
+      A torch tensor of shape (sentence_length, sentence_length) of distances
+      in the parse tree as specified by the observation annotation.
+    """
+    sentence_length = len(observation[0]) #All observation fields must be of same length
+    distances = torch.zeros((sentence_length, sentence_length))
+    for i in range(sentence_length):
+      for j in range(i,sentence_length):
+        i_j_distance = ParseDistanceTask.distance_between_pairs(observation, i, j)
+        distances[i][j] = i_j_distance
+        distances[j][i] = i_j_distance
+    return distances
+
+  @staticmethod
+  def distance_between_pairs(observation, i, j, head_indices=None, disconnected_parse=False, subtree_distance=0):
+    '''
+    sub_tree_distance is the distance between subtrees assuming we have subtrees that are artificial connected through the root
+    '''
+    '''Computes path distance between a pair of words
+    TODO: It would be (much) more efficient to compute all pairs' distances at once;
+          this pair-by-pair method is an artefact of an older design, but
+          was unit-tested for correctness... 
+    Args:
+      observation: an Observation namedtuple, with a head_indices field.
+          or None, if head_indies != None
+      i: one of the two words to compute the distance between.
+      j: one of the two words to compute the distance between.
+      head_indices: the head indices (according to a dependency parse) of all
+          words, or None, if observation != None.
+    Returns:
+      The integer distance d_path(i,j)
+    ''' 
+    if i == j:
+      return 0
+    if observation:
+      head_indices = []
+      number_of_underscores = 0
+      for elt in observation.head_indices:
+        if elt == '_':
+          head_indices.append(0)
+          number_of_underscores += 1
+        else:
+          head_indices.append(int(elt) + number_of_underscores)
+    i_path = [i+1]
+    j_path = [j+1]
+    i_head = i+1
+    j_head = j+1
+    while True:
+      if not (i_head == 0 and (i_path == [i+1] or i_path[-1] == 0)):
+        i_head = head_indices[i_head - 1]
+        i_path.append(i_head)
+      if not (j_head == 0 and (j_path == [j+1] or j_path[-1] == 0)):
+        j_head = head_indices[j_head - 1]
+        j_path.append(j_head)
+      if i_head in j_path:
+        j_path_length = j_path.index(i_head)
+        i_path_length = len(i_path) - 1
+        
+        break
+      elif j_head in i_path:
+        i_path_length = i_path.index(j_head)
+        j_path_length = len(j_path) - 1
+        break
+      elif i_head == j_head:
+        i_path_length = len(i_path) - 1
+        j_path_length = len(j_path) - 1
+        break
+    
+    total_length = j_path_length + i_path_length
+    # nodes_along_path = j_path[:j_path_length] + i_path[:i_path_length]
+    # ''' subtree_distance
+    # if 
+    return total_length
+
+class ParseDepthTask:
+  """Maps observations to a depth in the parse tree for each word"""
+
+  @staticmethod
+  def labels(observation):
+    """Computes the depth of each word; returns them as a torch tensor.
+
+    Args:
+      observation: a single Observation class for a sentence:
+    Returns:
+      A torch tensor of shape (sentence_length,) of depths
+      in the parse tree as specified by the observation annotation.
+    """
+    sentence_length = len(observation[0]) #All observation fields must be of same length
+    depths = torch.zeros(sentence_length)
+    for i in range(sentence_length):
+      depths[i] = ParseDepthTask.get_ordering_index(observation, i)
+    return depths
+
+  @staticmethod
+  def get_ordering_index(observation, i, head_indices=None):
+    '''Computes tree depth for a single word in a sentence
+
+    Args:
+      observation: an Observation namedtuple, with a head_indices field.
+          or None, if head_indies != None
+      i: the word in the sentence to compute the depth of
+      head_indices: the head indices (according to a dependency parse) of all
+          words, or None, if observation != None.
+
+    Returns:
+      The integer depth in the tree of word i
+    '''
+    if observation:
+      head_indices = []
+      number_of_underscores = 0
+      for elt in observation.head_indices:
+        if elt == '_':
+          head_indices.append(0)
+          number_of_underscores += 1
+        else:
+          head_indices.append(int(elt) + number_of_underscores)
+    length = 0
+    i_head = i+1
+    while True:
+      i_head = head_indices[i_head - 1]
+      if i_head != 0:
+        length += 1
+      else:
+        return length
\ No newline at end of file
diff --git a/src/train.py b/src/train.py
new file mode 100644
index 0000000..7e2782a
--- /dev/null
+++ b/src/train.py
@@ -0,0 +1,77 @@
+import os
+
+# os.environ["CUDA_VISIBLE_DEVICES"] = '4'
+
+import yaml
+import argparse
+import shutil
+import random
+from pathlib import Path
+
+import torch
+
+import architectures
+import datasets
+from utils import *
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.utilities.seed import seed_everything
+from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+from experiment import IncrementalParseProbeExperiment
+
+torch.backends.cudnn.enabled = False
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--config", dest="filename", default="./configs/test.yaml")
+parser.add_argument("--device", dest="device", default=[0], nargs="+")
+
+args = parser.parse_args()
+device = [int(d) for d in args.device]
+config_path = args.filename
+with open(args.filename, "r") as file:
+    args = yaml.safe_load(file)
+
+print(f"======= Training {args['probe_params']['probe_name']} =======")
+
+args["trainer_params"]["gpus"] = device
+args["exp_params"]["manual_seed"] = random.randint(1000, 2000)
+
+tb_logger = TensorBoardLogger(
+    save_dir=args["logging_params"]["save_dir"],
+    name=args["probe_params"]["probe_name"],
+    version=args["logging_params"]["version"],
+)
+Path(f"{tb_logger.log_dir}").mkdir(exist_ok=True, parents=True)
+shutil.copy2(config_path, f"{tb_logger.log_dir}/config.yaml")
+seed_everything(args["exp_params"]["manual_seed"], True)
+
+args["probe_params"]["pretrained_model"] = args["pretrained_model"]
+
+probe = getattr(architectures, args["probe_params"]["probe_type"])(
+    args["probe_params"]
+).to("cuda")
+
+Trainer(
+    logger=tb_logger,
+    callbacks=[
+        EarlyStopping(monitor="val_loss"),
+        LearningRateMonitor(),
+        ModelCheckpoint(
+            save_top_k=5,
+            dirpath=os.path.join(tb_logger.log_dir, "checkpoints"),
+            monitor="val_loss",
+            filename="{epoch}-{val_loss:.2f}",
+            save_last=True,
+        ),
+    ],
+    strategy="ddp",
+    **args["trainer_params"],
+).fit(
+    IncrementalParseProbeExperiment(probe=probe, params=args["exp_params"]),
+    datamodule=datasets.PTB_Dataset(config=args, probe=probe),
+)
diff --git a/src/transition.py b/src/transition.py
new file mode 100644
index 0000000..f5e1b25
--- /dev/null
+++ b/src/transition.py
@@ -0,0 +1,1030 @@
+"""
+Implementation of transition systems.
+
+The TransitionSystem class is an "interface" for all of the
+subclasses that are being used, but isn't really used anywhere
+explicitly itself.
+source: https://github.com/qipeng/arc-swift/blob/master/src/transition.py
+"""
+from smart_open import smart_open
+import torch  
+import random
+import copy
+import torch.nn as nn
+from collections import defaultdict
+import numpy as np
+
+class ParserState_dec:
+    def __init__(self, sentence = [None], transsys=None, goldrels=None):
+        self.history = []
+        self.action_tuples = []
+        self.model_embeddings = torch.tensor([])
+        self.log_prob = 0
+        self.num_shifts = 0
+        self.action_log_probs = []
+        self.conditional_likelihood= []
+        self.word_log_probs = []
+        self.words = []
+        self.expanded = False
+
+        self.stack = [0]
+        self.buf = [i+1 for i in range(len(sentence)-1)]
+        # head and relation labels
+        self.head = defaultdict(list) #[[-1, -1] for _ in range(len(sentence))]
+
+        self.goldrels = goldrels
+
+        self.transsys = transsys
+        if self.transsys is not None:
+            self.transsys._preparetransitionset(self)
+
+        self.terminated = False
+    
+    def to_batch(self, probe):
+        device = next(probe.parameters()).device
+        gold_tuples = torch.tensor([t+[-1] for t in self.action_tuples]).unsqueeze(0).to(device)
+        model_embeddings = self.model_embeddings.detach().clone().to(device)
+
+        action_ids = [t[0] for t in self.action_tuples]
+
+        if 'continuous_action_masks' in probe.args['data_sources']:
+            mask =generate_continuous_mask(action_ids, model_embeddings.shape[2])#self.num_shifts+1)
+            cont_mask = mask
+            # cont_mask = np.pad(mask,
+            #                     ((0, 400 - len(mask)),(0,0)),
+            #                     'constant', constant_values=-1)
+        else: cont_mask = torch.tensor([-1])
+
+        return {'gold_tuples':gold_tuples,
+                'padded_embeddings': model_embeddings,
+                'action_ids':torch.tensor(action_ids).unsqueeze(0).to(device),
+                'continuous_action_masks':torch.tensor(cont_mask).unsqueeze(0).to(device)} #tuples
+
+    def heads_idxs(self): return [self.head[i][0] for i in sorted(self.head.keys())]
+
+    def incremental_distance_matrix(self):
+        sentence_length = len(self.heads_idxs()) #All observation fields must be of same length
+        distances = torch.zeros((sentence_length, sentence_length))
+        relative_depths = torch.zeros((sentence_length, sentence_length))
+        for i in range(sentence_length):
+            for j in range(i,sentence_length):
+                # print(self.incremental_distance(i, j))
+                i_j_distance,i_j_relative_depth = self.incremental_distance(i, j)
+                distances[i][j] = i_j_distance
+                distances[j][i] = i_j_distance
+
+                relative_depths[i][j] = i_j_relative_depth
+                relative_depths[j][i] = -i_j_relative_depth
+
+        return distances, relative_depths
+
+    def incremental_distance(self, i, j,unconnected_pad = 1):
+        if i == j:
+            return 0, 0
+        # if observation:
+        head_indices = []
+        number_of_underscores = 0
+        for elt in self.heads_idxs():
+            # print(elt)
+            if elt == '_':
+                head_indices.append(0)
+                number_of_underscores += 1
+            else:
+                head_indices.append(int(elt) + number_of_underscores)
+        i_path = [i+1]
+        j_path = [j+1]
+        i_head = i+1
+        j_head = j+1
+        while True:
+            if not (i_head == 0 and (i_path == [i+1] or i_path[-1] == 0)):
+                i_head = head_indices[i_head - 1]
+                i_path.append(i_head)
+            if not (j_head == 0 and (j_path == [j+1] or j_path[-1] == 0)):
+                j_head = head_indices[j_head - 1]
+                j_path.append(j_head)
+            if i_head in j_path:
+                j_path_length = j_path.index(i_head)
+                i_path_length = len(i_path) - 1
+                
+                break
+            elif j_head in i_path:
+                i_path_length = i_path.index(j_head)
+                j_path_length = len(j_path) - 1
+                break
+            elif i_head == j_head:
+                i_path_length = len(i_path) - 1
+                j_path_length = len(j_path) - 1
+                break
+        
+        total_length = j_path_length + i_path_length
+        nodes_along_path = j_path[:j_path_length+1] + i_path[:i_path_length+1]
+
+        if -1 in nodes_along_path:
+            if unconnected_pad: total_length += unconnected_pad
+            else: total_length = -1
+
+        # if return_rel_depth:
+        if -1 in nodes_along_path:
+            return -1, float('inf')
+
+        rel_depth = -(i_path_length - j_path_length) if not j_path_length == i_path_length else 0
+        return total_length, rel_depth
+
+    def transitionset(self):
+        return self._transitionset
+
+    def clone(self, clone_embeddings=True):
+        res = ParserState_dec([])
+        res.stack = copy.copy(self.stack)
+        res.buf = copy.copy(self.buf)
+        res.head = copy.copy(self.head)
+        # res.pos = copy.copy(self.pos)
+        res.goldrels = copy.copy(self.goldrels)
+        res.transsys = self.transsys
+        res.terminated = self.terminated
+        res.action_tuples = copy.copy(self.action_tuples)
+        res.log_prob = self.log_prob
+        res.num_shifts = self.num_shifts
+        res.action_log_probs = copy.copy(self.action_log_probs)
+        res.conditional_likelihood = copy.deepcopy(self.conditional_likelihood)
+        if clone_embeddings: res.model_embeddings = copy.deepcopy(self.model_embeddings)
+        else: res.model_embeddings = []
+        res.word_log_probs = copy.copy(self.word_log_probs)
+        res.words = copy.copy(self.words)
+        res.expanded = self.expanded
+        res.history = copy.copy(self.history)
+        
+        if hasattr(self, '_transitionset'):
+            res._transitionset = copy.copy(self._transitionset)
+        return res
+
+class ParserState:
+    def __init__(self, sentence, transsys=None, goldrels=None):
+
+        self.stack = [0]
+        self.buf = [i+1 for i in range(len(sentence)-1)]
+        self.head = [[-1, -1] for _ in range(len(sentence))]
+        self.pos = [-1 for _ in range(len(sentence))]
+        self.goldrels = goldrels
+        self.transsys = transsys
+        if self.transsys is not None: self.transsys._preparetransitionset(self)
+
+    def transitionset(self): return self._transitionset
+
+    def clone(self):
+        res = ParserState([])
+        res.stack = copy.copy(self.stack)
+        res.buf = copy.copy(self.buf)
+        res.head = copy.copy(self.head)
+        res.pos = copy.copy(self.pos)
+        res.goldrels = copy.copy(self.goldrels)
+        res.transsys = self.transsys
+        if hasattr(self, '_transitionset'):
+            res._transitionset = copy.copy(self._transitionset)
+        return res
+
+class TransitionSystem(object):
+    def __init__(self, mappings_file):
+        self.mappings, self.invmappings = read_mappings(mappings_file, self.actions_list(), log=None)
+
+    def _preparetransitionset(self, parserstate):
+        """ Prepares the set of gold transitions given a parser state """
+        raise NotImplementedError()
+
+    def advance(self, parserstate, action):
+        """ Advances a parser state given an action """
+        raise NotImplementedError()
+
+    def goldtransition(self, parserstate, goldrels):
+        """ Returns the next gold transition given the set of gold arcs """
+        raise NotImplementedError()
+
+    def trans_to_str(self, transition, state, pos, fpos=None):
+        raise NotImplementedError()
+
+    @classmethod
+    def trans_from_line(self, line):
+        raise NotImplementedError()
+
+    @classmethod
+    def actions_list(self):
+        raise NotImplementedError()
+
+class ArcSwift(TransitionSystem):
+    def __init__(self, mappings_file):
+        self.mappings, self.invmappings = read_mappings(mappings_file, self.actions_list(), log=None)
+        self.name='ASw'
+    @classmethod
+    def actions_list(self):
+        return ['SHIFT', 'Left-Arc', 'Right-Arc']
+
+    def _preparetransitionset(self, parserstate):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        stack, buf, head = parserstate.stack, parserstate.buf, parserstate.head
+
+        t = []
+
+        if len(buf) > 1:
+            t += [(SHIFT, -1)]
+
+        left_possible = False
+        if len(buf) > 0:
+            for si in range(len(stack) - 1):
+                if head[stack[si]][0] < 0:
+                    t += [(LEFTARC, si)]
+                    left_possible = True
+                    break
+        if len(buf) > 1 or (len(buf) == 1 and not left_possible):
+            for si in range(len(stack)):
+                t += [(RIGHTARC, si)]
+                if head[stack[si]][0] < 0:
+                    break
+
+        parserstate._transitionset = t
+
+    def advance(self, parserstate, action):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        RELS = len(self.mappings['rel'])
+        cand = parserstate.transitionset()
+
+        if isinstance(action, int):
+            a, rel = self.tuple_trans_from_int(cand, action)
+        else:
+            rel = action[-1]
+            a = action[:-1]
+
+        stack = parserstate.stack
+        buf = parserstate.buf
+
+        if a[0] == SHIFT:
+            parserstate.stack = [buf[0]] + stack
+            parserstate.buf = buf[1:]
+        elif a[0] == LEFTARC:
+            si = a[1]
+            parserstate.head[stack[si]] = [buf[0], rel]
+            parserstate.stack = stack[(si+1):]
+        elif a[0] == RIGHTARC:
+            si = a[1]
+            parserstate.head[buf[0]] = [stack[si], rel]
+            parserstate.stack = [buf[0]] + stack[si:]
+            parserstate.buf = buf[1:]
+
+        self._preparetransitionset(parserstate)
+
+    def goldtransition(self, parserstate, goldrels=None, return_tuple=False):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        goldrels = goldrels or parserstate.goldrels
+        stack = parserstate.stack
+        buf = parserstate.buf
+        head = parserstate.head
+
+        j = buf[0]
+        addedArc = False
+        for n in range(len(stack)):
+            if stack[n] in goldrels[j]:
+                rel = goldrels[j][stack[n]]
+                a = (LEFTARC, n, rel)
+                addedArc = True
+                
+                break
+            elif j in goldrels[stack[n]]:
+                rel = goldrels[stack[n]][j]
+                a = (RIGHTARC, n, rel)
+                addedArc = True
+                break
+            if head[stack[n]][0] < 0: break
+
+        if not addedArc:
+            a = (SHIFT, -1, -1)
+            if return_tuple:
+            #this means we did |stack| comparisions and non of them succeeded
+            #choice point, if we want to optimize for implicit action return full stack
+                return a[0], buf[0], stack
+            
+        if return_tuple:
+            #this means we did n comparisions and only the last on succeeded
+            return a[0], buf[0], stack[:n]
+        return a
+
+    def trans_to_str(self, t, state, pos, fpos=None):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        if t[0] == SHIFT:
+            if fpos is None:
+                return "SHIFT\t%s" % (pos[state.buf[0]])
+            else:
+                return "SHIFT\t%s\t%s" % (pos[state.buf[0]], fpos[state.buf[0]])
+        elif t[0] == LEFTARC:
+            return "Left-Arc\t%d\t%s" % (t[1]+1, self.invmappings['rel'][t[2]])
+        elif t[0] == RIGHTARC:
+            if fpos is None:
+                return "Right-Arc\t%d\t%s\t%s" % (t[1]+1, self.invmappings['rel'][t[2]], pos[state.buf[0]])
+            else:
+                return "Right-Arc\t%d\t%s\t%s\t%s" % (t[1]+1, self.invmappings['rel'][t[2]], pos[state.buf[0]], fpos[state.buf[0]])
+
+    @classmethod
+    def trans_from_line(self, line):
+        if line[0] == 'Left-Arc':
+            fields = { 'action':line[0], 'n':int(line[1]), 'rel':line[2] }
+        elif line[0] == 'Right-Arc':
+            fields = { 'action':line[0], 'n':int(line[1]), 'rel':line[2], 'pos':line[3] }
+            if len(line) > 4:
+                fields['fpos'] = line[4]
+        elif line[0] == 'SHIFT':
+            fields = { 'action':line[0], 'pos':line[1] }
+            if len(line) > 2:
+                fields['fpos'] = line[2]
+        else:
+            raise ValueError(line[0])
+        return fields
+
+    def tuple_trans_to_int(self, cand, t):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        RELS = len(self.mappings['rel'])
+
+        base = 0
+        if t[0] == SHIFT:
+            return 0
+
+        if cand[0][0] == SHIFT:
+            base = 1
+
+        if t[0] == LEFTARC:
+            return base + t[2]
+
+        if len(cand) > 1 and cand[1][0] == LEFTARC:
+            base += RELS
+
+        if t[0] == RIGHTARC:
+            return base + t[1]*RELS + t[2]
+
+    def tuple_trans_from_int(self, cand, action):
+        SHIFT = self.mappings['action']['SHIFT']
+        RELS = len(self.mappings['rel'])
+        rel = -1
+
+        if cand[0][0] == SHIFT:
+            if action == 0:
+                a = cand[0]
+            else:
+                a = cand[(action - 1) / RELS + 1]
+                rel = (action - 1) % RELS
+        else:
+            a = cand[action / RELS]
+            rel = action % RELS
+
+        return a, rel
+
+class ArcEagerReduce(TransitionSystem):
+    def __init__(self, mappings_file):
+        self.mappings, self.invmappings = read_mappings(mappings_file, self.actions_list(), log=None)
+        self.name='AER'
+        
+    @classmethod
+    def actions_list(self):
+        return ['SHIFT', 'Left-Arc', 'Right-Arc', 'Reduce']
+
+    def _preparetransitionset(self, parserstate):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+        REDUCE = self.mappings['action']['Reduce']
+
+        stack, buf, head = parserstate.stack, parserstate.buf, parserstate.head
+
+        t = []
+
+        if len(buf) > 1:
+            t += [(SHIFT,)]
+
+        if len(buf) > 0 and len(stack) > 1:
+            t += [(REDUCE,)]
+
+        left_possible = False
+        if len(buf) > 0 and len(stack) > 1:
+            if head[stack[0]][0] < 0:
+                t += [(LEFTARC,)]
+                left_possible = True
+
+        if len(buf) > 1 or (len(buf) == 1 and not left_possible):
+            t += [(RIGHTARC,)]
+
+        parserstate._transitionset = t
+
+    def advance(self, parserstate, action):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+        REDUCE = self.mappings['action']['Reduce']
+
+        RELS = len(self.mappings['rel'])
+        cand = parserstate.transitionset()
+
+        if isinstance(action, int):
+            a, rel = self.tuple_trans_from_int(cand, action)
+        else:
+            rel = action[-1]
+            a = action[:-1]
+
+        stack = parserstate.stack
+        buf = parserstate.buf
+
+        if a[0] == SHIFT:
+            parserstate.stack = [buf[0]] + stack
+            parserstate.buf = buf[1:]
+        elif a[0] == LEFTARC:
+            parserstate.head[stack[0]] = [buf[0], rel]
+            parserstate.stack = stack[1:]
+        elif a[0] == RIGHTARC:
+            parserstate.head[buf[0]] = [stack[0], rel]
+            parserstate.stack = [buf[0]] + stack
+            parserstate.buf = buf[1:]
+        elif a[0] == REDUCE:
+            parserstate.stack = stack[1:]
+
+        self._preparetransitionset(parserstate)
+
+    def goldtransition(self, parserstate, goldrels=None, return_tuple=False):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+        REDUCE = self.mappings['action']['Reduce']
+
+        goldrels = goldrels or parserstate.goldrels
+        stack = parserstate.stack
+        buf = parserstate.buf
+        head = parserstate.head
+
+        POS = len(self.mappings['pos'])
+
+        j = buf[0]
+
+        norightchildren = True
+        for x in buf:
+            if x in goldrels[stack[0]]:
+                norightchildren = False
+                break
+
+        if stack[0] in goldrels[j]:
+            rel = goldrels[j][stack[0]]
+            a = (LEFTARC, rel)
+
+            if return_tuple:
+                return a[0], buf[0], stack[0]
+
+        elif j in goldrels[stack[0]]:
+            rel = goldrels[stack[0]][j]
+            a = (RIGHTARC, rel)
+
+            if return_tuple:
+                return a[0], buf[0], stack[0]
+
+        elif head[stack[0]][0] >= 0 and norightchildren:
+            a = (REDUCE, -1)
+            if return_tuple:
+                return a[0], buf[0], stack[0]
+
+        else:
+            a = (SHIFT, -1)
+            if return_tuple:
+                return a[0], buf[0], stack[0]
+
+        return a
+
+    def trans_to_str(self, t, state, pos, fpos=None):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+        REDUCE = self.mappings['action']['Reduce']
+        if t[0] == SHIFT:
+            if fpos is None:
+                return "SHIFT\t%s" % (pos[state.buf[0]])
+            else:
+                return "SHIFT\t%s\t%s" % (pos[state.buf[0]], fpos[state.buf[0]])
+        elif t[0] == LEFTARC:
+            return "Left-Arc\t%s" % (self.invmappings['rel'][t[1]])
+        elif t[0] == RIGHTARC:
+            if fpos is None:
+                return "Right-Arc\t%s\t%s" % (self.invmappings['rel'][t[1]], pos[state.buf[0]])
+            else:
+                return "Right-Arc\t%s\t%s\t%s" % (self.invmappings['rel'][t[1]], pos[state.buf[0]], fpos[state.buf[0]])
+        elif t[0] == REDUCE:
+            return "Reduce"
+
+    @classmethod
+    def trans_from_line(self, line):
+        if line[0] == 'Left-Arc':
+            fields = { 'action':line[0], 'rel':line[1] }
+        elif line[0] == 'Right-Arc':
+            fields = { 'action':line[0], 'rel':line[1], 'pos':line[2] }
+            if len(line) > 3:
+                fields['fpos'] = line[3]
+        elif line[0] == 'SHIFT':
+            fields = { 'action':line[0], 'pos':line[1] }
+            if len(line) > 2:
+                fields['fpos'] = line[2]
+        elif line[0] == 'Reduce':
+            fields = { 'action':line[0] }
+        else:
+            raise ValueError(line[0])
+        return fields
+
+    def tuple_trans_to_int(self, cand, t):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+        REDUCE = self.mappings['action']['Reduce']
+
+        RELS = len(self.mappings['rel'])
+
+        base = 0
+        if t[0] == SHIFT:
+            return base
+
+        base += 1
+
+        if t[0] == REDUCE:
+            return base
+
+        base += 1
+
+        if t[0] == LEFTARC:
+            return base + t[1]
+
+        base += RELS
+
+        if t[0] == RIGHTARC:
+            return base + t[1]
+
+    def tuple_trans_from_int(self, cand, action):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+        REDUCE = self.mappings['action']['Reduce']
+        RELS = len(self.mappings['rel'])
+        rel = -1
+
+        base = 0
+        if action == base:
+            a = (SHIFT,)
+        base += 1
+
+        if action == base:
+            a = (REDUCE,)
+        base += 1
+
+        if base <= action < base + RELS:
+            a = (LEFTARC,)
+            rel = action - base
+        base += RELS
+
+        if base <= action < base + RELS:
+            a = (RIGHTARC,)
+            rel = action - base
+
+        return a, rel
+
+class ArcEagerShift(ArcEagerReduce):
+    def __init__(self, mappings_file):
+        self.mappings, self.invmappings = read_mappings(mappings_file, self.actions_list(), log=None)
+        self.name='AES'
+        
+    def goldtransition(self, parserstate, goldrels=None, return_tuple=False):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+        REDUCE = self.mappings['action']['Reduce']
+
+        goldrels = goldrels or parserstate.goldrels
+        stack = parserstate.stack
+        buf = parserstate.buf
+        head = parserstate.head
+
+        POS = len(self.mappings['pos'])
+
+        j = buf[0]
+
+        has_right_children = False
+        for i in buf:
+            if i in goldrels[stack[0]]:
+                has_right_children = True
+                break
+
+        must_reduce = False
+        for i in stack:
+            if i in goldrels[j] or j in goldrels[i]:
+                must_reduce = True
+                break
+            if head[i][0] < 0:
+                break
+
+        if stack[0] in goldrels[j]:
+            rel = goldrels[j][stack[0]]
+            a = (LEFTARC, rel)
+
+            if return_tuple:
+                return a[0], buf[0], stack[0]
+
+        elif j in goldrels[stack[0]]:
+            rel = goldrels[stack[0]][j]
+            a = (RIGHTARC, rel)
+
+            if return_tuple:
+                return a[0], buf[0], stack[0]
+
+        elif not must_reduce or head[stack[0]][0] < 0 or has_right_children:
+            a = (SHIFT, -1)
+            if return_tuple:
+                #you can only be here if the comparisons failed (and of course someother things failed as well)
+                return a[0], buf[0], stack[0]
+        else:
+            a = (REDUCE, -1)
+            if return_tuple:
+                #you can only be here if the comparisons failed (and of course someother things failed as well)
+                return a[0], buf[0], stack[0]
+        return a
+
+class ArcStandard(TransitionSystem):
+    def __init__(self, mappings_file):
+        self.mappings, self.invmappings = read_mappings(mappings_file, self.actions_list(), log=None)
+        self.name='ASd'
+        self.num_actions = 3
+
+        self.i2a = self.actions_list()
+        self.i2a.extend(['BOS', 'EOS', 'PAD'])
+        self.a2i = {i:self.i2a.index(i) for i in self.i2a}
+    
+
+    def action_dists(self, p_shift, marginal_p_reduce):
+        p_reduce = (1-p_shift).unsqueeze(-1).log()+torch.concat((1-marginal_p_reduce.unsqueeze(-1), marginal_p_reduce.unsqueeze(-1)), -1).log()
+        dists = torch.cat(((p_shift).unsqueeze(-1).log(), p_reduce), -1)
+        return dists
+
+    def initial_state(self):
+        '''returns the initial state for beam search parsing
+        blank parser state after one shift
+        '''
+        init_parserstate = ParserState_dec()
+
+        init_parserstate.buf = [init_parserstate.num_shifts+1]
+        self._preparetransitionset(init_parserstate)
+        self.advance(init_parserstate, self.a2i['SHIFT'])
+        init_parserstate.action_log_probs.append(0)
+        init_parserstate.action_tuples = [[self.a2i['SHIFT'], -1, -1]]
+
+        init_parserstate.buf = [init_parserstate.num_shifts+1]
+        self._preparetransitionset(init_parserstate)
+        return init_parserstate
+    
+    def targets_idxs(self, batch):
+        '''
+        Returns 2 np arrays of the form [[indx in batch],
+                                       [index of first embedding],
+                                       [index of second embedding]],
+
+                                      [[imdex of the target action]]]
+        
+        indices of the gold actions in the batch'''
+
+        tuples = batch['gold_tuples'].clone()
+
+        tuples = tuples.roll(1, -1)
+        tuples[:,:,0] = torch.arange(tuples.shape[0]).unsqueeze(1).repeat(1,tuples.shape[1])
+
+        vector_comparisons = tuples[:,:,3] != -1
+        
+        oracle_action_idxs = tuples[vector_comparisons][:,[0,2,3,1]].transpose(1,0).cpu().numpy()
+
+        return oracle_action_idxs[:-1], oracle_action_idxs[-1]
+
+    @classmethod
+    def actions_list(self):
+        return ['SHIFT', 'Left-Arc', 'Right-Arc']
+
+    def _preparetransitionset(self, parserstate):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        stack, buf, head = parserstate.stack, parserstate.buf, parserstate.head
+
+        t = []
+
+        if len(buf) > 0:
+            t += [(SHIFT,)]
+
+        if len(stack) > 2:
+            t += [(LEFTARC,)]
+
+        if len(stack) > 1:
+            t += [(RIGHTARC,)]
+
+        parserstate._transitionset = t
+
+    def advance(self, parserstate, action):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        RELS = len(self.mappings['rel'])
+        cand = parserstate.transitionset()
+
+        if isinstance(action, int):
+            a, rel = self.tuple_trans_from_int(cand, action)
+        else:
+            rel = action[-1]
+            a = action[:-1]
+
+        stack = parserstate.stack
+        buf = parserstate.buf
+
+        if a[0] == SHIFT:
+            parserstate.stack = [buf[0]] + stack
+            #new
+            parserstate.head[buf[0]] = [-1, -1]
+            parserstate.num_shifts += 1
+            #
+            parserstate.buf = buf[1:]
+        elif a[0] == LEFTARC:
+            parserstate.head[stack[1]] = [stack[0], rel]
+            parserstate.stack = [stack[0]] + stack[2:]
+        elif a[0] == RIGHTARC:
+            parserstate.head[stack[0]] = [stack[1], rel]
+            parserstate.stack = stack[1:]
+
+        self._preparetransitionset(parserstate)
+
+    def goldtransition(self, parserstate, goldrels=None, return_tuple=False):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        goldrels = goldrels or parserstate.goldrels
+        stack = parserstate.stack
+        buf = parserstate.buf
+        head = parserstate.head
+
+        POS = len(self.mappings['pos'])
+
+        #this is a double check to make sure we dont reduce node that still have children in the future
+        #hopefully this is just a hack and we dont need it    
+        stack0_done = True
+        for x in buf:
+            if x in goldrels[stack[0]]:
+                stack0_done = False
+                break
+
+        if len(stack) > 2 and stack[1] in goldrels[stack[0]]:
+            rel = goldrels[stack[0]][stack[1]]
+            a = (LEFTARC, rel)
+
+            if return_tuple:
+                return a[0], stack[0], stack[1]
+
+        elif len(stack) > 1 and stack[0] in goldrels[stack[1]] and stack0_done:
+            rel = goldrels[stack[1]][stack[0]]
+            a = (RIGHTARC, rel)
+
+            if return_tuple:
+                return a[0], stack[0], stack[1]
+                # return a[0], stack[1], stack[0]
+        else:
+            a = (SHIFT, -1)
+
+            if return_tuple:
+                #look at the non distance comparison triggers if neither ('or' statement) triggered it means its the distance comparisons fault
+                if not len(stack) > 1:
+                    return a[0], -1, -1
+
+                else: 
+                    return a[0], stack[0], stack[1]
+
+                    # if random.randint(0, 1):
+                    #     return a[0], stack[0], stack[1]
+                    # else:
+                    #     return a[0], stack[1], stack[0]
+                #means we didnt actually compare anything (one of the disqualifies triggered), doesnt matter what the stack looks like 
+
+        return a
+
+    def trans_to_str(self, t, state, pos, fpos=None):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+        if t[0] == SHIFT:
+            if fpos is None:
+                return "SHIFT\t%s" % (pos[state.buf[0]])
+            else:
+                try: 
+                    return "SHIFT\t%s\t%s" % (pos[state.buf[0]], fpos[state.buf[0]])
+                except: 
+                    None
+
+        elif t[0] == LEFTARC:
+            return "Left-Arc\t%s" % (self.invmappings['rel'][t[1]])
+        elif t[0] == RIGHTARC:
+            return "Right-Arc\t%s" % (self.invmappings['rel'][t[1]])
+            
+
+    @classmethod
+    def trans_from_line(self, line):
+        if line[0] == 'Left-Arc':
+            fields = { 'action':line[0], 'rel':line[1] }
+        elif line[0] == 'Right-Arc':
+            fields = { 'action':line[0], 'rel':line[1] }
+        elif line[0] == 'SHIFT':
+            fields = { 'action':line[0], 'pos':line[1] }
+            if len(line) > 2:
+                fields['fpos'] = line[2]
+        else:
+            raise ValueError(line[0])
+        return fields
+
+    def tuple_trans_to_int(self, cand, t):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        RELS = len(self.mappings['rel'])
+
+        base = 0
+        if t[0] == SHIFT:
+            return base
+
+        base += 1
+
+        if t[0] == LEFTARC:
+            return base + t[1]
+
+        base += RELS
+
+        if t[0] == RIGHTARC:
+            return base + t[1]
+
+    def tuple_trans_from_int(self, cand, action):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+        RELS = len(self.mappings['rel'])
+        rel = -1
+
+        base = 0
+        if action == base:
+            a = (SHIFT,)
+        base += 1
+
+        if base <= action < base + RELS:
+            a = (LEFTARC,)
+            rel = action - base
+        base += RELS
+
+        if base <= action < base + RELS:
+            a = (RIGHTARC,)
+            rel = action - base
+
+        return a, rel
+
+class ArcHybrid(ArcStandard):
+    def __init__(self, mappings_file):
+        self.mappings, self.invmappings = read_mappings(mappings_file, self.actions_list(), log=None)
+        self.name='AH'
+        
+    def _preparetransitionset(self, parserstate):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        stack, buf, head = parserstate.stack, parserstate.buf, parserstate.head
+
+        t = []
+
+        if len(buf) > 0:
+            t += [(SHIFT,)]
+
+        if len(buf) > 0 and len(stack) > 1 and head[stack[0]][0] < 0:
+            t += [(LEFTARC,)]
+
+        if len(stack) > 1 and head[stack[0]][0] < 0:
+            t += [(RIGHTARC,)]
+
+        parserstate._transitionset = t
+
+    def advance(self, parserstate, action):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        RELS = len(self.mappings['rel'])
+        cand = parserstate.transitionset()
+
+        if isinstance(action, int):
+            a, rel = self.tuple_trans_from_int(cand, action)
+        else:
+            rel = action[-1]
+            a = action[:-1]
+
+        stack = parserstate.stack
+        buf = parserstate.buf
+
+        if a[0] == SHIFT:
+            parserstate.stack = [buf[0]] + stack
+            parserstate.buf = buf[1:]
+        elif a[0] == LEFTARC:
+            parserstate.head[stack[0]] = [buf[0], rel]
+            parserstate.stack = stack[1:]
+        elif a[0] == RIGHTARC:
+            parserstate.head[stack[0]] = [stack[1], rel]
+            parserstate.stack = stack[1:]
+
+        self._preparetransitionset(parserstate)
+
+    def goldtransition(self, parserstate, goldrels=None, return_tuple=False):
+        SHIFT = self.mappings['action']['SHIFT']
+        LEFTARC = self.mappings['action']['Left-Arc']
+        RIGHTARC = self.mappings['action']['Right-Arc']
+
+        goldrels = goldrels or parserstate.goldrels
+        stack = parserstate.stack
+        buf = parserstate.buf
+        head = parserstate.head
+
+        POS = len(self.mappings['pos'])
+
+        stack0_done = True
+        for x in buf:
+            if x in goldrels[stack[0]]:
+                stack0_done = False
+                break
+
+        if len(buf) > 0 and stack[0] in goldrels[buf[0]]:
+            rel = goldrels[buf[0]][stack[0]]
+            a = (LEFTARC, rel)
+
+            if return_tuple:
+                #for LEFTARC, only buf[0] and stack[0] are used
+                return a[0], buf[0], stack[0]
+
+        elif len(stack) > 1 and stack[0] in goldrels[stack[1]] and stack0_done:
+            rel = goldrels[stack[1]][stack[0]]
+            a = (RIGHTARC, rel)
+
+            if return_tuple:
+                #for RIGHTARC, only stack[0] and stack[1] are used
+                return a[0], stack[0], stack[1] 
+        else:
+            a = (SHIFT, -1)
+            if return_tuple:
+                if not (not (len(stack) > 1) or not (stack0_done)):
+                    #for SHIFT all three are used (implictly)
+                    return a[0], buf[0], stack[0], stack[1] 
+                else: 
+                #means we didnt actually compare anything (one of the disqualifies triggered), doesnt matter what the stack looks like
+                    return a[0], -1, -1, -1
+
+        return a
+
+def read_mappings(mappings_file, actions_list, log=None):
+    i = 0
+    res = dict()
+    res2 = dict()
+    with smart_open(mappings_file, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith("::"):
+                currentkey = line[2:]
+                res[currentkey] = dict()
+                res2[currentkey] = []
+                i = 0
+            else:
+                res[currentkey][line] = i
+                res2[currentkey] += [line]
+                i += 1
+
+    res['action'] = {k: i for i, k in enumerate(actions_list)}
+    res2['action'] = actions_list
+
+    return res, res2
+    
+def generate_continuous_mask(action_ids, token_pad):
+    mask = []
+    #i think we we missing the last embedding before
+    #shouuldnt be word indec should be number of words
+    wrd_indx = 1
+    for indx,i in enumerate(action_ids):
+        if i == 0: wrd_indx+=1
+        mask.append([1]*wrd_indx + [0]*(token_pad-wrd_indx))
+    return mask
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000..1d1edab
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,533 @@
+from smart_open import smart_open
+import numpy as np
+import torch
+import random
+import torch.nn.functional as F
+from nltk import Tree
+from collections import defaultdict
+import h5py
+from tqdm import tqdm
+import json
+import torch.nn as nn
+from abc import ABC
+import os
+from queue import PriorityQueue
+import yaml
+import copy
+import shutil
+from itertools import count
+from transition import ArcSwift, ArcEagerReduce, ArcEagerShift, ArcStandard, ArcHybrid
+import transition
+import yaml
+import architectures
+from scipy.stats import spearmanr, pearsonr
+
+ignored_tags = ["''", ",", ".", ":", "``", "-LRB-", "-RRB-"]
+
+MODEL_DATA = {'gpt2': {'layer_count': 13, 'feature_count': 768},
+              'gpt2-medium': {'layer_count': 25, 'feature_count': 1024},
+              'gpt2-large': {'layer_count': 37, 'feature_count': 1280},
+              'gpt2-xl': {'layer_count': 49, 'feature_count': 1600},
+              'bert-base-cased': {'layer_count': 13, 'feature_count': 768},
+              'bert-large-cased': {'layer_count': 25, 'feature_count': 1024}}
+
+
+def generate_continuous_mask(action_ids, token_pad):
+    mask = []
+    #i think we we missing the last embedding before
+    #shouuldnt be word indec should be number of words
+    wrd_indx = 1
+    for indx,i in enumerate(action_ids):
+        if i == 0: wrd_indx+=1
+        mask.append([1]*wrd_indx + [0]*(token_pad-wrd_indx))
+    return mask
+
+
+def generate_lines_for_sent(lines):
+    '''Yields batches of lines describing a sentence in conllx.
+    Args:
+        lines: Each line of a conllx file.
+    Yields:
+        a list of lines describing a single sentence in conllx.
+    '''
+    buf = []
+    for line in lines:
+        if line.startswith('#'):
+            continue
+        if not line.strip():
+            if buf:
+                yield buf
+                buf = []
+            else:
+                continue
+        else:
+            buf.append(line.strip())
+    if buf:
+        yield buf
+
+def clean_number(w):
+    new_w = re.sub('[0-9]{1,}([,.]?[0-9]*)*', 'N', w)
+    return new_w
+
+def conv_padded_ngrams(probe_vocab, 
+                       action_ids, 
+                       action_ngram_pad=30, 
+                       token_pad=30, 
+                       pad_token = -1):
+    '''
+    input:
+      converts unpadded array of action id to padded array of padded action ngrams
+      probe_vocab(dict): probe.a2i
+      action_ids (array, cpu tensor): (len(action_ids),)
+      action_ngram_pad (int): pad
+      token_pad (int): pad
+      pad_token (int): what int to pad with (should be probe.a2i[PAD])
+    retuns:
+      padded_action_ngrams (nparray): (token_pad x action_ngram_pad)
+    '''
+    #convert to numpy array
+    arr_action_ids = np.array(action_ids)
+
+    #boolean array is this action a shift?
+    shift_bin = (arr_action_ids == probe_vocab['SHIFT'])
+
+    #idxs of where shifts should happen - adds a shift at the end
+    shift_ids = np.concatenate((np.nonzero(shift_bin)[0], [len(arr_action_ids)]))
+
+    #action ngrams
+    split_actions = np.split(arr_action_ids,shift_ids+1,0)[:-1]
+
+    #remove trailing pad token
+    split_actions[-1] = split_actions[-1][np.where(split_actions[-1] != probe_vocab['PAD'])]
+
+    #pad ngrams and add special tokens
+    padded_ngrams = np.array([np.concatenate(([probe_vocab['BOS']], i,[probe_vocab['EOS']] ,[probe_vocab['PAD']]*(action_ngram_pad-len(i)-2))) for i in split_actions])
+
+    #pad ngram batch to token_pad
+    padded_ngrams = np.concatenate((padded_ngrams, np.zeros((token_pad-len(padded_ngrams), action_ngram_pad)) + probe_vocab['PAD']),0)
+    return padded_ngrams
+
+
+def update_log(s):
+    with open(args.logpath, 'a') as f:
+        f.write(s + '\n')
+
+def flatten_list(lst): return [j for sub in lst for j in sub]
+
+def head_indxs_to_states(head_indxs,oracle):
+    goldrels = [dict() for i in range(len(head_indxs)+1)]
+    for tok, head in enumerate(head_indxs): goldrels[head][tok+1] = -1
+
+    state = transition.ParserState_dec(["<ROOT>"] + head_indxs, transsys=oracle, goldrels=goldrels)
+    full_states = []#[state.clone()]
+    while len(state.transitionset()) > 0:
+
+        goldtransition =oracle.goldtransition(state)
+        state.action_tuples.append(list(oracle.goldtransition(state, return_tuple=True)))
+        oracle.advance(state, goldtransition)
+        full_states.append(state.clone())
+        
+    return full_states
+    
+def prune_queue(queue, k):
+    pruned_queue = PriorityQueue()
+    for i in range(k):
+        if queue.qsize():
+            g = queue.get() #g is a tuple (score, node)
+            pruned_queue.put(g) 
+    return pruned_queue
+
+def clean_dir(dir_path):
+    if os.path.exists(dir_path):
+        shutil.rmtree(dir_path, ignore_errors=True)
+
+def mkdir_ex(dir_path):
+    if not os.path.exists(dir_path):
+        os.mkdir(dir_path)
+
+def mkdir_p(dir):
+    '''make a directory (dir) if it doesn't exist'''
+    if not os.path.exists(dir):
+        os.mkdir(dir)
+        
+MODEL_DATA = {'gpt2': {'layer_count': 13, 'feature_count': 768},
+              'gpt2-medium': {'layer_count': 25, 'feature_count': 1024},
+              'gpt2-large': {'layer_count': 37, 'feature_count': 1280},
+              'gpt2-xl': {'layer_count': 49, 'feature_count': 1600},
+              'bert-base': {'layer_count': 13, 'feature_count': 768},
+              'bert-large': {'layer_count': 25, 'feature_count': 1024}}
+
+def oracle_lookup(k):
+    lookup = {"ASw": ArcSwift,
+              "AER": ArcEagerReduce,
+              "AES": ArcEagerShift,
+              "ASd": ArcStandard,
+              "AH" : ArcHybrid,}
+    return lookup[k]
+
+class obs(object):
+    def __init__(self, head_indices): self.head_indices = head_indices
+    def __getitem__(self,index): return self.head_indices
+
+class UnionFind:
+  '''
+  Naive UnionFind implementation for (slow) Prim's MST algorithm
+  Used to compute minimum spanning trees for distance matrices
+  '''
+  def __init__(self, n):
+    self.parents = list(range(n))
+  def union(self, i,j):
+    if self.find(i) != self.find(j):
+      i_parent = self.find(i)
+      self.parents[i_parent] = j
+  def find(self, i):
+    i_parent = i
+    while True:
+      if i_parent != self.parents[i_parent]:
+        i_parent = self.parents[i_parent]
+      else:
+        break
+    return i_parent
+
+def prims_matrix_to_edges(matrix, poses):
+  '''
+  Constructs a minimum spanning tree from the pairwise weights in matrix;
+  returns the edges.
+  Never lets punctuation-tagged words be part of the tree.
+  '''
+  pairs_to_distances = {}
+  uf = UnionFind(len(matrix))
+  for i_index, line in enumerate(matrix):
+    for j_index, dist in enumerate(line):
+      if IDX2XPOS[poses[i_index].item()] in ["''", ",", ".", ":", "``", "-LRB-", "-RRB-"]:
+        continue
+      if IDX2XPOS[poses[j_index].item()] in ["''", ",", ".", ":", "``", "-LRB-", "-RRB-"]:
+        continue
+      pairs_to_distances[(i_index, j_index)] = dist
+  edges = []
+  for (i_index, j_index), distance in sorted(pairs_to_distances.items(), key = lambda x: x[1]):
+    if uf.find(i_index) != uf.find(j_index):
+      uf.union(i_index, j_index)
+      edges.append((i_index, j_index))
+  return edges
+
+def get_nopunct_argmin(prediction, poses):
+  '''
+  Gets the argmin of predictions, but filters out all punctuation-POS-tagged words
+  '''
+  puncts = ["''", ",", ".", ":", "``", "-LRB-", "-RRB-"]
+  original_argmin = np.argmin(prediction)
+  for i in range(len(poses)):
+    argmin = np.argmin(prediction)
+    if IDX2XPOS[poses[argmin].item()] not in puncts:
+      return argmin
+    else:
+      prediction[argmin] = np.inf
+  return original_argmin
+
+def heads_to_displacy(sentence, heads):
+    displacy_format = {
+        "words": [
+            {"text": token, "tag": ' '} for token in sentence.split()
+        ],
+        "arcs": [
+            {"start": dep, "end": head[0], "label": ' ', "dir": "right"} if dep < head[0] else {"start": head[0], "end": dep, "label": ' ', "dir": "left"} for dep, head in heads.items() if head[0] != -1
+        ]
+    }
+    displacy_format["words"].insert(0, {"text": 'ROOT', "tag": ' '})
+    return displacy_format
+
+def load_lit_checkpoint(purpose,mod,probe_name,l):
+    if os.path.isfile(f"./experiment_checkpoints/{purpose}/{mod}/{probe_name}/layer_{str(l)}/checkpoints/last.ckpt"):
+        with open(f"./experiment_checkpoints/{purpose}/{mod}/{probe_name}/layer_{str(l)}/config.yaml", 'r') as file: l_args = yaml.safe_load(file)
+        l_args['probe_params']['pretrained_model'] = l_args['pretrained_model']
+        p_ckpt = experiment.IncrementalParseProbeExperiment.load_from_checkpoint(f"./experiment_checkpoints/{purpose}/{mod}/{probe_name}/layer_{str(l)}/checkpoints/last.ckpt").probe
+        p = getattr(architectures, l_args['probe_params']['probe_type'])(l_args['probe_params']).to('cuda')
+        p.load_state_dict(p_ckpt.state_dict())
+        p.eval()
+        p.oracle = transition.ArcStandard(l_args['probe_params']['oracle_params']['mappings_file'])
+        return l_args, p
+    else: return None, None
+
+def berkeley_unk_conv(ws):
+  """This is a simplified version of unknown token conversion in BerkeleyParser.
+
+  The full version is berkely_unk_conv2.
+  """
+  uk = "unk"
+  sz = len(ws) - 1
+  if ws[0].isupper():
+    uk = "c" + uk
+  if ws[0].isdigit() and ws[sz].isdigit():
+    uk = uk + "n"
+  elif sz <= 2:
+    pass
+  elif ws[sz-2:sz+1] == "ing":
+    uk = uk + "ing"
+  elif ws[sz-1:sz+1] == "ed":
+    uk = uk + "ed"
+  elif ws[sz-1:sz+1] == "ly":
+    uk = uk + "ly"
+  elif ws[sz] == "s":
+    uk = uk + "s"
+  elif ws[sz-2:sz+1] == "est":
+    uk = uk + "est"
+  elif ws[sz-1:sz+1] == "er":
+    uk = uk + 'ER'
+  elif ws[sz-2:sz+1] == "ion":
+    uk = uk + "ion"
+  elif ws[sz-2:sz+1] == "ory":
+    uk = uk + "ory"
+  elif ws[0:2] == "un":
+    uk = "un" + uk
+  elif ws[sz-1:sz+1] == "al":
+    uk = uk + "al"
+  else:
+    for i in range(sz):
+      if ws[i] == '-':
+        uk = uk + "-"
+        break
+      elif ws[i] == '.':
+        uk = uk + "."
+        break
+  return "<" + uk + ">"
+
+def berkeley_unk_conv2(token):
+  numCaps = 0
+  hasDigit = False
+  hasDash = False
+  hasLower = False
+  for char in token:
+    if char.isdigit():
+      hasDigit = True
+    elif char == '-':
+      hasDash = True
+    elif char.isalpha():
+      if char.islower():
+        hasLower = True
+      elif char.isupper():
+        numCaps += 1
+  result = 'UNK'
+  lower = token.rstrip().lower()
+  ch0 = token.rstrip()[0]
+  if ch0.isupper():
+    if numCaps == 1:
+      result = result + '-INITC'
+      # Remove this because it relies on a vocabulary, not given to this funciton (HN).
+      # if lower in words_dict:
+      #   result = result + '-KNOWNLC'
+    else:
+      result = result + '-CAPS'
+  elif not(ch0.isalpha()) and numCaps > 0:
+    result = result + '-CAPS'
+  elif hasLower:
+    result = result + '-LC'
+  if hasDigit:
+    result = result + '-NUM'
+  if hasDash:
+    result = result + '-DASH'
+  if lower[-1] == 's' and len(lower) >= 3:
+    ch2 = lower[-2]
+    if not(ch2 == 's') and not(ch2 == 'i') and not(ch2 == 'u'):
+      result = result + '-s'
+  elif len(lower) >= 5 and not(hasDash) and not(hasDigit and numCaps > 0):
+    if lower[-2:] == 'ed':
+      result = result + '-ed'
+    elif lower[-3:] == 'ing':
+      result = result + '-ing'
+    elif lower[-3:] == 'ion':
+      result = result + '-ion'
+    elif lower[-2:] == 'er':
+      result = result + '-er'
+    elif lower[-3:] == 'est':
+      result = result + '-est'
+    elif lower[-2:] == 'ly':
+      result = result + '-ly'
+    elif lower[-3:] == 'ity':
+      result = result + '-ity'
+    elif lower[-1] == 'y':
+      result = result + '-y'
+    elif lower[-2:] == 'al':
+      result = result + '-al'
+  return result
+
+import logging
+import os.path as op
+from smart_open import smart_open
+# import cPickle as pickle
+import pickle
+from transition import ArcSwift, ArcEagerReduce, ArcEagerShift, ArcStandard, ArcHybrid
+import numpy as np
+
+from copy import copy
+
+class ParserState:
+    def __init__(self, sentence, transsys=None, goldrels=None):
+#         print(sentence)
+        self.stack = [0]
+        # sentences should already have a <ROOT> symbol as the first token
+#         print([i+1 for i in range(len(sentence)-1)])
+        self.buf = [i+1 for i in range(len(sentence)-1)]
+        # head and relation labels
+        self.head = [[-1, -1] for _ in range(len(sentence))]
+
+        self.pos = [-1 for _ in range(len(sentence))]
+
+        self.goldrels = goldrels
+
+        self.transsys = transsys
+        if self.transsys is not None:
+            self.transsys._preparetransitionset(self)
+
+    def transitionset(self):
+        return self._transitionset
+
+    def clone(self):
+        res = ParserState([])
+        res.stack = copy(self.stack)
+        res.buf = copy(self.buf)
+        res.head = copy(self.head)
+        res.pos = copy(self.pos)
+        res.goldrels = copy(self.goldrels)
+        res.transsys = self.transsys
+        if hasattr(self, '_transitionset'):
+            res._transitionset = copy(self._transitionset)
+        return res
+
+
+transition_dims = ['action', 'n', 'rel', 'pos', 'fpos']
+transition_pos = {v:i for i, v in enumerate(transition_dims)}
+floatX = np.float32
+
+def transsys_lookup(k):
+    lookup = {"ASw": ArcSwift,
+              "AER": ArcEagerReduce,
+              "AES": ArcEagerShift,
+              "ASd": ArcStandard,
+              "AH" : ArcHybrid,}
+    return lookup[k]
+
+def process_example(conll_lines, seq_lines, vocab, mappings, transsys, fpos=False, log=None):
+    if fpos:
+        res = [[] for _ in range(4)]
+    else:
+        res = [[] for _ in range(3)]
+    res[0] = [vocab[u'<ROOT>']] + [vocab[u'<UNK>'] if line.split()[1] not in vocab else vocab[line.split()[1]] for line in conll_lines]
+    for line in seq_lines:
+        line = line.split()
+        try:
+            fields = transsys.trans_from_line(line)
+        except ValueError as e:
+            log.error('Encountered unknown transition type "%s" in sequences file, ignoring...' % (str(e)))
+            return None
+
+        vector_form = []
+        for k in transition_dims:
+            if k in fields:
+                if k in mappings:
+                    fields[k] = mappings[k][fields[k]]
+                vector_form += [fields[k]]
+            else:
+                vector_form += [-1] # this should never be used
+
+        res[1] += [vector_form]
+
+    # gold POS
+    res[2] = [len(mappings['pos'])] + [mappings['pos'][line.split()[3]] for line in conll_lines]
+    if fpos:
+        # fine-grained POS
+        res[3] = [len(mappings['fpos'])] + [mappings['fpos'][line.split()[4]] for line in conll_lines]
+
+    return tuple(res)
+
+def read_mappings(mappings_file, transsys, log=None):
+    i = 0
+    res = dict()
+    res2 = dict()
+    with smart_open(mappings_file, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith("::"):
+                currentkey = line[2:]
+                res[currentkey] = dict()
+                res2[currentkey] = []
+                i = 0
+            else:
+                res[currentkey][line] = i
+                res2[currentkey] += [line]
+                i += 1
+
+    res['action'] = {k: i for i, k in enumerate(transsys.actions_list())}
+    res2['action'] = transsys.actions_list()
+
+    return res, res2
+
+def read_gold_parserstates(fin, transsys, fpos=False):
+    def processlines(lines):
+        arcs = [dict() for i in range(len(lines)+1)]
+
+        pos = ["" for i in range(len(lines)+1)]
+        fpos = ["" for i in range(len(lines)+1)]
+
+        for i, line in enumerate(lines):
+            pos[i+1] = line[3] # fine-grained
+            fpos[i+1] = line[4]
+            parent = int(line[6])
+            relation = line[7]
+            arcs[parent][i+1] = transsys.mappings['rel'][relation]
+#         print(ParserState(["<ROOT>"] + lines))
+
+        res = [ParserState(["<ROOT>"] + lines, transsys=transsys, goldrels=arcs), pos]
+        if fpos:
+            res += [fpos]
+        else:
+            res == [None]
+        return res
+    res = []
+
+    lines = []
+    line = fin.readline()#.decode('utf-8')
+    while line:
+        line = line.strip().split()
+
+        if len(line) == 0:
+            res += [processlines(lines)]
+            
+            lines = []
+        else:
+            lines += [line]
+
+        line = fin.readline()#.decode('utf-8')
+
+    if len(lines) > 0:
+        res += [processlines(lines)]
+#         print(res[0][0].buf)
+
+    return res
+
+def write_gold_trans(tpl, fout):
+    state, pos, fpos = tpl
+    transsys = state.transsys
+    while len(state.transitionset()) > 0:
+        t = transsys.goldtransition(state)
+
+        fout.write("%s\n" % transsys.trans_to_str(t, state, pos, fpos))
+
+        transsys.advance(state, t)
+
+    fout.write("\n")
+
+def multi_argmin(lst):
+    minval = 1e10
+    res = []
+    for i, v in enumerate(lst):
+        if v < minval:
+            minval = v
+            res = [i]
+        elif v == minval:
+            res += [i]
+
+    return res
+
+XPOS2IDX = {'$': 0,'PRP$': 1,'VBZ': 2,'CD': 3,'JJS': 4,'VBG': 5,'IN': 6,'VB': 7,',': 8,'RB': 9,'JJ': 10,'LS': 11,'TO': 12,'UH': 13,'EX': 14,'``': 15,'SYM': 16,'NNP': 17,'WP': 18,'.': 19,"''": 20,'VBP': 21,'WP$': 22,'-RRB-': 23,'-LRB-': 24,'PDT': 25,'PRP': 26,'NNS': 27,':': 28,'WDT': 29,'POS': 30,'MD': 31,'RBS': 32,'RP': 33,'VBN': 34,'CC': 35,'NNPS': 36,'JJR': 37,'RBR': 38,'DT': 39,'WRB': 40,'NN': 41,'FW': 42,'VBD': 43,'#': 44}
+IDX2XPOS = {v: k for k, v in XPOS2IDX.items()}
\ No newline at end of file