understanding-search · mivanit · May 14, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/maze_transformer/evaluation/baseline_models.py b/maze_transformer/evaluation/baseline_models.py
@@ -15,8 +15,8 @@
     get_origin_tokens,
     get_path_tokens,
     get_target_tokens,
-    strings_to_coords,
 )
+from maze_dataset.tokenization.util import strings_to_coords
 from transformer_lens import HookedTransformer
 
 from maze_transformer.training.config import ConfigHolder

diff --git a/maze_transformer/evaluation/eval_model.py b/maze_transformer/evaluation/eval_model.py
@@ -16,12 +16,12 @@
 )
 from maze_dataset.tokenization import MazeTokenizer
 from maze_dataset.tokenization.token_utils import (
-    WhenMissing,
     get_context_tokens,
     get_path_tokens,
     remove_padding_from_token_str,
-    strings_to_coords,
 )
+from maze_dataset.tokenization.util import strings_to_coords
+from maze_dataset.utils import WhenMissing
 
 # muutils
 from muutils.mlutils import chunks
@@ -143,7 +143,7 @@ def predict_maze_paths(
             smart_max_new_tokens
         ), "if max_new_tokens is None, smart_max_new_tokens must be True"
 
-    maze_tokenizer: MazeTokenizer = model.config.maze_tokenizer
+    maze_tokenizer: MazeTokenizer = model.tokenizer._maze_tokenizer
 
     contexts_lists: list[list[str]] = [
         get_context_tokens(tokens) for tokens in tokens_batch

diff --git a/maze_transformer/evaluation/plotting.py b/maze_transformer/evaluation/plotting.py
@@ -32,7 +32,7 @@ def plot_predicted_paths(
     if n_mazes is None:
         n_mazes = len(dataset)
 
-    dataset_tokens = dataset.as_tokens(model.config.maze_tokenizer)[:n_mazes]
+    dataset_tokens = dataset.as_tokens(model.tokenizer._maze_tokenizer)[:n_mazes]
 
     # predict
     predictions: list[list[str | tuple[int, int]]] = predict_maze_paths(

diff --git a/maze_transformer/mechinterp/plot_attention.py b/maze_transformer/mechinterp/plot_attention.py
@@ -19,7 +19,7 @@
 from maze_dataset.plotting.plot_tokens import plot_colored_text
 from maze_dataset.plotting.print_tokens import color_tokens_cmap
 from maze_dataset.tokenization import MazeTokenizer
-from maze_dataset.tokenization.token_utils import coord_str_to_tuple_noneable
+from maze_dataset.tokenization.util import coord_str_to_tuple_noneable
 
 # Utilities
 from muutils.json_serialize import SerializableDataclass, serializable_dataclass

diff --git a/maze_transformer/mechinterp/residual_stream_structure.py b/maze_transformer/mechinterp/residual_stream_structure.py
@@ -12,7 +12,7 @@
 # maze_dataset
 from maze_dataset.constants import _SPECIAL_TOKENS_ABBREVIATIONS
 from maze_dataset.tokenization import MazeTokenizer
-from maze_dataset.tokenization.token_utils import strings_to_coords
+from maze_dataset.tokenization.util import strings_to_coords
 
 # scipy
 from scipy.spatial.distance import pdist, squareform

diff --git a/maze_transformer/training/train_model.py b/maze_transformer/training/train_model.py
@@ -1,5 +1,6 @@
 import json
 import typing
+import warnings
 from pathlib import Path
 from typing import Union
 
@@ -122,9 +123,28 @@ def train_model(
                     f"passed dataset has different config than cfg.dataset_cfg, but allow_dataset_override is True, so using passed dataset"
                 )
             else:
-                raise ValueError(
-                    f"dataset has different config than cfg.dataset_cfg, and allow_dataset_override is False"
-                )
+                datasets_cfg_diff: dict = dataset.cfg.diff(cfg.dataset_cfg)
+                if datasets_cfg_diff == {
+                    "applied_filters": {
+                        "self": [
+                            {
+                                "name": "collect_generation_meta",
+                                "args": (),
+                                "kwargs": {},
+                            }
+                        ],
+                        "other": [],
+                    }
+                }:
+                    warnings.warn(
+                        f"dataset has different config than cfg.dataset_cfg, but the only difference is in applied_filters, so using passed dataset. This is due to fast dataset loading collecting generation metadata for performance reasons"
+                    )
+
+                else:
+                    raise ValueError(
+                        f"dataset has different config than cfg.dataset_cfg, and allow_dataset_override is False",
+                        f"{datasets_cfg_diff = }",
+                    )
 
     logger.progress(f"finished getting training dataset with {len(dataset)} samples")
     # validation dataset, if applicable

diff --git a/notebooks/residual_stream_decoding.ipynb b/notebooks/residual_stream_decoding.ipynb
diff --git a/notebooks/train_model.ipynb b/notebooks/train_model.ipynb
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,15 +10,16 @@ repository = "https://github.com/understanding-search/maze-transformer"
 [tool.poetry.dependencies]
 python = ">=3.10,<3.13"
 # dataset
-maze-dataset = "^0.4.5"
+maze-dataset = "^0.5.2"
 # transformers
 torch = ">=1.13.1"
-transformer-lens = "1.14.0"
+transformer-lens = "^1.14.0"
 transformers = ">=4.34" # Dependency in transformer-lens 1.14.0
 # utils
 muutils = "^0.5.5"
 zanj = "^0.2.0"
-wandb = "^0.13.5" # note: TransformerLens forces us to use 0.13.5
+# wandb = "^0.13.5" # note: TransformerLens forces us to use 0.13.5
+wandb = "^0.17.0"
 fire = "^0.5.0"
 typing-extensions = "^4.8.0"
 # plotting