diff --git a/configs/train_decoder_config.test.json b/configs/train_decoder_config.test.json index 101846e0..dfc09735 100644 --- a/configs/train_decoder_config.test.json +++ b/configs/train_decoder_config.test.json @@ -9,7 +9,7 @@ "dim_mults": [1, 2, 4, 8], "attn_dim_head": 16, "attn_heads": 4, - "self_attn": [false, true, true, true] + "self_attn": [false, true, true, true] } ], "clip": { diff --git a/dalle2_pytorch/train_configs.py b/dalle2_pytorch/train_configs.py index cecd8c7e..0dcb50cc 100644 --- a/dalle2_pytorch/train_configs.py +++ b/dalle2_pytorch/train_configs.py @@ -233,7 +233,7 @@ class UnetConfig(BaseModel): cond_on_text_encodings: Optional[bool] = None cond_dim: Optional[int] = None channels: int = 3 - self_attn: ListOrTuple[int] + self_attn: ListOrTuple[bool] attn_dim_head: int = 32 attn_heads: int = 16 init_cross_embed: bool = True @@ -245,7 +245,7 @@ class DecoderConfig(BaseModel): unets: ListOrTuple[UnetConfig] image_size: Optional[int] = None image_sizes: ListOrTuple[int] = None - clip: Optional[AdapterConfig] # The clip model to use if embeddings are not provided + clip: Optional[AdapterConfig] = None # The clip model to use if embeddings are not provided channels: int = 3 timesteps: int = 1000 sample_timesteps: Optional[SingularOrIterable[Optional[int]]] = None diff --git a/dalle2_pytorch/version.py b/dalle2_pytorch/version.py index 87046e54..23b53d6f 100644 --- a/dalle2_pytorch/version.py +++ b/dalle2_pytorch/version.py @@ -1 +1 @@ -__version__ = '1.15.4' +__version__ = '1.15.5'