Fix how the denominator is computed in fast autoregressive transformer

proroklab · Jul 4, 2024 · a70ceeb · a70ceeb
1 parent d3ed90f
commit a70ceeb
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/popgym/baselines/models/linear_attention.py b/popgym/baselines/models/linear_attention.py
@@ -90,7 +90,7 @@ def forward(
         # numerator = Q^T S
         numerator = torch.einsum("bti, btil -> btl", Q, S)
         # denominator = Q^T Z
-        denominator = torch.einsum("bti, btl -> bt", Q, Z).reshape(B, T, 1) + 1e-5
+        denominator = torch.einsum("bti, bti -> bt", Q, Z).reshape(B, T, 1) + 1e-5
         # output = (Q^T S) / (Q^T Z)
         output = numerator / denominator