diff --git a/README.md b/README.md
index 73fc4c2..19a0a6c 100644
--- a/README.md
+++ b/README.md
@@ -697,4 +697,24 @@ Once built, images will be saved to the same directory the command is invoked
 }
 ```
 
+```bibtex
+@article{Arar2021LearnedQF,
+    title   = {Learned Queries for Efficient Local Attention},
+    author  = {Moab Arar and Ariel Shamir and Amit H. Bermano},
+    journal = {ArXiv},
+    year    = {2021},
+    volume  = {abs/2112.11435}
+}
+```
+
+```bibtex
+@article{Yu2021VectorquantizedIM,
+    title   = {Vector-quantized Image Modeling with Improved VQGAN},
+    author  = {Jiahui Yu and Xin Li and Jing Yu Koh and Han Zhang and Ruoming Pang and James Qin and Alexander Ku and Yuanzhong Xu and Jason Baldridge and Yonghui Wu},
+    journal = {ArXiv},
+    year    = {2021},
+    volume  = {abs/2110.04627}
+}
+```
+
 *Creating noise from data is easy; creating data from noise is generative modeling.* - Yang Song's <a href="https://arxiv.org/abs/2011.13456">paper</a>
diff --git a/dalle2_pytorch/attention.py b/dalle2_pytorch/attention.py
index d01d294..7b6145d 100644
--- a/dalle2_pytorch/attention.py
+++ b/dalle2_pytorch/attention.py
@@ -44,7 +44,12 @@ class QueryAndAttend(nn.Module):
 
         self.queries = nn.Parameter(torch.randn(heads, num_queries, dim_head))
         self.to_kv = nn.Conv2d(dim, dim_head * 2, 1, bias = False)
-        self.to_out = nn.Conv2d(inner_dim, dim, 1, bias = False)
+
+        self.to_out = nn.Sequential(
+            nn.Conv2d(inner_dim, dim * 2, 1, bias = False),
+            nn.Tanh(),
+            nn.Conv2d(dim * 2, dim, 1, bias = False)
+        )
 
     def forward(self, x):
         """