diff --git a/dalle2_pytorch/dalle2_pytorch.py b/dalle2_pytorch/dalle2_pytorch.py
index fbfeb88..ac23d4c 100644
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1064,7 +1064,7 @@ class Unet(nn.Module):
         dim_mults=(1, 2, 4, 8),
         channels = 3,
         attn_dim_head = 32,
-        attn_heads = 8,
+        attn_heads = 16,
         lowres_cond = False, # for cascading diffusion - https://cascaded-diffusion.github.io/
         sparse_attn = False,
         sparse_attn_window = 8,  # window size for sparse attention
diff --git a/setup.py b/setup.py
index 40fd70e..e7f460f 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
       'dream = dalle2_pytorch.cli:dream'
     ],
   },
-  version = '0.0.86',
+  version = '0.0.87',
   license='MIT',
   description = 'DALL-E 2',
   author = 'Phil Wang',