diff --git a/dalle2_pytorch/dalle2_pytorch.py b/dalle2_pytorch/dalle2_pytorch.py
index 61aa44e..2152f80 100644
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -834,7 +834,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
         self.image_embed_dim = default(image_embed_dim, lambda: clip.dim_latent)
         self.channels = default(image_channels, lambda: clip.image_channels)
 
-        self.cond_drop_prob = cond_drop_prob if not predict_x_start else 0.
+        self.cond_drop_prob = cond_drop_prob
         self.condition_on_text_encodings = condition_on_text_encodings
 
         # in paper, they do not predict the noise, but predict x0 directly for image embedding, claiming empirically better results. I'll just offer both.
diff --git a/setup.py b/setup.py
index 72087b4..b0908cc 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
       'dream = dalle2_pytorch.cli:dream'
     ],
   },
-  version = '0.1.9',
+  version = '0.1.10',
   license='MIT',
   description = 'DALL-E 2',
   author = 'Phil Wang',