diff --git a/dalle2_pytorch/dalle2_pytorch.py b/dalle2_pytorch/dalle2_pytorch.py
index 8e6e45a..61aa44e 100644
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -800,7 +800,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
         image_size = None,
         image_channels = 3,
         timesteps = 1000,
-        cond_drop_prob = 0.2,
+        cond_drop_prob = 0.,
         loss_type = "l1",
         predict_x_start = True,
         beta_schedule = "cosine",
@@ -834,7 +834,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
         self.image_embed_dim = default(image_embed_dim, lambda: clip.dim_latent)
         self.channels = default(image_channels, lambda: clip.image_channels)
 
-        self.cond_drop_prob = cond_drop_prob
+        self.cond_drop_prob = cond_drop_prob if not predict_x_start else 0.
         self.condition_on_text_encodings = condition_on_text_encodings
 
         # in paper, they do not predict the noise, but predict x0 directly for image embedding, claiming empirically better results. I'll just offer both.
diff --git a/setup.py b/setup.py
index d39797f..72087b4 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
       'dream = dalle2_pytorch.cli:dream'
     ],
   },
-  version = '0.1.8',
+  version = '0.1.9',
   license='MIT',
   description = 'DALL-E 2',
   author = 'Phil Wang',