use @clip-anytorch , thanks to @rom1504

fix a bug with classifier free guidance, thanks to @xiankgx again!
2026-02-12 11:34:29 +01:00 · 2022-04-30 06:40:54 -07:00 · 2022-04-30 06:34:57 -07:00
3 changed files with 12 additions and 18 deletions
--- a/README.md
+++ b/README.md
@@ -499,9 +499,7 @@ loss.backward()

 Although there is the possibility they are using an unreleased, more powerful CLIP, you can use one of the released ones, if you do not wish to train your own CLIP from scratch. This will also allow the community to more quickly validate the conclusions of the paper.

-First you'll need to install <a href="https://github.com/openai/CLIP#usage">the prerequisites</a>
-
-Then to use a pretrained OpenAI CLIP, simply import `OpenAIClipAdapter` and pass it into the `DiffusionPrior` or `Decoder` like so
+To use a pretrained OpenAI CLIP, simply import `OpenAIClipAdapter` and pass it into the `DiffusionPrior` or `Decoder` like so

 ```python
 import torch
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -172,11 +172,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
        self,
        name = 'ViT-B/32'
    ):
-        try:
-            import clip
-        except ImportError:
-            print('you must install openai clip in order to use this adapter - `pip install git+https://github.com/openai/CLIP.git` - more instructions at https://github.com/openai/CLIP#usage')
-
+        import clip
        openai_clip, _ = clip.load(name)
        super().__init__(openai_clip)

@@ -688,14 +684,14 @@ class DiffusionPriorNetwork(nn.Module):

        # classifier free guidance

-        cond_prob_mask = prob_mask_like((batch,), cond_drop_prob, device = device)
-        cond_prob_mask = rearrange(cond_prob_mask, 'b -> b 1')
+        keep_mask = prob_mask_like((batch,), 1 - cond_drop_prob, device = device)
+        keep_mask = rearrange(keep_mask, 'b -> b 1')

-        mask &= cond_prob_mask
+        mask &= keep_mask

        # whether text embedding is masked or not depends on the classifier free guidance conditional masking

-        mask = torch.cat((mask, cond_prob_mask), dim = 1)
+        mask = torch.cat((mask, keep_mask), dim = 1)

        # whether text embedding is used for conditioning depends on whether text encodings are available for attention (for classifier free guidance, even though it seems from the paper it was not used in the prior ddpm, as the objective is different)
        # but let's just do it right
@@ -1208,8 +1204,8 @@ class Unet(nn.Module):

        # conditional dropout

-        cond_prob_mask = prob_mask_like((batch_size,), cond_drop_prob, device = device)
-        cond_prob_mask = rearrange(cond_prob_mask, 'b -> b 1 1')
+        keep_mask = prob_mask_like((batch_size,), 1 - cond_drop_prob, device = device)
+        keep_mask = rearrange(keep_mask, 'b -> b 1 1')

        # mask out image embedding depending on condition dropout
        # for classifier free guidance
@@ -1220,7 +1216,7 @@ class Unet(nn.Module):
            image_tokens = self.image_to_cond(image_embed)

            image_tokens = torch.where(
-                cond_prob_mask,
+                keep_mask,
                image_tokens,
                self.null_image_embed
            )
@@ -1232,7 +1228,7 @@ class Unet(nn.Module):
        if exists(text_encodings) and self.cond_on_text_encodings:
            text_tokens = self.text_to_cond(text_encodings)
            text_tokens = torch.where(
-                cond_prob_mask,
+                keep_mask,
                text_tokens,
                self.null_text_embed[:, :text_tokens.shape[1]]
            )
@@ -1636,4 +1632,3 @@ class DALLE2(nn.Module):
            return images[0]

        return images
-
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.71',
+  version = '0.0.73',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
@@ -23,6 +23,7 @@ setup(
  ],
  install_requires=[
    'click',
+    'clip-anytorch',
    'einops>=0.4',
    'einops-exts>=0.0.3',
    'kornia>=0.5.4',
Author	SHA1	Message	Date
Phil Wang	e2f9615afa	use @clip-anytorch , thanks to @rom1504	2022-04-30 06:40:54 -07:00
Phil Wang	0d1c07c803	fix a bug with classifier free guidance, thanks to @xiankgx again!	2022-04-30 06:34:57 -07:00