allow for training the Prior network with precomputed CLIP embeddings (or text encodings)

refactor so that the causal transformer in the diffusion prior network can be conditioned without text encodings (for Laions parallel efforts, although it seems from the paper it is needed)
2026-02-12 19:44:26 +01:00 · 2022-04-26 09:29:51 -07:00 · 2022-04-26 09:00:11 -07:00
3 changed files with 133 additions and 23 deletions
--- a/README.md
+++ b/README.md
@@ -376,6 +376,75 @@ You can also train the decoder on images of greater than the size (say 512x512)

 For the layperson, no worries, training will all be automated into a CLI tool, at least for small scale training.

+## Training on Preprocessed CLIP Embeddings
+
+It is likely, when scaling up, that you would first preprocess your images and text into corresponding embeddings before training the prior network. You can do so easily by simply passing in `image_embed`, `text_embed`, and optionally `text_encodings` and `text_mask`
+
+Working example below
+
+```python
+import torch
+from dalle2_pytorch import DiffusionPriorNetwork, DiffusionPrior, CLIP
+
+# get trained CLIP from step one
+
+clip = CLIP(
+    dim_text = 512,
+    dim_image = 512,
+    dim_latent = 512,
+    num_text_tokens = 49408,
+    text_enc_depth = 6,
+    text_seq_len = 256,
+    text_heads = 8,
+    visual_enc_depth = 6,
+    visual_image_size = 256,
+    visual_patch_size = 32,
+    visual_heads = 8,
+).cuda()
+
+# setup prior network, which contains an autoregressive transformer
+
+prior_network = DiffusionPriorNetwork(
+    dim = 512,
+    depth = 6,
+    dim_head = 64,
+    heads = 8
+).cuda()
+
+# diffusion prior network, which contains the CLIP and network (with transformer) above
+
+diffusion_prior = DiffusionPrior(
+    net = prior_network,
+    clip = clip,
+    timesteps = 100,
+    cond_drop_prob = 0.2,
+    condition_on_text_encodings = False  # this probably should be true, but just to get Laion started
+).cuda()
+
+# mock data
+
+text = torch.randint(0, 49408, (4, 256)).cuda()
+images = torch.randn(4, 3, 256, 256).cuda()
+
+# precompute the text and image embeddings
+# here using the diffusion prior class, but could be done with CLIP alone
+
+clip_image_embeds = diffusion_prior.get_image_embed(images)
+clip_text_embeds = diffusion_prior.get_text_cond(text).get('text_embed')
+
+# feed text and images into diffusion prior network
+
+loss = diffusion_prior(
+    text_embed = clip_text_embeds,
+    image_embed = clip_image_embeds
+)
+
+loss.backward()
+
+# do the above for many many many steps
+# now the diffusion prior can generate image embeddings from the text embeddings
+```
+
 ## Experimental

 ### DALL-E2 with Latent Diffusion
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -421,25 +421,41 @@ class DiffusionPriorNetwork(nn.Module):
        image_embed,
        diffusion_timesteps,
        *,
-        text_encodings,
        text_embed,
+        text_encodings = None,
        mask = None,
        cond_drop_prob = 0.2
    ):
-        batch, text_enc_len, device = image_embed.shape[0], text_encodings.shape[-2], image_embed.device
+        batch, dim, device, dtype = *image_embed.shape, image_embed.device, image_embed.dtype

        # in section 2.2, last paragraph
        # "... consisting of encoded text, CLIP text embedding, diffusion timestep embedding, noised CLIP image embedding, final embedding for prediction"

        text_embed, image_embed = rearrange_many((text_embed, image_embed), 'b d -> b 1 d')

+        # make text encodings optional
+        # although the paper seems to suggest it is present <--
+
+        if not exists(text_encodings):
+            text_encodings = torch.empty((batch, 0, dim), device = device, dtype = dtype)
+
+        if not exists(mask):
+            mask = torch.ones((batch, text_encodings.shape[-2]), device = device, dtype = torch.bool)
+
+        # classifier free guidance
+
+        cond_prob_mask = prob_mask_like((batch,), cond_drop_prob, device = device)
+        cond_prob_mask = rearrange(cond_prob_mask, 'b -> b 1')
+
+        mask &= cond_prob_mask
+
+        # whether text embedding is masked or not depends on the classifier free guidance conditional masking
+
+        mask = torch.cat((mask, cond_prob_mask), dim = 1)
+
        # whether text embedding is used for conditioning depends on whether text encodings are available for attention (for classifier free guidance, even though it seems from the paper it was not used in the prior ddpm, as the objective is different)
        # but let's just do it right

-        if exists(mask):
-            not_all_masked_out = mask.any(dim = -1)
-            mask = torch.cat((mask, rearrange(not_all_masked_out, 'b -> b 1')), dim = 1)
-
        if exists(mask):
            mask = F.pad(mask, (0, 2), value = True) # extend mask for text embedding, noised image embedding, time step embedding, and learned query

@@ -455,16 +471,6 @@ class DiffusionPriorNetwork(nn.Module):
            learned_queries
        ), dim = -2)

-        # mask if it doesn't exist
-
-        if not exists(mask):
-            mask = torch.ones((batch, text_enc_len), device = device, dtype = torch.bool)
-
-        # classifier free guidance
-
-        cond_prob_mask = prob_mask_like((batch,), cond_drop_prob, device = device)
-        mask &= rearrange(cond_prob_mask, 'b -> b 1')
-
        # attend

        tokens = self.causal_transformer(tokens, mask = mask)
@@ -486,6 +492,7 @@ class DiffusionPrior(nn.Module):
        loss_type = "l1",
        predict_x_start = True,
        beta_schedule = "cosine",
+        condition_on_text_encodings = True, # the paper suggests this is needed, but you can turn it off for your CLIP preprocessed text embed -> image embed training
    ):
        super().__init__()
        assert isinstance(clip, CLIP)
@@ -496,7 +503,9 @@ class DiffusionPrior(nn.Module):
        self.image_embed_dim = clip.dim_latent
        self.channels = clip.image_channels
        self.image_size = clip.image_size
+
        self.cond_drop_prob = cond_drop_prob
+        self.condition_on_text_encodings = condition_on_text_encodings

        self.predict_x_start = predict_x_start
        # in paper, they do not predict the noise, but predict x0 directly for image embedding, claiming empirically better results. I'll just offer both.
@@ -561,6 +570,10 @@ class DiffusionPrior(nn.Module):
        text_cls, text_encodings = text_encodings[:, 0], text_encodings[:, 1:]
        text_embed = self.clip.to_text_latent(text_cls)
        text_embed = l2norm(text_embed)
+
+        if not self.condition_on_text_encodings:            
+            return dict(text_embed = text_embed)
+
        return dict(text_encodings = text_encodings, text_embed = text_embed, mask = text != 0)

    def q_mean_variance(self, x_start, t):
@@ -679,13 +692,41 @@ class DiffusionPrior(nn.Module):
        top_image_embeds = image_embeds.gather(1, top_sim_indices)
        return rearrange(top_image_embeds, 'b 1 d -> b d')

-    def forward(self, text, image, *args, **kwargs):
-        b, device, img_size, = image.shape[0], image.device, self.image_size
-        check_shape(image, 'b c h w', h = img_size, w = img_size, c = self.channels)
+    def forward(
+        self,
+        text = None,
+        image = None,
+        text_embed = None,      # allow for training on preprocessed CLIP text and image embeddings
+        image_embed = None,
+        text_encodings = None,  # as well as CLIP text encodings
+        text_mask = None,       # text mask <- may eventually opt for the learned padding tokens technique from DALL-E1 to reduce complexity
+        *args,
+        **kwargs
+    ):
+        assert exists(text) ^ exists(text_embed), 'either text or text embedding must be supplied'
+        assert exists(image) ^ exists(image_embed), 'either text or text embedding must be supplied'
+        assert not (self.condition_on_text_encodings and (not exists(text_encodings) and not exists(text))), 'text encodings must be present if you specified you wish to condition on it on initialization'

-        times = torch.randint(0, self.num_timesteps, (b,), device = device, dtype = torch.long)
-        image_embed = self.get_image_embed(image)
-        text_cond = self.get_text_cond(text)
+        if exists(image):
+            image_embed = self.get_image_embed(image)
+
+        # calculate text conditionings, based on what is passed in
+
+        if exists(text):
+            text_cond = self.get_text_cond(text)
+        else:
+            text_cond = dict(
+                text_embed = text_embed,
+                text_encodings = text_encodings,
+                mask = text_mask
+            )
+
+        # timestep conditioning from ddpm
+
+        batch, device = image_embed.shape[0], image_embed.device
+        times = torch.randint(0, self.num_timesteps, (batch,), device = device, dtype = torch.long)
+
+        # calculate forward loss

        loss = self.p_losses(image_embed, times, text_cond = text_cond, *args, **kwargs)
        return loss
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.46',
+  version = '0.0.48',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	7ba6357c05	allow for training the Prior network with precomputed CLIP embeddings (or text encodings)	2022-04-26 09:29:51 -07:00
Phil Wang	76e063e8b7	refactor so that the causal transformer in the diffusion prior network can be conditioned without text encodings (for Laions parallel efforts, although it seems from the paper it is needed)	2022-04-26 09:00:11 -07:00