fix a bug with classifier free guidance, thanks to @xiankgx again!

todo
fix example in readme, since api changed
2026-02-23 08:55:40 +01:00 · 2022-04-30 06:34:57 -07:00 · 2022-04-29 15:40:51 -07:00 · 2022-04-29 13:40:55 -07:00 · 2022-04-29 13:05:01 -07:00
4 changed files with 49 additions and 15 deletions
--- a/README.md
+++ b/README.md
@@ -430,8 +430,8 @@ images = torch.randn(4, 3, 256, 256).cuda()
 # precompute the text and image embeddings
 # here using the diffusion prior class, but could be done with CLIP alone
-clip_image_embeds = diffusion_prior.get_image_embed(images)
+clip_image_embeds = diffusion_prior.clip.embed_image(images).image_embed
-clip_text_embeds = diffusion_prior.get_text_cond(text).get('text_embed')
+clip_text_embeds = diffusion_prior.clip.embed_text(text).text_embed
 # feed text and images into diffusion prior network
@@ -741,6 +741,7 @@ Once built, images will be saved to the same directory the command is invoked
 - [ ] become an expert with unets, cleanup unet code, make it fully configurable, port all learnings over to https://github.com/lucidrains/x-unet
 - [ ] copy the cascading ddpm code to a separate repo (perhaps https://github.com/lucidrains/denoising-diffusion-pytorch) as the main contribution of dalle2 really is just the prior network
 - [ ] transcribe code to Jax, which lowers the activation energy for distributed training, given access to TPUs
 - [ ] just take care of the training for the decoder in a wrapper class, as each unet in the cascade will need its own optimizer
 - [ ] train on a toy task, offer in colab
 - [ ] think about how best to design a declarative training config that handles preencoding for prior and training of multiple networks in decoder
 - [ ] extend diffusion head to use diffusion-gan (potentially using lightweight-gan) to speed up inference
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -3,6 +3,7 @@ from tqdm import tqdm
 from inspect import isfunction
 from functools import partial
 from contextlib import contextmanager
 from collections import namedtuple
 import torch
 import torch.nn.functional as F
@@ -102,6 +103,9 @@ def unnormalize_img(normed_img):
 # clip related adapters
 EmbeddedText = namedtuple('EmbedTextReturn', ['text_embed', 'text_encodings', 'text_mask'])
 EmbeddedImage = namedtuple('EmbedImageReturn', ['image_embed', 'image_encodings'])
 class BaseClipAdapter(nn.Module):
    def __init__(self, clip):
        super().__init__()
@@ -153,7 +157,7 @@ class XClipAdapter(BaseClipAdapter):
        encoder_output = self.clip.text_transformer(text)
        text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
        text_embed = self.clip.to_text_latent(text_cls)
-        return l2norm(text_embed), text_encodings, text_mask
+        return EmbeddedText(l2norm(text_embed), text_encodings, text_mask)
    @torch.no_grad()
    def embed_image(self, image):
@@ -161,7 +165,7 @@ class XClipAdapter(BaseClipAdapter):
        encoder_output = self.clip.visual_transformer(image)
        image_cls, image_encodings = encoder_output[:, 0], encoder_output[:, 1:]
        image_embed = self.clip.to_visual_latent(image_cls)
-        return l2norm(image_embed), image_encodings
+        return EmbeddedImage(l2norm(image_embed), image_encodings)
 class OpenAIClipAdapter(BaseClipAdapter):
    def __init__(
@@ -219,7 +223,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
        text_embed = self.clip.encode_text(text)
        text_encodings = self.text_encodings
        del self.text_encodings
-        return text_embed.float(), text_encodings.float(), text_mask
+        return EmbeddedText(text_embed.float(), text_encodings.float(), text_mask)
    @torch.no_grad()
    def embed_image(self, image):
@@ -227,7 +231,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
        image = resize_image_to(image, self.image_size)
        image = self.clip_normalize(unnormalize_img(image))
        image_embed = self.clip.encode_image(image)
-        return image_embed.float(), None
+        return EmbeddedImage(image_embed.float(), None)
 # classifier free guidance functions
@@ -684,14 +688,14 @@ class DiffusionPriorNetwork(nn.Module):
        # classifier free guidance
-        cond_prob_mask = prob_mask_like((batch,), cond_drop_prob, device = device)
+        keep_mask = prob_mask_like((batch,), 1 - cond_drop_prob, device = device)
-        cond_prob_mask = rearrange(cond_prob_mask, 'b -> b 1')
+        keep_mask = rearrange(keep_mask, 'b -> b 1')
-        mask &= cond_prob_mask
+        mask &= keep_mask
        # whether text embedding is masked or not depends on the classifier free guidance conditional masking
-        mask = torch.cat((mask, cond_prob_mask), dim = 1)
+        mask = torch.cat((mask, keep_mask), dim = 1)
        # whether text embedding is used for conditioning depends on whether text encodings are available for attention (for classifier free guidance, even though it seems from the paper it was not used in the prior ddpm, as the objective is different)
        # but let's just do it right
@@ -1204,8 +1208,8 @@ class Unet(nn.Module):
        # conditional dropout
-        cond_prob_mask = prob_mask_like((batch_size,), cond_drop_prob, device = device)
+        keep_mask = prob_mask_like((batch_size,), 1 - cond_drop_prob, device = device)
-        cond_prob_mask = rearrange(cond_prob_mask, 'b -> b 1 1')
+        keep_mask = rearrange(keep_mask, 'b -> b 1 1')
        # mask out image embedding depending on condition dropout
        # for classifier free guidance
@@ -1216,7 +1220,7 @@ class Unet(nn.Module):
            image_tokens = self.image_to_cond(image_embed)
            image_tokens = torch.where(
-                cond_prob_mask,
+                keep_mask,
                image_tokens,
                self.null_image_embed
            )
@@ -1228,7 +1232,7 @@ class Unet(nn.Module):
        if exists(text_encodings) and self.cond_on_text_encodings:
            text_tokens = self.text_to_cond(text_encodings)
            text_tokens = torch.where(
-                cond_prob_mask,
+                keep_mask,
                text_tokens,
                self.null_text_embed[:, :text_tokens.shape[1]]
            )
--- a/dalle2_pytorch/optimizer.py
+++ b/dalle2_pytorch/optimizer.py
@@ -0,0 +1,29 @@
 from torch.optim import AdamW, Adam
 def separate_weight_decayable_params(params):
    no_wd_params = set([param for param in params if param.ndim < 2])
    wd_params = set(params) - no_wd_params
    return wd_params, no_wd_params
 def get_optimizer(
    params,
    lr = 3e-4,
    wd = 1e-2,
    betas = (0.9, 0.999),
    filter_by_requires_grad = False
 ):
    if filter_by_requires_grad:
        params = list(filter(lambda t: t.requires_grad, params))
    if wd == 0:
        return Adam(params, lr = lr, betas = betas)
    params = set(params)
    wd_params, no_wd_params = separate_weight_decayable_params(params)
    param_groups = [
        {'params': list(wd_params)},
        {'params': list(no_wd_params), 'weight_decay': 0},
    ]
    return AdamW(param_groups, lr = lr, weight_decay = wd, betas = betas)
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.67',
+  version = '0.0.72',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	0d1c07c803	fix a bug with classifier free guidance, thanks to @xiankgx again!	2022-04-30 06:34:57 -07:00
Phil Wang	a389f81138	todo	2022-04-29 15:40:51 -07:00
Phil Wang	0283556608	fix example in readme, since api changed	2022-04-29 13:40:55 -07:00
Phil Wang	5063d192b6	now completely OpenAI CLIP compatible for training just take care of the logic for AdamW and transformers used namedtuples for clip adapter embedding outputs	2022-04-29 13:05:01 -07:00