turn off classifier free guidance if predicting x_start for diffusion prior

todo
2026-02-13 12:04:24 +01:00 · 2022-05-07 09:38:17 -07:00 · 2022-05-07 09:21:08 -07:00 · 2022-05-07 08:33:45 -07:00 · 2022-05-07 08:32:43 -07:00 · 2022-05-07 07:52:17 -07:00
3 changed files with 16 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -981,6 +981,8 @@ Once built, images will be saved to the same directory the command is invoked
 - [ ] make sure FILIP works with DALL-E2 from x-clip https://arxiv.org/abs/2111.07783
 - [ ] make sure resnet hyperparameters can be configurable across unet depth (groups and expansion factor)
 - [ ] offer save / load methods on the trainer classes to automatically take care of state dicts for scalers / optimizers / saving versions and checking for breaking changes
+- [ ] offer setting in diffusion prior to split time and image embeddings into multiple tokens, configurable, for more surface area during attention
+- [ ] bring in skip-layer excitatons (from lightweight gan paper) to see if it helps for either decoder of unet or vqgan-vae training

 ## Citations

--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -706,7 +706,7 @@ class DiffusionPriorNetwork(nn.Module):
        **kwargs
    ):
        super().__init__()
-        self.time_embeddings = nn.Embedding(num_timesteps, dim) if exists(num_timesteps) else nn.Sequential(Rearrange('b -> b 1'), MLP(1, dim)) # also offer a continuous version of timestep embeddings, with a 2 layer MLP
+        self.time_embeddings = nn.Embedding(num_timesteps, dim) if exists(num_timesteps) else nn.Sequential(SinusoidalPosEmb(dim), MLP(dim, dim)) # also offer a continuous version of timestep embeddings, with a 2 layer MLP
        self.learned_query = nn.Parameter(torch.randn(dim))
        self.causal_transformer = CausalTransformer(dim = dim, **kwargs)

@@ -800,13 +800,14 @@ class DiffusionPrior(BaseGaussianDiffusion):
        image_size = None,
        image_channels = 3,
        timesteps = 1000,
-        cond_drop_prob = 0.2,
+        cond_drop_prob = 0.,
        loss_type = "l1",
        predict_x_start = True,
        beta_schedule = "cosine",
        condition_on_text_encodings = True, # the paper suggests this is needed, but you can turn it off for your CLIP preprocessed text embed -> image embed training
        sampling_clamp_l2norm = False,
        training_clamp_l2norm = False,
+        init_image_embed_l2norm = False,
        image_embed_scale = None,           # this is for scaling the l2-normed image embedding, so it is more suitable for gaussian diffusion, as outlined by Katherine (@crowsonkb) https://github.com/lucidrains/DALLE2-pytorch/issues/60#issue-1226116132
        clip_adapter_overrides = dict()
    ):
@@ -833,7 +834,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        self.image_embed_dim = default(image_embed_dim, lambda: clip.dim_latent)
        self.channels = default(image_channels, lambda: clip.image_channels)

-        self.cond_drop_prob = cond_drop_prob
+        self.cond_drop_prob = cond_drop_prob if not predict_x_start else 0.
        self.condition_on_text_encodings = condition_on_text_encodings

        # in paper, they do not predict the noise, but predict x0 directly for image embedding, claiming empirically better results. I'll just offer both.
@@ -845,6 +846,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        # whether to force an l2norm, similar to clipping denoised, when sampling
        self.sampling_clamp_l2norm = sampling_clamp_l2norm
        self.training_clamp_l2norm = training_clamp_l2norm
+        self.init_image_embed_l2norm = init_image_embed_l2norm

    def p_mean_variance(self, x, t, text_cond, clip_denoised: bool):
        pred = self.net(x, t, **text_cond)
@@ -879,11 +881,16 @@ class DiffusionPrior(BaseGaussianDiffusion):
        device = self.betas.device

        b = shape[0]
-        img = torch.randn(shape, device=device)
+        image_embed = torch.randn(shape, device=device)
+
+        if self.init_image_embed_l2norm:
+            image_embed = l2norm(image_embed) * self.image_embed_scale

        for i in tqdm(reversed(range(0, self.num_timesteps)), desc='sampling loop time step', total=self.num_timesteps):
-            img = self.p_sample(img, torch.full((b,), i, device = device, dtype = torch.long), text_cond = text_cond)
-        return img
+            times = torch.full((b,), i, device = device, dtype = torch.long)
+            image_embed = self.p_sample(image_embed, times, text_cond = text_cond)
+
+        return image_embed

    def p_losses(self, image_embed, times, text_cond, noise = None):
        noise = default(noise, lambda: torch.randn_like(image_embed))
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.1.6',
+  version = '0.1.9',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	4010aec033	turn off classifier free guidance if predicting x_start for diffusion prior	2022-05-07 09:38:17 -07:00
Phil Wang	c87b84a259	todo	2022-05-07 09:21:08 -07:00
Phil Wang	8b05468653	todo	2022-05-07 08:33:45 -07:00
Phil Wang	830afd3c15	sinusoidal embed time embeddings for diffusion prior as well, for continuous version	2022-05-07 08:32:43 -07:00
Phil Wang	8f93729d19	when in doubt, make it a hyperparameter	2022-05-07 07:52:17 -07:00