allow for final l2norm clamping of the sampled image embed

more accurate readme
fix misnamed variable, thanks to @nousr
2026-02-13 12:04:24 +01:00 · 2022-07-10 09:44:38 -07:00 · 2022-07-09 20:57:26 -07:00 · 2022-07-09 19:01:37 -07:00 · 2022-07-09 18:38:40 -07:00 · 2022-07-09 17:31:54 -07:00
3 changed files with 31 additions and 13 deletions
--- a/README.md
+++ b/README.md
@@ -355,7 +355,8 @@ prior_network = DiffusionPriorNetwork(
 diffusion_prior = DiffusionPrior(
    net = prior_network,
    clip = clip,
-    timesteps = 100,
+    timesteps = 1000,
+    sample_timesteps = 64,
    cond_drop_prob = 0.2
 ).cuda()

--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -922,11 +922,12 @@ class DiffusionPrior(nn.Module):
        loss_type = "l2",
        predict_x_start = True,
        beta_schedule = "cosine",
-        condition_on_text_encodings = True, # the paper suggests this is needed, but you can turn it off for your CLIP preprocessed text embed -> image embed training
-        sampling_clamp_l2norm = False,
+        condition_on_text_encodings = True,  # the paper suggests this is needed, but you can turn it off for your CLIP preprocessed text embed -> image embed training
+        sampling_clamp_l2norm = False,       # whether to l2norm clamp the image embed at each denoising iteration (analogous to -1 to 1 clipping for usual DDPMs)
+        sampling_final_clamp_l2norm = False, # whether to l2norm the final image embedding output (this is also done for images in ddpm)
        training_clamp_l2norm = False,
        init_image_embed_l2norm = False,
-        image_embed_scale = None,           # this is for scaling the l2-normed image embedding, so it is more suitable for gaussian diffusion, as outlined by Katherine (@crowsonkb) https://github.com/lucidrains/DALLE2-pytorch/issues/60#issue-1226116132
+        image_embed_scale = None,            # this is for scaling the l2-normed image embedding, so it is more suitable for gaussian diffusion, as outlined by Katherine (@crowsonkb) https://github.com/lucidrains/DALLE2-pytorch/issues/60#issue-1226116132
        clip_adapter_overrides = dict()
    ):
        super().__init__()
@@ -963,23 +964,32 @@ class DiffusionPrior(nn.Module):
        self.condition_on_text_encodings = condition_on_text_encodings

        # in paper, they do not predict the noise, but predict x0 directly for image embedding, claiming empirically better results. I'll just offer both.
+
        self.predict_x_start = predict_x_start

        # @crowsonkb 's suggestion - https://github.com/lucidrains/DALLE2-pytorch/issues/60#issue-1226116132
+
        self.image_embed_scale = default(image_embed_scale, self.image_embed_dim ** 0.5)

        # whether to force an l2norm, similar to clipping denoised, when sampling
+
        self.sampling_clamp_l2norm = sampling_clamp_l2norm
+        self.sampling_final_clamp_l2norm = sampling_final_clamp_l2norm
+
        self.training_clamp_l2norm = training_clamp_l2norm
        self.init_image_embed_l2norm = init_image_embed_l2norm

        # device tracker
+
        self.register_buffer('_dummy', torch.tensor([True]), persistent = False)

    @property
    def device(self):
        return self._dummy.device

+    def l2norm_clamp_embed(self, image_embed):
+        return l2norm(image_embed) * self.image_embed_scale
+
    def p_mean_variance(self, x, t, text_cond, clip_denoised = False, cond_scale = 1.):
        assert not (cond_scale != 1. and not self.can_classifier_guidance), 'the model was not trained with conditional dropout, and thus one cannot use classifier free guidance (cond_scale anything other than 1)'

@@ -1020,6 +1030,9 @@ class DiffusionPrior(nn.Module):
            times = torch.full((batch,), i, device = device, dtype = torch.long)
            image_embed = self.p_sample(image_embed, times, text_cond = text_cond, cond_scale = cond_scale)

+        if self.sampling_final_clamp_l2norm and self.predict_x_start:
+            image_embed = self.l2norm_clamp_embed(image_embed)
+
        return image_embed

    @torch.no_grad()
@@ -1055,15 +1068,18 @@ class DiffusionPrior(nn.Module):
                x_start.clamp_(-1., 1.)

            if self.predict_x_start and self.sampling_clamp_l2norm:
-                x_start = l2norm(x_start) * self.image_embed_scale
+                x_start = self.l2norm_clamp_embed(x_start)

            c1 = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
            c2 = ((1 - alpha_next) - torch.square(c1)).sqrt()
-            new_noise = torch.randn_like(image_embed)
+            noise = torch.randn_like(image_embed) if time_next > 0 else 0.

-            img = x_start * alpha_next.sqrt() + \
-                  c1 * new_noise + \
-                  c2 * pred_noise
+            image_embed = x_start * alpha_next.sqrt() + \
+                          c1 * noise + \
+                          c2 * pred_noise
+
+        if self.predict_x_start and self.sampling_final_clamp_l2norm:
+            image_embed = self.l2norm_clamp_embed(image_embed)

        return image_embed

@@ -1091,7 +1107,7 @@ class DiffusionPrior(nn.Module):
        )

        if self.predict_x_start and self.training_clamp_l2norm:
-            pred = l2norm(pred) * self.image_embed_scale
+            pred = self.l2norm_clamp_embed(pred)

        target = noise if not self.predict_x_start else image_embed

@@ -2047,7 +2063,7 @@ class Decoder(nn.Module):
        self.noise_schedulers = nn.ModuleList([])

        for ind, (unet_beta_schedule, unet_p2_loss_weight_gamma, sample_timesteps) in enumerate(zip(beta_schedule, p2_loss_weight_gamma, self.sample_timesteps)):
-            assert sample_timesteps <= timesteps, f'sampling timesteps {sample_timesteps} must be less than or equal to the number of training timesteps {timesteps} for unet {ind + 1}'
+            assert not exists(sample_timesteps) or sample_timesteps <= timesteps, f'sampling timesteps {sample_timesteps} must be less than or equal to the number of training timesteps {timesteps} for unet {ind + 1}'

            noise_scheduler = NoiseScheduler(
                beta_schedule = unet_beta_schedule,
@@ -2275,9 +2291,10 @@ class Decoder(nn.Module):

            c1 = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
            c2 = ((1 - alpha_next) - torch.square(c1)).sqrt()
+            noise = torch.randn_like(img) if time_next > 0 else 0.

            img = x_start * alpha_next.sqrt() + \
-                  c1 * torch.randn_like(img) + \
+                  c1 * noise + \
                  c2 * pred_noise

        img = self.unnormalize_img(img)
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.19.2'
+__version__ = '0.19.6'
Author	SHA1	Message	Date
Phil Wang	7ea314e2f0	allow for final l2norm clamping of the sampled image embed	2022-07-10 09:44:38 -07:00
Phil Wang	4173e88121	more accurate readme	2022-07-09 20:57:26 -07:00
Phil Wang	3dae43fa0e	fix misnamed variable, thanks to @nousr	2022-07-09 19:01:37 -07:00
Phil Wang	a598820012	do not noise for the last step in ddim	2022-07-09 18:38:40 -07:00
Phil Wang	4878762627	fix for small validation bug for sampling steps	2022-07-09 17:31:54 -07:00