always rederive the predicted noise from the clipped x0 for ddim + predict noise objective

bump to newer package of clip-anytorch that allows for text encodings < maximum context length
fix for self conditioning in diffusion prior network https://github.com/lucidrains/DALLE2-pytorch/issues/273
2026-02-12 11:34:29 +01:00 · 2023-03-05 10:45:44 -08:00 · 2023-03-04 09:37:25 -08:00 · 2023-02-11 17:18:40 -08:00 · 2022-12-23 13:23:09 -08:00 · 2022-12-15 10:54:21 -08:00
3 changed files with 8 additions and 12 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -360,6 +360,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
        is_eos_id = (text == self.eos_id)
        text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
        text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
+        text_mask = text_mask & (text != 0)
        assert not self.cleared

        text_embed = self.clip.encode_text(text)
@@ -434,6 +435,7 @@ class OpenClipAdapter(BaseClipAdapter):
        is_eos_id = (text == self.eos_id)
        text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
        text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
+        text_mask = text_mask & (text != 0)
        assert not self.cleared

        text_embed = self.clip.encode_text(text)
@@ -1122,7 +1124,7 @@ class DiffusionPriorNetwork(nn.Module):
        learned_queries = repeat(self.learned_query, 'd -> b 1 d', b = batch)

        if self.self_cond:
-            learned_queries = torch.cat((image_embed, self_cond), dim = -2)
+            learned_queries = torch.cat((self_cond, learned_queries), dim = -2)

        tokens = torch.cat((
            text_encodings,
@@ -1332,10 +1334,7 @@ class DiffusionPrior(nn.Module):

            # predict noise

-            if self.predict_x_start or self.predict_v:
-                pred_noise = self.noise_scheduler.predict_noise_from_start(image_embed, t = time_cond, x0 = x_start)
-            else:
-                pred_noise = pred
+            pred_noise = self.noise_scheduler.predict_noise_from_start(image_embed, t = time_cond, x0 = x_start)

            if time_next < 0:
                image_embed = x_start
@@ -2494,7 +2493,7 @@ class Decoder(nn.Module):
        dynamic_thres_percentile = 0.95,
        p2_loss_weight_gamma = 0.,                  # p2 loss weight, from https://arxiv.org/abs/2204.00227 - 0 is equivalent to weight of 1 across time - 1. is recommended
        p2_loss_weight_k = 1,
-        ddim_sampling_eta = 1.                      # can be set to 0. for deterministic sampling afaict
+        ddim_sampling_eta = 0.                      # can be set to 0. for deterministic sampling afaict
    ):
        super().__init__()

@@ -2973,10 +2972,7 @@ class Decoder(nn.Module):

                # predict noise

-                if predict_x_start or predict_v:
-                    pred_noise = noise_scheduler.predict_noise_from_start(img, t = time_cond, x0 = x_start)
-                else:
-                    pred_noise = pred
+                pred_noise = noise_scheduler.predict_noise_from_start(img, t = time_cond, x0 = x_start)

                c1 = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
                c2 = ((1 - alpha_next) - torch.square(c1)).sqrt()
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '1.11.2'
+__version__ = '1.12.3'
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ setup(
    'accelerate',
    'click',
    'open-clip-torch>=2.0.0,<3.0.0',
-    'clip-anytorch>=2.4.0',
+    'clip-anytorch>=2.5.2',
    'coca-pytorch>=0.0.5',
    'ema-pytorch>=0.0.7',
    'einops>=0.4',
Author	SHA1	Message	Date
Phil Wang	848e8a480a	always rederive the predicted noise from the clipped x0 for ddim + predict noise objective	2023-03-05 10:45:44 -08:00
Phil Wang	cc58f75474	bump to newer package of clip-anytorch that allows for text encodings < maximum context length	2023-03-04 09:37:25 -08:00
Phil Wang	3b2cf7b0bc	fix for self conditioning in diffusion prior network https://github.com/lucidrains/DALLE2-pytorch/issues/273	2023-02-11 17:18:40 -08:00
Phil Wang	984d62a373	default ddim sampling eta to 0	2022-12-23 13:23:09 -08:00
Phil Wang	683dd98b96	extra insurance in case eos id is not there	2022-12-15 10:54:21 -08:00