make sure lowres conditioning image is properly normalized to -1 to 1 for cascading ddpm

fix everything around normalizing images to -1 to 1 for ddpm training automatically
2026-02-12 11:34:29 +01:00 · 2022-05-14 01:23:54 -07:00 · 2022-05-14 01:17:11 -07:00
2 changed files with 26 additions and 8 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1,7 +1,7 @@
 import math
 from tqdm import tqdm
 from inspect import isfunction
-from functools import partial
+from functools import partial, wraps
 from contextlib import contextmanager
 from collections import namedtuple
 from pathlib import Path
@@ -45,6 +45,14 @@ def exists(val):
 def identity(t, *args, **kwargs):
    return t

+def maybe(fn):
+    @wraps(fn)
+    def inner(x):
+        if not exists(x):
+            return x
+        return fn(x)
+    return inner
+
 def default(val, d):
    if exists(val):
        return val
@@ -114,10 +122,10 @@ def resize_image_to(image, target_image_size):
 # ddpms expect images to be in the range of -1 to 1
 # but CLIP may otherwise

-def normalize_img(img):
+def normalize_neg_one_to_one(img):
    return img * 2 - 1

-def unnormalize_img(normed_img):
+def unnormalize_zero_to_one(normed_img):
    return (normed_img + 1) * 0.5

 # clip related adapters
@@ -1037,7 +1045,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        assert not (self.condition_on_text_encodings and (not exists(text_encodings) and not exists(text))), 'text encodings must be present if you specified you wish to condition on it on initialization'

        if exists(image):
-            image_embed, _ = self.clip.embed_image(unnormalize_img(image))
+            image_embed, _ = self.clip.embed_image(image)

        # calculate text conditionings, based on what is passed in

@@ -1821,7 +1829,7 @@ class Decoder(BaseGaussianDiffusion):
            # eq 15 - https://arxiv.org/abs/2102.09672
            min_log = extract(self.posterior_log_variance_clipped, t, x.shape)
            max_log = extract(torch.log(self.betas), t, x.shape)
-            var_interp_frac = unnormalize_img(var_interp_frac_unnormalized)
+            var_interp_frac = unnormalize_zero_to_one(var_interp_frac_unnormalized)

            posterior_log_variance = var_interp_frac * max_log + (1 - var_interp_frac) * min_log
            posterior_variance = posterior_log_variance.exp()
@@ -1844,6 +1852,8 @@ class Decoder(BaseGaussianDiffusion):
        b = shape[0]
        img = torch.randn(shape, device = device)

+        lowres_cond_img = maybe(normalize_neg_one_to_one)(lowres_cond_img)
+
        for i in tqdm(reversed(range(0, self.num_timesteps)), desc = 'sampling loop time step', total = self.num_timesteps):
            img = self.p_sample(
                unet,
@@ -1859,11 +1869,19 @@ class Decoder(BaseGaussianDiffusion):
                clip_denoised = clip_denoised
            )

-        return img
+        unnormalize_img = unnormalize_zero_to_one(img)
+        return unnormalize_img

    def p_losses(self, unet, x_start, times, *, image_embed, lowres_cond_img = None, text_encodings = None, text_mask = None, predict_x_start = False, noise = None, learned_variance = False, clip_denoised = False):
        noise = default(noise, lambda: torch.randn_like(x_start))

+        # normalize to [-1, 1]
+
+        x_start = normalize_neg_one_to_one(x_start)
+        lowres_cond_img = maybe(normalize_neg_one_to_one)(lowres_cond_img)
+
+        # get x_t
+
        x_noisy = self.q_sample(x_start = x_start, t = times, noise = noise)

        model_output = unet(
@@ -2011,7 +2029,7 @@ class Decoder(BaseGaussianDiffusion):

        if not exists(image_embed):
            assert exists(self.clip), 'if you want to derive CLIP image embeddings automatically, you must supply `clip` to the decoder on init'
-            image_embed, _ = self.clip.embed_image(unnormalize_img(image))
+            image_embed, _ = self.clip.embed_image(image)

        text_encodings = text_mask = None
        if exists(text) and not exists(text_encodings):
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.2.15',
+  version = '0.2.17',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	5d27029e98	make sure lowres conditioning image is properly normalized to -1 to 1 for cascading ddpm	2022-05-14 01:23:54 -07:00
Phil Wang	3115fa17b3	fix everything around normalizing images to -1 to 1 for ddpm training automatically	2022-05-14 01:17:11 -07:00