expose num_steps_taken helper method on trainer to retrieve number of training steps of each unet

allow for control over use of nearest interp method of downsampling low res conditioning, in addition to being able to turn it off
extra insurance that diffusion prior is on the correct device, when using trainer with accelerator or device was given
2026-02-12 11:34:29 +01:00 · 2022-07-08 13:00:56 -07:00 · 2022-07-08 11:44:43 -07:00 · 2022-07-07 10:08:33 -07:00 · 2022-07-07 09:41:49 -07:00 · 2022-07-07 07:43:41 -07:00
4 changed files with 56 additions and 20 deletions
--- a/README.md
+++ b/README.md
@@ -44,6 +44,7 @@ This library would not have gotten to this working state without the help of
 - <a href="https://github.com/krish240574">Kumar</a> for working on the initial diffusion training script
 - <a href="https://github.com/rom1504">Romain</a> for the pull request reviews and project management
 - <a href="https://github.com/Ciaohe">He Cao</a> and <a href="https://github.com/xiankgx">xiankgx</a> for the Q&A and for identifying of critical bugs
+- <a href="https://github.com/marunine">Marunine</a> for identifying issues with resizing of the low resolution conditioner, when training the upsampler, in addition to various other bug fixes
 - <a href="https://github.com/crowsonkb">Katherine</a> for her advice
 - <a href="https://stability.ai/">Stability AI</a> for the generous sponsorship
 - <a href="https://huggingface.co">🤗 Huggingface</a> and in particular <a href="https://github.com/sgugger">Sylvain</a> for the <a href="https://github.com/huggingface/accelerate">Accelerate</a> library
@@ -581,7 +582,8 @@ unet1 = Unet(
    image_embed_dim = 512,
    cond_dim = 128,
    channels = 3,
-    dim_mults=(1, 2, 4, 8)
+    dim_mults=(1, 2, 4, 8),
+    cond_on_text_encodings = True  # set to True for any unets that need to be conditioned on text encodings (ex. first unet in cascade)
 ).cuda()

 unet2 = Unet(
@@ -598,12 +600,11 @@ decoder = Decoder(
    clip = clip,
    timesteps = 100,
    image_cond_drop_prob = 0.1,
-    text_cond_drop_prob = 0.5,
-    condition_on_text_encodings = False  # set this to True if you wish to condition on text during training and sampling
+    text_cond_drop_prob = 0.5
 ).cuda()

 for unet_number in (1, 2):
-    loss = decoder(images, unet_number = unet_number) # this can optionally be decoder(images, text) if you wish to condition on the text encodings as well, though it was hinted in the paper it didn't do much
+    loss = decoder(images, text = text, unet_number = unet_number) # this can optionally be decoder(images, text) if you wish to condition on the text encodings as well, though it was hinted in the paper it didn't do much
    loss.backward()

 # do above for many steps
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -125,14 +125,28 @@ def log(t, eps = 1e-12):
 def l2norm(t):
    return F.normalize(t, dim = -1)

-def resize_image_to(image, target_image_size):
+def resize_image_to(
+    image,
+    target_image_size,
+    clamp_range = None,
+    nearest = False,
+    **kwargs
+):
    orig_image_size = image.shape[-1]

    if orig_image_size == target_image_size:
        return image

-    scale_factors = target_image_size / orig_image_size
-    return resize(image, scale_factors = scale_factors)
+    if not nearest:
+        scale_factors = target_image_size / orig_image_size
+        out = resize(image, scale_factors = scale_factors, **kwargs)
+    else:
+        out = F.interpolate(image, target_image_size, mode = 'nearest', align_corners = False)
+
+    if exists(clamp_range):
+        out = out.clamp(*clamp_range)
+
+    return out

 # image normalization functions
 # ddpms expect images to be in the range of -1 to 1
@@ -1776,11 +1790,17 @@ class LowresConditioner(nn.Module):
    def __init__(
        self,
        downsample_first = True,
+        downsample_mode_nearest = False,
        blur_sigma = 0.6,
        blur_kernel_size = 3,
+        input_image_range = None
    ):
        super().__init__()
        self.downsample_first = downsample_first
+        self.downsample_mode_nearest = downsample_mode_nearest
+
+        self.input_image_range = input_image_range
+
        self.blur_sigma = blur_sigma
        self.blur_kernel_size = blur_kernel_size

@@ -1794,7 +1814,7 @@ class LowresConditioner(nn.Module):
        blur_kernel_size = None
    ):
        if self.training and self.downsample_first and exists(downsample_image_size):
-            cond_fmap = resize_image_to(cond_fmap, downsample_image_size)
+            cond_fmap = resize_image_to(cond_fmap, downsample_image_size, clamp_range = self.input_image_range, nearest = self.downsample_mode_nearest)

        if self.training:
            # when training, blur the low resolution conditional image
@@ -1814,7 +1834,7 @@ class LowresConditioner(nn.Module):

            cond_fmap = gaussian_blur2d(cond_fmap, cast_tuple(blur_kernel_size, 2), cast_tuple(blur_sigma, 2))

-        cond_fmap = resize_image_to(cond_fmap, target_image_size)
+        cond_fmap = resize_image_to(cond_fmap, target_image_size, clamp_range = self.input_image_range)

        return cond_fmap

@@ -1837,6 +1857,7 @@ class Decoder(nn.Module):
        image_sizes = None,                         # for cascading ddpm, image size at each stage
        random_crop_sizes = None,                   # whether to random crop the image at that stage in the cascade (super resoluting convolutions at the end may be able to generalize on smaller crops)
        lowres_downsample_first = True,             # cascading ddpm - resizes to lower resolution, then to next conditional resolution + blur
+        lowres_downsample_mode_nearest = False,     # cascading ddpm - whether to use nearest mode downsampling for lower resolution
        blur_sigma = 0.6,                           # cascading ddpm - blur sigma
        blur_kernel_size = 3,                       # cascading ddpm - blur kernel size
        clip_denoised = True,
@@ -1930,10 +1951,6 @@ class Decoder(nn.Module):
            self.unets.append(one_unet)
            self.vaes.append(one_vae.copy_for_eval())

-        # determine from unets whether conditioning on text encoding is needed
-
-        self.condition_on_text_encodings = any([unet.cond_on_text_encodings for unet in self.unets])
-
        # create noise schedulers per unet

        if not exists(beta_schedule):
@@ -1972,6 +1989,10 @@ class Decoder(nn.Module):

        self.predict_x_start = cast_tuple(predict_x_start, len(unets)) if not predict_x_start_for_latent_diffusion else tuple(map(lambda t: isinstance(t, VQGanVAE), self.vaes))

+        # input image range
+
+        self.input_image_range = (-1. if not auto_normalize_img else 0., 1.)
+
        # cascading ddpm related stuff

        lowres_conditions = tuple(map(lambda t: t.lowres_cond, self.unets))
@@ -1979,8 +2000,10 @@ class Decoder(nn.Module):

        self.to_lowres_cond = LowresConditioner(
            downsample_first = lowres_downsample_first,
+            downsample_mode_nearest = lowres_downsample_mode_nearest,
            blur_sigma = blur_sigma,
            blur_kernel_size = blur_kernel_size,
+            input_image_range = self.input_image_range
        )

        # classifier free guidance
@@ -2012,6 +2035,10 @@ class Decoder(nn.Module):
    def device(self):
        return self._dummy.device

+    @property
+    def condition_on_text_encodings(self):
+        return any([unet.cond_on_text_encodings for unet in self.unets])
+
    def get_unet(self, unet_number):
        assert 0 < unet_number <= len(self.unets)
        index = unet_number - 1
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -192,6 +192,7 @@ class DiffusionPriorTrainer(nn.Module):
            self.device = diffusion_prior_device
        else:
            self.device = accelerator.device if exists(accelerator) else device
+            diffusion_prior.to(self.device)

        # save model

@@ -526,6 +527,17 @@ class DecoderTrainer(nn.Module):

        self.warmup_schedulers = warmup_schedulers

+    def validate_and_return_unet_number(self, unet_number = None):
+        if self.num_unets == 1:
+            unet_number = default(unet_number, 1)
+
+        assert exists(unet_number) and 1 <= unet_number <= self.num_unets
+        return unet_number
+
+    def num_steps_taken(self, unet_number = None):
+        unet_number = self.validate_and_return_unet_number(unet_number)
+        return self.steps[unet_number - 1].item()
+
    def save(self, path, overwrite = True, **kwargs):
        path = Path(path)
        assert not (path.exists() and not overwrite)
@@ -594,10 +606,7 @@ class DecoderTrainer(nn.Module):
        self.steps += F.one_hot(unet_index_tensor, num_classes = len(self.steps))

    def update(self, unet_number = None):
-        if self.num_unets == 1:
-            unet_number = default(unet_number, 1)
-
-        assert exists(unet_number) and 1 <= unet_number <= self.num_unets
+        unet_number = self.validate_and_return_unet_number(unet_number)
        index = unet_number - 1

        optimizer = getattr(self, f'optim{index}')
@@ -663,8 +672,7 @@ class DecoderTrainer(nn.Module):
        max_batch_size = None,
        **kwargs
    ):
-        if self.num_unets == 1:
-            unet_number = default(unet_number, 1)
+        unet_number = self.validate_and_return_unet_number(unet_number)

        total_loss = 0.

--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.16.14'
+__version__ = '0.16.19'
Author	SHA1	Message	Date
Phil Wang	d7bc5fbedd	expose num_steps_taken helper method on trainer to retrieve number of training steps of each unet	2022-07-08 13:00:56 -07:00
Phil Wang	8c823affff	allow for control over use of nearest interp method of downsampling low res conditioning, in addition to being able to turn it off	2022-07-08 11:44:43 -07:00
Phil Wang	ec7cab01d9	extra insurance that diffusion prior is on the correct device, when using trainer with accelerator or device was given	2022-07-07 10:08:33 -07:00
Phil Wang	46be8c32d3	fix a potential issue in the low resolution conditioner, when downsampling and then upsampling using resize right, thanks to @marunine	2022-07-07 09:41:49 -07:00
Phil Wang	900f086a6d	fix condition_on_text_encodings in dalle2 orchestrator class, fix readme	2022-07-07 07:43:41 -07:00