let the neural network peek at the low resolution conditioning one last time before making prediction, for upsamplers

just always use nearest neighbor interpolation when resizing for low resolution conditioning, for https://github.com/lucidrains/DALLE2-pytorch/pull/181
allow for using classifier free guidance for some unets but not others, by passing in a tuple of cond_scale during sampling for decoder, just in case it is causing issues for upsamplers
2026-02-12 11:34:29 +01:00 · 2022-07-14 10:27:04 -07:00 · 2022-07-13 20:59:43 -07:00 · 2022-07-13 13:12:30 -07:00 · 2022-07-13 12:56:02 -07:00
2 changed files with 25 additions and 18 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -146,7 +146,7 @@ def resize_image_to(
        scale_factors = target_image_size / orig_image_size
        out = resize(image, scale_factors = scale_factors, **kwargs)
    else:
-        out = F.interpolate(image, target_image_size, mode = 'nearest', align_corners = False)
+        out = F.interpolate(image, target_image_size, mode = 'nearest')

    if exists(clamp_range):
        out = out.clamp(*clamp_range)
@@ -278,6 +278,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
        import clip
        openai_clip, preprocess = clip.load(name)
        super().__init__(openai_clip)
+        self.eos_id = 49407 # for handling 0 being also '!'

        text_attention_final = self.find_layer('ln_final')
        self.handle = text_attention_final.register_forward_hook(self._hook)
@@ -316,7 +317,10 @@ class OpenAIClipAdapter(BaseClipAdapter):
    @torch.no_grad()
    def embed_text(self, text):
        text = text[..., :self.max_text_len]
-        text_mask = text != 0
+
+        is_eos_id = (text == self.eos_id)
+        text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
+        text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
        assert not self.cleared

        text_embed = self.clip.encode_text(text)
@@ -900,7 +904,7 @@ class DiffusionPriorNetwork(nn.Module):
        null_text_embeds = self.null_text_embed.to(text_encodings.dtype)

        text_encodings = torch.where(
-            rearrange(mask, 'b n -> b n 1'),
+            rearrange(mask, 'b n -> b n 1').clone(),
            text_encodings,
            null_text_embeds
        )
@@ -1727,7 +1731,10 @@ class Unet(nn.Module):
            ]))

        self.final_resnet_block = ResnetBlock(dim * 2, dim, time_cond_dim = time_cond_dim, groups = top_level_resnet_group)
-        self.to_out = nn.Conv2d(dim, self.channels_out, kernel_size = final_conv_kernel_size, padding = final_conv_kernel_size // 2)
+
+        out_dim_in = dim + (channels if lowres_cond else 0)
+
+        self.to_out = nn.Conv2d(out_dim_in, self.channels_out, kernel_size = final_conv_kernel_size, padding = final_conv_kernel_size // 2)

        zero_init_(self.to_out) # since both OpenAI and @crowsonkb are doing it

@@ -1947,13 +1954,16 @@ class Unet(nn.Module):
        x = torch.cat((x, r), dim = 1)

        x = self.final_resnet_block(x, t)
+
+        if exists(lowres_cond_img):
+            x = torch.cat((x, lowres_cond_img), dim = 1)
+
        return self.to_out(x)

 class LowresConditioner(nn.Module):
    def __init__(
        self,
        downsample_first = True,
-        downsample_mode_nearest = False,
        blur_prob = 0.5,
        blur_sigma = 0.6,
        blur_kernel_size = 3,
@@ -1961,8 +1971,6 @@ class LowresConditioner(nn.Module):
    ):
        super().__init__()
        self.downsample_first = downsample_first
-        self.downsample_mode_nearest = downsample_mode_nearest
-
        self.input_image_range = input_image_range

        self.blur_prob = blur_prob
@@ -1979,7 +1987,7 @@ class LowresConditioner(nn.Module):
        blur_kernel_size = None
    ):
        if self.downsample_first and exists(downsample_image_size):
-            cond_fmap = resize_image_to(cond_fmap, downsample_image_size, clamp_range = self.input_image_range, nearest = self.downsample_mode_nearest)
+            cond_fmap = resize_image_to(cond_fmap, downsample_image_size, clamp_range = self.input_image_range, nearest = True)

        # blur is only applied 50% of the time
        # section 3.1 in https://arxiv.org/abs/2106.15282
@@ -2006,7 +2014,7 @@ class LowresConditioner(nn.Module):

            cond_fmap = gaussian_blur2d(cond_fmap, cast_tuple(blur_kernel_size, 2), cast_tuple(blur_sigma, 2))

-        cond_fmap = resize_image_to(cond_fmap, target_image_size, clamp_range = self.input_image_range)
+        cond_fmap = resize_image_to(cond_fmap, target_image_size, clamp_range = self.input_image_range, nearest = True)
        return cond_fmap

 class Decoder(nn.Module):
@@ -2029,7 +2037,6 @@ class Decoder(nn.Module):
        image_sizes = None,                         # for cascading ddpm, image size at each stage
        random_crop_sizes = None,                   # whether to random crop the image at that stage in the cascade (super resoluting convolutions at the end may be able to generalize on smaller crops)
        lowres_downsample_first = True,             # cascading ddpm - resizes to lower resolution, then to next conditional resolution + blur
-        lowres_downsample_mode_nearest = False,     # cascading ddpm - whether to use nearest mode downsampling for lower resolution
        blur_prob = 0.5,                            # cascading ddpm - when training, the gaussian blur is only applied 50% of the time
        blur_sigma = 0.6,                           # cascading ddpm - blur sigma
        blur_kernel_size = 3,                       # cascading ddpm - blur kernel size
@@ -2179,11 +2186,8 @@ class Decoder(nn.Module):
        lowres_conditions = tuple(map(lambda t: t.lowres_cond, self.unets))
        assert lowres_conditions == (False, *((True,) * (len(self.unets) - 1))), 'the first unet must be unconditioned (by low resolution image), and the rest of the unets must have `lowres_cond` set to True'

-        self.lowres_downsample_mode_nearest = lowres_downsample_mode_nearest
-
        self.to_lowres_cond = LowresConditioner(
            downsample_first = lowres_downsample_first,
-            downsample_mode_nearest = lowres_downsample_mode_nearest,
            blur_prob = blur_prob,
            blur_sigma = blur_sigma,
            blur_kernel_size = blur_kernel_size,
@@ -2494,7 +2498,10 @@ class Decoder(nn.Module):
        img = None
        is_cuda = next(self.parameters()).is_cuda

-        for unet_number, unet, vae, channel, image_size, predict_x_start, learned_variance, noise_scheduler, sample_timesteps in tqdm(zip(range(1, len(self.unets) + 1), self.unets, self.vaes, self.sample_channels, self.image_sizes, self.predict_x_start, self.learned_variance, self.noise_schedulers, self.sample_timesteps)):
+        num_unets = len(self.unets)
+        cond_scale = cast_tuple(cond_scale, num_unets)
+
+        for unet_number, unet, vae, channel, image_size, predict_x_start, learned_variance, noise_scheduler, sample_timesteps, unet_cond_scale in tqdm(zip(range(1, num_unets + 1), self.unets, self.vaes, self.sample_channels, self.image_sizes, self.predict_x_start, self.learned_variance, self.noise_schedulers, self.sample_timesteps, cond_scale)):

            context = self.one_unet_in_gpu(unet = unet) if is_cuda and not distributed else null_context()

@@ -2503,7 +2510,7 @@ class Decoder(nn.Module):
                shape = (batch_size, channel, image_size, image_size)

                if unet.lowres_cond:
-                    lowres_cond_img = resize_image_to(img, target_image_size = image_size, clamp_range = self.input_image_range, nearest = self.lowres_downsample_mode_nearest)
+                    lowres_cond_img = resize_image_to(img, target_image_size = image_size, clamp_range = self.input_image_range, nearest = True)

                is_latent_diffusion = isinstance(vae, VQGanVAE)
                image_size = vae.get_encoded_fmap_size(image_size)
@@ -2516,7 +2523,7 @@ class Decoder(nn.Module):
                    shape,
                    image_embed = image_embed,
                    text_encodings = text_encodings,
-                    cond_scale = cond_scale,
+                    cond_scale = unet_cond_scale,
                    predict_x_start = predict_x_start,
                    learned_variance = learned_variance,
                    clip_denoised = not is_latent_diffusion,
@@ -2573,7 +2580,7 @@ class Decoder(nn.Module):
        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

        lowres_cond_img = self.to_lowres_cond(image, target_image_size = target_image_size, downsample_image_size = self.image_sizes[unet_index - 1]) if unet_number > 1 else None
-        image = resize_image_to(image, target_image_size)
+        image = resize_image_to(image, target_image_size, nearest = True)

        if exists(random_crop_size):
            aug = K.RandomCrop((random_crop_size, random_crop_size), p = 1.)
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.23.7'
+__version__ = '0.24.0'
Author	SHA1	Message	Date
Phil Wang	a34f60962a	let the neural network peek at the low resolution conditioning one last time before making prediction, for upsamplers	2022-07-14 10:27:04 -07:00
Phil Wang	0b40cbaa54	just always use nearest neighbor interpolation when resizing for low resolution conditioning, for https://github.com/lucidrains/DALLE2-pytorch/pull/181	2022-07-13 20:59:43 -07:00
Phil Wang	f141144a6d	allow for using classifier free guidance for some unets but not others, by passing in a tuple of cond_scale during sampling for decoder, just in case it is causing issues for upsamplers	2022-07-13 13:12:30 -07:00
Phil Wang	f988207718	hack around some inplace error, also make sure for openai clip text encoding, only tokens after eos_id is masked out	2022-07-13 12:56:02 -07:00