allow for using classifier free guidance for some unets but not others, by passing in a tuple of cond_scale during sampling for decoder, just in case it is causing issues for upsamplers

hack around some inplace error, also make sure for openai clip text encoding, only tokens after eos_id is masked out
foolproof sampling for decoder to always use eval mode (and restore training state afterwards)
2026-02-13 20:34:22 +01:00 · 2022-07-13 13:12:30 -07:00 · 2022-07-13 12:56:02 -07:00 · 2022-07-13 10:21:00 -07:00 · 2022-07-13 10:16:02 -07:00 · 2022-07-13 10:11:49 -07:00
3 changed files with 45 additions and 11 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -278,6 +278,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
        import clip
        openai_clip, preprocess = clip.load(name)
        super().__init__(openai_clip)
+        self.eos_id = 49407 # for handling 0 being also '!'

        text_attention_final = self.find_layer('ln_final')
        self.handle = text_attention_final.register_forward_hook(self._hook)
@@ -316,7 +317,10 @@ class OpenAIClipAdapter(BaseClipAdapter):
    @torch.no_grad()
    def embed_text(self, text):
        text = text[..., :self.max_text_len]
-        text_mask = text != 0
+
+        is_eos_id = (text == self.eos_id)
+        text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
+        text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
        assert not self.cleared

        text_embed = self.clip.encode_text(text)
@@ -900,7 +904,7 @@ class DiffusionPriorNetwork(nn.Module):
        null_text_embeds = self.null_text_embed.to(text_encodings.dtype)

        text_encodings = torch.where(
-            rearrange(mask, 'b n -> b n 1'),
+            rearrange(mask, 'b n -> b n 1').clone(),
            text_encodings,
            null_text_embeds
        )
@@ -1251,6 +1255,14 @@ class DiffusionPrior(nn.Module):

 # decoder

+def NearestUpsample(dim, dim_out = None):
+    dim_out = default(dim_out, dim)
+
+    return nn.Sequential(
+        nn.Upsample(scale_factor = 2, mode = 'nearest'),
+        nn.Conv2d(dim, dim_out, 3, padding = 1)
+    )
+
 class PixelShuffleUpsample(nn.Module):
    """
    code shared by @MalumaDev at DALLE2-pytorch for addressing checkboard artifacts
@@ -1657,7 +1669,7 @@ class Unet(nn.Module):

        # upsample klass

-        upsample_klass = ConvTransposeUpsample if not pixel_shuffle_upsample else PixelShuffleUpsample
+        upsample_klass = NearestUpsample if not pixel_shuffle_upsample else PixelShuffleUpsample

        # give memory efficient unet an initial resnet block

@@ -1946,6 +1958,7 @@ class LowresConditioner(nn.Module):
        self,
        downsample_first = True,
        downsample_mode_nearest = False,
+        blur_prob = 0.5,
        blur_sigma = 0.6,
        blur_kernel_size = 3,
        input_image_range = None
@@ -1956,6 +1969,7 @@ class LowresConditioner(nn.Module):

        self.input_image_range = input_image_range

+        self.blur_prob = blur_prob
        self.blur_sigma = blur_sigma
        self.blur_kernel_size = blur_kernel_size

@@ -1968,20 +1982,27 @@ class LowresConditioner(nn.Module):
        blur_sigma = None,
        blur_kernel_size = None
    ):
-        if self.training and self.downsample_first and exists(downsample_image_size):
+        if self.downsample_first and exists(downsample_image_size):
            cond_fmap = resize_image_to(cond_fmap, downsample_image_size, clamp_range = self.input_image_range, nearest = self.downsample_mode_nearest)

-        if self.training:
+        # blur is only applied 50% of the time
+        # section 3.1 in https://arxiv.org/abs/2106.15282
+
+        if random.random() < self.blur_prob:
+
            # when training, blur the low resolution conditional image
+
            blur_sigma = default(blur_sigma, self.blur_sigma)
            blur_kernel_size = default(blur_kernel_size, self.blur_kernel_size)

            # allow for drawing a random sigma between lo and hi float values
+
            if isinstance(blur_sigma, tuple):
                blur_sigma = tuple(map(float, blur_sigma))
                blur_sigma = random.uniform(*blur_sigma)

            # allow for drawing a random kernel size between lo and hi int values
+
            if isinstance(blur_kernel_size, tuple):
                blur_kernel_size = tuple(map(int, blur_kernel_size))
                kernel_size_lo, kernel_size_hi = blur_kernel_size
@@ -1990,7 +2011,6 @@ class LowresConditioner(nn.Module):
            cond_fmap = gaussian_blur2d(cond_fmap, cast_tuple(blur_kernel_size, 2), cast_tuple(blur_sigma, 2))

        cond_fmap = resize_image_to(cond_fmap, target_image_size, clamp_range = self.input_image_range)
-
        return cond_fmap

 class Decoder(nn.Module):
@@ -2014,6 +2034,7 @@ class Decoder(nn.Module):
        random_crop_sizes = None,                   # whether to random crop the image at that stage in the cascade (super resoluting convolutions at the end may be able to generalize on smaller crops)
        lowres_downsample_first = True,             # cascading ddpm - resizes to lower resolution, then to next conditional resolution + blur
        lowres_downsample_mode_nearest = False,     # cascading ddpm - whether to use nearest mode downsampling for lower resolution
+        blur_prob = 0.5,                            # cascading ddpm - when training, the gaussian blur is only applied 50% of the time
        blur_sigma = 0.6,                           # cascading ddpm - blur sigma
        blur_kernel_size = 3,                       # cascading ddpm - blur kernel size
        clip_denoised = True,
@@ -2162,9 +2183,12 @@ class Decoder(nn.Module):
        lowres_conditions = tuple(map(lambda t: t.lowres_cond, self.unets))
        assert lowres_conditions == (False, *((True,) * (len(self.unets) - 1))), 'the first unet must be unconditioned (by low resolution image), and the rest of the unets must have `lowres_cond` set to True'

+        self.lowres_downsample_mode_nearest = lowres_downsample_mode_nearest
+
        self.to_lowres_cond = LowresConditioner(
            downsample_first = lowres_downsample_first,
            downsample_mode_nearest = lowres_downsample_mode_nearest,
+            blur_prob = blur_prob,
            blur_sigma = blur_sigma,
            blur_kernel_size = blur_kernel_size,
            input_image_range = self.input_image_range
@@ -2474,7 +2498,10 @@ class Decoder(nn.Module):
        img = None
        is_cuda = next(self.parameters()).is_cuda

-        for unet_number, unet, vae, channel, image_size, predict_x_start, learned_variance, noise_scheduler, sample_timesteps in tqdm(zip(range(1, len(self.unets) + 1), self.unets, self.vaes, self.sample_channels, self.image_sizes, self.predict_x_start, self.learned_variance, self.noise_schedulers, self.sample_timesteps)):
+        num_unets = len(self.unets)
+        cond_scale = cast_tuple(cond_scale, num_unets)
+
+        for unet_number, unet, vae, channel, image_size, predict_x_start, learned_variance, noise_scheduler, sample_timesteps, unet_cond_scale in tqdm(zip(range(1, num_unets + 1), self.unets, self.vaes, self.sample_channels, self.image_sizes, self.predict_x_start, self.learned_variance, self.noise_schedulers, self.sample_timesteps, cond_scale)):

            context = self.one_unet_in_gpu(unet = unet) if is_cuda and not distributed else null_context()

@@ -2483,7 +2510,7 @@ class Decoder(nn.Module):
                shape = (batch_size, channel, image_size, image_size)

                if unet.lowres_cond:
-                    lowres_cond_img = self.to_lowres_cond(img, target_image_size = image_size)
+                    lowres_cond_img = resize_image_to(img, target_image_size = image_size, clamp_range = self.input_image_range, nearest = self.lowres_downsample_mode_nearest)

                is_latent_diffusion = isinstance(vae, VQGanVAE)
                image_size = vae.get_encoded_fmap_size(image_size)
@@ -2496,7 +2523,7 @@ class Decoder(nn.Module):
                    shape,
                    image_embed = image_embed,
                    text_encodings = text_encodings,
-                    cond_scale = cond_scale,
+                    cond_scale = unet_cond_scale,
                    predict_x_start = predict_x_start,
                    learned_variance = learned_variance,
                    clip_denoised = not is_latent_diffusion,
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -673,8 +673,14 @@ class DecoderTrainer(nn.Module):
    def sample(self, *args, **kwargs):
        distributed = self.accelerator.num_processes > 1
        base_decoder = self.accelerator.unwrap_model(self.decoder)
+
+        was_training = base_decoder.training
+        base_decoder.eval()
+
        if kwargs.pop('use_non_ema', False) or not self.use_ema:
-            return base_decoder.sample(*args, **kwargs, distributed = distributed)
+            out = base_decoder.sample(*args, **kwargs, distributed = distributed)
+            base_decoder.train(was_training)
+            return out

        trainable_unets = self.accelerator.unwrap_model(self.decoder).unets
        base_decoder.unets = self.unets                  # swap in exponential moving averaged unets for sampling
@@ -687,6 +693,7 @@ class DecoderTrainer(nn.Module):
        for ema in self.ema_unets:
            ema.restore_ema_model_device()

+        base_decoder.train(was_training)
        return output

    @torch.no_grad()
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.23.4'
+__version__ = '0.23.9'
Author	SHA1	Message	Date
Phil Wang	f141144a6d	allow for using classifier free guidance for some unets but not others, by passing in a tuple of cond_scale during sampling for decoder, just in case it is causing issues for upsamplers	2022-07-13 13:12:30 -07:00
Phil Wang	f988207718	hack around some inplace error, also make sure for openai clip text encoding, only tokens after eos_id is masked out	2022-07-13 12:56:02 -07:00
Phil Wang	b2073219f0	foolproof sampling for decoder to always use eval mode (and restore training state afterwards)	2022-07-13 10:21:00 -07:00
Phil Wang	cc0f7a935c	fix non pixel shuffle upsample	2022-07-13 10:16:02 -07:00
Phil Wang	95a512cb65	fix a potential bug with conditioning with blurred low resolution image, blur should be applied only 50% of the time	2022-07-13 10:11:49 -07:00