hack around some inplace error, also make sure for openai clip text encoding, only tokens after eos_id is masked out

foolproof sampling for decoder to always use eval mode (and restore training state afterwards)
fix non pixel shuffle upsample
2026-02-13 12:04:24 +01:00 · 2022-07-13 12:56:02 -07:00 · 2022-07-13 10:21:00 -07:00 · 2022-07-13 10:16:02 -07:00
3 changed files with 24 additions and 5 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -278,6 +278,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
        import clip
        openai_clip, preprocess = clip.load(name)
        super().__init__(openai_clip)
+        self.eos_id = 49407 # for handling 0 being also '!'

        text_attention_final = self.find_layer('ln_final')
        self.handle = text_attention_final.register_forward_hook(self._hook)
@@ -316,7 +317,10 @@ class OpenAIClipAdapter(BaseClipAdapter):
    @torch.no_grad()
    def embed_text(self, text):
        text = text[..., :self.max_text_len]
-        text_mask = text != 0
+
+        is_eos_id = (text == self.eos_id)
+        text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
+        text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
        assert not self.cleared

        text_embed = self.clip.encode_text(text)
@@ -900,7 +904,7 @@ class DiffusionPriorNetwork(nn.Module):
        null_text_embeds = self.null_text_embed.to(text_encodings.dtype)

        text_encodings = torch.where(
-            rearrange(mask, 'b n -> b n 1'),
+            rearrange(mask, 'b n -> b n 1').clone(),
            text_encodings,
            null_text_embeds
        )
@@ -1251,6 +1255,14 @@ class DiffusionPrior(nn.Module):

 # decoder

+def NearestUpsample(dim, dim_out = None):
+    dim_out = default(dim_out, dim)
+
+    return nn.Sequential(
+        nn.Upsample(scale_factor = 2, mode = 'nearest'),
+        nn.Conv2d(dim, dim_out, 3, padding = 1)
+    )
+
 class PixelShuffleUpsample(nn.Module):
    """
    code shared by @MalumaDev at DALLE2-pytorch for addressing checkboard artifacts
@@ -1657,7 +1669,7 @@ class Unet(nn.Module):

        # upsample klass

-        upsample_klass = ConvTransposeUpsample if not pixel_shuffle_upsample else PixelShuffleUpsample
+        upsample_klass = NearestUpsample if not pixel_shuffle_upsample else PixelShuffleUpsample

        # give memory efficient unet an initial resnet block

--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -673,8 +673,14 @@ class DecoderTrainer(nn.Module):
    def sample(self, *args, **kwargs):
        distributed = self.accelerator.num_processes > 1
        base_decoder = self.accelerator.unwrap_model(self.decoder)
+
+        was_training = base_decoder.training
+        base_decoder.eval()
+
        if kwargs.pop('use_non_ema', False) or not self.use_ema:
-            return base_decoder.sample(*args, **kwargs, distributed = distributed)
+            out = base_decoder.sample(*args, **kwargs, distributed = distributed)
+            base_decoder.train(was_training)
+            return out

        trainable_unets = self.accelerator.unwrap_model(self.decoder).unets
        base_decoder.unets = self.unets                  # swap in exponential moving averaged unets for sampling
@@ -687,6 +693,7 @@ class DecoderTrainer(nn.Module):
        for ema in self.ema_unets:
            ema.restore_ema_model_device()

+        base_decoder.train(was_training)
        return output

    @torch.no_grad()
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.23.5'
+__version__ = '0.23.8'
Author	SHA1	Message	Date
Phil Wang	f988207718	hack around some inplace error, also make sure for openai clip text encoding, only tokens after eos_id is masked out	2022-07-13 12:56:02 -07:00
Phil Wang	b2073219f0	foolproof sampling for decoder to always use eval mode (and restore training state afterwards)	2022-07-13 10:21:00 -07:00
Phil Wang	cc0f7a935c	fix non pixel shuffle upsample	2022-07-13 10:16:02 -07:00