one more fix for text mask, if the length of the text encoding exceeds max_text_len, add an assert for better error msg

generate text mask within the unet and diffusion prior itself from the text encodings, if not given
shoutout for @MalumaDev
2026-02-13 21:34:21 +01:00 · 2022-07-12 15:01:46 -07:00 · 2022-07-12 12:54:59 -07:00 · 2022-07-11 16:12:35 -07:00
3 changed files with 12 additions and 14 deletions
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ This library would not have gotten to this working state without the help of
 - <a href="https://github.com/rom1504">Romain</a> for the pull request reviews and project management
 - <a href="https://github.com/Ciaohe">He Cao</a> and <a href="https://github.com/xiankgx">xiankgx</a> for the Q&A and for identifying of critical bugs
 - <a href="https://github.com/marunine">Marunine</a> for identifying issues with resizing of the low resolution conditioner, when training the upsampler, in addition to various other bug fixes
+- <a href="https://github.com/malumadev">MalumaDev</a> for proposing the use of pixel shuffle upsampler for fixing checkboard artifacts
 - <a href="https://github.com/crowsonkb">Katherine</a> for her advice
 - <a href="https://stability.ai/">Stability AI</a> for the generous sponsorship
 - <a href="https://huggingface.co">🤗 Huggingface</a> and in particular <a href="https://github.com/sgugger">Sylvain</a> for the <a href="https://github.com/huggingface/accelerate">Accelerate</a> library
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -872,7 +872,7 @@ class DiffusionPriorNetwork(nn.Module):
            text_encodings = torch.empty((batch, 0, dim), device = device, dtype = dtype)

        if not exists(mask):
-            mask = torch.ones((batch, text_encodings.shape[-2]), device = device, dtype = torch.bool)
+            mask = torch.any(text_encodings != 0., dim = -1)

        # classifier free guidance

@@ -1205,7 +1205,6 @@ class DiffusionPrior(nn.Module):

        if self.condition_on_text_encodings:
            assert exists(text_encodings), 'text encodings must be present for diffusion prior if specified'
-            text_mask = default(text_mask, lambda: torch.any(text_encodings != 0., dim = -1))
            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}

        # timestep conditioning from ddpm
@@ -1819,21 +1818,25 @@ class Unet(nn.Module):
        if exists(text_encodings) and self.cond_on_text_encodings:
            assert self.text_embed_dim == text_encodings.shape[-1], f'the text encodings you are passing in have a dimension of {text_encodings.shape[-1]}, but the unet was created with text_embed_dim of {self.text_embed_dim}.'

+            if not exists(text_mask):
+                text_mask = torch.any(text_encodings != 0., dim = -1)
+
            text_tokens = self.text_to_cond(text_encodings)
+
            text_tokens = text_tokens[:, :self.max_text_len]
+            text_mask = text_mask[:, :self.max_text_len]

            text_tokens_len = text_tokens.shape[1]
            remainder = self.max_text_len - text_tokens_len

            if remainder > 0:
                text_tokens = F.pad(text_tokens, (0, 0, 0, remainder))
+                text_mask = F.pad(text_mask, (0, remainder), value = False)

-            if exists(text_mask):
-                if remainder > 0:
-                    text_mask = F.pad(text_mask, (0, remainder), value = False)
+            text_mask = rearrange(text_mask, 'b n -> b n 1')

-                text_mask = rearrange(text_mask, 'b n -> b n 1')
-                text_keep_mask = text_mask & text_keep_mask
+            assert text_mask.shape[0] == text_keep_mask.shape[0], f'text_mask has shape of {text_mask.shape} while text_keep_mask has shape {text_keep_mask.shape}'
+            text_keep_mask = text_mask & text_keep_mask

            null_text_embed = self.null_text_embed.to(text_tokens.dtype) # for some reason pytorch AMP not working

@@ -2440,9 +2443,6 @@ class Decoder(nn.Module):
        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

-        if self.condition_on_text_encodings:
-            text_mask = default(text_mask, lambda: torch.any(text_encodings != 0., dim = -1))
-
        img = None
        is_cuda = next(self.parameters()).is_cuda

@@ -2526,9 +2526,6 @@ class Decoder(nn.Module):
        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

-        if self.condition_on_text_encodings:
-            text_mask = default(text_mask, lambda: torch.any(text_encodings != 0., dim = -1))
-
        lowres_cond_img = self.to_lowres_cond(image, target_image_size = target_image_size, downsample_image_size = self.image_sizes[unet_index - 1]) if unet_number > 1 else None
        image = resize_image_to(image, target_image_size)

--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.21.0'
+__version__ = '0.21.2'
Author	SHA1	Message	Date
Phil Wang	1ec4dbe64f	one more fix for text mask, if the length of the text encoding exceeds max_text_len, add an assert for better error msg	2022-07-12 15:01:46 -07:00
Phil Wang	e0835acca9	generate text mask within the unet and diffusion prior itself from the text encodings, if not given	2022-07-12 12:54:59 -07:00
Phil Wang	e055793e5d	shoutout for @MalumaDev	2022-07-11 16:12:35 -07:00