0.22.3

add setting to attend to all text encodings regardless of padding, for diffusion prior
make sure text encodings being passed in has the correct batch dimension
2026-02-12 11:34:29 +01:00 · 2022-07-12 17:08:31 -07:00 · 2022-07-12 17:08:12 -07:00 · 2022-07-12 16:00:19 -07:00 · 2022-07-12 15:40:31 -07:00 · 2022-07-12 15:33:13 -07:00
5 changed files with 80 additions and 62 deletions
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ This library would not have gotten to this working state without the help of
 - <a href="https://github.com/rom1504">Romain</a> for the pull request reviews and project management
 - <a href="https://github.com/Ciaohe">He Cao</a> and <a href="https://github.com/xiankgx">xiankgx</a> for the Q&A and for identifying of critical bugs
 - <a href="https://github.com/marunine">Marunine</a> for identifying issues with resizing of the low resolution conditioner, when training the upsampler, in addition to various other bug fixes
+- <a href="https://github.com/malumadev">MalumaDev</a> for proposing the use of pixel shuffle upsampler for fixing checkboard artifacts
 - <a href="https://github.com/crowsonkb">Katherine</a> for her advice
 - <a href="https://stability.ai/">Stability AI</a> for the generous sponsorship
 - <a href="https://huggingface.co">🤗 Huggingface</a> and in particular <a href="https://github.com/sgugger">Sylvain</a> for the <a href="https://github.com/huggingface/accelerate">Accelerate</a> library
@@ -420,7 +421,7 @@ For the layperson, no worries, training will all be automated into a CLI tool, a

 ## Training on Preprocessed CLIP Embeddings

-It is likely, when scaling up, that you would first preprocess your images and text into corresponding embeddings before training the prior network. You can do so easily by simply passing in `image_embed`, `text_embed`, and optionally `text_encodings` and `text_mask`
+It is likely, when scaling up, that you would first preprocess your images and text into corresponding embeddings before training the prior network. You can do so easily by simply passing in `image_embed`, `text_embed`, and optionally `text_encodings`

 Working example below

--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -77,6 +77,11 @@ def cast_tuple(val, length = None):
 def module_device(module):
    return next(module.parameters()).device

+def zero_init_(m):
+    nn.init.zeros_(m.weight)
+    if exists(m.bias):
+        nn.init.zeros_(m.bias)
+
@contextmanager
 def null_context(*args, **kwargs):
    yield
@@ -160,7 +165,7 @@ def unnormalize_zero_to_one(normed_img):

 # clip related adapters

-EmbeddedText = namedtuple('EmbedTextReturn', ['text_embed', 'text_encodings', 'text_mask'])
+EmbeddedText = namedtuple('EmbedTextReturn', ['text_embed', 'text_encodings'])
 EmbeddedImage = namedtuple('EmbedImageReturn', ['image_embed', 'image_encodings'])

 class BaseClipAdapter(nn.Module):
@@ -221,7 +226,7 @@ class XClipAdapter(BaseClipAdapter):
        text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
        text_embed = self.clip.to_text_latent(text_cls)
        text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
-        return EmbeddedText(l2norm(text_embed), text_encodings, text_mask)
+        return EmbeddedText(l2norm(text_embed), text_encodings)

    @torch.no_grad()
    def embed_image(self, image):
@@ -257,7 +262,7 @@ class CoCaAdapter(BaseClipAdapter):
        text_mask = text != 0
        text_embed, text_encodings = self.clip.embed_text(text)
        text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
-        return EmbeddedText(text_embed, text_encodings, text_mask)
+        return EmbeddedText(text_embed, text_encodings)

    @torch.no_grad()
    def embed_image(self, image):
@@ -318,7 +323,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
        text_encodings = self.text_encodings
        text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
        del self.text_encodings
-        return EmbeddedText(l2norm(text_embed.float()), text_encodings.float(), text_mask)
+        return EmbeddedText(l2norm(text_embed.float()), text_encodings.float())

    @torch.no_grad()
    def embed_image(self, image):
@@ -801,6 +806,7 @@ class DiffusionPriorNetwork(nn.Module):
        num_time_embeds = 1,
        num_image_embeds = 1,
        num_text_embeds = 1,
+        attend_all_text_encodings = True,
        **kwargs
    ):
        super().__init__()
@@ -826,6 +832,8 @@ class DiffusionPriorNetwork(nn.Module):
        self.learned_query = nn.Parameter(torch.randn(dim))
        self.causal_transformer = CausalTransformer(dim = dim, **kwargs)

+        self.attend_all_text_encodings = attend_all_text_encodings
+
    def forward_with_cond_scale(
        self,
        *args,
@@ -847,7 +855,6 @@ class DiffusionPriorNetwork(nn.Module):
        *,
        text_embed,
        text_encodings = None,
-        mask = None,
        cond_drop_prob = 0.
    ):
        batch, dim, device, dtype = *image_embed.shape, image_embed.device, image_embed.dtype
@@ -866,8 +873,10 @@ class DiffusionPriorNetwork(nn.Module):
        if not exists(text_encodings):
            text_encodings = torch.empty((batch, 0, dim), device = device, dtype = dtype)

-        if not exists(mask):
+        if self.attend_all_text_encodings:
            mask = torch.ones((batch, text_encodings.shape[-2]), device = device, dtype = torch.bool)
+        else:
+            mask = torch.any(text_encodings != 0., dim = -1)

        # classifier free guidance

@@ -884,9 +893,8 @@ class DiffusionPriorNetwork(nn.Module):
        # whether text embedding is used for conditioning depends on whether text encodings are available for attention (for classifier free guidance, even though it seems from the paper it was not used in the prior ddpm, as the objective is different)
        # but let's just do it right

-        if exists(mask):
-            attend_padding = 1 + num_time_embeds + num_image_embeds # 1 for learned queries + number of image embeds + time embeds
-            mask = F.pad(mask, (0, attend_padding), value = True) # extend mask for text embedding, noised image embedding, time step embedding, and learned query
+        attend_padding = 1 + num_time_embeds + num_image_embeds # 1 for learned queries + number of image embeds + time embeds
+        mask = F.pad(mask, (0, attend_padding), value = True) # extend mask for text embedding, noised image embedding, time step embedding, and learned query

        time_embed = self.to_time_embeds(diffusion_timesteps)

@@ -1147,12 +1155,12 @@ class DiffusionPrior(nn.Module):
        batch_size = text.shape[0]
        image_embed_dim = self.image_embed_dim

-        text_embed, text_encodings, text_mask = self.clip.embed_text(text)
+        text_embed, text_encodings = self.clip.embed_text(text)

        text_cond = dict(text_embed = text_embed)

        if self.condition_on_text_encodings:
-            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}
+            text_cond = {**text_cond, 'text_encodings': text_encodings}

        image_embeds = self.p_sample_loop((batch_size, image_embed_dim), text_cond = text_cond, cond_scale = cond_scale, timesteps = timesteps)

@@ -1180,7 +1188,6 @@ class DiffusionPrior(nn.Module):
        text_embed = None,      # allow for training on preprocessed CLIP text and image embeddings
        image_embed = None,
        text_encodings = None,  # as well as CLIP text encodings
-        text_mask = None,       # text mask <- may eventually opt for the learned padding tokens technique from DALL-E1 to reduce complexity
        *args,
        **kwargs
    ):
@@ -1194,14 +1201,13 @@ class DiffusionPrior(nn.Module):
        # calculate text conditionings, based on what is passed in

        if exists(text):
-            text_embed, text_encodings, text_mask = self.clip.embed_text(text)
+            text_embed, text_encodings = self.clip.embed_text(text)

        text_cond = dict(text_embed = text_embed)

        if self.condition_on_text_encodings:
            assert exists(text_encodings), 'text encodings must be present for diffusion prior if specified'
-            text_mask = default(text_mask, lambda: torch.any(text_encodings != 0., dim = -1))
-            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}
+            text_cond = {**text_cond, 'text_encodings': text_encodings}

        # timestep conditioning from ddpm

@@ -1218,16 +1224,35 @@ class DiffusionPrior(nn.Module):

 # decoder

-def ConvTransposeUpsample(dim, dim_out = None):
-    dim_out = default(dim_out, dim)
-    return nn.ConvTranspose2d(dim, dim_out, 4, 2, 1)
+class PixelShuffleUpsample(nn.Module):
+    """
+    code shared by @MalumaDev at DALLE2-pytorch for addressing checkboard artifacts
+    https://arxiv.org/ftp/arxiv/papers/1707/1707.02937.pdf
+    """
+    def __init__(self, dim, dim_out = None):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        conv = nn.Conv2d(dim, dim_out * 4, 1)

-def NearestUpsample(dim, dim_out = None):
-    dim_out = default(dim_out, dim)
-    return nn.Sequential(
-        nn.Upsample(scale_factor = 2, mode = 'nearest'),
-        nn.Conv2d(dim, dim_out, 3, padding = 1)
-    )
+        self.net = nn.Sequential(
+            conv,
+            nn.SiLU(),
+            nn.PixelShuffle(2)
+        )
+
+        self.init_conv_(conv)
+
+    def init_conv_(self, conv):
+        o, i, h, w = conv.weight.shape
+        conv_weight = torch.empty(o // 4, i, h, w)
+        nn.init.kaiming_uniform_(conv_weight)
+        conv_weight = repeat(conv_weight, 'o ... -> (o 4) ...')
+
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+
+    def forward(self, x):
+        return self.net(x)

 def Downsample(dim, *, dim_out = None):
    dim_out = default(dim_out, dim)
@@ -1491,7 +1516,7 @@ class Unet(nn.Module):
        cross_embed_downsample_kernel_sizes = (2, 4),
        memory_efficient = False,
        scale_skip_connection = False,
-        nearest_upsample = False,
+        pixel_shuffle_upsample = True,
        final_conv_kernel_size = 1,
        **kwargs
    ):
@@ -1605,7 +1630,7 @@ class Unet(nn.Module):

        # upsample klass

-        upsample_klass = ConvTransposeUpsample if not nearest_upsample else NearestUpsample
+        upsample_klass = ConvTransposeUpsample if not pixel_shuffle_upsample else PixelShuffleUpsample

        # give memory efficient unet an initial resnet block

@@ -1669,6 +1694,8 @@ class Unet(nn.Module):
        self.final_resnet_block = ResnetBlock(dim * 2, dim, time_cond_dim = time_cond_dim, groups = top_level_resnet_group)
        self.to_out = nn.Conv2d(dim, self.channels_out, kernel_size = final_conv_kernel_size, padding = final_conv_kernel_size // 2)

+        zero_init_(self.to_out) # since both OpenAI and @crowsonkb are doing it
+
    # if the current settings for the unet are not correct
    # for cascading DDPM, then reinit the unet with the right settings
    def cast_model_parameters(
@@ -1719,7 +1746,6 @@ class Unet(nn.Module):
        image_embed,
        lowres_cond_img = None,
        text_encodings = None,
-        text_mask = None,
        image_cond_drop_prob = 0.,
        text_cond_drop_prob = 0.,
        blur_sigma = None,
@@ -1791,23 +1817,27 @@ class Unet(nn.Module):
        text_tokens = None

        if exists(text_encodings) and self.cond_on_text_encodings:
+            assert text_encodings.shape[0] == batch_size, f'the text encodings being passed into the unet does not have the proper batch size - text encoding shape {text_encodings.shape} - required batch size is {batch_size}'
            assert self.text_embed_dim == text_encodings.shape[-1], f'the text encodings you are passing in have a dimension of {text_encodings.shape[-1]}, but the unet was created with text_embed_dim of {self.text_embed_dim}.'

+            text_mask = torch.any(text_encodings != 0., dim = -1)
+
            text_tokens = self.text_to_cond(text_encodings)
+
            text_tokens = text_tokens[:, :self.max_text_len]
+            text_mask = text_mask[:, :self.max_text_len]

            text_tokens_len = text_tokens.shape[1]
            remainder = self.max_text_len - text_tokens_len

            if remainder > 0:
                text_tokens = F.pad(text_tokens, (0, 0, 0, remainder))
+                text_mask = F.pad(text_mask, (0, remainder), value = False)

-            if exists(text_mask):
-                if remainder > 0:
-                    text_mask = F.pad(text_mask, (0, remainder), value = False)
+            text_mask = rearrange(text_mask, 'b n -> b n 1')

-                text_mask = rearrange(text_mask, 'b n -> b n 1')
-                text_keep_mask = text_mask & text_keep_mask
+            assert text_mask.shape[0] == text_keep_mask.shape[0], f'text_mask has shape of {text_mask.shape} while text_keep_mask has shape {text_keep_mask.shape}. text encoding is of shape {text_encodings.shape}'
+            text_keep_mask = text_mask & text_keep_mask

            null_text_embed = self.null_text_embed.to(text_tokens.dtype) # for some reason pytorch AMP not working

@@ -2189,10 +2219,10 @@ class Decoder(nn.Module):
        x = x.clamp(-s, s) / s
        return x

-    def p_mean_variance(self, unet, x, t, image_embed, noise_scheduler, text_encodings = None, text_mask = None, lowres_cond_img = None, clip_denoised = True, predict_x_start = False, learned_variance = False, cond_scale = 1., model_output = None):
+    def p_mean_variance(self, unet, x, t, image_embed, noise_scheduler, text_encodings = None, lowres_cond_img = None, clip_denoised = True, predict_x_start = False, learned_variance = False, cond_scale = 1., model_output = None):
        assert not (cond_scale != 1. and not self.can_classifier_guidance), 'the decoder was not trained with conditional dropout, and thus one cannot use classifier free guidance (cond_scale anything other than 1)'

-        pred = default(model_output, lambda: unet.forward_with_cond_scale(x, t, image_embed = image_embed, text_encodings = text_encodings, text_mask = text_mask, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img))
+        pred = default(model_output, lambda: unet.forward_with_cond_scale(x, t, image_embed = image_embed, text_encodings = text_encodings, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img))

        if learned_variance:
            pred, var_interp_frac_unnormalized = pred.chunk(2, dim = 1)
@@ -2224,16 +2254,16 @@ class Decoder(nn.Module):
        return model_mean, posterior_variance, posterior_log_variance

    @torch.no_grad()
-    def p_sample(self, unet, x, t, image_embed, noise_scheduler, text_encodings = None, text_mask = None, cond_scale = 1., lowres_cond_img = None, predict_x_start = False, learned_variance = False, clip_denoised = True):
+    def p_sample(self, unet, x, t, image_embed, noise_scheduler, text_encodings = None, cond_scale = 1., lowres_cond_img = None, predict_x_start = False, learned_variance = False, clip_denoised = True):
        b, *_, device = *x.shape, x.device
-        model_mean, _, model_log_variance = self.p_mean_variance(unet, x = x, t = t, image_embed = image_embed, text_encodings = text_encodings, text_mask = text_mask, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img, clip_denoised = clip_denoised, predict_x_start = predict_x_start, noise_scheduler = noise_scheduler, learned_variance = learned_variance)
+        model_mean, _, model_log_variance = self.p_mean_variance(unet, x = x, t = t, image_embed = image_embed, text_encodings = text_encodings, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img, clip_denoised = clip_denoised, predict_x_start = predict_x_start, noise_scheduler = noise_scheduler, learned_variance = learned_variance)
        noise = torch.randn_like(x)
        # no noise when t == 0
        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise

    @torch.no_grad()
-    def p_sample_loop_ddpm(self, unet, shape, image_embed, noise_scheduler, predict_x_start = False, learned_variance = False, clip_denoised = True, lowres_cond_img = None, text_encodings = None, text_mask = None, cond_scale = 1, is_latent_diffusion = False):
+    def p_sample_loop_ddpm(self, unet, shape, image_embed, noise_scheduler, predict_x_start = False, learned_variance = False, clip_denoised = True, lowres_cond_img = None, text_encodings = None, cond_scale = 1, is_latent_diffusion = False):
        device = self.device

        b = shape[0]
@@ -2249,7 +2279,6 @@ class Decoder(nn.Module):
                torch.full((b,), i, device = device, dtype = torch.long),
                image_embed = image_embed,
                text_encodings = text_encodings,
-                text_mask = text_mask,
                cond_scale = cond_scale,
                lowres_cond_img = lowres_cond_img,
                predict_x_start = predict_x_start,
@@ -2262,7 +2291,7 @@ class Decoder(nn.Module):
        return unnormalize_img

    @torch.no_grad()
-    def p_sample_loop_ddim(self, unet, shape, image_embed, noise_scheduler, timesteps, eta = 1., predict_x_start = False, learned_variance = False, clip_denoised = True, lowres_cond_img = None, text_encodings = None, text_mask = None, cond_scale = 1, is_latent_diffusion = False):
+    def p_sample_loop_ddim(self, unet, shape, image_embed, noise_scheduler, timesteps, eta = 1., predict_x_start = False, learned_variance = False, clip_denoised = True, lowres_cond_img = None, text_encodings = None, cond_scale = 1, is_latent_diffusion = False):
        batch, device, total_timesteps, alphas, eta = shape[0], self.device, noise_scheduler.num_timesteps, noise_scheduler.alphas_cumprod_prev, self.ddim_sampling_eta

        times = torch.linspace(0., total_timesteps, steps = timesteps + 2)[:-1]
@@ -2278,7 +2307,7 @@ class Decoder(nn.Module):

            time_cond = torch.full((batch,), time, device = device, dtype = torch.long)

-            pred = unet.forward_with_cond_scale(img, time_cond, image_embed = image_embed, text_encodings = text_encodings, text_mask = text_mask, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img)
+            pred = unet.forward_with_cond_scale(img, time_cond, image_embed = image_embed, text_encodings = text_encodings, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img)

            if learned_variance:
                pred, _ = pred.chunk(2, dim = 1)
@@ -2317,7 +2346,7 @@ class Decoder(nn.Module):

        return self.p_sample_loop_ddim(*args, noise_scheduler = noise_scheduler, timesteps = timesteps, **kwargs)

-    def p_losses(self, unet, x_start, times, *, image_embed, noise_scheduler, lowres_cond_img = None, text_encodings = None, text_mask = None, predict_x_start = False, noise = None, learned_variance = False, clip_denoised = False, is_latent_diffusion = False):
+    def p_losses(self, unet, x_start, times, *, image_embed, noise_scheduler, lowres_cond_img = None, text_encodings = None, predict_x_start = False, noise = None, learned_variance = False, clip_denoised = False, is_latent_diffusion = False):
        noise = default(noise, lambda: torch.randn_like(x_start))

        # normalize to [-1, 1]
@@ -2335,7 +2364,6 @@ class Decoder(nn.Module):
            times,
            image_embed = image_embed,
            text_encodings = text_encodings,
-            text_mask = text_mask,
            lowres_cond_img = lowres_cond_img,
            image_cond_drop_prob = self.image_cond_drop_prob,
            text_cond_drop_prob = self.text_cond_drop_prob,
@@ -2395,7 +2423,6 @@ class Decoder(nn.Module):
        self,
        image_embed = None,
        text = None,
-        text_mask = None,
        text_encodings = None,
        batch_size = 1,
        cond_scale = 1.,
@@ -2409,14 +2436,11 @@ class Decoder(nn.Module):

        if exists(text) and not exists(text_encodings) and not self.unconditional:
            assert exists(self.clip)
-            _, text_encodings, text_mask = self.clip.embed_text(text)
+            _, text_encodings = self.clip.embed_text(text)

        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

-        if self.condition_on_text_encodings:
-            text_mask = default(text_mask, lambda: torch.any(text_encodings != 0., dim = -1))
-
        img = None
        is_cuda = next(self.parameters()).is_cuda

@@ -2442,7 +2466,6 @@ class Decoder(nn.Module):
                    shape,
                    image_embed = image_embed,
                    text_encodings = text_encodings,
-                    text_mask = text_mask,
                    cond_scale = cond_scale,
                    predict_x_start = predict_x_start,
                    learned_variance = learned_variance,
@@ -2466,7 +2489,6 @@ class Decoder(nn.Module):
        text = None,
        image_embed = None,
        text_encodings = None,
-        text_mask = None,
        unet_number = None,
        return_lowres_cond_image = False # whether to return the low resolution conditioning images, for debugging upsampler purposes
    ):
@@ -2495,14 +2517,11 @@ class Decoder(nn.Module):

        if exists(text) and not exists(text_encodings) and not self.unconditional:
            assert exists(self.clip), 'if you are passing in raw text, you need to supply `clip` to the decoder'
-            _, text_encodings, text_mask = self.clip.embed_text(text)
+            _, text_encodings = self.clip.embed_text(text)

        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

-        if self.condition_on_text_encodings:
-            text_mask = default(text_mask, lambda: torch.any(text_encodings != 0., dim = -1))
-
        lowres_cond_img = self.to_lowres_cond(image, target_image_size = target_image_size, downsample_image_size = self.image_sizes[unet_index - 1]) if unet_number > 1 else None
        image = resize_image_to(image, target_image_size)

@@ -2521,7 +2540,7 @@ class Decoder(nn.Module):
            image = vae.encode(image)
            lowres_cond_img = maybe(vae.encode)(lowres_cond_img)

-        losses = self.p_losses(unet, image, times, image_embed = image_embed, text_encodings = text_encodings, text_mask = text_mask, lowres_cond_img = lowres_cond_img, predict_x_start = predict_x_start, learned_variance = learned_variance, is_latent_diffusion = is_latent_diffusion, noise_scheduler = noise_scheduler)
+        losses = self.p_losses(unet, image, times, image_embed = image_embed, text_encodings = text_encodings, lowres_cond_img = lowres_cond_img, predict_x_start = predict_x_start, learned_variance = learned_variance, is_latent_diffusion = is_latent_diffusion, noise_scheduler = noise_scheduler)

        if not return_lowres_cond_image:
            return losses
--- a/dalle2_pytorch/train_configs.py
+++ b/dalle2_pytorch/train_configs.py
@@ -133,6 +133,7 @@ class DiffusionPriorNetworkConfig(BaseModel):
    num_time_embeds: int = 1
    num_image_embeds: int = 1
    num_text_embeds: int = 1
+    attend_all_text_encodings: bool = True
    dim_head: int = 64
    heads: int = 8
    ff_mult: int = 4
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.20.0'
+__version__ = '0.22.3'
--- a/train_diffusion_prior.py
+++ b/train_diffusion_prior.py
@@ -126,9 +126,9 @@ def report_cosine_sims(

        # we are text conditioned, we produce an embedding from the tokenized text
        if text_conditioned:
-            text_embedding, text_encodings, text_mask = trainer.embed_text(text_data)
+            text_embedding, text_encodings = trainer.embed_text(text_data)
            text_cond = dict(
-                text_embed=text_embedding, text_encodings=text_encodings, mask=text_mask
+                text_embed=text_embedding, text_encodings=text_encodings
            )
        else:
            text_embedding = text_data
@@ -146,15 +146,12 @@ def report_cosine_sims(

        if text_conditioned:
            text_encodings_shuffled = text_encodings[rolled_idx]
-            text_mask_shuffled = text_mask[rolled_idx]
        else:
            text_encodings_shuffled = None
-            text_mask_shuffled = None

        text_cond_shuffled = dict(
            text_embed=text_embed_shuffled,
-            text_encodings=text_encodings_shuffled,
-            mask=text_mask_shuffled,
+            text_encodings=text_encodings_shuffled
        )

        # prepare the text embedding
Author	SHA1	Message	Date
Phil Wang	cd26c6b17d	0.22.3	2022-07-12 17:08:31 -07:00
Phil Wang	775abc4df6	add setting to attend to all text encodings regardless of padding, for diffusion prior	2022-07-12 17:08:12 -07:00
Phil Wang	11b1d533a0	make sure text encodings being passed in has the correct batch dimension	2022-07-12 16:00:19 -07:00
Phil Wang	e76e89f9eb	remove text masking altogether in favor of deriving from text encodings (padded text encodings must be pad value of 0.)	2022-07-12 15:40:31 -07:00
Phil Wang	bb3ff0ac67	protect against bad text mask being passed into decoder	2022-07-12 15:33:13 -07:00
Phil Wang	1ec4dbe64f	one more fix for text mask, if the length of the text encoding exceeds max_text_len, add an assert for better error msg	2022-07-12 15:01:46 -07:00
Phil Wang	e0835acca9	generate text mask within the unet and diffusion prior itself from the text encodings, if not given	2022-07-12 12:54:59 -07:00
Phil Wang	e055793e5d	shoutout for @MalumaDev	2022-07-11 16:12:35 -07:00
Phil Wang	1d9ef99288	add PixelShuffleUpsample thanks to @MalumaDev and @marunine for running the experiment and verifyng absence of checkboard artifacts	2022-07-11 16:07:23 -07:00
Phil Wang	bdd62c24b3	zero init final projection in unet, since openai and @crowsonkb are both doing it	2022-07-11 13:22:06 -07:00