0.22.3

add setting to attend to all text encodings regardless of padding, for diffusion prior
make sure text encodings being passed in has the correct batch dimension
2026-02-13 03:54:35 +01:00 · 2022-07-12 17:08:31 -07:00 · 2022-07-12 17:08:12 -07:00 · 2022-07-12 16:00:19 -07:00
3 changed files with 10 additions and 3 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -806,6 +806,7 @@ class DiffusionPriorNetwork(nn.Module):
        num_time_embeds = 1,
        num_image_embeds = 1,
        num_text_embeds = 1,
+        attend_all_text_encodings = True,
        **kwargs
    ):
        super().__init__()
@@ -831,6 +832,8 @@ class DiffusionPriorNetwork(nn.Module):
        self.learned_query = nn.Parameter(torch.randn(dim))
        self.causal_transformer = CausalTransformer(dim = dim, **kwargs)

+        self.attend_all_text_encodings = attend_all_text_encodings
+
    def forward_with_cond_scale(
        self,
        *args,
@@ -852,7 +855,6 @@ class DiffusionPriorNetwork(nn.Module):
        *,
        text_embed,
        text_encodings = None,
-        mask = None,
        cond_drop_prob = 0.
    ):
        batch, dim, device, dtype = *image_embed.shape, image_embed.device, image_embed.dtype
@@ -871,7 +873,10 @@ class DiffusionPriorNetwork(nn.Module):
        if not exists(text_encodings):
            text_encodings = torch.empty((batch, 0, dim), device = device, dtype = dtype)

-        mask = torch.any(text_encodings != 0., dim = -1)
+        if self.attend_all_text_encodings:
+            mask = torch.ones((batch, text_encodings.shape[-2]), device = device, dtype = torch.bool)
+        else:
+            mask = torch.any(text_encodings != 0., dim = -1)

        # classifier free guidance

@@ -1812,6 +1817,7 @@ class Unet(nn.Module):
        text_tokens = None

        if exists(text_encodings) and self.cond_on_text_encodings:
+            assert text_encodings.shape[0] == batch_size, f'the text encodings being passed into the unet does not have the proper batch size - text encoding shape {text_encodings.shape} - required batch size is {batch_size}'
            assert self.text_embed_dim == text_encodings.shape[-1], f'the text encodings you are passing in have a dimension of {text_encodings.shape[-1]}, but the unet was created with text_embed_dim of {self.text_embed_dim}.'

            text_mask = torch.any(text_encodings != 0., dim = -1)
--- a/dalle2_pytorch/train_configs.py
+++ b/dalle2_pytorch/train_configs.py
@@ -133,6 +133,7 @@ class DiffusionPriorNetworkConfig(BaseModel):
    num_time_embeds: int = 1
    num_image_embeds: int = 1
    num_text_embeds: int = 1
+    attend_all_text_encodings: bool = True
    dim_head: int = 64
    heads: int = 8
    ff_mult: int = 4
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.22.0'
+__version__ = '0.22.3'
Author	SHA1	Message	Date
Phil Wang	cd26c6b17d	0.22.3	2022-07-12 17:08:31 -07:00
Phil Wang	775abc4df6	add setting to attend to all text encodings regardless of padding, for diffusion prior	2022-07-12 17:08:12 -07:00
Phil Wang	11b1d533a0	make sure text encodings being passed in has the correct batch dimension	2022-07-12 16:00:19 -07:00