cast attention matrix back to original dtype pre-softmax in attention

make it work for @ethancohen123
2026-02-13 03:54:35 +01:00 · 2022-08-20 10:56:01 -07:00 · 2022-08-19 11:28:58 -07:00
2 changed files with 12 additions and 3 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -250,9 +250,15 @@ class XClipAdapter(BaseClipAdapter):
        text = text[..., :self.max_text_len]
        text_mask = text != 0
        encoder_output = self.clip.text_transformer(text)
-        text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
+
+        encoder_output_is_cls = encoder_output.ndim == 3
+
+        text_cls, text_encodings = (encoder_output[:, 0], encoder_output[:, 1:]) if encoder_output_is_cls else (encoder_output, None)
        text_embed = self.clip.to_text_latent(text_cls)
-        text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
+
+        if exists(text_encodings):
+            text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
+
        return EmbeddedText(l2norm(text_embed), text_encodings)

    @torch.no_grad()
@@ -873,6 +879,8 @@ class Attention(nn.Module):
        # attention

        attn = sim.softmax(dim = -1, dtype = torch.float32)
+        attn = attn.type(sim.dtype)
+
        attn = self.dropout(attn)

        # aggregate values
@@ -1631,6 +1639,7 @@ class CrossAttention(nn.Module):
            sim = sim.masked_fill(~mask, max_neg_value)

        attn = sim.softmax(dim = -1, dtype = torch.float32)
+        attn = attn.type(sim.dtype)

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '1.8.2'
+__version__ = '1.8.4'
Author	SHA1	Message	Date
Phil Wang	083508ff8e	cast attention matrix back to original dtype pre-softmax in attention	2022-08-20 10:56:01 -07:00
Phil Wang	7762edd0ff	make it work for @ethancohen123	2022-08-19 11:28:58 -07:00