use cross attention for conditioning unet based on image embedding tokens (which opens up the door on conditioning on text encodings as well

start using swish glu everywhere, given success of PaLM
better naming
2026-02-12 19:44:26 +01:00 · 2022-04-14 10:10:04 -07:00 · 2022-04-14 09:34:32 -07:00 · 2022-04-14 09:24:31 -07:00 · 2022-04-14 09:21:51 -07:00 · 2022-04-14 09:16:09 -07:00
3 changed files with 165 additions and 53 deletions
--- a/README.md
+++ b/README.md
@@ -22,19 +22,11 @@ For all of you emailing me (there is a lot), the best way to contribute is throu
 $ pip install dalle2-pytorch
 ```

-## CLI Usage (work in progress)
-
-```bash
-$ dream 'sharing a sunset at the summit of mount everest with my dog'
-```
-
-Once built, images will be saved to the same directory the command is invoked
-
-## Training (for deep learning practitioners)
+## Usage

 To train DALLE-2 is a 3 step process, with the training of CLIP being the most important

-To train CLIP, you can either use <a href="https://github.com/lucidrains/x-clip">x-clip</a> package, or join the LAION discord, where a lot of replication efforts are already underway.
+To train CLIP, you can either use <a href="https://github.com/lucidrains/x-clip">x-clip</a> package, or join the LAION discord, where a lot of replication efforts are already <a href="https://github.com/mlfoundations/open_clip">underway</a>.

 This repository will demonstrate integration with `x-clip` for starters

@@ -109,7 +101,7 @@ clip = CLIP(
 unet = Unet(
    dim = 128,
    image_embed_dim = 512,
-    time_dim = 128,
+    cond_dim = 128,
    channels = 3,
    dim_mults=(1, 2, 4, 8)
 ).cuda()
@@ -272,7 +264,7 @@ loss.backward()
 unet = Unet(
    dim = 128,
    image_embed_dim = 512,
-    time_dim = 128,
+    cond_dim = 128,
    channels = 3,
    dim_mults=(1, 2, 4, 8)
 ).cuda()
@@ -306,6 +298,18 @@ Everything in this readme should run without error

 For the layperson, no worries, training will all be automated into a CLI tool, at least for small scale training.

+## CLI Usage (work in progress)
+
+```bash
+$ dream 'sharing a sunset at the summit of mount everest with my dog'
+```
+
+Once built, images will be saved to the same directory the command is invoked
+
+## Training wrapper (wip)
+
+Offer training wrappers
+
 ## Training CLI (wip)

 <a href="https://github.com/lucidrains/stylegan2-pytorch">template</a>
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -11,6 +11,8 @@ from einops.layers.torch import Rearrange
 from einops_exts import rearrange_many, repeat_many, check_shape
 from einops_exts.torch import EinopsToAndFrom

+from kornia.filters import filter2d
+
 from dalle2_pytorch.tokenizer import tokenizer

 # use x-clip
@@ -116,14 +118,13 @@ class ChanRMSNorm(RMSNorm):
        inv_norm = torch.rsqrt(squared_sum + self.eps)
        return x * inv_norm * rearrange(self.gamma, 'c -> 1 c 1 1') * self.scale

-class PreNormResidual(nn.Module):
-    def __init__(self, dim, fn):
+class Residual(nn.Module):
+    def __init__(self, fn):
        super().__init__()
        self.fn = fn
-        self.norm = RMSNorm(dim)

    def forward(self, x, **kwargs):
-        return self.fn(self.norm(x), **kwargs) + x
+        return self.fn(x, **kwargs) + x

 # mlp

@@ -162,12 +163,21 @@ class MLP(nn.Module):

 # feedforward

-def FeedForward(dim, mult = 4, dropout = 0.):
+class SwiGLU(nn.Module):
+    """ used successfully in https://arxiv.org/abs/2204.0231 """
+    def forward(self, x):
+        x, gate = x.chunk(2, dim = -1)
+        return x * F.silu(gate)
+
+def FeedForward(dim, mult = 4, dropout = 0., post_activation_norm = False):
+    """ post-activation norm https://arxiv.org/abs/2110.09456 """
+
    inner_dim = int(mult * dim)
    return nn.Sequential(
        RMSNorm(dim),
-        nn.Linear(dim, inner_dim, bias = False),
-        nn.GELU(),
+        nn.Linear(dim, inner_dim * 2, bias = False),
+        SwiGLU(),
+        RMSNorm(inner_dim) if post_activation_norm else nn.Identity(),
        nn.Dropout(dropout),
        nn.Linear(inner_dim, dim, bias = False)
    )
@@ -229,6 +239,7 @@ class Attention(nn.Module):

        sim = sim - sim.amax(dim = -1, keepdim = True)
        attn = sim.softmax(dim = -1)
+        attn = self.dropout(attn)

        out = einsum('b h i j, b j d -> b h i d', attn, v)

@@ -285,17 +296,16 @@ class DiffusionPriorNetwork(nn.Module):

    def forward_with_cond_scale(
        self,
-        x,
        *args,
        cond_scale = 1.,
        **kwargs
    ):
-        logits = self.forward(x, *args, **kwargs)
+        logits = self.forward(*args, **kwargs)

        if cond_scale == 1:
            return logits

-        null_logits = self.forward(x, *args, cond_drop_prob = 1., **kwargs)
+        null_logits = self.forward(*args, cond_drop_prob = 1., **kwargs)
        return null_logits + (logits - null_logits) * cond_scale

    def forward(
@@ -315,8 +325,15 @@ class DiffusionPriorNetwork(nn.Module):

        text_embed, image_embed = rearrange_many((text_embed, image_embed), 'b d -> b 1 d')

+        # whether text embedding is used for conditioning depends on whether text encodings are available for attention (for classifier free guidance, even though it seems from the paper it was not used in the prior ddpm, as the objective is different)
+        # but let's just do it right
+
        if exists(mask):
-            mask = F.pad(mask, (0, 3), value = True) # extend mask for text embedding, noised image embedding, time step embedding, and learned query
+            not_all_masked_out = mask.any(dim = -1)
+            mask = torch.cat((mask, rearrange(not_all_masked_out, 'b -> b 1')), dim = 1)
+
+        if exists(mask):
+            mask = F.pad(mask, (0, 2), value = True) # extend mask for text embedding, noised image embedding, time step embedding, and learned query

        time_embed = self.time_embeddings(diffusion_timesteps)
        time_embed = rearrange(time_embed, 'b d -> b 1 d')
@@ -554,6 +571,17 @@ def Upsample(dim):
 def Downsample(dim):
    return nn.Conv2d(dim, dim, 4, 2, 1)

+class Blur(nn.Module):
+    def __init__(self):
+        super().__init__()
+        filt = torch.Tensor([1, 2, 1])
+        self.register_buffer('filt', filt)
+
+    def forward(self, x):
+        filt = self.filt
+        filt = rearrange(filt, '... j -> ... 1 j') * rearrange(flit, '... i -> ... i 1')
+        return filter2d(x, filt, normalized = True)
+
 class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim):
        super().__init__()
@@ -581,10 +609,17 @@ class ConvNextBlock(nn.Module):
        super().__init__()
        need_projection = dim != dim_out

-        self.mlp = nn.Sequential(
-            nn.GELU(),
-            nn.Linear(cond_dim, dim)
-        ) if exists(cond_dim) else None
+        self.cross_attn = None
+
+        if exists(cond_dim):
+            self.cross_attn = EinopsToAndFrom(
+                'b c h w',
+                'b (h w) c',
+                CrossAttention(
+                    dim = dim,
+                    context_dim = cond_dim
+                )
+            )

        self.ds_conv = nn.Conv2d(dim, dim, 7, padding = 3, groups = dim)

@@ -601,21 +636,82 @@ class ConvNextBlock(nn.Module):
    def forward(self, x, cond = None):
        h = self.ds_conv(x)

-        if exists(self.mlp):
+        if exists(self.cross_attn):
            assert exists(cond)
-            condition = self.mlp(cond)
-            h = h + rearrange(condition, 'b c -> b c 1 1')
+            h = self.cross_attn(h, context = cond) + h

        h = self.net(h)
+
        return h + self.res_conv(x)

+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        context_dim = None,
+        dim_head = 64,
+        heads = 8,
+        dropout = 0.,
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        context_dim = default(context_dim, dim)
+
+        self.norm = RMSNorm(dim)
+        self.norm_context = RMSNorm(context_dim)
+        self.dropout = nn.Dropout(dropout)
+
+        self.null_kv = nn.Parameter(torch.randn(2, dim_head))
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim, bias = False)
+
+    def forward(self, x, context, mask = None):
+        b, n, device = *x.shape[:2], x.device
+
+        x = self.norm(x)
+        context = self.norm_context(context)
+
+        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
+
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h = self.heads)
+
+        # add null key / value for classifier free guidance in prior net
+
+        nk, nv = repeat_many(self.null_kv.unbind(dim = -2), 'd -> b h 1 d', h = self.heads,  b = b)
+
+        k = torch.cat((nk, k), dim = -2)
+        v = torch.cat((nv, v), dim = -2)
+
+        q = q * self.scale
+
+        sim = einsum('b h i d, b h j d -> b h i j', q, k)
+        max_neg_value = -torch.finfo(sim.dtype).max
+
+        if exists(mask):
+            mask = F.pad(mask, (1, 0), value = True)
+            mask = rearrange(mask, 'b j -> b 1 1 j')
+            sim = sim.masked_fill(~mask, max_neg_value)
+
+        sim = sim - sim.amax(dim = -1, keepdim = True)
+        attn = sim.softmax(dim = -1)
+
+        out = einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+
 class Unet(nn.Module):
    def __init__(
        self,
        dim,
        *,
        image_embed_dim,
-        time_dim = None,
+        cond_dim = None,
+        num_image_tokens = 4,
        out_dim = None,
        dim_mults=(1, 2, 4, 8),
        channels = 3,
@@ -626,18 +722,28 @@ class Unet(nn.Module):
        dims = [channels, *map(lambda m: dim * m, dim_mults)]
        in_out = list(zip(dims[:-1], dims[1:]))

-        time_dim = default(time_dim, dim)
+        # time and image embeddings
+
+        cond_dim = default(cond_dim, dim)

        self.time_mlp = nn.Sequential(
            SinusoidalPosEmb(dim),
            nn.Linear(dim, dim * 4),
            nn.GELU(),
-            nn.Linear(dim * 4, dim)
+            nn.Linear(dim * 4, cond_dim),
+            Rearrange('b d -> b 1 d')
        )

-        self.null_image_embed = nn.Parameter(torch.randn(image_embed_dim))
+        self.image_to_cond = nn.Sequential(
+            nn.Linear(image_embed_dim, cond_dim * num_image_tokens),
+            Rearrange('b (n d) -> b n d', n = num_image_tokens)
+        ) if image_embed_dim != cond_dim else nn.Identity()

-        cond_dim = time_dim + image_embed_dim
+        # for classifier free guidance
+
+        self.null_image_embed = nn.Parameter(torch.randn(1, num_image_tokens, cond_dim))
+
+        # layers

        self.downs = nn.ModuleList([])
        self.ups = nn.ModuleList([])
@@ -647,7 +753,7 @@ class Unet(nn.Module):
            is_last = ind >= (num_resolutions - 1)

            self.downs.append(nn.ModuleList([
-                ConvNextBlock(dim_in, dim_out, cond_dim = cond_dim, norm = ind != 0),
+                ConvNextBlock(dim_in, dim_out, norm = ind != 0),
                ConvNextBlock(dim_out, dim_out, cond_dim = cond_dim),
                Downsample(dim_out) if not is_last else nn.Identity()
            ]))
@@ -655,7 +761,7 @@ class Unet(nn.Module):
        mid_dim = dims[-1]

        self.mid_block1 = ConvNextBlock(mid_dim, mid_dim, cond_dim = cond_dim)
-        self.mid_attn = EinopsToAndFrom('b c h w', 'b (h w) c', PreNormResidual(mid_dim, Attention(mid_dim)))
+        self.mid_attn = EinopsToAndFrom('b c h w', 'b (h w) c', Residual(Attention(mid_dim)))
        self.mid_block2 = ConvNextBlock(mid_dim, mid_dim, cond_dim = cond_dim)

        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
@@ -675,17 +781,16 @@ class Unet(nn.Module):

    def forward_with_cond_scale(
        self,
-        x,
        *args,
        cond_scale = 1.,
        **kwargs
    ):
-        logits = self.forward(x, *args, **kwargs)
+        logits = self.forward(*args, **kwargs)

        if cond_scale == 1:
            return logits

-        null_logits = self.forward(x, *args, cond_drop_prob = 1., **kwargs)
+        null_logits = self.forward(*args, cond_drop_prob = 1., **kwargs)
        return null_logits + (logits - null_logits) * cond_scale

    def forward(
@@ -698,37 +803,39 @@ class Unet(nn.Module):
        cond_drop_prob = 0.
    ):
        batch_size, device = x.shape[0], x.device
-        t = self.time_mlp(time)
+        time_tokens = self.time_mlp(time)

        cond_prob_mask = prob_mask_like((batch_size,), cond_drop_prob, device = device)

        # mask out image embedding depending on condition dropout
        # for classifier free guidance

-        image_embed = torch.where(
-            rearrange(cond_prob_mask, 'b -> b 1'),
-            image_embed,
-            rearrange(self.null_image_embed, 'd -> 1 d')
+        image_tokens = self.image_to_cond(image_embed)
+
+        image_tokens = torch.where(
+            rearrange(cond_prob_mask, 'b -> b 1 1'),
+            image_tokens,
+            self.null_image_embed
        )

-        t = torch.cat((t, image_embed), dim = -1)
+        c = torch.cat((time_tokens, image_tokens), dim = -2) # c for condition

        hiddens = []

        for convnext, convnext2, downsample in self.downs:
-            x = convnext(x, t)
-            x = convnext2(x, t)
+            x = convnext(x, c)
+            x = convnext2(x, c)
            hiddens.append(x)
            x = downsample(x)

-        x = self.mid_block1(x, t)
+        x = self.mid_block1(x, c)
        x = self.mid_attn(x)
-        x = self.mid_block2(x, t)
+        x = self.mid_block2(x, c)

        for convnext, convnext2, upsample in self.ups:
            x = torch.cat((x, hiddens.pop()), dim=1)
-            x = convnext(x, t)
-            x = convnext2(x, t)
+            x = convnext(x, c)
+            x = convnext2(x, c)
            x = upsample(x)

        return self.final_conv(x)
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.8',
+  version = '0.0.11',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
@@ -25,6 +25,7 @@ setup(
    'click',
    'einops>=0.4',
    'einops-exts>=0.0.3',
+    'kornia>=0.5.4',
    'pillow',
    'torch>=1.10',
    'torchvision',
Author	SHA1	Message	Date
Phil Wang	68e9883f59	use cross attention for conditioning unet based on image embedding tokens (which opens up the door on conditioning on text encodings as well	2022-04-14 10:10:04 -07:00
Phil Wang	95b018374a	start using swish glu everywhere, given success of PaLM	2022-04-14 09:34:32 -07:00
Phil Wang	8b5c2385b0	better naming	2022-04-14 09:24:31 -07:00
Phil Wang	f2c52d8239	fix bug with classifier free guidance for prior network, even though it seems it may not be used	2022-04-14 09:21:51 -07:00
Phil Wang	97e951221b	bring in blur, as it will be used somewhere in the cascading DDPM in the decoder eventually, once i figure it out	2022-04-14 09:16:09 -07:00
Phil Wang	e1b0c140f1	cleanup readme	2022-04-14 08:51:22 -07:00
Phil Wang	5989569a44	link to OpenCLIP effort	2022-04-14 08:31:15 -07:00
Phil Wang	82464d7bd3	per-fect	2022-04-14 08:30:07 -07:00