complete vit-vqgan from https://arxiv.org/abs/2110.04627

2026-02-12 19:44:26 +01:00 · 2022-04-26 17:04:18 -07:00
6 changed files with 193 additions and 95 deletions
--- a/README.md
+++ b/README.md
@@ -499,12 +499,10 @@ loss.backward()

 ### DALL-E2 with Latent Diffusion

-This repository decides to take the next step and offer DALL-E v2 combined with <a href="https://huggingface.co/spaces/multimodalart/latentdiffusion">latent diffusion</a>, from Rombach et al.
+This repository decides to take the next step and offer DALL-E2 combined with <a href="https://huggingface.co/spaces/multimodalart/latentdiffusion">latent diffusion</a>, from Rombach et al.

 You can use it as follows. Latent diffusion can be limited to just the first U-Net in the cascade, or to any number you wish.

-The repository also comes equipped with all the necessary settings to recreate `ViT-VQGan` from the <a href="https://arxiv.org/abs/2110.04627">Improved VQGans</a> paper. Furthermore, the <a href="https://github.com/lucidrains/vector-quantize-pytorch">vector quantization</a> library also comes equipped to do <a href="https://arxiv.org/abs/2203.01941">residual or multi-headed quantization</a>, which I believe will give an even further boost in performance to the autoencoder.
-
 ```python
 import torch
 from dalle2_pytorch import Unet, Decoder, CLIP, VQGanVAE
@@ -647,12 +645,11 @@ Once built, images will be saved to the same directory the command is invoked
 - [x] use attention-based upsampling https://arxiv.org/abs/2112.11435
 - [x] use inheritance just this once for sharing logic between decoder and prior network ddpms
 - [x] bring in vit-vqgan https://arxiv.org/abs/2110.04627 for the latent diffusion
- [x] abstract interface for CLIP adapter class, so other CLIPs can be brought in
+- [ ] abstract interface for CLIP adapter class, so other CLIPs can be brought in
 - [ ] become an expert with unets, cleanup unet code, make it fully configurable, port all learnings over to https://github.com/lucidrains/x-unet
 - [ ] copy the cascading ddpm code to a separate repo (perhaps https://github.com/lucidrains/denoising-diffusion-pytorch) as the main contribution of dalle2 really is just the prior network
 - [ ] transcribe code to Jax, which lowers the activation energy for distributed training, given access to TPUs
 - [ ] train on a toy task, offer in colab
- [ ] think about how best to design a declarative training config that handles preencoding for prior and training of multiple networks in decoder
 - [ ] extend diffusion head to use diffusion-gan (potentially using lightweight-gan) to speed up inference
 - [ ] bring in tools to train vqgan-vae

@@ -700,6 +697,16 @@ Once built, images will be saved to the same directory the command is invoked
 }
 ```

+```bibtex
+@article{Arar2021LearnedQF,
+    title   = {Learned Queries for Efficient Local Attention},
+    author  = {Moab Arar and Ariel Shamir and Amit H. Bermano},
+    journal = {ArXiv},
+    year    = {2021},
+    volume  = {abs/2112.11435}
+}
+```
+
 ```bibtex
@article{Yu2021VectorquantizedIM,
    title   = {Vector-quantized Image Modeling with Improved VQGAN},
--- a/dalle2_pytorch/attention.py
+++ b/dalle2_pytorch/attention.py
@@ -0,0 +1,130 @@
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+
+from einops import rearrange, repeat
+
+class LayerNormChan(nn.Module):
+    def __init__(
+        self,
+        dim,
+        eps = 1e-5
+    ):
+        super().__init__()
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(1, dim, 1, 1))
+
+    def forward(self, x):
+        var = torch.var(x, dim = 1, unbiased = False, keepdim = True)
+        mean = torch.mean(x, dim = 1, keepdim = True)
+        return (x - mean) / (var + self.eps).sqrt() * self.gamma
+
+# attention-based upsampling
+# from https://arxiv.org/abs/2112.11435
+
+class QueryAndAttend(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        num_queries = 1,
+        dim_head = 32,
+        heads = 8,
+        window_size = 3
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        inner_dim = dim_head * heads
+        self.heads = heads
+        self.dim_head = dim_head
+        self.window_size = window_size
+        self.num_queries = num_queries
+
+        self.rel_pos_bias = nn.Parameter(torch.randn(heads, num_queries, window_size * window_size, 1, 1))
+
+        self.queries = nn.Parameter(torch.randn(heads, num_queries, dim_head))
+        self.to_kv = nn.Conv2d(dim, dim_head * 2, 1, bias = False)
+
+        self.to_out = nn.Sequential(
+            nn.Conv2d(inner_dim, dim * 2, 1, bias = False),
+            nn.Tanh(),
+            nn.Conv2d(dim * 2, dim, 1, bias = False)
+        )
+
+    def forward(self, x):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        l - num queries
+        d - head dimension
+        x - height
+        y - width
+        j - source sequence for attending to (kernel size squared in this case)
+        """
+
+        wsz, heads, dim_head, num_queries = self.window_size, self.heads, self.dim_head, self.num_queries
+        batch, _, height, width = x.shape
+
+        is_one_query = self.num_queries == 1
+
+        # queries, keys, values
+
+        q = self.queries * self.scale
+        k, v = self.to_kv(x).chunk(2, dim = 1)
+
+        # similarities
+
+        sim = einsum('h l d, b d x y -> b h l x y', q, k)
+        sim = rearrange(sim, 'b ... x y -> b (...) x y')
+
+        # unfold the similarity scores, with float(-inf) as padding value
+
+        mask_value = -torch.finfo(sim.dtype).max
+        sim = F.pad(sim, ((wsz // 2,) * 4), value = mask_value)
+        sim = F.unfold(sim, kernel_size = wsz)
+        sim = rearrange(sim, 'b (h l j) (x y) -> b h l j x y', h = heads, l = num_queries, x = height, y = width)
+
+        # rel pos bias
+
+        sim = sim + self.rel_pos_bias
+
+        # numerically stable attention
+
+        sim = sim - sim.amax(dim = -3, keepdim = True).detach()
+        attn = sim.softmax(dim = -3)
+
+        # unfold values
+
+        v = F.pad(v, ((wsz // 2,) * 4), value = 0.)
+        v = F.unfold(v, kernel_size = wsz)
+        v = rearrange(v, 'b (d j) (x y) -> b d j x y', d = dim_head, x = height, y = width)
+
+        # aggregate values
+
+        out = einsum('b h l j x y, b d j x y -> b l h d x y', attn, v)
+
+        # combine heads
+
+        out = rearrange(out, 'b l h d x y -> (b l) (h d) x y')
+        out = self.to_out(out)
+        out = rearrange(out, '(b l) d x y -> b l d x y', b = batch)
+
+        # return original input if one query
+
+        if is_one_query:
+            out = rearrange(out, 'b 1 ... -> b ...')
+
+        return out
+
+class QueryAttnUpsample(nn.Module):
+    def __init__(self, dim, **kwargs):
+        super().__init__()
+        self.norm = LayerNormChan(dim)
+        self.qna = QueryAndAttend(dim = dim, num_queries = 4, **kwargs)
+
+    def forward(self, x):
+        x = self.norm(x)
+        out = self.qna(x)
+        out = rearrange(out, 'b (w1 w2) c h w -> b c (h w1) (w w2)', w1 = 2, w2 = 2)
+        return out
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -17,6 +17,7 @@ from kornia.filters import gaussian_blur2d

 from dalle2_pytorch.tokenizer import tokenizer
 from dalle2_pytorch.vqgan_vae import NullVQGanVAE, VQGanVAE
+from dalle2_pytorch.attention import QueryAttnUpsample

 # use x-clip

@@ -35,10 +36,6 @@ def default(val, d):
 def cast_tuple(val, length = 1):
    return val if isinstance(val, tuple) else ((val,) * length)

-@contextmanager
-def null_context(*args, **kwargs):
-    yield
-
 def eval_decorator(fn):
    def inner(model, *args, **kwargs):
        was_training = model.training
@@ -89,59 +86,6 @@ def resize_image_to(t, image_size, mode = 'bilinear'): # take a look at https://

    return F.interpolate(t, size = shape, mode = mode, align_corners = False)

-# clip related adapters
-
-class BaseClipAdapter(nn.Module):
-    def __init__(self, clip):
-        super().__init__()
-        self.clip = clip
-
-    @property
-    def dim_latent(self):
-        raise NotImplementedError
-
-    @property
-    def image_size(self):
-        raise NotImplementedError
-
-    @property
-    def image_channels(self):
-        raise NotImplementedError
-
-    def embed_text(self, text):
-        raise NotImplementedError
-
-    def embed_image(self, image):
-        raise NotImplementedError
-
-class XClipAdapter(BaseClipAdapter):
-    @property
-    def dim_latent(self):
-        return self.clip.dim_latent
-
-    @property
-    def image_size(self):
-        return self.clip.image_size
-
-    @property
-    def image_channels(self):
-        return self.clip.image_channels
-
-    @torch.no_grad()
-    def embed_text(self, text):
-        encoder_output = self.clip.text_transformer(text)
-        text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
-        text_embed = self.clip.to_text_latent(text_cls)
-        return l2norm(text_embed), text_encodings
-
-    @torch.no_grad()
-    def embed_image(self, image):
-        image = resize_image_to(image, self.image_size)
-        encoder_output = self.clip.visual_transformer(image)
-        image_cls, image_encodings = encoder_output[:, 0], encoder_output[:, 1:]
-        image_embed = self.clip.to_visual_latent(image_cls)
-        return l2norm(image_embed), image_encodings
-
 # classifier free guidance functions

 def prob_mask_like(shape, prob, device):
@@ -648,7 +592,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        if exists(clip):
            assert isinstance(clip, CLIP)
            freeze_model_and_make_eval_(clip)
-            self.clip = XClipAdapter(clip)
+            self.clip = clip
        else:
            assert exists(image_embed_dim), 'latent dimension must be given, if training prior network without CLIP given'
            self.clip = None
@@ -663,6 +607,29 @@ class DiffusionPrior(BaseGaussianDiffusion):
        self.predict_x_start = predict_x_start
        # in paper, they do not predict the noise, but predict x0 directly for image embedding, claiming empirically better results. I'll just offer both.

+    @torch.no_grad()
+    def get_image_embed(self, image):
+        assert exists(self.clip)
+
+        image_encoding = self.clip.visual_transformer(image)
+        image_cls = image_encoding[:, 0]
+        image_embed = self.clip.to_visual_latent(image_cls)
+        return l2norm(image_embed)
+
+    @torch.no_grad()
+    def get_text_cond(self, text):
+        assert exists(self.clip)
+
+        text_encodings = self.clip.text_transformer(text)
+        text_cls, text_encodings = text_encodings[:, 0], text_encodings[:, 1:]
+        text_embed = self.clip.to_text_latent(text_cls)
+        text_embed = l2norm(text_embed)
+
+        if not self.condition_on_text_encodings:            
+            return dict(text_embed = text_embed)
+
+        return dict(text_encodings = text_encodings, text_embed = text_embed, mask = text != 0)
+
    def p_mean_variance(self, x, t, text_cond, clip_denoised: bool):
        pred = self.net(x, t, **text_cond)

@@ -734,12 +701,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        batch_size = text.shape[0]
        image_embed_dim = self.image_embed_dim

-        text_embed, text_encodings = self.clip.embed_text(text)
-
-        text_cond = dict(text_embed = text_embed)
-
-        if self.condition_on_text_encodings:
-            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}
+        text_cond = self.get_text_cond(text)

        image_embeds = self.p_sample_loop((batch_size, image_embed_dim), text_cond = text_cond)
        text_embeds = text_cond['text_embed']
@@ -771,18 +733,18 @@ class DiffusionPrior(BaseGaussianDiffusion):
        assert not (self.condition_on_text_encodings and (not exists(text_encodings) and not exists(text))), 'text encodings must be present if you specified you wish to condition on it on initialization'

        if exists(image):
-            image_embed, _ = self.clip.embed_image(image)
+            image_embed = self.get_image_embed(image)

        # calculate text conditionings, based on what is passed in

        if exists(text):
-            text_embed, text_encodings = self.clip.embed_text(text)
-            text_mask = text != 0
-
-        text_cond = dict(text_embed = text_embed)
-
-        if self.condition_on_text_encodings:
-            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}
+            text_cond = self.get_text_cond(text)
+        else:
+            text_cond = dict(
+                text_embed = text_embed,
+                text_encodings = text_encodings,
+                mask = text_mask
+            )

        # timestep conditioning from ddpm

@@ -1243,9 +1205,7 @@ class Decoder(BaseGaussianDiffusion):
            loss_type = loss_type
        )

-        if isinstance(clip, CLIP):
-            clip = XClipAdapter(clip)
-
+        assert isinstance(clip, CLIP)
        freeze_model_and_make_eval_(clip)
        self.clip = clip
        self.clip_image_size = clip.image_size
@@ -1327,6 +1287,10 @@ class Decoder(BaseGaussianDiffusion):
        yield
        unet.cpu()

+    @torch.no_grad()
+    def get_text_encodings(self, text):
+        text_encodings = self.clip.text_transformer(text)
+        return text_encodings[:, 1:]

    @torch.no_grad()
    def get_image_embed(self, image):
@@ -1412,19 +1376,14 @@ class Decoder(BaseGaussianDiffusion):
    def sample(self, image_embed, text = None, cond_scale = 1.):
        batch_size = image_embed.shape[0]

-        text_encodings = None
-        if exists(text):
-            _, text_encodings = self.clip.embed_text(text)
+        text_encodings = self.get_text_encodings(text) if exists(text) else None

        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'

        img = None

        for unet, vae, channel, image_size, predict_x_start in tqdm(zip(self.unets, self.vaes, self.sample_channels, self.image_sizes, self.predict_x_start)):
-
-            context = self.one_unet_in_gpu(unet = unet) if image_embed.is_cuda else null_context()
-
-            with context:
+            with self.one_unet_in_gpu(unet = unet):
                lowres_cond_img = None
                shape = (batch_size, channel, image_size, image_size)

@@ -1477,11 +1436,9 @@ class Decoder(BaseGaussianDiffusion):
        times = torch.randint(0, self.num_timesteps, (b,), device = device, dtype = torch.long)

        if not exists(image_embed):
-            image_embed, _ = self.clip.embed_image(image)
+            image_embed = self.get_image_embed(image)

-        text_encodings = None
-        if exists(text) and not exists(text_encodings):
-            _, text_encodings = self.clip.embed_text(text)
+        text_encodings = self.get_text_encodings(text) if exists(text) and not exists(text_encodings) else None

        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'

--- a/dalle2_pytorch/train.py
+++ b/dalle2_pytorch/train.py
@@ -35,7 +35,7 @@ class EMA(nn.Module):

        self.update_moving_average(self.ema_model, self.online_model)

-    def update_moving_average(self, ma_model, current_model):
+    def update_moving_average(ma_model, current_model):
        def calculate_ema(beta, old, new):
            if not exists(old):
                return new
--- a/dalle2_pytorch/vqgan_vae.py
+++ b/dalle2_pytorch/vqgan_vae.py
@@ -15,6 +15,8 @@ from einops import rearrange, reduce, repeat
 from einops_exts import rearrange_many
 from einops.layers.torch import Rearrange

+from dalle2_pytorch.attention import QueryAttnUpsample
+
 # constants

 MList = nn.ModuleList
@@ -493,10 +495,12 @@ class ViTEncDec(nn.Module):
                layers = layers
            ),
            nn.Sequential(
-                nn.Linear(dim, dim * 4, bias = False),
+                nn.Linear(dim, dim * 2, bias = False),
                nn.Tanh(),
-                nn.Linear(dim * 4, input_dim, bias = False),
+                nn.Linear(dim * 2, dim, bias = False),
            ),
+            nn.LayerNorm(dim),
+            nn.Linear(dim, input_dim),
            RearrangeImage(),
            Rearrange('b h w (p1 p2 c) -> b c (h p1) (w p2)', p1 = patch_size, p2 = patch_size)
        )
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.58',
+  version = '0.0.53',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',