make kernel size and sigma for gaussian blur for cascading DDPM overridable at forward. also make sure unets are wrapped in a modulelist so that at sample time, blurring does not happen

2026-02-12 11:34:29 +01:00 · 2022-04-18 12:00:47 -07:00
7 changed files with 339 additions and 1738 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <img src="./dalle2.png" width="450px"></img>

-## DALL-E 2 - Pytorch
+## DALL-E 2 - Pytorch (wip)

 Implementation of <a href="https://openai.com/dall-e-2/">DALL-E 2</a>, OpenAI's updated text-to-image synthesis neural network, in Pytorch.

@@ -10,9 +10,11 @@ The main novelty seems to be an extra layer of indirection with the prior networ

 This model is SOTA for text-to-image for now.

+It may also explore an extension of using <a href="https://huggingface.co/spaces/multimodalart/latentdiffusion">latent diffusion</a> in the decoder from Rombach et al.
+
 Please join <a href="https://discord.gg/xBPBXfcFHd"><img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white"></a> if you are interested in helping out with the replication

-There was enough interest for a <a href="https://github.com/lucidrains/dalle2-jax">Jax version</a>. I will also eventually extend this to <a href="https://github.com/lucidrains/dalle2-video">text to video</a>, once the repository is in a good place.
+There was enough interest for a Jax version. It will be completed after the Pytorch version shows signs of life on my toy tasks. <a href="https://github.com/lucidrains/dalle2-jax">Placeholder repository</a>

 ## Install

@@ -195,10 +197,10 @@ clip = CLIP(
    dim_image = 512,
    dim_latent = 512,
    num_text_tokens = 49408,
-    text_enc_depth = 6,
+    text_enc_depth = 1,
    text_seq_len = 256,
    text_heads = 8,
-    visual_enc_depth = 6,
+    visual_enc_depth = 1,
    visual_image_size = 256,
    visual_patch_size = 32,
    visual_heads = 8
@@ -207,28 +209,28 @@ clip = CLIP(
 # 2 unets for the decoder (a la cascading DDPM)

 unet1 = Unet(
-    dim = 32,
+    dim = 16,
    image_embed_dim = 512,
-    cond_dim = 128,
    channels = 3,
    dim_mults = (1, 2, 4, 8)
 ).cuda()

 unet2 = Unet(
-    dim = 32,
+    dim = 16,
    image_embed_dim = 512,
+    lowres_cond = True,         # subsequent unets must have this turned on (and first unet must have this turned off)
    cond_dim = 128,
    channels = 3,
    dim_mults = (1, 2, 4, 8, 16)
 ).cuda()

-# decoder, which contains the unet(s) and clip
+# decoder, which contains the unet and clip

 decoder = Decoder(
    clip = clip,
    unet = (unet1, unet2),            # insert both unets in order of low resolution to highest resolution (you can have as many stages as you want here)
-    image_sizes = (256, 512),         # resolutions, 256 for first unet, 512 for second. these must be unique and in ascending order (matches with the unets passed in)
-    timesteps = 1000,
+    image_sizes = (256, 512),         # resolutions, 256 for first unet, 512 for second
+    timesteps = 100,
    cond_drop_prob = 0.2
 ).cuda()

@@ -246,9 +248,16 @@ loss = decoder(images, unet_number = 2)
 loss.backward()

 # do the above for many steps for both unets
+
+# then it will learn to generate images based on the CLIP image embeddings
+
+# chaining the unets from lowest resolution to highest resolution (thus cascading)
+
+mock_image_embed = torch.randn(1, 512).cuda()
+images = decoder.sample(mock_image_embed) # (1, 3, 512, 512)
 ```

-Finally, to generate the DALL-E2 images from text. Insert the trained `DiffusionPrior` as well as the `Decoder` (which wraps `CLIP`, the causal transformer, and unet(s))
+Finally, to generate the DALL-E2 images from text. Insert the trained `DiffusionPrior` as well as the `Decoder` (which both contains `CLIP`, a unet, and a causal transformer)

 ```python
 from dalle2_pytorch import DALLE2
@@ -340,7 +349,8 @@ unet2 = Unet(
    image_embed_dim = 512,
    cond_dim = 128,
    channels = 3,
-    dim_mults = (1, 2, 4, 8, 16)
+    dim_mults = (1, 2, 4, 8, 16),
+    lowres_cond = True
 ).cuda()

 decoder = Decoder(
@@ -348,8 +358,7 @@ decoder = Decoder(
    image_sizes = (128, 256),
    clip = clip,
    timesteps = 100,
-    cond_drop_prob = 0.2,
-    condition_on_text_encodings = False  # set this to True if you wish to condition on text during training and sampling
+    cond_drop_prob = 0.2
 ).cuda()

 for unet_number in (1, 2):
@@ -377,247 +386,7 @@ You can also train the decoder on images of greater than the size (say 512x512)

 For the layperson, no worries, training will all be automated into a CLI tool, at least for small scale training.

-## Training on Preprocessed CLIP Embeddings
-
-It is likely, when scaling up, that you would first preprocess your images and text into corresponding embeddings before training the prior network. You can do so easily by simply passing in `image_embed`, `text_embed`, and optionally `text_encodings` and `text_mask`
-
-Working example below
-
-```python
-import torch
-from dalle2_pytorch import DiffusionPriorNetwork, DiffusionPrior, CLIP
-
-# get trained CLIP from step one
-
-clip = CLIP(
-    dim_text = 512,
-    dim_image = 512,
-    dim_latent = 512,
-    num_text_tokens = 49408,
-    text_enc_depth = 6,
-    text_seq_len = 256,
-    text_heads = 8,
-    visual_enc_depth = 6,
-    visual_image_size = 256,
-    visual_patch_size = 32,
-    visual_heads = 8,
-).cuda()
-
-# setup prior network, which contains an autoregressive transformer
-
-prior_network = DiffusionPriorNetwork(
-    dim = 512,
-    depth = 6,
-    dim_head = 64,
-    heads = 8
-).cuda()
-
-# diffusion prior network, which contains the CLIP and network (with transformer) above
-
-diffusion_prior = DiffusionPrior(
-    net = prior_network,
-    clip = clip,
-    timesteps = 100,
-    cond_drop_prob = 0.2,
-    condition_on_text_encodings = False  # this probably should be true, but just to get Laion started
-).cuda()
-
-# mock data
-
-text = torch.randint(0, 49408, (4, 256)).cuda()
-images = torch.randn(4, 3, 256, 256).cuda()
-
-# precompute the text and image embeddings
-# here using the diffusion prior class, but could be done with CLIP alone
-
-clip_image_embeds = diffusion_prior.get_image_embed(images)
-clip_text_embeds = diffusion_prior.get_text_cond(text).get('text_embed')
-
-# feed text and images into diffusion prior network
-
-loss = diffusion_prior(
-    text_embed = clip_text_embeds,
-    image_embed = clip_image_embeds
-)
-
-loss.backward()
-
-# do the above for many many many steps
-# now the diffusion prior can generate image embeddings from the text embeddings
-```
-
-You can also completely go `CLIP`-less, in which case you will need to pass in the `image_embed_dim` into the `DiffusionPrior` on initialization
-
-```python
-import torch
-from dalle2_pytorch import DiffusionPriorNetwork, DiffusionPrior
-
-# setup prior network, which contains an autoregressive transformer
-
-prior_network = DiffusionPriorNetwork(
-    dim = 512,
-    depth = 6,
-    dim_head = 64,
-    heads = 8
-).cuda()
-
-# diffusion prior network, which contains the CLIP and network (with transformer) above
-
-diffusion_prior = DiffusionPrior(
-    net = prior_network,
-    image_embed_dim = 512,               # this needs to be set
-    timesteps = 100,
-    cond_drop_prob = 0.2,
-    condition_on_text_encodings = False  # this probably should be true, but just to get Laion started
-).cuda()
-
-# mock data
-
-text = torch.randint(0, 49408, (4, 256)).cuda()
-images = torch.randn(4, 3, 256, 256).cuda()
-
-# precompute the text and image embeddings
-# here using the diffusion prior class, but could be done with CLIP alone
-
-clip_image_embeds = torch.randn(4, 512).cuda()
-clip_text_embeds = torch.randn(4, 512).cuda()
-
-# feed text and images into diffusion prior network
-
-loss = diffusion_prior(
-    text_embed = clip_text_embeds,
-    image_embed = clip_image_embeds
-)
-
-loss.backward()
-
-# do the above for many many many steps
-# now the diffusion prior can generate image embeddings from the text embeddings
-```
-
-## Experimental
-
-### DALL-E2 with Latent Diffusion
-
-This repository decides to take the next step and offer DALL-E v2 combined with <a href="https://huggingface.co/spaces/multimodalart/latentdiffusion">latent diffusion</a>, from Rombach et al.
-
-You can use it as follows. Latent diffusion can be limited to just the first U-Net in the cascade, or to any number you wish.
-
-The repository also comes equipped with all the necessary settings to recreate `ViT-VQGan` from the <a href="https://arxiv.org/abs/2110.04627">Improved VQGans</a> paper. Furthermore, the <a href="https://github.com/lucidrains/vector-quantize-pytorch">vector quantization</a> library also comes equipped to do <a href="https://arxiv.org/abs/2203.01941">residual or multi-headed quantization</a>, which I believe will give an even further boost in performance to the autoencoder.
-
-```python
-import torch
-from dalle2_pytorch import Unet, Decoder, CLIP, VQGanVAE
-
-# trained clip from step 1
-
-clip = CLIP(
-    dim_text = 512,
-    dim_image = 512,
-    dim_latent = 512,
-    num_text_tokens = 49408,
-    text_enc_depth = 1,
-    text_seq_len = 256,
-    text_heads = 8,
-    visual_enc_depth = 1,
-    visual_image_size = 256,
-    visual_patch_size = 32,
-    visual_heads = 8
-)
-
-# 3 unets for the decoder (a la cascading DDPM)
-
-# first two unets are doing latent diffusion
-# vqgan-vae must be trained before hand
-
-vae1 = VQGanVAE(
-    dim = 32,
-    image_size = 256,
-    layers = 3,
-    layer_mults = (1, 2, 4)
-)
-
-vae2 = VQGanVAE(
-    dim = 32,
-    image_size = 512,
-    layers = 3,
-    layer_mults = (1, 2, 4)
-)
-
-unet1 = Unet(
-    dim = 32,
-    image_embed_dim = 512,
-    cond_dim = 128,
-    channels = 3,
-    sparse_attn = True,
-    sparse_attn_window = 2,
-    dim_mults = (1, 2, 4, 8)
-)
-
-unet2 = Unet(
-    dim = 32,
-    image_embed_dim = 512,
-    channels = 3,
-    dim_mults = (1, 2, 4, 8, 16),
-    cond_on_image_embeds = True,
-    cond_on_text_encodings = False
-)
-
-unet3 = Unet(
-    dim = 32,
-    image_embed_dim = 512,
-    channels = 3,
-    dim_mults = (1, 2, 4, 8, 16),
-    cond_on_image_embeds = True,
-    cond_on_text_encodings = False,
-    attend_at_middle = False
-)
-
-# decoder, which contains the unet(s) and clip
-
-decoder = Decoder(
-    clip = clip,
-    vae = (vae1, vae2),                # latent diffusion for unet1 (vae1) and unet2 (vae2), but not for the last unet3
-    unet = (unet1, unet2, unet3),      # insert unets in order of low resolution to highest resolution (you can have as many stages as you want here)
-    image_sizes = (256, 512, 1024),    # resolutions, 256 for first unet, 512 for second, 1024 for third
-    timesteps = 100,
-    cond_drop_prob = 0.2
-).cuda()
-
-# mock images (get a lot of this)
-
-images = torch.randn(1, 3, 1024, 1024).cuda()
-
-# feed images into decoder, specifying which unet you want to train
-# each unet can be trained separately, which is one of the benefits of the cascading DDPM scheme
-
-with decoder.one_unet_in_gpu(1):
-    loss = decoder(images, unet_number = 1)
-    loss.backward()
-
-with decoder.one_unet_in_gpu(2):
-    loss = decoder(images, unet_number = 2)
-    loss.backward()
-
-with decoder.one_unet_in_gpu(3):
-    loss = decoder(images, unet_number = 3)
-    loss.backward()
-
-# do the above for many steps for both unets
-
-# then it will learn to generate images based on the CLIP image embeddings
-
-# chaining the unets from lowest resolution to highest resolution (thus cascading)
-
-mock_image_embed = torch.randn(1, 512).cuda()
-images = decoder.sample(mock_image_embed) # (1, 3, 1024, 1024)
-```
-
-## Training wrapper (wip)
-
-Offer training wrappers
-
-## CLI (wip)
+## CLI Usage (work in progress)

 ```bash
 $ dream 'sharing a sunset at the summit of mount everest with my dog'
@@ -625,7 +394,9 @@ $ dream 'sharing a sunset at the summit of mount everest with my dog'

 Once built, images will be saved to the same directory the command is invoked

-<a href="https://github.com/lucidrains/big-sleep">template</a>
+## Training wrapper (wip)
+
+Offer training wrappers

 ## Training CLI (wip)

@@ -639,22 +410,14 @@ Once built, images will be saved to the same directory the command is invoked
 - [x] augment unet so that it can also be conditioned on text encodings (although in paper they hinted this didn't make much a difference)
 - [x] figure out all the current bag of tricks needed to make DDPMs great (starting with the blur trick mentioned in paper)
 - [x] build the cascading ddpm by having Decoder class manage multiple unets at different resolutions
- [x] add efficient attention in unet
- [x] be able to finely customize what to condition on (text, image embed) for specific unet in the cascade (super resolution ddpms near the end may not need too much conditioning)
- [x] offload unets not being trained on to CPU for memory efficiency (for training each resolution unets separately)
- [x] build out latent diffusion architecture, with the vq-reg variant (vqgan-vae), make it completely optional and compatible with cascading ddpms
- [x] for decoder, allow ability to customize objective (predict epsilon vs x0), in case latent diffusion does better with prediction of x0
- [x] use attention-based upsampling https://arxiv.org/abs/2112.11435
- [x] use inheritance just this once for sharing logic between decoder and prior network ddpms
- [x] bring in vit-vqgan https://arxiv.org/abs/2110.04627 for the latent diffusion
- [x] abstract interface for CLIP adapter class, so other CLIPs can be brought in
- [ ] become an expert with unets, cleanup unet code, make it fully configurable, port all learnings over to https://github.com/lucidrains/x-unet
- [ ] copy the cascading ddpm code to a separate repo (perhaps https://github.com/lucidrains/denoising-diffusion-pytorch) as the main contribution of dalle2 really is just the prior network
- [ ] transcribe code to Jax, which lowers the activation energy for distributed training, given access to TPUs
+- [ ] use an image resolution cutoff and do cross attention conditioning only if resources allow, and MLP + sum conditioning on rest
+- [ ] make unet more configurable
+- [ ] figure out some factory methods to make cascading unet instantiations less error-prone
+- [ ] offload unets not being trained on to CPU for memory efficiency (for training each resolution unets separately)
 - [ ] train on a toy task, offer in colab
- [ ] think about how best to design a declarative training config that handles preencoding for prior and training of multiple networks in decoder
- [ ] extend diffusion head to use diffusion-gan (potentially using lightweight-gan) to speed up inference
- [ ] bring in tools to train vqgan-vae
+- [ ] add attention to unet - apply some personal tricks with efficient attention - use the sparse attention mechanism from https://github.com/lucidrains/vit-pytorch#maxvit
+- [ ] build out latent diffusion architecture in separate file, as it is not faithful to dalle-2 (but offer it as as setting)
+- [ ] consider U2-net for decoder https://arxiv.org/abs/2005.09007 (also in separate file as experimental) build out https://github.com/lucidrains/x-unet

 ## Citations

@@ -686,27 +449,20 @@ Once built, images will be saved to the same directory the command is invoked

 ```bibtex
@inproceedings{Liu2022ACF,
-    title   = {A ConvNet for the 2020https://arxiv.org/abs/2112.11435s},
+    title   = {A ConvNet for the 2020s},
    author  = {Zhuang Liu and Hanzi Mao and Chaozheng Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie},
    year    = {2022}
 }
 ```

 ```bibtex
-@inproceedings{Tu2022MaxViTMV,
-    title   = {MaxViT: Multi-Axis Vision Transformer},
-    author  = {Zhe-Wei Tu and Hossein Talebi and Han Zhang and Feng Yang and Peyman Milanfar and Alan Conrad Bovik and Yinxiao Li},
-    year    = {2022}
-}
-```
-
-```bibtex
-@article{Yu2021VectorquantizedIM,
-    title   = {Vector-quantized Image Modeling with Improved VQGAN},
-    author  = {Jiahui Yu and Xin Li and Jing Yu Koh and Han Zhang and Ruoming Pang and James Qin and Alexander Ku and Yuanzhong Xu and Jason Baldridge and Yonghui Wu},
-    journal = {ArXiv},
-    year    = {2021},
-    volume  = {abs/2110.04627}
+@misc{zhang2019root,
+    title   = {Root Mean Square Layer Normalization},
+    author  = {Biao Zhang and Rico Sennrich},
+    year    = {2019},
+    eprint  = {1910.07467},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.LG}
 }
 ```

--- a/dalle2_pytorch/init.py
+++ b/dalle2_pytorch/init.py
@@ -1,4 +1,2 @@
 from dalle2_pytorch.dalle2_pytorch import DALLE2, DiffusionPriorNetwork, DiffusionPrior, Unet, Decoder
-
-from dalle2_pytorch.vqgan_vae import VQGanVAE
 from x_clip import CLIP
--- a/dalle2_pytorch/cli.py
+++ b/dalle2_pytorch/cli.py
@@ -1,51 +1,9 @@
 import click
-import torch
-import torchvision.transforms as T
-from pathlib import Path
-
-from dalle2_pytorch import DALLE2, Decoder, DiffusionPrior
-
-def safeget(dictionary, keys, default = None):
-    return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default, keys.split('.'), dictionary)
-
-def simple_slugify(text, max_length = 255):
-    return text.replace("-", "_").replace(",", "").replace(" ", "_").replace("|", "--").strip('-_')[:max_length]
-
-def get_pkg_version():
-    from pkg_resources import get_distribution
-    return get_distribution('dalle2_pytorch').version

 def main():
    pass

@click.command()
-@click.option('--model', default = './dalle2.pt', help = 'path to trained DALL-E2 model')
-@click.option('--cond_scale', default = 2, help = 'conditioning scale (classifier free guidance) in decoder')
@click.argument('text')
-def dream(
-    model,
-    cond_scale,
-    text
-):
-    model_path = Path(model)
-    full_model_path = str(model_path.resolve())
-    assert model_path.exists(), f'model not found at {full_model_path}'
-    loaded = torch.load(str(model_path))
-
-    version = safeget(loaded, 'version')
-    print(f'loading DALL-E2 from {full_model_path}, saved at version {version} - current package version is {get_pkg_version()}')
-
-    prior_init_params = safeget(loaded, 'init_params.prior')
-    decoder_init_params = safeget(loaded, 'init_params.decoder')
-    model_params = safeget(loaded, 'model_params')
-
-    prior = DiffusionPrior(**prior_init_params)
-    decoder = Decoder(**decoder_init_params)
-
-    dalle2 = DALLE2(prior, decoder)
-    dalle2.load_state_dict(model_params)
-
-    image = dalle2(text, cond_scale = cond_scale)
-
-    pil_image = T.ToPILImage()(image)
-    return pil_image.save(f'./{simple_slugify(text)}.png')
+def dream(text):
+    return image
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
--- a/dalle2_pytorch/train.py
+++ b/dalle2_pytorch/train.py
@@ -1,53 +0,0 @@
-import copy
-import torch
-from torch import nn
-
-# exponential moving average wrapper
-
-class EMA(nn.Module):
-    def __init__(
-        self,
-        model,
-        beta = 0.99,
-        ema_update_after_step = 1000,
-        ema_update_every = 10,
-    ):
-        super().__init__()
-        self.beta = beta
-        self.online_model = model
-        self.ema_model = copy.deepcopy(model)
-
-        self.ema_update_after_step = ema_update_after_step # only start EMA after this step number, starting at 0
-        self.ema_update_every = ema_update_every
-
-        self.register_buffer('initted', torch.Tensor([False]))
-        self.register_buffer('step', torch.tensor([0.]))
-
-    def update(self):
-        self.step += 1
-
-        if self.step <= self.ema_update_after_step or (self.step % self.ema_update_every) != 0:
-            return
-
-        if not self.initted:
-            self.ema_model.state_dict(self.online_model.state_dict())
-            self.initted.data.copy_(torch.Tensor([True]))
-
-        self.update_moving_average(self.ema_model, self.online_model)
-
-    def update_moving_average(self, ma_model, current_model):
-        def calculate_ema(beta, old, new):
-            if not exists(old):
-                return new
-            return old * beta + (1 - beta) * new
-
-        for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()):
-            old_weight, up_weight = ma_params.data, current_params.data
-            ma_params.data = calculate_ema(self.beta, old_weight, up_weight)
-
-        for current_buffer, ma_buffer in zip(current_model.buffers(), ma_model.buffers()):
-            new_buffer_value = calculate_ema(self.beta, ma_buffer, current_buffer)
-            ma_buffer.copy_(new_buffer_value)
-
-    def __call__(self, *args, **kwargs):
-        return self.ema_model(*args, **kwargs)
--- a/dalle2_pytorch/vqgan_vae.py
+++ b/dalle2_pytorch/vqgan_vae.py
@@ -1,755 +0,0 @@
-import copy
-import math
-from math import sqrt
-from functools import partial, wraps
-
-from vector_quantize_pytorch import VectorQuantize as VQ
-
-import torch
-from torch import nn, einsum
-import torch.nn.functional as F
-from torch.autograd import grad as torch_grad
-import torchvision
-
-from einops import rearrange, reduce, repeat
-from einops_exts import rearrange_many
-from einops.layers.torch import Rearrange
-
-# constants
-
-MList = nn.ModuleList
-
-# helper functions
-
-def exists(val):
-    return val is not None
-
-def default(val, d):
-    return val if exists(val) else d
-
-# decorators
-
-def eval_decorator(fn):
-    def inner(model, *args, **kwargs):
-        was_training = model.training
-        model.eval()
-        out = fn(model, *args, **kwargs)
-        model.train(was_training)
-        return out
-    return inner
-
-def remove_vgg(fn):
-    @wraps(fn)
-    def inner(self, *args, **kwargs):
-        has_vgg = hasattr(self, 'vgg')
-        if has_vgg:
-            vgg = self.vgg
-            delattr(self, 'vgg')
-
-        out = fn(self, *args, **kwargs)
-
-        if has_vgg:
-            self.vgg = vgg
-
-        return out
-    return inner
-
-# keyword argument helpers
-
-def pick_and_pop(keys, d):
-    values = list(map(lambda key: d.pop(key), keys))
-    return dict(zip(keys, values))
-
-def group_dict_by_key(cond, d):
-    return_val = [dict(),dict()]
-    for key in d.keys():
-        match = bool(cond(key))
-        ind = int(not match)
-        return_val[ind][key] = d[key]
-    return (*return_val,)
-
-def string_begins_with(prefix, str):
-    return str.startswith(prefix)
-
-def group_by_key_prefix(prefix, d):
-    return group_dict_by_key(partial(string_begins_with, prefix), d)
-
-def groupby_prefix_and_trim(prefix, d):
-    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
-    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
-    return kwargs_without_prefix, kwargs
-
-# tensor helper functions
-
-def log(t, eps = 1e-10):
-    return torch.log(t + eps)
-
-def gradient_penalty(images, output, weight = 10):
-    batch_size = images.shape[0]
-    gradients = torch_grad(outputs = output, inputs = images,
-                           grad_outputs = torch.ones(output.size(), device = images.device),
-                           create_graph = True, retain_graph = True, only_inputs = True)[0]
-
-    gradients = rearrange(gradients, 'b ... -> b (...)')
-    return weight * ((gradients.norm(2, dim = 1) - 1) ** 2).mean()
-
-def l2norm(t):
-    return F.normalize(t, dim = -1)
-
-def leaky_relu(p = 0.1):
-    return nn.LeakyReLU(0.1)
-
-def stable_softmax(t, dim = -1, alpha = 32 ** 2):
-    t = t / alpha
-    t = t - torch.amax(t, dim = dim, keepdim = True).detach()
-    return (t * alpha).softmax(dim = dim)
-
-def safe_div(numer, denom, eps = 1e-8):
-    return numer / (denom + eps)
-
-# gan losses
-
-def hinge_discr_loss(fake, real):
-    return (F.relu(1 + fake) + F.relu(1 - real)).mean()
-
-def hinge_gen_loss(fake):
-    return -fake.mean()
-
-def bce_discr_loss(fake, real):
-    return (-log(1 - torch.sigmoid(fake)) - log(torch.sigmoid(real))).mean()
-
-def bce_gen_loss(fake):
-    return -log(torch.sigmoid(fake)).mean()
-
-def grad_layer_wrt_loss(loss, layer):
-    return torch_grad(
-        outputs = loss,
-        inputs = layer,
-        grad_outputs = torch.ones_like(loss),
-        retain_graph = True
-    )[0].detach()
-
-# vqgan vae
-
-class LayerNormChan(nn.Module):
-    def __init__(
-        self,
-        dim,
-        eps = 1e-5
-    ):
-        super().__init__()
-        self.eps = eps
-        self.gamma = nn.Parameter(torch.ones(1, dim, 1, 1))
-
-    def forward(self, x):
-        var = torch.var(x, dim = 1, unbiased = False, keepdim = True)
-        mean = torch.mean(x, dim = 1, keepdim = True)
-        return (x - mean) / (var + self.eps).sqrt() * self.gamma
-
-# discriminator
-
-class Discriminator(nn.Module):
-    def __init__(
-        self,
-        dims,
-        channels = 3,
-        groups = 16,
-        init_kernel_size = 5
-    ):
-        super().__init__()
-        dim_pairs = zip(dims[:-1], dims[1:])
-
-        self.layers = MList([nn.Sequential(nn.Conv2d(channels, dims[0], init_kernel_size, padding = init_kernel_size // 2), leaky_relu())])
-
-        for dim_in, dim_out in dim_pairs:
-            self.layers.append(nn.Sequential(
-                nn.Conv2d(dim_in, dim_out, 4, stride = 2, padding = 1),
-                nn.GroupNorm(groups, dim_out),
-                leaky_relu()
-            ))
-
-        dim = dims[-1]
-        self.to_logits = nn.Sequential( # return 5 x 5, for PatchGAN-esque training
-            nn.Conv2d(dim, dim, 1),
-            leaky_relu(),
-            nn.Conv2d(dim, 1, 4)
-        )
-
-    def forward(self, x):
-        for net in self.layers:
-            x = net(x)
-
-        return self.to_logits(x)
-
-# positional encoding
-
-class ContinuousPositionBias(nn.Module):
-    """ from https://arxiv.org/abs/2111.09883 """
-
-    def __init__(self, *, dim, heads, layers = 2):
-        super().__init__()
-        self.net = MList([])
-        self.net.append(nn.Sequential(nn.Linear(2, dim), leaky_relu()))
-
-        for _ in range(layers - 1):
-            self.net.append(nn.Sequential(nn.Linear(dim, dim), leaky_relu()))
-
-        self.net.append(nn.Linear(dim, heads))
-        self.register_buffer('rel_pos', None, persistent = False)
-
-    def forward(self, x):
-        n, device = x.shape[-1], x.device
-        fmap_size = int(sqrt(n))
-
-        if not exists(self.rel_pos):
-            pos = torch.arange(fmap_size, device = device)
-            grid = torch.stack(torch.meshgrid(pos, pos, indexing = 'ij'))
-            grid = rearrange(grid, 'c i j -> (i j) c')
-            rel_pos = rearrange(grid, 'i c -> i 1 c') - rearrange(grid, 'j c -> 1 j c')
-            rel_pos = torch.sign(rel_pos) * torch.log(rel_pos.abs() + 1)
-            self.register_buffer('rel_pos', rel_pos, persistent = False)
-
-        rel_pos = self.rel_pos.float()
-
-        for layer in self.net:
-            rel_pos = layer(rel_pos)
-
-        bias = rearrange(rel_pos, 'i j h -> h i j')
-        return x + bias
-
-# resnet encoder / decoder
-
-class ResnetEncDec(nn.Module):
-    def __init__(
-        self,
-        dim,
-        *,
-        channels = 3,
-        layers = 4,
-        layer_mults = None,
-        num_resnet_blocks = 1,
-        resnet_groups = 16,
-        first_conv_kernel_size = 5,
-        use_attn = True,
-        attn_dim_head = 64,
-        attn_heads = 8,
-        attn_dropout = 0.,
-    ):
-        super().__init__()
-        assert dim % resnet_groups == 0, f'dimension {dim} must be divisible by {resnet_groups} (groups for the groupnorm)'
-
-        self.layers = layers
-
-        self.encoders = MList([])
-        self.decoders = MList([])
-
-        layer_mults = default(layer_mults, list(map(lambda t: 2 ** t, range(layers))))
-        assert len(layer_mults) == layers, 'layer multipliers must be equal to designated number of layers'
-
-        layer_dims = [dim * mult for mult in layer_mults]
-        dims = (dim, *layer_dims)
-
-        self.encoded_dim = dims[-1]
-
-        dim_pairs = zip(dims[:-1], dims[1:])
-
-        append = lambda arr, t: arr.append(t)
-        prepend = lambda arr, t: arr.insert(0, t)
-
-        if not isinstance(num_resnet_blocks, tuple):
-            num_resnet_blocks = (*((0,) * (layers - 1)), num_resnet_blocks)
-
-        if not isinstance(use_attn, tuple):
-            use_attn = (*((False,) * (layers - 1)), use_attn)
-
-        assert len(num_resnet_blocks) == layers, 'number of resnet blocks config must be equal to number of layers'
-        assert len(use_attn) == layers
-
-        for layer_index, (dim_in, dim_out), layer_num_resnet_blocks, layer_use_attn in zip(range(layers), dim_pairs, num_resnet_blocks, use_attn):
-            append(self.encoders, nn.Sequential(nn.Conv2d(dim_in, dim_out, 4, stride = 2, padding = 1), leaky_relu()))
-            prepend(self.decoders, nn.Sequential(nn.ConvTranspose2d(dim_out, dim_in, 4, 2, 1), leaky_relu()))
-
-            if layer_use_attn:
-                prepend(self.decoders, VQGanAttention(dim = dim_out, heads = attn_heads, dim_head = attn_dim_head, dropout = attn_dropout))
-
-            for _ in range(layer_num_resnet_blocks):
-                append(self.encoders, ResBlock(dim_out, groups = resnet_groups))
-                prepend(self.decoders, GLUResBlock(dim_out, groups = resnet_groups))
-
-            if layer_use_attn:
-                append(self.encoders, VQGanAttention(dim = dim_out, heads = attn_heads, dim_head = attn_dim_head, dropout = attn_dropout))
-
-        prepend(self.encoders, nn.Conv2d(channels, dim, first_conv_kernel_size, padding = first_conv_kernel_size // 2))
-        append(self.decoders, nn.Conv2d(dim, channels, 1))
-
-    def get_encoded_fmap_size(self, image_size):
-        return image_size // (2 ** self.layers)
-
-    def encode(self, x):
-        for enc in self.encoders:
-            x = enc(x)
-        return x
-
-    def decode(self, x):
-        for dec in self.decoders:
-            x = dec(x)
-        return x
-
-class GLUResBlock(nn.Module):
-    def __init__(self, chan, groups = 16):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Conv2d(chan, chan * 2, 3, padding = 1),
-            nn.GLU(dim = 1),
-            nn.GroupNorm(groups, chan),
-            nn.Conv2d(chan, chan * 2, 3, padding = 1),
-            nn.GLU(dim = 1),
-            nn.GroupNorm(groups, chan),
-            nn.Conv2d(chan, chan, 1)
-        )
-
-    def forward(self, x):
-        return self.net(x) + x
-
-class ResBlock(nn.Module):
-    def __init__(self, chan, groups = 16):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Conv2d(chan, chan, 3, padding = 1),
-            nn.GroupNorm(groups, chan),
-            leaky_relu(),
-            nn.Conv2d(chan, chan, 3, padding = 1),
-            nn.GroupNorm(groups, chan),
-            leaky_relu(),
-            nn.Conv2d(chan, chan, 1)
-        )
-
-    def forward(self, x):
-        return self.net(x) + x
-
-# vqgan attention layer
-
-class VQGanAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        dim_head = 64,
-        heads = 8,
-        dropout = 0.
-    ):
-        super().__init__()
-        self.heads = heads
-        self.scale = dim_head ** -0.5
-        inner_dim = heads * dim_head
-
-        self.dropout = nn.Dropout(dropout)
-        self.pre_norm = LayerNormChan(dim)
-
-        self.cpb = ContinuousPositionBias(dim = dim // 4, heads = heads)
-        self.to_qkv = nn.Conv2d(dim, inner_dim * 3, 1, bias = False)
-        self.to_out = nn.Conv2d(inner_dim, dim, 1, bias = False)
-
-    def forward(self, x):
-        h = self.heads
-        height, width, residual = *x.shape[-2:], x.clone()
-
-        x = self.pre_norm(x)
-
-        q, k, v = self.to_qkv(x).chunk(3, dim = 1)
-
-        q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> b h c (x y)', h = h), (q, k, v))
-
-        sim = einsum('b h c i, b h c j -> b h i j', q, k) * self.scale
-
-        sim = self.cpb(sim)
-
-        attn = stable_softmax(sim, dim = -1)
-        attn = self.dropout(attn)
-
-        out = einsum('b h i j, b h c j -> b h c i', attn, v)
-        out = rearrange(out, 'b h c (x y) -> b (h c) x y', x = height, y = width)
-        out = self.to_out(out)
-
-        return out + residual
-
-# ViT encoder / decoder
-
-class RearrangeImage(nn.Module):
-    def forward(self, x):
-        n = x.shape[1]
-        w = h = int(sqrt(n))
-        return rearrange(x, 'b (h w) ... -> b h w ...', h = h, w = w)
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim,
-        *,
-        heads = 8,
-        dim_head = 32
-    ):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.heads = heads
-        self.scale = dim_head ** -0.5
-        inner_dim = dim_head * heads
-
-        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
-        self.to_out = nn.Linear(inner_dim, dim)
-
-    def forward(self, x):
-        h = self.heads
-
-        x = self.norm(x)
-
-        q, k, v = self.to_qkv(x).chunk(3, dim = -1)
-        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h = h)
-
-        q = q * self.scale
-        sim = einsum('b h i d, b h j d -> b h i j', q, k)
-
-        sim = sim - sim.amax(dim = -1, keepdim = True).detach()
-        attn = sim.softmax(dim = -1)
-
-        out = einsum('b h i j, b h j d -> b h i d', attn, v)
-
-        out = rearrange(out, 'b h n d -> b n (h d)')
-        return self.to_out(out)
-
-def FeedForward(dim, mult = 4):
-    return nn.Sequential(
-        nn.LayerNorm(dim),
-        nn.Linear(dim, dim * mult, bias = False),
-        nn.GELU(),
-        nn.Linear(dim * mult, dim, bias = False)
-    )
-
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        dim,
-        *,
-        layers,
-        dim_head = 32,
-        heads = 8,
-        ff_mult = 4
-    ):
-        super().__init__()
-        self.layers = nn.ModuleList([])
-        for _ in range(layers):
-            self.layers.append(nn.ModuleList([
-                Attention(dim = dim, dim_head = dim_head, heads = heads),
-                FeedForward(dim = dim, mult = ff_mult)
-            ]))
-
-        self.norm = nn.LayerNorm(dim)
-
-    def forward(self, x):
-        for attn, ff in self.layers:
-            x = attn(x) + x
-            x = ff(x) + x
-
-        return self.norm(x)
-
-class ViTEncDec(nn.Module):
-    def __init__(
-        self,
-        dim,
-        channels = 3,
-        layers = 4,
-        patch_size = 8,
-        dim_head = 32,
-        heads = 8,
-        ff_mult = 4
-    ):
-        super().__init__()
-        self.encoded_dim = dim
-        self.patch_size = patch_size
-
-        input_dim = channels * (patch_size ** 2)
-
-        self.encoder = nn.Sequential(
-            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.Linear(input_dim, dim),
-            Transformer(
-                dim = dim,
-                dim_head = dim_head,
-                heads = heads,
-                ff_mult = ff_mult,
-                layers = layers
-            ),
-            RearrangeImage(),
-            Rearrange('b h w c -> b c h w')
-        )
-
-        self.decoder = nn.Sequential(
-            Rearrange('b c h w -> b (h w) c'),
-            Transformer(
-                dim = dim,
-                dim_head = dim_head,
-                heads = heads,
-                ff_mult = ff_mult,
-                layers = layers
-            ),
-            nn.Sequential(
-                nn.Linear(dim, dim * 4, bias = False),
-                nn.Tanh(),
-                nn.Linear(dim * 4, input_dim, bias = False),
-            ),
-            RearrangeImage(),
-            Rearrange('b h w (p1 p2 c) -> b c (h p1) (w p2)', p1 = patch_size, p2 = patch_size)
-        )
-
-    def get_encoded_fmap_size(self, image_size):
-        return image_size // self.patch_size
-
-    def encode(self, x):
-        return self.encoder(x)
-
-    def decode(self, x):
-        return self.decoder(x)
-
-# main vqgan-vae classes
-
-class NullVQGanVAE(nn.Module):
-    def __init__(
-        self,
-        *,
-        channels
-    ):
-        super().__init__()
-        self.encoded_dim = channels
-        self.layers = 0
-
-    def get_encoded_fmap_size(self, size):
-        return size
-
-    def copy_for_eval(self):
-        return self
-
-    def encode(self, x):
-        return x
-
-    def decode(self, x):
-        return x
-
-class VQGanVAE(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        image_size,
-        channels = 3,
-        layers = 4,
-        l2_recon_loss = False,
-        use_hinge_loss = True,
-        vgg = None,
-        vq_codebook_size = 512,
-        vq_decay = 0.8,
-        vq_commitment_weight = 1.,
-        vq_kmeans_init = True,
-        vq_use_cosine_sim = True,
-        use_vgg_and_gan = True,
-        vae_type = 'resnet',
-        discr_layers = 4,
-        **kwargs
-    ):
-        super().__init__()
-        vq_kwargs, kwargs = groupby_prefix_and_trim('vq_', kwargs)
-        encdec_kwargs, kwargs = groupby_prefix_and_trim('encdec_', kwargs)
-
-        self.image_size = image_size
-        self.channels = channels
-        self.codebook_size = vq_codebook_size
-
-        if vae_type == 'resnet':
-            enc_dec_klass = ResnetEncDec
-        elif vae_type == 'vit':
-            enc_dec_klass = ViTEncDec
-        else:
-            raise ValueError(f'{vae_type} not valid')
-
-        self.enc_dec = enc_dec_klass(
-            dim = dim,
-            channels = channels,
-            layers = layers,
-            **encdec_kwargs
-        )
-
-        self.vq = VQ(
-            dim = self.enc_dec.encoded_dim,
-            codebook_size = vq_codebook_size,
-            decay = vq_decay,
-            commitment_weight = vq_commitment_weight,
-            accept_image_fmap = True,
-            kmeans_init = vq_kmeans_init,
-            use_cosine_sim = vq_use_cosine_sim,
-            **vq_kwargs
-        )
-
-        # reconstruction loss
-
-        self.recon_loss_fn = F.mse_loss if l2_recon_loss else F.l1_loss
-
-        # turn off GAN and perceptual loss if grayscale
-
-        self.vgg = None
-        self.discr = None
-        self.use_vgg_and_gan = use_vgg_and_gan
-
-        if not use_vgg_and_gan:
-            return
-
-        # preceptual loss
-
-        if exists(vgg):
-            self.vgg = vgg
-        else:
-            self.vgg = torchvision.models.vgg16(pretrained = True)
-            self.vgg.classifier = nn.Sequential(*self.vgg.classifier[:-2])
-
-        # gan related losses
-
-        layer_mults = list(map(lambda t: 2 ** t, range(discr_layers)))
-        layer_dims = [dim * mult for mult in layer_mults]
-        dims = (dim, *layer_dims)
-
-        self.discr = Discriminator(dims = dims, channels = channels)
-
-        self.discr_loss = hinge_discr_loss if use_hinge_loss else bce_discr_loss
-        self.gen_loss = hinge_gen_loss if use_hinge_loss else bce_gen_loss
-
-    @property
-    def encoded_dim(self):
-        return self.enc_dec.encoded_dim
-
-    def get_encoded_fmap_size(self, image_size):
-        return self.enc_dec.get_encoded_fmap_size(image_size)
-
-    def copy_for_eval(self):
-        device = next(self.parameters()).device
-        vae_copy = copy.deepcopy(self.cpu())
-
-        if vae_copy.use_vgg_and_gan:
-            del vae_copy.discr
-            del vae_copy.vgg
-
-        vae_copy.eval()
-        return vae_copy.to(device)
-
-    @remove_vgg
-    def state_dict(self, *args, **kwargs):
-        return super().state_dict(*args, **kwargs)
-
-    @remove_vgg
-    def load_state_dict(self, *args, **kwargs):
-        return super().load_state_dict(*args, **kwargs)
-
-    @property
-    def codebook(self):
-        return self.vq.codebook
-
-    def encode(self, fmap):
-        fmap = self.enc_dec.encode(fmap)
-        return fmap
-
-    def decode(self, fmap, return_indices_and_loss = False):
-        fmap, indices, commit_loss = self.vq(fmap)
-
-        fmap = self.enc_dec.decode(fmap)
-
-        if not return_indices_and_loss:
-            return fmap
-
-        return fmap, indices, commit_loss
-
-    def forward(
-        self,
-        img,
-        return_loss = False,
-        return_discr_loss = False,
-        return_recons = False,
-        add_gradient_penalty = True
-    ):
-        batch, channels, height, width, device = *img.shape, img.device
-        assert height == self.image_size and width == self.image_size, 'height and width of input image must be equal to {self.image_size}'
-        assert channels == self.channels, 'number of channels on image or sketch is not equal to the channels set on this VQGanVAE'
-
-        fmap = self.encode(img)
-
-        fmap, indices, commit_loss = self.decode(fmap, return_indices_and_loss = True)
-
-        if not return_loss and not return_discr_loss:
-            return fmap
-
-        assert return_loss ^ return_discr_loss, 'you should either return autoencoder loss or discriminator loss, but not both'
-
-        # whether to return discriminator loss
-
-        if return_discr_loss:
-            assert exists(self.discr), 'discriminator must exist to train it'
-
-            fmap.detach_()
-            img.requires_grad_()
-
-            fmap_discr_logits, img_discr_logits = map(self.discr, (fmap, img))
-
-            discr_loss = self.discr_loss(fmap_discr_logits, img_discr_logits)
-
-            if add_gradient_penalty:
-                gp = gradient_penalty(img, img_discr_logits)
-                loss = discr_loss + gp
-
-            if return_recons:
-                return loss, fmap
-
-            return loss
-
-        # reconstruction loss
-
-        recon_loss = self.recon_loss_fn(fmap, img)
-
-        # early return if training on grayscale
-
-        if not self.use_vgg_and_gan:
-            if return_recons:
-                return recon_loss, fmap
-
-            return recon_loss
-
-        # perceptual loss
-
-        img_vgg_input = img
-        fmap_vgg_input = fmap
-
-        if img.shape[1] == 1:
-            # handle grayscale for vgg
-            img_vgg_input, fmap_vgg_input = map(lambda t: repeat(t, 'b 1 ... -> b c ...', c = 3), (img_vgg_input, fmap_vgg_input))
-
-        img_vgg_feats = self.vgg(img_vgg_input)
-        recon_vgg_feats = self.vgg(fmap_vgg_input)
-        perceptual_loss = F.mse_loss(img_vgg_feats, recon_vgg_feats)
-
-        # generator loss
-
-        gen_loss = self.gen_loss(self.discr(fmap))
-
-        # calculate adaptive weight
-
-        last_dec_layer = self.decoders[-1].weight
-
-        norm_grad_wrt_gen_loss = grad_layer_wrt_loss(gen_loss, last_dec_layer).norm(p = 2)
-        norm_grad_wrt_perceptual_loss = grad_layer_wrt_loss(perceptual_loss, last_dec_layer).norm(p = 2)
-
-        adaptive_weight = safe_div(norm_grad_wrt_perceptual_loss, norm_grad_wrt_gen_loss)
-        adaptive_weight.clamp_(max = 1e4)
-
-        # combine losses
-
-        loss = recon_loss + perceptual_loss + commit_loss + adaptive_weight * gen_loss
-
-        if return_recons:
-            return loss, fmap
-
-        return loss
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.59',
+  version = '0.0.21',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
@@ -30,7 +30,6 @@ setup(
    'torch>=1.10',
    'torchvision',
    'tqdm',
-    'vector-quantize-pytorch',
    'x-clip>=0.4.4',
    'youtokentome'
  ],