project management

add convnext backbone for vqgan-vae, still need to fix groupnorms in resnet encdec
let researchers do the hyperparameter search
2026-02-12 11:34:29 +01:00 · 2022-05-01 09:32:57 -07:00 · 2022-05-01 09:32:24 -07:00 · 2022-05-01 08:46:21 -07:00
4 changed files with 111 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -820,13 +820,13 @@ Once built, images will be saved to the same directory the command is invoked
 - [x] take care of mixed precision as well as gradient accumulation within decoder trainer
 - [x] just take care of the training for the decoder in a wrapper class, as each unet in the cascade will need its own optimizer
 - [x] bring in tools to train vqgan-vae
+- [x] add convnext backbone for vqgan-vae (in addition to vit [vit-vqgan] + resnet)
 - [ ] become an expert with unets, cleanup unet code, make it fully configurable, port all learnings over to https://github.com/lucidrains/x-unet
 - [ ] copy the cascading ddpm code to a separate repo (perhaps https://github.com/lucidrains/denoising-diffusion-pytorch) as the main contribution of dalle2 really is just the prior network
 - [ ] transcribe code to Jax, which lowers the activation energy for distributed training, given access to TPUs
 - [ ] train on a toy task, offer in colab
 - [ ] think about how best to design a declarative training config that handles preencoding for prior and training of multiple networks in decoder
 - [ ] extend diffusion head to use diffusion-gan (potentially using lightweight-gan) to speed up inference
- [ ] add convnext backbone for vqgan-vae (in addition to vit [vit-vqgan] + resnet)

 ## Citations

--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1072,6 +1072,8 @@ class Unet(nn.Module):
        cond_on_text_encodings = False,
        max_text_len = 256,
        cond_on_image_embeds = False,
+        init_dim = None,
+        init_conv_kernel_size = 7
    ):
        super().__init__()
        # save locals to take care of some hyperparameters for cascading DDPM
@@ -1089,9 +1091,10 @@ class Unet(nn.Module):
        self.channels = channels

        init_channels = channels if not lowres_cond else channels * 2 # in cascading diffusion, one concats the low resolution image, blurred, for conditioning the higher resolution synthesis
-        init_dim = dim // 2
+        init_dim = default(init_dim, dim // 2)

-        self.init_conv = nn.Conv2d(init_channels, init_dim, 7, padding = 3)
+        assert (init_conv_kernel_size % 2) == 1
+        self.init_conv = nn.Conv2d(init_channels, init_dim, init_conv_kernel_size, padding = init_conv_kernel_size // 2)

        dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
        in_out = list(zip(dims[:-1], dims[1:]))
--- a/dalle2_pytorch/vqgan_vae.py
+++ b/dalle2_pytorch/vqgan_vae.py
@@ -327,6 +327,108 @@ class ResBlock(nn.Module):
    def forward(self, x):
        return self.net(x) + x

+# convnext enc dec
+
+class ChanLayerNorm(nn.Module):
+    def __init__(self, dim, eps = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
+
+    def forward(self, x):
+        var = torch.var(x, dim = 1, unbiased = False, keepdim = True)
+        mean = torch.mean(x, dim = 1, keepdim = True)
+        return (x - mean) / (var + self.eps).sqrt() * self.g
+
+class ConvNext(nn.Module):
+    def __init__(self, dim, mult = 4, kernel_size = 3, ds_kernel_size = 7):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        self.net = nn.Sequential(
+            nn.Conv2d(dim, dim, ds_kernel_size, padding = ds_kernel_size // 2, groups = dim),
+            ChanLayerNorm(dim),
+            nn.Conv2d(dim, inner_dim, kernel_size, padding = kernel_size // 2),
+            nn.GELU(),
+            nn.Conv2d(inner_dim, dim, kernel_size, padding = kernel_size // 2)
+        )
+
+    def forward(self, x):
+        return self.net(x) + x
+
+class ConvNextEncDec(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        channels = 3,
+        layers = 4,
+        layer_mults = None,
+        num_blocks = 1,
+        first_conv_kernel_size = 5,
+        use_attn = True,
+        attn_dim_head = 64,
+        attn_heads = 8,
+        attn_dropout = 0.,
+    ):
+        super().__init__()
+
+        self.layers = layers
+
+        self.encoders = MList([])
+        self.decoders = MList([])
+
+        layer_mults = default(layer_mults, list(map(lambda t: 2 ** t, range(layers))))
+        assert len(layer_mults) == layers, 'layer multipliers must be equal to designated number of layers'
+
+        layer_dims = [dim * mult for mult in layer_mults]
+        dims = (dim, *layer_dims)
+
+        self.encoded_dim = dims[-1]
+
+        dim_pairs = zip(dims[:-1], dims[1:])
+
+        append = lambda arr, t: arr.append(t)
+        prepend = lambda arr, t: arr.insert(0, t)
+
+        if not isinstance(num_blocks, tuple):
+            num_blocks = (*((0,) * (layers - 1)), num_blocks)
+
+        if not isinstance(use_attn, tuple):
+            use_attn = (*((False,) * (layers - 1)), use_attn)
+
+        assert len(num_blocks) == layers, 'number of blocks config must be equal to number of layers'
+        assert len(use_attn) == layers
+
+        for layer_index, (dim_in, dim_out), layer_num_blocks, layer_use_attn in zip(range(layers), dim_pairs, num_blocks, use_attn):
+            append(self.encoders, nn.Sequential(nn.Conv2d(dim_in, dim_out, 4, stride = 2, padding = 1), leaky_relu()))
+            prepend(self.decoders, nn.Sequential(nn.ConvTranspose2d(dim_out, dim_in, 4, 2, 1), leaky_relu()))
+
+            if layer_use_attn:
+                prepend(self.decoders, VQGanAttention(dim = dim_out, heads = attn_heads, dim_head = attn_dim_head, dropout = attn_dropout))
+
+            for _ in range(layer_num_blocks):
+                append(self.encoders, ConvNext(dim_out))
+                prepend(self.decoders, ConvNext(dim_out))
+
+            if layer_use_attn:
+                append(self.encoders, VQGanAttention(dim = dim_out, heads = attn_heads, dim_head = attn_dim_head, dropout = attn_dropout))
+
+        prepend(self.encoders, nn.Conv2d(channels, dim, first_conv_kernel_size, padding = first_conv_kernel_size // 2))
+        append(self.decoders, nn.Conv2d(dim, channels, 1))
+
+    def get_encoded_fmap_size(self, image_size):
+        return image_size // (2 ** self.layers)
+
+    def encode(self, x):
+        for enc in self.encoders:
+            x = enc(x)
+        return x
+
+    def decode(self, x):
+        for dec in self.decoders:
+            x = dec(x)
+        return x
+
 # vqgan attention layer

 class VQGanAttention(nn.Module):
@@ -568,6 +670,8 @@ class VQGanVAE(nn.Module):
            enc_dec_klass = ResnetEncDec
        elif vae_type == 'vit':
            enc_dec_klass = ViTEncDec
+        elif vae_type == 'convnext':
+            enc_dec_klass = ConvNextEncDec
        else:
            raise ValueError(f'{vae_type} not valid')

--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.84',
+  version = '0.0.86',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	8b9bbec7d1	project management	2022-05-01 09:32:57 -07:00
Phil Wang	1bb9fc9829	add convnext backbone for vqgan-vae, still need to fix groupnorms in resnet encdec	2022-05-01 09:32:24 -07:00
Phil Wang	5e421bd5bb	let researchers do the hyperparameter search	2022-05-01 08:46:21 -07:00