make sure resnet groups in unet is finely customizable

2025-12-19 17:54:20 +01:00 · 2022-05-10 10:12:37 -07:00
parent 10b905b445
commit b1e7b5f6bb
3 changed files with 16 additions and 14 deletions
--- a/README.md
+++ b/README.md
@@ -999,6 +999,7 @@ Once built, images will be saved to the same directory the command is invoked
 - [x] make sure DDPMs can be run with traditional resnet blocks (but leave convnext as an option for experimentation)
 - [x] make sure for the latter unets in the cascade, one can train on crops for learning super resolution (constrain the unet to be only convolutions in that case, or allow conv-like attention with rel pos bias)
 - [x] offer setting in diffusion prior to split time and image embeddings into multiple tokens, configurable, for more surface area during attention
+- [x] make sure resnet hyperparameters can be configurable across unet depth (groups and expansion factor)
 - [ ] become an expert with unets, cleanup unet code, make it fully configurable, port all learnings over to https://github.com/lucidrains/x-unet (test out unet² in ddpm repo) - consider https://github.com/lucidrains/uformer-pytorch attention-based unet
 - [ ] make sure the cascading ddpm in the repository can be trained unconditionally, offer a one-line CLI tool for training on a folder of images
 - [ ] transcribe code to Jax, which lowers the activation energy for distributed training, given access to TPUs
@@ -1012,7 +1013,6 @@ Once built, images will be saved to the same directory the command is invoked
 - [ ] use an experimental tracker agnostic setup, as done <a href="https://github.com/lucidrains/tf-bind-transformer#simple-trainer-class-for-fine-tuning">here</a>
 - [ ] interface out the vqgan-vae so a pretrained one can be pulled off the shelf to validate latent diffusion + DALL-E2
 - [ ] make sure FILIP works with DALL-E2 from x-clip https://arxiv.org/abs/2111.07783
- [ ] make sure resnet hyperparameters can be configurable across unet depth (groups and expansion factor)
 - [ ] offer save / load methods on the trainer classes to automatically take care of state dicts for scalers / optimizers / saving versions and checking for breaking changes
 - [ ] bring in skip-layer excitatons (from lightweight gan paper) to see if it helps for either decoder of unet or vqgan-vae training

--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1251,8 +1251,7 @@ class Unet(nn.Module):
        cond_on_image_embeds = False,
        init_dim = None,
        init_conv_kernel_size = 7,
-        block_type = 'resnet',
-        block_resnet_groups = 8,
+        resnet_groups = 8,
        **kwargs
    ):
        super().__init__()
@@ -1330,7 +1329,9 @@ class Unet(nn.Module):

        # resnet block klass

-        block_klass = partial(ResnetBlock, groups = block_resnet_groups)
+        resnet_groups = cast_tuple(resnet_groups, len(in_out))
+
+        assert len(resnet_groups) == len(in_out)

        # layers

@@ -1338,38 +1339,39 @@ class Unet(nn.Module):
        self.ups = nn.ModuleList([])
        num_resolutions = len(in_out)

-        for ind, (dim_in, dim_out) in enumerate(in_out):
+        for ind, ((dim_in, dim_out), groups) in enumerate(zip(in_out, resnet_groups)):
            is_first = ind == 0
            is_last = ind >= (num_resolutions - 1)
            layer_cond_dim = cond_dim if not is_first else None

            self.downs.append(nn.ModuleList([
-                block_klass(dim_in, dim_out, time_cond_dim = time_cond_dim),
+                ResnetBlock(dim_in, dim_out, time_cond_dim = time_cond_dim, groups = groups),
                Residual(LinearAttention(dim_out, **attn_kwargs)) if sparse_attn else nn.Identity(),
-                block_klass(dim_out, dim_out, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim),
+                ResnetBlock(dim_out, dim_out, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim, groups = groups),
                Downsample(dim_out) if not is_last else nn.Identity()
            ]))

        mid_dim = dims[-1]

-        self.mid_block1 = block_klass(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim)
+        self.mid_block1 = ResnetBlock(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[-1])
        self.mid_attn = EinopsToAndFrom('b c h w', 'b (h w) c', Residual(Attention(mid_dim, **attn_kwargs))) if attend_at_middle else None
-        self.mid_block2 = block_klass(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim)
+        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[-1])

-        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+        for ind, ((dim_in, dim_out), groups) in enumerate(zip(reversed(in_out[1:]), reversed(resnet_groups))):
            is_last = ind >= (num_resolutions - 2)
            layer_cond_dim = cond_dim if not is_last else None

            self.ups.append(nn.ModuleList([
-                block_klass(dim_out * 2, dim_in, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim),
+                ResnetBlock(dim_out * 2, dim_in, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim, groups = groups),
                Residual(LinearAttention(dim_in, **attn_kwargs)) if sparse_attn else nn.Identity(),
-                block_klass(dim_in, dim_in, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim),
+                ResnetBlock(dim_in, dim_in, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim, groups = groups),
                Upsample(dim_in)
            ]))

        out_dim = default(out_dim, channels)
+
        self.final_conv = nn.Sequential(
-            block_klass(dim, dim),
+            ResnetBlock(dim, dim, groups = resnet_groups[0]),
            nn.Conv2d(dim, out_dim, 1)
        )

--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.2.4',
+  version = '0.2.5',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',