add two asserts to diffusion prior to ensure matching image embedding dimensions for clip, diffusion prior network, and what was set on diffusion prior

upgrade to best downsample
2026-02-12 11:34:29 +01:00 · 2022-08-28 10:11:37 -07:00 · 2022-08-25 10:37:02 -07:00
4 changed files with 23 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -1285,4 +1285,14 @@ For detailed information on training the diffusion prior, please refer to the [d
 }
 ```

+```bibtex
+@article{Sunkara2022NoMS,
+    title   = {No More Strided Convolutions or Pooling: A New CNN Building Block for Low-Resolution Images and Small Objects},
+    author  = {Raja Sunkara and Tie Luo},
+    journal = {ArXiv},
+    year    = {2022},
+    volume  = {abs/2208.03641}
+}
+```
+
 *Creating noise from data is easy; creating data from noise is generative modeling.* - <a href="https://arxiv.org/abs/2011.13456">Yang Song's paper</a>
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1166,6 +1166,10 @@ class DiffusionPrior(nn.Module):

        self.net = net
        self.image_embed_dim = default(image_embed_dim, lambda: clip.dim_latent)
+
+        assert net.dim == self.image_embed_dim, f'your diffusion prior network has a dimension of {net.dim}, but you set your image embedding dimension (keyword image_embed_dim) on DiffusionPrior to {self.image_embed_dim}'
+        assert not exists(clip) or clip.dim_latent == self.image_embed_dim, f'you passed in a CLIP to the diffusion prior with latent dimensions of {clip.dim_latent}, but your image embedding dimension (keyword image_embed_dim) for the DiffusionPrior was set to {self.image_embed_dim}'
+
        self.channels = default(image_channels, lambda: clip.image_channels)

        self.text_cond_drop_prob = default(text_cond_drop_prob, cond_drop_prob)
@@ -1479,9 +1483,14 @@ class PixelShuffleUpsample(nn.Module):
    def forward(self, x):
        return self.net(x)

-def Downsample(dim, *, dim_out = None):
+def Downsample(dim, dim_out = None):
+    # https://arxiv.org/abs/2208.03641 shows this is the most optimal way to downsample
+    # named SP-conv in the paper, but basically a pixel unshuffle
    dim_out = default(dim_out, dim)
-    return nn.Conv2d(dim, dim_out, 4, 2, 1)
+    return nn.Sequential(
+        Rearrange('b c (h s1) (w s2) -> b (c s1 s2) h w', s1 = 2, s2 = 2),
+        nn.Conv2d(dim * 4, dim_out, 1)
+    )

 class WeightStandardizedConv2d(nn.Conv2d):
    """
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -519,7 +519,7 @@ class DecoderTrainer(nn.Module):
            clip = decoder.clip
            clip.to(precision_type)

-        decoder, train_dataloader, *optimizers = list(self.accelerator.prepare(decoder, dataloaders['train'], *optimizers))
+        decoder, *optimizers = list(self.accelerator.prepare(decoder, *optimizers))

        self.decoder = decoder

--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '1.9.0'
+__version__ = '1.10.1'
Author	SHA1	Message	Date
Phil Wang	ba58ae0bf2	add two asserts to diffusion prior to ensure matching image embedding dimensions for clip, diffusion prior network, and what was set on diffusion prior	2022-08-28 10:11:37 -07:00
Phil Wang	1cc5d0afa7	upgrade to best downsample	2022-08-25 10:37:02 -07:00