make kernel size and sigma for gaussian blur for cascading DDPM overridable at forward. also make sure unets are wrapped in a modulelist so that at sample time, blurring does not happen

readme
2026-02-13 12:04:24 +01:00 · 2022-04-18 12:00:47 -07:00 · 2022-04-18 11:52:25 -07:00
3 changed files with 19 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ unet = Unet(
 # decoder, which contains the unet and clip

 decoder = Decoder(
-    net = unet,
+    unet = unet,
    clip = clip,
    timesteps = 100,
    cond_drop_prob = 0.2
@@ -182,9 +182,9 @@ loss.backward()
 # now the diffusion prior can generate image embeddings from the text embeddings
 ```

-In the paper, they actually used a <a href="https://cascaded-diffusion.github.io/">recently discovered technique</a>, from <a href="http://www.jonathanho.me/">Jonathan Ho</a> himself (original author of DDPMs, from which DALL-E2 is based).
+In the paper, they actually used a <a href="https://cascaded-diffusion.github.io/">recently discovered technique</a>, from <a href="http://www.jonathanho.me/">Jonathan Ho</a> himself (original author of DDPMs, the core technique used in DALL-E v2) for high resolution image synthesis.

-This can easily be used within the framework offered in this repository as so
+This can easily be used within this framework as so

 ```python
 import torch
@@ -218,7 +218,7 @@ unet1 = Unet(
 unet2 = Unet(
    dim = 16,
    image_embed_dim = 512,
-    lowres_cond = True,         # subsequence unets must have this turned on (and first unet must have this turned off)
+    lowres_cond = True,         # subsequent unets must have this turned on (and first unet must have this turned off)
    cond_dim = 128,
    channels = 3,
    dim_mults = (1, 2, 4, 8, 16)
@@ -412,6 +412,8 @@ Offer training wrappers
 - [x] build the cascading ddpm by having Decoder class manage multiple unets at different resolutions
 - [ ] use an image resolution cutoff and do cross attention conditioning only if resources allow, and MLP + sum conditioning on rest
 - [ ] make unet more configurable
+- [ ] figure out some factory methods to make cascading unet instantiations less error-prone
+- [ ] offload unets not being trained on to CPU for memory efficiency (for training each resolution unets separately)
 - [ ] train on a toy task, offer in colab
 - [ ] add attention to unet - apply some personal tricks with efficient attention - use the sparse attention mechanism from https://github.com/lucidrains/vit-pytorch#maxvit
 - [ ] build out latent diffusion architecture in separate file, as it is not faithful to dalle-2 (but offer it as as setting)
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1,6 +1,7 @@
 import math
 from tqdm import tqdm
 from inspect import isfunction
+from functools import partial

 import torch
 import torch.nn.functional as F
@@ -12,6 +13,7 @@ from einops_exts import rearrange_many, repeat_many, check_shape
 from einops_exts.torch import EinopsToAndFrom

 from kornia.filters.gaussian import GaussianBlur2d
+from kornia.filters import gaussian_blur2d

 from dalle2_pytorch.tokenizer import tokenizer

@@ -811,6 +813,7 @@ class Unet(nn.Module):
        lowres_cond = False, # for cascading diffusion - https://cascaded-diffusion.github.io/
        lowres_cond_upsample_mode = 'bilinear',
        blur_sigma = 0.1,
+        blur_kernel_size = 3,
        attend_at_middle = True, # whether to have a layer of attention at the bottleneck (can turn off for higher resolution in cascading DDPM, before bringing in efficient attention)
    ):
        super().__init__()
@@ -819,7 +822,8 @@ class Unet(nn.Module):

        self.lowres_cond = lowres_cond
        self.lowres_cond_upsample_mode = lowres_cond_upsample_mode
-        self.lowres_cond_blur = GaussianBlur2d((3, 3), (blur_sigma, blur_sigma))
+        self.lowres_blur_kernel_size = blur_kernel_size
+        self.lowres_blur_sigma = blur_sigma

        # determine dimensions

@@ -915,7 +919,9 @@ class Unet(nn.Module):
        image_embed,
        lowres_cond_img = None,
        text_encodings = None,
-        cond_drop_prob = 0.
+        cond_drop_prob = 0.,
+        blur_sigma = None,
+        blur_kernel_size = None
    ):
        batch_size, device = x.shape[0], x.device

@@ -926,7 +932,9 @@ class Unet(nn.Module):
        if exists(lowres_cond_img):
            if self.training:
                # when training, blur the low resolution conditional image
-                lowres_cond_img = self.lowres_cond_blur(lowres_cond_img)
+                blur_sigma = default(blur_sigma, self.lowres_blur_sigma)
+                blur_kernel_size = default(blur_kernel_size, self.lowres_blur_kernel_size)
+                lowres_cond_img = gaussian_blur2d(lowres_cond_img, cast_tuple(blur_kernel_size, 2), cast_tuple(blur_sigma, 2))

            lowres_cond_img = resize_image_to(lowres_cond_img, x.shape[-2:], mode = self.lowres_cond_upsample_mode)
            x = torch.cat((x, lowres_cond_img), dim = 1)
@@ -1014,7 +1022,7 @@ class Decoder(nn.Module):
        self.clip_image_size = clip.image_size
        self.channels = clip.image_channels

-        self.unets = cast_tuple(unet)
+        self.unets = nn.ModuleList(unet)
        image_sizes = default(image_sizes, (clip.image_size,))
        image_sizes = tuple(sorted(set(image_sizes)))

--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.20',
+  version = '0.0.21',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	5a731cc936	make kernel size and sigma for gaussian blur for cascading DDPM overridable at forward. also make sure unets are wrapped in a modulelist so that at sample time, blurring does not happen	2022-04-18 12:00:47 -07:00
Phil Wang	6cddefad26	readme	2022-04-18 11:52:25 -07:00