fix missing resisidual for highest resolution of the unet

changed str in order to avoid confusions and collisions with Python (#147 )
make memory efficient unet design from imagen toggle-able
2026-02-13 12:04:24 +01:00 · 2022-06-15 18:01:19 -07:00 · 2022-06-15 13:41:16 -07:00 · 2022-06-15 13:40:26 -07:00 · 2022-06-15 12:18:21 -07:00
3 changed files with 28 additions and 13 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1084,8 +1084,9 @@ class DiffusionPrior(BaseGaussianDiffusion):
 def Upsample(dim):
    return nn.ConvTranspose2d(dim, dim, 4, 2, 1)

-def Downsample(dim):
-    return nn.Conv2d(dim, dim, 4, 2, 1)
+def Downsample(dim, *, dim_out = None):
+    dim_out = default(dim_out, dim)
+    return nn.Conv2d(dim, dim_out, 4, 2, 1)

 class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim):
@@ -1351,6 +1352,7 @@ class Unet(nn.Module):
        init_cross_embed_kernel_sizes = (3, 7, 15),
        cross_embed_downsample = False,
        cross_embed_downsample_kernel_sizes = (2, 4),
+        memory_efficient = False,
        **kwargs
    ):
        super().__init__()
@@ -1370,7 +1372,7 @@ class Unet(nn.Module):
        self.channels_out = default(channels_out, channels)

        init_channels = channels if not lowres_cond else channels * 2 # in cascading diffusion, one concats the low resolution image, blurred, for conditioning the higher resolution synthesis
-        init_dim = default(init_dim, dim // 3 * 2)
+        init_dim = default(init_dim, dim)

        self.init_conv = CrossEmbedLayer(init_channels, dim_out = init_dim, kernel_sizes = init_cross_embed_kernel_sizes, stride = 1)

@@ -1461,10 +1463,11 @@ class Unet(nn.Module):
            layer_cond_dim = cond_dim if not is_first else None

            self.downs.append(nn.ModuleList([
-                ResnetBlock(dim_in, dim_out, time_cond_dim = time_cond_dim, groups = groups),
+                downsample_klass(dim_in, dim_out = dim_out) if memory_efficient else None,
+                ResnetBlock(dim_out if memory_efficient else dim_in, dim_out, time_cond_dim = time_cond_dim, groups = groups),
                Residual(LinearAttention(dim_out, **attn_kwargs)) if sparse_attn else nn.Identity(),
                nn.ModuleList([ResnetBlock(dim_out, dim_out, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim, groups = groups) for _ in range(layer_num_resnet_blocks)]),
-                downsample_klass(dim_out) if not is_last else nn.Identity()
+                downsample_klass(dim_out) if not is_last and not memory_efficient else None
            ]))

        mid_dim = dims[-1]
@@ -1473,7 +1476,9 @@ class Unet(nn.Module):
        self.mid_attn = EinopsToAndFrom('b c h w', 'b (h w) c', Residual(Attention(mid_dim, **attn_kwargs))) if attend_at_middle else None
        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[-1])

-        for ind, ((dim_in, dim_out), groups, layer_num_resnet_blocks) in enumerate(zip(reversed(in_out[1:]), reversed(resnet_groups), reversed(num_resnet_blocks))):
+        up_in_out_slice = slice(1 if not memory_efficient else None, None)
+
+        for ind, ((dim_in, dim_out), groups, layer_num_resnet_blocks) in enumerate(zip(reversed(in_out[up_in_out_slice]), reversed(resnet_groups), reversed(num_resnet_blocks))):
            is_last = ind >= (num_resolutions - 2)
            layer_cond_dim = cond_dim if not is_last else None

@@ -1484,8 +1489,10 @@ class Unet(nn.Module):
                Upsample(dim_in)
            ]))

+        final_dim_in = dim * (1 if memory_efficient else 2)
+
        self.final_conv = nn.Sequential(
-            ResnetBlock(dim, dim, groups = resnet_groups[0]),
+            ResnetBlock(final_dim_in, dim, groups = resnet_groups[0]),
            nn.Conv2d(dim, self.channels_out, 1)
        )

@@ -1654,7 +1661,10 @@ class Unet(nn.Module):

        hiddens = []

-        for init_block, sparse_attn, resnet_blocks, downsample in self.downs:
+        for pre_downsample, init_block, sparse_attn, resnet_blocks, post_downsample in self.downs:
+            if exists(pre_downsample):
+                x = pre_downsample(x)
+
            x = init_block(x, c, t)
            x = sparse_attn(x)

@@ -1662,7 +1672,9 @@ class Unet(nn.Module):
                x = resnet_block(x, c, t)

            hiddens.append(x)
-            x = downsample(x)
+
+            if exists(post_downsample):
+                x = post_downsample(x)

        x = self.mid_block1(x, mid_c, t)

@@ -1672,7 +1684,7 @@ class Unet(nn.Module):
        x = self.mid_block2(x, mid_c, t)

        for init_block, sparse_attn, resnet_blocks, upsample in self.ups:
-            x = torch.cat((x, hiddens.pop()), dim=1)
+            x = torch.cat((x, hiddens.pop()), dim = 1)
            x = init_block(x, c, t)
            x = sparse_attn(x)

@@ -1681,6 +1693,9 @@ class Unet(nn.Module):

            x = upsample(x)

+        if len(hiddens) > 0:
+            x = torch.cat((x, hiddens.pop()), dim = 1)
+
        return self.final_conv(x)

 class LowresConditioner(nn.Module):
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.7.1'
+__version__ = '0.9.0'
--- a/dalle2_pytorch/vqgan_vae.py
+++ b/dalle2_pytorch/vqgan_vae.py
@@ -68,8 +68,8 @@ def group_dict_by_key(cond, d):
        return_val[ind][key] = d[key]
    return (*return_val,)

-def string_begins_with(prefix, str):
-    return str.startswith(prefix)
+def string_begins_with(prefix, string_input):
+    return string_input.startswith(prefix)

 def group_by_key_prefix(prefix, d):
    return group_dict_by_key(partial(string_begins_with, prefix), d)
Author	SHA1	Message	Date
Phil Wang	6647050c33	fix missing resisidual for highest resolution of the unet	2022-06-15 18:01:19 -07:00
Giorgos Zachariadis	b4c3e5b854	changed str in order to avoid confusions and collisions with Python (#147 )	2022-06-15 13:41:16 -07:00
Phil Wang	b7f9607258	make memory efficient unet design from imagen toggle-able	2022-06-15 13:40:26 -07:00
Phil Wang	2219348a6e	adopt similar unet architecture as imagen	2022-06-15 12:18:21 -07:00