complete contextmanager method for keeping only one unet in GPU during training or inference

give time tokens a surface area of 2 tokens as default, make it so researcher can customize which unet actually is conditioned on image embeddings and/or text encodings
2026-02-13 03:54:35 +01:00 · 2022-04-20 10:46:13 -07:00 · 2022-04-20 10:04:47 -07:00
4 changed files with 52 additions and 18 deletions
--- a/README.md
+++ b/README.md
@@ -410,8 +410,8 @@ Offer training wrappers
 - [x] figure out all the current bag of tricks needed to make DDPMs great (starting with the blur trick mentioned in paper)
 - [x] build the cascading ddpm by having Decoder class manage multiple unets at different resolutions
 - [x] add efficient attention in unet
- [ ] be able to finely customize what to condition on (text, image embed) for specific unet in the cascade (super resolution ddpms near the end may not need too much conditioning)
- [ ] offload unets not being trained on to CPU for memory efficiency (for training each resolution unets separately)
+- [x] be able to finely customize what to condition on (text, image embed) for specific unet in the cascade (super resolution ddpms near the end may not need too much conditioning)
+- [x] offload unets not being trained on to CPU for memory efficiency (for training each resolution unets separately)
 - [ ] build out latent diffusion architecture in separate file, as it is not faithful to dalle-2 (but offer it as as setting)
 - [ ] become an expert with unets, cleanup unet code, make it fully configurable, port all learnings over to https://github.com/lucidrains/x-unet
 - [ ] train on a toy task, offer in colab
--- a/dalle2_pytorch/cli.py
+++ b/dalle2_pytorch/cli.py
@@ -6,4 +6,4 @@ def main():
@click.command()
@click.argument('text')
 def dream(text):
-    return image
+    return 'not ready yet'
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -2,6 +2,7 @@ import math
 from tqdm import tqdm
 from inspect import isfunction
 from functools import partial
+from contextlib import contextmanager

 import torch
 import torch.nn.functional as F
@@ -820,6 +821,7 @@ class Unet(nn.Module):
        image_embed_dim,
        cond_dim = None,
        num_image_tokens = 4,
+        num_time_tokens = 2,
        out_dim = None,
        dim_mults=(1, 2, 4, 8),
        channels = 3,
@@ -830,6 +832,8 @@ class Unet(nn.Module):
        sparse_attn = False,
        sparse_attn_window = 8,  # window size for sparse attention
        attend_at_middle = True, # whether to have a layer of attention at the bottleneck (can turn off for higher resolution in cascading DDPM, before bringing in efficient attention)
+        cond_on_text_encodings = False,
+        cond_on_image_embeds = False,
    ):
        super().__init__()
        # save locals to take care of some hyperparameters for cascading DDPM
@@ -862,8 +866,8 @@ class Unet(nn.Module):
            SinusoidalPosEmb(dim),
            nn.Linear(dim, dim * 4),
            nn.GELU(),
-            nn.Linear(dim * 4, cond_dim),
-            Rearrange('b d -> b 1 d')
+            nn.Linear(dim * 4, cond_dim * num_time_tokens),
+            Rearrange('b (r d) -> b r d', r = num_time_tokens)
        )

        self.image_to_cond = nn.Sequential(
@@ -873,6 +877,12 @@ class Unet(nn.Module):

        self.text_to_cond = nn.LazyLinear(cond_dim)

+        # finer control over whether to condition on image embeddings and text encodings
+        # so one can have the latter unets in the cascading DDPMs only focus on super-resoluting
+
+        self.cond_on_text_encodings = cond_on_text_encodings
+        self.cond_on_image_embeds = cond_on_image_embeds
+
        # for classifier free guidance

        self.null_image_embed = nn.Parameter(torch.randn(1, num_image_tokens, cond_dim))
@@ -982,17 +992,22 @@ class Unet(nn.Module):
        # mask out image embedding depending on condition dropout
        # for classifier free guidance

-        image_tokens = self.image_to_cond(image_embed)
+        image_tokens = None

-        image_tokens = torch.where(
-            cond_prob_mask,
-            image_tokens,
-            self.null_image_embed
-        )
+        if self.cond_on_image_embeds:
+            image_tokens = self.image_to_cond(image_embed)
+
+            image_tokens = torch.where(
+                cond_prob_mask,
+                image_tokens,
+                self.null_image_embed
+            )

        # take care of text encodings (optional)

-        if exists(text_encodings):
+        text_tokens = None
+
+        if exists(text_encodings) and self.cond_on_text_encodings:
            text_tokens = self.text_to_cond(text_encodings)
            text_tokens = torch.where(
                cond_prob_mask,
@@ -1002,12 +1017,15 @@ class Unet(nn.Module):

        # main conditioning tokens (c)

-        c = torch.cat((time_tokens, image_tokens), dim = -2)
+        c = time_tokens
+
+        if exists(image_tokens):
+            c = torch.cat((c, image_tokens), dim = -2)

        # text and image conditioning tokens (mid_c)
        # to save on compute, only do cross attention based conditioning on the inner most layers of the Unet

-        mid_c = c if not exists(text_encodings) else torch.cat((c, text_tokens), dim = -2)
+        mid_c = c if not exists(text_tokens) else torch.cat((c, text_tokens), dim = -2)

        # go through the layers of the unet, down and up

@@ -1124,6 +1142,20 @@ class Decoder(nn.Module):
        self.register_buffer('posterior_mean_coef1', betas * torch.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod))
        self.register_buffer('posterior_mean_coef2', (1. - alphas_cumprod_prev) * torch.sqrt(alphas) / (1. - alphas_cumprod))

+    @contextmanager
+    def one_unet_in_gpu(self, unet_number):
+        assert 0 < unet_number <= len(self.unets)
+        index = unet_number - 1
+        self.cuda()
+        self.unets.cpu()
+
+        unet = self.unets[index]
+        unet.cuda()
+
+        yield
+
+        self.unets.cpu()
+
    def get_text_encodings(self, text):
        text_encodings = self.clip.text_transformer(text)
        return text_encodings[:, 1:]
@@ -1228,9 +1260,11 @@ class Decoder(nn.Module):
        text_encodings = self.get_text_encodings(text) if exists(text) else None

        img = None
-        for unet, image_size in tqdm(zip(self.unets, self.image_sizes)):
-            shape = (batch_size, channels, image_size, image_size)
-            img = self.p_sample_loop(unet, shape, image_embed = image_embed, text_encodings = text_encodings, cond_scale = cond_scale, lowres_cond_img = img)
+
+        for ind, (unet, image_size) in tqdm(enumerate(zip(self.unets, self.image_sizes))):
+            with self.one_unet_in_gpu(ind + 1):
+                shape = (batch_size, channels, image_size, image_size)
+                img = self.p_sample_loop(unet, shape, image_embed = image_embed, text_encodings = text_encodings, cond_scale = cond_scale, lowres_cond_img = img)

        return img

--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.26',
+  version = '0.0.28',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	27a33e1b20	complete contextmanager method for keeping only one unet in GPU during training or inference	2022-04-20 10:46:13 -07:00
Phil Wang	6f941a219a	give time tokens a surface area of 2 tokens as default, make it so researcher can customize which unet actually is conditioned on image embeddings and/or text encodings	2022-04-20 10:04:47 -07:00