allows one to shortcut sampling at a specific unet number, if one were to be training in stages

DecoderTrainer sample method uses the exponentially moving averaged
allow for division of loss prior to scaling, for gradient accumulation purposes
2026-02-12 11:34:29 +01:00 · 2022-04-30 15:10:25 -07:00 · 2022-04-30 14:55:34 -07:00 · 2022-04-30 12:56:47 -07:00 · 2022-04-30 12:27:24 -07:00 · 2022-04-30 11:56:05 -07:00
4 changed files with 94 additions and 10 deletions
--- a/README.md
+++ b/README.md
@@ -760,7 +760,7 @@ decoder = Decoder(
    unet = (unet1, unet2),
    image_sizes = (128, 256),
    clip = clip,
-    timesteps = 1,
+    timesteps = 1000,
    condition_on_text_encodings = True
 ).cuda()

@@ -778,6 +778,12 @@ for unet_number in (1, 2):
    loss.backward()

    decoder_trainer.update(unet_number) # update the specific unet as well as its exponential moving average
+
+# after much training
+# you can sample from the exponentially moving averaged unets as so
+
+mock_image_embed = torch.randn(4, 512).cuda()
+images = decoder.sample(mock_image_embed, text = text) # (4, 3, 256, 256)
 ```

 ## CLI (wip)
@@ -811,7 +817,7 @@ Once built, images will be saved to the same directory the command is invoked
 - [x] use inheritance just this once for sharing logic between decoder and prior network ddpms
 - [x] bring in vit-vqgan https://arxiv.org/abs/2110.04627 for the latent diffusion
 - [x] abstract interface for CLIP adapter class, so other CLIPs can be brought in
- [ ] take care of mixed precision as well as gradient accumulation within decoder trainer
+- [x] take care of mixed precision as well as gradient accumulation within decoder trainer
 - [ ] become an expert with unets, cleanup unet code, make it fully configurable, port all learnings over to https://github.com/lucidrains/x-unet
 - [ ] copy the cascading ddpm code to a separate repo (perhaps https://github.com/lucidrains/denoising-diffusion-pytorch) as the main contribution of dalle2 really is just the prior network
 - [ ] transcribe code to Jax, which lowers the activation energy for distributed training, given access to TPUs
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1540,7 +1540,13 @@ class Decoder(BaseGaussianDiffusion):

    @torch.no_grad()
    @eval_decorator
-    def sample(self, image_embed, text = None, cond_scale = 1.):
+    def sample(
+        self,
+        image_embed,
+        text = None,
+        cond_scale = 1.,
+        stop_at_unet_number = None
+    ):
        batch_size = image_embed.shape[0]

        text_encodings = text_mask = None
@@ -1552,7 +1558,7 @@ class Decoder(BaseGaussianDiffusion):

        img = None

-        for unet, vae, channel, image_size, predict_x_start in tqdm(zip(self.unets, self.vaes, self.sample_channels, self.image_sizes, self.predict_x_start)):
+        for unet_number, unet, vae, channel, image_size, predict_x_start in tqdm(zip(range(1, len(self.unets) + 1), self.unets, self.vaes, self.sample_channels, self.image_sizes, self.predict_x_start)):

            context = self.one_unet_in_gpu(unet = unet) if image_embed.is_cuda else null_context()

@@ -1584,6 +1590,9 @@ class Decoder(BaseGaussianDiffusion):

                img = vae.decode(img)

+            if exists(stop_at_unet_number) and stop_at_unet_number == unet_number:
+                break
+
        return img

    def forward(
--- a/dalle2_pytorch/train.py
+++ b/dalle2_pytorch/train.py
@@ -3,12 +3,19 @@ from functools import partial

 import torch
 from torch import nn
+from torch.cuda.amp import autocast, GradScaler

 from dalle2_pytorch.dalle2_pytorch import Decoder
 from dalle2_pytorch.optimizer import get_optimizer

 # helper functions

+def exists(val):
+    return val is not None
+
+def cast_tuple(val, length = 1):
+    return val if isinstance(val, tuple) else ((val,) * length)
+
 def pick_and_pop(keys, d):
    values = list(map(lambda key: d.pop(key), keys))
    return dict(zip(keys, values))
@@ -89,6 +96,10 @@ class DecoderTrainer(nn.Module):
        self,
        decoder,
        use_ema = True,
+        lr = 3e-4,
+        wd = 1e-2,
+        max_grad_norm = None,
+        amp = False,
        **kwargs
    ):
        super().__init__()
@@ -106,24 +117,82 @@ class DecoderTrainer(nn.Module):

        self.ema_unets = nn.ModuleList([])

-        for ind, unet in enumerate(self.decoder.unets):
-            optimizer = get_optimizer(unet.parameters(), **kwargs)
+        self.amp = amp
+
+        # be able to finely customize learning rate, weight decay
+        # per unet
+
+        lr, wd = map(partial(cast_tuple, length = self.num_unets), (lr, wd))
+
+        for ind, (unet, unet_lr, unet_wd) in enumerate(zip(self.decoder.unets, lr, wd)):
+            optimizer = get_optimizer(
+                unet.parameters(),
+                lr = unet_lr,
+                wd = unet_wd,
+                **kwargs
+            )
+
            setattr(self, f'optim{ind}', optimizer) # cannot use pytorch ModuleList for some reason with optimizers

            if self.use_ema:
                self.ema_unets.append(EMA(unet, **ema_kwargs))

+            scaler = GradScaler(enabled = amp)
+            setattr(self, f'scaler{ind}', scaler)
+
+        # gradient clipping if needed
+
+        self.max_grad_norm = max_grad_norm
+
+    @property
+    def unets(self):
+        return nn.ModuleList([ema.ema_model for ema in self.ema_unets])
+
+    def scale(self, loss, *, unet_number):
+        assert 1 <= unet_number <= self.num_unets
+        index = unet_number - 1
+        scaler = getattr(self, f'scaler{index}')
+        return scaler.scale(loss)
+
    def update(self, unet_number):
        assert 1 <= unet_number <= self.num_unets
        index = unet_number - 1
+        unet = self.decoder.unets[index]
+
+        if exists(self.max_grad_norm):
+            nn.utils.clip_grad_norm_(unet.parameters(), self.max_grad_norm)

        optimizer = getattr(self, f'optim{index}')
-        optimizer.step()
+        scaler = getattr(self, f'scaler{index}')
+
+        scaler.step(optimizer)
+        scaler.update()
        optimizer.zero_grad()

        if self.use_ema:
            ema_unet = self.ema_unets[index]
            ema_unet.update()

-    def forward(self, x, *, unet_number, **kwargs):
-        return self.decoder(x, unet_number = unet_number, **kwargs)
+    @torch.no_grad()
+    def sample(self, *args, **kwargs):
+        if self.use_ema:
+            trainable_unets = self.decoder.unets
+            self.decoder.unets = self.unets                  # swap in exponential moving averaged unets for sampling
+
+        output = self.decoder.sample(*args, **kwargs)
+
+        if self.use_ema:
+            self.decoder.unets = trainable_unets             # restore original training unets
+        return output
+
+    def forward(
+        self,
+        x,
+        *,
+        unet_number,
+        divisor = 1,
+        **kwargs
+    ):
+        with autocast(enabled = self.amp):
+            loss = self.decoder(x, unet_number = unet_number, **kwargs)
+        return self.scale(loss / divisor, unet_number = unet_number)
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.77',
+  version = '0.0.82',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	8260fc933a	allows one to shortcut sampling at a specific unet number, if one were to be training in stages	2022-04-30 15:10:25 -07:00
Phil Wang	ebe01749ed	DecoderTrainer sample method uses the exponentially moving averaged	2022-04-30 14:55:34 -07:00
Phil Wang	63195cc2cb	allow for division of loss prior to scaling, for gradient accumulation purposes	2022-04-30 12:56:47 -07:00
Phil Wang	a2ef69af66	take care of mixed precision, and make gradient accumulation do-able externally	2022-04-30 12:27:24 -07:00
Phil Wang	5fff22834e	be able to finely customize learning parameters for each unet, take care of gradient clipping	2022-04-30 11:56:05 -07:00