update sample, and set default gradient clipping value for decoder training

samples
last update
2026-02-14 09:54:27 +01:00 · 2022-05-16 17:38:30 -07:00 · 2022-05-16 13:46:35 -07:00 · 2022-05-16 13:38:33 -07:00 · 2022-05-16 13:33:54 -07:00 · 2022-05-16 12:57:31 -07:00
5 changed files with 68 additions and 19 deletions
--- a/README.md
+++ b/README.md
@@ -14,6 +14,16 @@ Please join <a href="https://discord.gg/xBPBXfcFHd"><img alt="Join us on Discord
 There was enough interest for a <a href="https://github.com/lucidrains/dalle2-jax">Jax version</a>. I will also eventually extend this to <a href="https://github.com/lucidrains/dalle2-video">text to video</a>, once the repository is in a good place.
 ## Status
 - A research group has used the code in this repository to train a functional diffusion prior for their CLIP generations. Will share their work once they release their preprint. This, and <a href="https://github.com/crowsonkb">Katherine's</a> own experiments, validate OpenAI's finding that the extra prior increases variety of generations.
 - Decoder is now verified working for unconditional generation on my experimental setup for Oxford flowers. 2 researchers have also confirmed Decoder is working for them.
 <img src="./samples/oxford.png" width="600px" />
 *ongoing at 21k steps*
 ## Install
 ```bash
@@ -814,8 +824,8 @@ clip = CLIP(
 # mock data
-text = torch.randint(0, 49408, (32, 256)).cuda()
+text = torch.randint(0, 49408, (512, 256)).cuda()
-images = torch.randn(32, 3, 256, 256).cuda()
+images = torch.randn(512, 3, 256, 256).cuda()
 # prior networks (with transformer)
@@ -848,7 +858,7 @@ diffusion_prior_trainer.update()  # this will update the optimizer as well as th
 # after much of the above three lines in a loop
 # you can sample from the exponential moving average of the diffusion prior identically to how you do so for DiffusionPrior
-image_embeds = diffusion_prior_trainer.sample(text) # (4, 512) - exponential moving averaged image embeddings
+image_embeds = diffusion_prior_trainer.sample(text, max_batch_size = 4) # (512, 512) - exponential moving averaged image embeddings
 ```
 ## Bonus
@@ -861,7 +871,7 @@ ex.
 ```python
 import torch
-from dalle2_pytorch import Unet, Decoder
+from dalle2_pytorch import Unet, Decoder, DecoderTrainer
 # unet for the cascading ddpm
@@ -884,20 +894,24 @@ decoder = Decoder(
    unconditional = True
 ).cuda()
-# mock images (get a lot of this)
+# decoder trainer
 decoder_trainer = DecoderTrainer(decoder)
 # images (get a lot of this)
 images = torch.randn(1, 3, 512, 512).cuda()
 # feed images into decoder
 for i in (1, 2):
-    loss = decoder(images, unet_number = i)
+    loss = decoder_trainer(images, unet_number = i)
-    loss.backward()
+    decoder_trainer.update(unet_number = i)
-# do the above for many many many many steps
+# do the above for many many many many images
 # then it will learn to generate images
-images = decoder.sample(batch_size = 2) # (2, 3, 512, 512)
+images = decoder_trainer.sample(batch_size = 36, max_batch_size = 4) # (36, 3, 512, 512)
 ```
 ## Dataloaders
--- a/dalle2_pytorch/optimizer.py
+++ b/dalle2_pytorch/optimizer.py
@@ -7,7 +7,7 @@ def separate_weight_decayable_params(params):
 def get_optimizer(
    params,
-    lr = 2e-5,
+    lr = 1e-4,
    wd = 1e-2,
    betas = (0.9, 0.999),
    eps = 1e-8,
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -47,6 +47,14 @@ def groupby_prefix_and_trim(prefix, d):
    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
    return kwargs_without_prefix, kwargs
 def num_to_groups(num, divisor):
    groups = num // divisor
    remainder = num % divisor
    arr = [divisor] * groups
    if remainder > 0:
        arr.append(remainder)
    return arr
 # decorators
 def cast_torch_tensor(fn):
@@ -227,6 +235,16 @@ class EMA(nn.Module):
 # diffusion prior trainer
 def prior_sample_in_chunks(fn):
    @wraps(fn)
    def inner(self, *args, max_batch_size = None, **kwargs):
        if not exists(max_batch_size):
            return fn(self, *args, **kwargs)
        outputs = [fn(self, *chunked_args, **chunked_kwargs) for _, (chunked_args, chunked_kwargs) in split_args_and_kwargs(*args, split_size = max_batch_size, **kwargs)]
        return torch.cat(outputs, dim = 0)
    return inner
 class DiffusionPriorTrainer(nn.Module):
    def __init__(
        self,
@@ -287,11 +305,13 @@ class DiffusionPriorTrainer(nn.Module):
    @torch.no_grad()
    @cast_torch_tensor
    @prior_sample_in_chunks
    def p_sample_loop(self, *args, **kwargs):
        return self.ema_diffusion_prior.ema_model.p_sample_loop(*args, **kwargs)
    @torch.no_grad()
    @cast_torch_tensor
    @prior_sample_in_chunks
    def sample(self, *args, **kwargs):
        return self.ema_diffusion_prior.ema_model.sample(*args, **kwargs)
@@ -322,15 +342,31 @@ class DiffusionPriorTrainer(nn.Module):
 # decoder trainer
 def decoder_sample_in_chunks(fn):
    @wraps(fn)
    def inner(self, *args, max_batch_size = None, **kwargs):
        if not exists(max_batch_size):
            return fn(self, *args, **kwargs)
        if self.decoder.unconditional:
            batch_size = kwargs.get('batch_size')
            batch_sizes = num_to_groups(batch_size, max_batch_size)
            outputs = [fn(self, *args, **{**kwargs, 'batch_size': sub_batch_size}) for sub_batch_size in batch_sizes]
        else:
            outputs = [fn(self, *chunked_args, **chunked_kwargs) for _, (chunked_args, chunked_kwargs) in split_args_and_kwargs(*args, split_size = max_batch_size, **kwargs)]
        return torch.cat(outputs, dim = 0)
    return inner
 class DecoderTrainer(nn.Module):
    def __init__(
        self,
        decoder,
        use_ema = True,
-        lr = 2e-5,
+        lr = 1e-4,
        wd = 1e-2,
        eps = 1e-8,
-        max_grad_norm = None,
+        max_grad_norm = 0.5,
        amp = False,
        **kwargs
    ):
@@ -411,18 +447,17 @@ class DecoderTrainer(nn.Module):
    @torch.no_grad()
    @cast_torch_tensor
    @decoder_sample_in_chunks
    def sample(self, *args, **kwargs):
-        if kwargs.pop('use_non_ema', False):
+        if kwargs.pop('use_non_ema', False) or not self.use_ema:
            return self.decoder.sample(*args, **kwargs)
-        if self.use_ema:
+        trainable_unets = self.decoder.unets
-            trainable_unets = self.decoder.unets
+        self.decoder.unets = self.unets                  # swap in exponential moving averaged unets for sampling
            self.decoder.unets = self.unets                  # swap in exponential moving averaged unets for sampling
        output = self.decoder.sample(*args, **kwargs)
-        if self.use_ema:
+        self.decoder.unets = trainable_unets             # restore original training unets
            self.decoder.unets = trainable_unets             # restore original training unets
        # cast the ema_model unets back to original device
        for ema in self.ema_unets:
--- a/samples/oxford.png
+++ b/samples/oxford.png
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.2.44',
+  version = '0.3.2',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	bb86ab2404	update sample, and set default gradient clipping value for decoder training	2022-05-16 17:38:30 -07:00
Phil Wang	ae056dd67c	samples	2022-05-16 13:46:35 -07:00
Phil Wang	033d6b0ce8	last update	2022-05-16 13:38:33 -07:00
Phil Wang	c7ea8748db	default decoder learning rate to what was in the paper	2022-05-16 13:33:54 -07:00
Phil Wang	13382885d9	final update to dalle2 repository for a while - sampling from prior in chunks automatically with max_batch_size keyword given	2022-05-16 12:57:31 -07:00
Phil Wang	c3d4a7ffe4	update working unconditional decoder example	2022-05-16 12:50:07 -07:00
Phil Wang	164d9be444	use a decorator and take care of sampling in chunks (max_batch_size keyword), in case one is sampling a huge grid of images	2022-05-16 12:34:28 -07:00
Phil Wang	5562ec6be2	status updates	2022-05-16 12:01:54 -07:00