update sample, and set default gradient clipping value for decoder training

samples
last update
2026-02-12 11:34:29 +01:00 · 2022-05-16 17:38:30 -07:00 · 2022-05-16 13:46:35 -07:00 · 2022-05-16 13:38:33 -07:00 · 2022-05-16 13:33:54 -07:00 · 2022-05-16 12:57:31 -07:00
5 changed files with 34 additions and 14 deletions
--- a/README.md
+++ b/README.md
@@ -18,7 +18,11 @@ There was enough interest for a <a href="https://github.com/lucidrains/dalle2-ja

 - A research group has used the code in this repository to train a functional diffusion prior for their CLIP generations. Will share their work once they release their preprint. This, and <a href="https://github.com/crowsonkb">Katherine's</a> own experiments, validate OpenAI's finding that the extra prior increases variety of generations.

- Decoder is now verified working for unconditional generation on my experimental setup for Oxford flowers
+- Decoder is now verified working for unconditional generation on my experimental setup for Oxford flowers. 2 researchers have also confirmed Decoder is working for them.
+
+<img src="./samples/oxford.png" width="600px" />
+
+*ongoing at 21k steps*

 ## Install

@@ -820,8 +824,8 @@ clip = CLIP(

 # mock data

-text = torch.randint(0, 49408, (32, 256)).cuda()
-images = torch.randn(32, 3, 256, 256).cuda()
+text = torch.randint(0, 49408, (512, 256)).cuda()
+images = torch.randn(512, 3, 256, 256).cuda()

 # prior networks (with transformer)

@@ -854,7 +858,7 @@ diffusion_prior_trainer.update()  # this will update the optimizer as well as th
 # after much of the above three lines in a loop
 # you can sample from the exponential moving average of the diffusion prior identically to how you do so for DiffusionPrior

-image_embeds = diffusion_prior_trainer.sample(text) # (4, 512) - exponential moving averaged image embeddings
+image_embeds = diffusion_prior_trainer.sample(text, max_batch_size = 4) # (512, 512) - exponential moving averaged image embeddings
 ```

 ## Bonus
@@ -867,7 +871,7 @@ ex.

 ```python
 import torch
-from dalle2_pytorch import Unet, Decoder
+from dalle2_pytorch import Unet, Decoder, DecoderTrainer

 # unet for the cascading ddpm

@@ -890,20 +894,24 @@ decoder = Decoder(
    unconditional = True
 ).cuda()

-# mock images (get a lot of this)
+# decoder trainer
+
+decoder_trainer = DecoderTrainer(decoder)
+
+# images (get a lot of this)

 images = torch.randn(1, 3, 512, 512).cuda()

 # feed images into decoder

 for i in (1, 2):
-    loss = decoder(images, unet_number = i)
-    loss.backward()
+    loss = decoder_trainer(images, unet_number = i)
+    decoder_trainer.update(unet_number = i)

-# do the above for many many many many steps
+# do the above for many many many many images
 # then it will learn to generate images

-images = decoder.sample(batch_size = 2) # (2, 3, 512, 512)
+images = decoder_trainer.sample(batch_size = 36, max_batch_size = 4) # (36, 3, 512, 512)
 ```

 ## Dataloaders
--- a/dalle2_pytorch/optimizer.py
+++ b/dalle2_pytorch/optimizer.py
@@ -7,7 +7,7 @@ def separate_weight_decayable_params(params):

 def get_optimizer(
    params,
-    lr = 2e-5,
+    lr = 1e-4,
    wd = 1e-2,
    betas = (0.9, 0.999),
    eps = 1e-8,
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -235,6 +235,16 @@ class EMA(nn.Module):

 # diffusion prior trainer

+def prior_sample_in_chunks(fn):
+    @wraps(fn)
+    def inner(self, *args, max_batch_size = None, **kwargs):
+        if not exists(max_batch_size):
+            return fn(self, *args, **kwargs)
+
+        outputs = [fn(self, *chunked_args, **chunked_kwargs) for _, (chunked_args, chunked_kwargs) in split_args_and_kwargs(*args, split_size = max_batch_size, **kwargs)]
+        return torch.cat(outputs, dim = 0)
+    return inner
+
 class DiffusionPriorTrainer(nn.Module):
    def __init__(
        self,
@@ -295,11 +305,13 @@ class DiffusionPriorTrainer(nn.Module):

    @torch.no_grad()
    @cast_torch_tensor
+    @prior_sample_in_chunks
    def p_sample_loop(self, *args, **kwargs):
        return self.ema_diffusion_prior.ema_model.p_sample_loop(*args, **kwargs)

    @torch.no_grad()
    @cast_torch_tensor
+    @prior_sample_in_chunks
    def sample(self, *args, **kwargs):
        return self.ema_diffusion_prior.ema_model.sample(*args, **kwargs)

@@ -351,10 +363,10 @@ class DecoderTrainer(nn.Module):
        self,
        decoder,
        use_ema = True,
-        lr = 2e-5,
+        lr = 1e-4,
        wd = 1e-2,
        eps = 1e-8,
-        max_grad_norm = None,
+        max_grad_norm = 0.5,
        amp = False,
        **kwargs
    ):
--- a/samples/oxford.png
+++ b/samples/oxford.png
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.2.46',
+  version = '0.3.2',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	bb86ab2404	update sample, and set default gradient clipping value for decoder training	2022-05-16 17:38:30 -07:00
Phil Wang	ae056dd67c	samples	2022-05-16 13:46:35 -07:00
Phil Wang	033d6b0ce8	last update	2022-05-16 13:38:33 -07:00
Phil Wang	c7ea8748db	default decoder learning rate to what was in the paper	2022-05-16 13:33:54 -07:00
Phil Wang	13382885d9	final update to dalle2 repository for a while - sampling from prior in chunks automatically with max_batch_size keyword given	2022-05-16 12:57:31 -07:00
Phil Wang	c3d4a7ffe4	update working unconditional decoder example	2022-05-16 12:50:07 -07:00