quick fix for @marunine

update sample, and set default gradient clipping value for decoder training
samples
2026-02-12 11:34:29 +01:00 · 2022-05-18 20:22:52 -07:00 · 2022-05-16 17:38:30 -07:00 · 2022-05-16 13:46:35 -07:00 · 2022-05-16 13:38:33 -07:00
5 changed files with 17 additions and 8 deletions
--- a/README.md
+++ b/README.md
@@ -18,7 +18,11 @@ There was enough interest for a <a href="https://github.com/lucidrains/dalle2-ja

 - A research group has used the code in this repository to train a functional diffusion prior for their CLIP generations. Will share their work once they release their preprint. This, and <a href="https://github.com/crowsonkb">Katherine's</a> own experiments, validate OpenAI's finding that the extra prior increases variety of generations.

- Decoder is now verified working for unconditional generation on my experimental setup for Oxford flowers
+- Decoder is now verified working for unconditional generation on my experimental setup for Oxford flowers. 2 researchers have also confirmed Decoder is working for them.
+
+<img src="./samples/oxford.png" width="600px" />
+
+*ongoing at 21k steps*

 ## Install

--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1697,7 +1697,8 @@ class Decoder(BaseGaussianDiffusion):
        clip_adapter_overrides = dict(),
        learned_variance = True,
        vb_loss_weight = 0.001,
-        unconditional = False
+        unconditional = False,
+        auto_normalize_img = True,                  # whether to take care of normalizing the image from [0, 1] to [-1, 1] and back automatically - you can turn this off if you want to pass in the [-1, 1] ranged image yourself from the dataloader
    ):
        super().__init__(
            beta_schedule = beta_schedule,
@@ -1806,6 +1807,10 @@ class Decoder(BaseGaussianDiffusion):
        self.clip_denoised = clip_denoised
        self.clip_x_start = clip_x_start

+        # normalize and unnormalize image functions
+        self.normalize_img = normalize_neg_one_to_one if auto_normalize_img else identity
+        self.unnormalize_img = unnormalize_zero_to_one if auto_normalize_img else identity
+
    def get_unet(self, unet_number):
        assert 0 < unet_number <= len(self.unets)
        index = unet_number - 1
@@ -1877,7 +1882,7 @@ class Decoder(BaseGaussianDiffusion):
        img = torch.randn(shape, device = device)

        if not is_latent_diffusion:
-            lowres_cond_img = maybe(normalize_neg_one_to_one)(lowres_cond_img)
+            lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img)

        for i in tqdm(reversed(range(0, self.num_timesteps)), desc = 'sampling loop time step', total = self.num_timesteps):
            img = self.p_sample(
@@ -1894,7 +1899,7 @@ class Decoder(BaseGaussianDiffusion):
                clip_denoised = clip_denoised
            )

-        unnormalize_img = unnormalize_zero_to_one(img)
+        unnormalize_img = self.unnormalize_img(img)
        return unnormalize_img

    def p_losses(self, unet, x_start, times, *, image_embed, lowres_cond_img = None, text_encodings = None, text_mask = None, predict_x_start = False, noise = None, learned_variance = False, clip_denoised = False, is_latent_diffusion = False):
@@ -1903,8 +1908,8 @@ class Decoder(BaseGaussianDiffusion):
        # normalize to [-1, 1]

        if not is_latent_diffusion:
-            x_start = normalize_neg_one_to_one(x_start)
-            lowres_cond_img = maybe(normalize_neg_one_to_one)(lowres_cond_img)
+            x_start = self.normalize_img(x_start)
+            lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img)

        # get x_t

--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -366,7 +366,7 @@ class DecoderTrainer(nn.Module):
        lr = 1e-4,
        wd = 1e-2,
        eps = 1e-8,
-        max_grad_norm = None,
+        max_grad_norm = 0.5,
        amp = False,
        **kwargs
    ):
--- a/samples/oxford.png
+++ b/samples/oxford.png
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.3.1',
+  version = '0.3.3',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	db0642c4cd	quick fix for @marunine	2022-05-18 20:22:52 -07:00
Phil Wang	bb86ab2404	update sample, and set default gradient clipping value for decoder training	2022-05-16 17:38:30 -07:00
Phil Wang	ae056dd67c	samples	2022-05-16 13:46:35 -07:00
Phil Wang	033d6b0ce8	last update	2022-05-16 13:38:33 -07:00