just use an assert to make sure clip image channels is never different than the channels of the diffusion prior and decoder, if clip is given

2026-02-12 11:34:29 +01:00 · 2022-05-22 22:34:33 -07:00
5 changed files with 21 additions and 67 deletions
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ This model is SOTA for text-to-image for now.

 Please join <a href="https://discord.gg/xBPBXfcFHd"><img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white"></a> if you are interested in helping out with the replication with the <a href="https://laion.ai/">LAION</a> community | <a href="https://www.youtube.com/watch?v=AIOE1l1W0Tw">Yannic Interview</a>

-As of 5/23/22, it is no longer SOTA. SOTA will be <a href="https://github.com/lucidrains/imagen-pytorch">here</a>. Jax versions as well as text-to-video project will be shifted towards the Imagen architecture, as it is way simpler.
+There was enough interest for a <a href="https://github.com/lucidrains/dalle2-jax">Jax version</a>. I will also eventually extend this to <a href="https://github.com/lucidrains/dalle2-video">text to video</a>, once the repository is in a good place.

 ## Status

@@ -26,7 +26,7 @@ As of 5/23/22, it is no longer SOTA. SOTA will be <a href="https://github.com/lu

 ## Pre-Trained Models
 - LAION is training prior models. Checkpoints are available on <a href="https://huggingface.co/zenglishuci/conditioned-prior">🤗huggingface</a> and the training statistics are available on <a href="https://wandb.ai/nousr_laion/conditioned-prior/reports/LAION-DALLE2-PyTorch-Prior--VmlldzoyMDI2OTIx">🐝WANDB</a>.
- Decoder - <a href="https://wandb.ai/veldrovive/dalle2_train_decoder/runs/jkrtg0so?workspace=user-veldrovive">In-progress test run</a> 🚧
+- Decoder 🚧
 - DALL-E 2 🚧

 ## Install
@@ -1195,12 +1195,4 @@ This library would not have gotten to this working state without the help of
 }
 ```

-```bibtex
-@misc{Saharia2022,
-    title   = {Imagen: unprecedented photorealism × deep level of language understanding},
-    author  = {Chitwan Saharia*, William Chan*, Saurabh Saxena†, Lala Li†, Jay Whang†, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S. Sara Mahdavi, Rapha Gontijo Lopes, Tim Salimans, Jonathan Ho†, David Fleet†, Mohammad Norouzi*},
-    year    = {2022}
-}
-```
-
 *Creating noise from data is easy; creating data from noise is generative modeling.* - <a href="https://arxiv.org/abs/2011.13456">Yang Song's paper</a>
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -890,7 +890,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        )

        if exists(clip):
-            assert image_channels == clip.image_channels, f'channels of image ({image_channels}) should be equal to the channels that CLIP accepts ({clip.image_channels})'
+            assert image_channels == clip.image_channels, f'channels of image ({channels}) should be equal to the channels that CLIP accepts ({clip.image_channels})'

            if isinstance(clip, CLIP):
                clip = XClipAdapter(clip, **clip_adapter_overrides)
@@ -1107,20 +1107,13 @@ class Block(nn.Module):
        groups = 8
    ):
        super().__init__()
-        self.project = nn.Conv2d(dim, dim_out, 3, padding = 1)
-        self.norm = nn.GroupNorm(groups, dim_out)
-        self.act = nn.SiLU()
-
-    def forward(self, x, scale_shift = None):
-        x = self.project(x)
-        x = self.norm(x)
-
-        if exists(scale_shift):
-            scale, shift = scale_shift
-            x = x * (scale + 1) + shift
-
-        x = self.act(x)
-        return x
+        self.block = nn.Sequential(
+            nn.Conv2d(dim, dim_out, 3, padding = 1),
+            nn.GroupNorm(groups, dim_out),
+            nn.SiLU()
+        )
+    def forward(self, x):
+        return self.block(x)

 class ResnetBlock(nn.Module):
    def __init__(
@@ -1139,7 +1132,7 @@ class ResnetBlock(nn.Module):
        if exists(time_cond_dim):
            self.time_mlp = nn.Sequential(
                nn.SiLU(),
-                nn.Linear(time_cond_dim, dim_out * 2)
+                nn.Linear(time_cond_dim, dim_out)
            )

        self.cross_attn = None
@@ -1159,14 +1152,11 @@ class ResnetBlock(nn.Module):
        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()

    def forward(self, x, cond = None, time_emb = None):
+        h = self.block1(x)

-        scale_shift = None
        if exists(self.time_mlp) and exists(time_emb):
            time_emb = self.time_mlp(time_emb)
-            time_emb = rearrange(time_emb, 'b c -> b c 1 1')
-            scale_shift = time_emb.chunk(2, dim = 1)
-
-        h = self.block1(x, scale_shift = scale_shift)
+            h = rearrange(time_emb, 'b c -> b c 1 1') + h

        if exists(self.cross_attn):
            assert exists(cond)
@@ -1714,8 +1704,6 @@ class Decoder(BaseGaussianDiffusion):
        vb_loss_weight = 0.001,
        unconditional = False,
        auto_normalize_img = True,                  # whether to take care of normalizing the image from [0, 1] to [-1, 1] and back automatically - you can turn this off if you want to pass in the [-1, 1] ranged image yourself from the dataloader
-        use_dynamic_thres = False,                  # from the Imagen paper
-        dynamic_thres_percentile = 0.9
    ):
        super().__init__(
            beta_schedule = beta_schedule,
@@ -1838,11 +1826,6 @@ class Decoder(BaseGaussianDiffusion):
        self.clip_denoised = clip_denoised
        self.clip_x_start = clip_x_start

-        # dynamic thresholding settings, if clipping denoised during sampling
-
-        self.use_dynamic_thres = use_dynamic_thres
-        self.dynamic_thres_percentile = dynamic_thres_percentile
-
        # normalize and unnormalize image functions

        self.normalize_img = normalize_neg_one_to_one if auto_normalize_img else identity
@@ -1885,21 +1868,7 @@ class Decoder(BaseGaussianDiffusion):
            x_recon = self.predict_start_from_noise(x, t = t, noise = pred)

        if clip_denoised:
-            # s is the threshold amount
-            # static thresholding would just be s = 1
-            s = 1.
-            if self.use_dynamic_thres:
-                s = torch.quantile(
-                    rearrange(x_recon, 'b ... -> b (...)').abs(),
-                    self.dynamic_thres_percentile,
-                    dim = -1
-                )
-
-                s.clamp_(min = 1.)
-                s = s.view(-1, *((1,) * (x_recon.ndim - 1)))
-
-            # clip by threshold, depending on whether static or dynamic
-            x_recon = x_recon.clamp(-s, s) / s
+            x_recon.clamp_(-1., 1.)

        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)

--- a/dalle2_pytorch/optimizer.py
+++ b/dalle2_pytorch/optimizer.py
@@ -12,7 +12,6 @@ def get_optimizer(
    betas = (0.9, 0.999),
    eps = 1e-8,
    filter_by_requires_grad = False,
-    group_wd_params = True,
    **kwargs
 ):
    if filter_by_requires_grad:
@@ -22,13 +21,11 @@ def get_optimizer(
        return Adam(params, lr = lr, betas = betas, eps = eps)

    params = set(params)
+    wd_params, no_wd_params = separate_weight_decayable_params(params)

-    if group_wd_params:
-        wd_params, no_wd_params = separate_weight_decayable_params(params)
+    param_groups = [
+        {'params': list(wd_params)},
+        {'params': list(no_wd_params), 'weight_decay': 0},
+    ]

-        params = [
-            {'params': list(wd_params)},
-            {'params': list(no_wd_params), 'weight_decay': 0},
-        ]
-
-    return AdamW(params, lr = lr, weight_decay = wd, betas = betas, eps = eps)
+    return AdamW(param_groups, lr = lr, weight_decay = wd, betas = betas, eps = eps)
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -254,7 +254,6 @@ class DiffusionPriorTrainer(nn.Module):
        eps = 1e-6,
        max_grad_norm = None,
        amp = False,
-        group_wd_params = True,
        **kwargs
    ):
        super().__init__()
@@ -280,7 +279,6 @@ class DiffusionPriorTrainer(nn.Module):
            lr = lr,
            wd = wd,
            eps = eps,
-            group_wd_params = group_wd_params,
            **kwargs
        )

@@ -412,7 +410,6 @@ class DecoderTrainer(nn.Module):
        eps = 1e-8,
        max_grad_norm = 0.5,
        amp = False,
-        group_wd_params = True,
        **kwargs
    ):
        super().__init__()
@@ -438,7 +435,6 @@ class DecoderTrainer(nn.Module):
                lr = unet_lr,
                wd = unet_wd,
                eps = unet_eps,
-                group_wd_params = group_wd_params,
                **kwargs
            )

--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.5.2',
+  version = '0.4.12',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',