add weight standardization behind feature flag, which may potentially work well with group norm

2026-02-13 22:44:37 +01:00 · 2022-08-14 11:33:18 -07:00
4 changed files with 18 additions and 61 deletions
--- a/README.md
+++ b/README.md
@@ -49,7 +49,6 @@ This library would not have gotten to this working state without the help of
 - <a href="https://github.com/crowsonkb">Katherine</a> for her advice
 - <a href="https://stability.ai/">Stability AI</a> for the generous sponsorship
 - <a href="https://huggingface.co">🤗 Huggingface</a> and in particular <a href="https://github.com/sgugger">Sylvain</a> for the <a href="https://github.com/huggingface/accelerate">Accelerate</a> library
- <a href="https://github.com/arogozhnikov">Alex</a> for <a href="https://github.com/arogozhnikov/einops">einops</a>, indispensable tool for tensor manipulation

 ... and many others. Thank you! 🙏

@@ -1275,14 +1274,4 @@ For detailed information on training the diffusion prior, please refer to the [d
 }
 ```

-```bibtex
-@inproceedings{rogozhnikov2022einops,
-    title   = {Einops: Clear and Reliable Tensor Manipulations with Einstein-like Notation},
-    author  = {Alex Rogozhnikov},
-    booktitle = {International Conference on Learning Representations},
-    year    = {2022},
-    url     = {https://openreview.net/forum?id=oapKSVM2bcj}
-}
-```
-
 *Creating noise from data is easy; creating data from noise is generative modeling.* - <a href="https://arxiv.org/abs/2011.13456">Yang Song's paper</a>
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -250,15 +250,9 @@ class XClipAdapter(BaseClipAdapter):
        text = text[..., :self.max_text_len]
        text_mask = text != 0
        encoder_output = self.clip.text_transformer(text)
-
-        encoder_output_is_cls = encoder_output.ndim == 3
-
-        text_cls, text_encodings = (encoder_output[:, 0], encoder_output[:, 1:]) if encoder_output_is_cls else (encoder_output, None)
+        text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
        text_embed = self.clip.to_text_latent(text_cls)
-
-        if exists(text_encodings):
-            text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
-
+        text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
        return EmbeddedText(l2norm(text_embed), text_encodings)

    @torch.no_grad()
@@ -879,8 +873,6 @@ class Attention(nn.Module):
        # attention

        attn = sim.softmax(dim = -1, dtype = torch.float32)
-        attn = attn.type(sim.dtype)
-
        attn = self.dropout(attn)

        # aggregate values
@@ -1464,6 +1456,10 @@ class WeightStandardizedConv2d(nn.Conv2d):
    https://arxiv.org/abs/1903.10520
    weight standardization purportedly works synergistically with group normalization
    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
    def forward(self, x):
        eps = 1e-5 if x.dtype == torch.float32 else 1e-3

@@ -1639,7 +1635,6 @@ class CrossAttention(nn.Module):
            sim = sim.masked_fill(~mask, max_neg_value)

        attn = sim.softmax(dim = -1, dtype = torch.float32)
-        attn = attn.type(sim.dtype)

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -9,7 +9,7 @@ from collections.abc import Iterable
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
+from torch.optim.lr_scheduler import LambdaLR
 from torch.cuda.amp import autocast, GradScaler

 from dalle2_pytorch.dalle2_pytorch import Decoder, DiffusionPrior
@@ -181,8 +181,7 @@ class DiffusionPriorTrainer(nn.Module):
        eps = 1e-6,
        max_grad_norm = None,
        group_wd_params = True,
-        warmup_steps = None,
-        cosine_decay_max_steps = None,
+        warmup_steps = 1,
        **kwargs
    ):
        super().__init__()
@@ -234,11 +233,8 @@ class DiffusionPriorTrainer(nn.Module):
            **self.optim_kwargs,
            **kwargs
        )
-
-        if exists(cosine_decay_max_steps):
-            self.scheduler = CosineAnnealingLR(optimizer, T_max = cosine_decay_max_steps)
-        else:
-            self.scheduler = LambdaLR(self.optimizer, lr_lambda = lambda _: 1.0)
+        
+        self.scheduler = LambdaLR(self.optimizer, lr_lambda = lambda _: 1.0)
        
        self.warmup_scheduler = warmup.LinearWarmup(self.optimizer, warmup_period = warmup_steps) if exists(warmup_steps) else None

@@ -275,7 +271,6 @@ class DiffusionPriorTrainer(nn.Module):
            # FIXME: LambdaLR can't be saved due to pickling issues
            save_obj = dict(
                optimizer = self.optimizer.state_dict(),
-                scheduler = self.scheduler.state_dict(),
                warmup_scheduler = self.warmup_scheduler,
                model = self.accelerator.unwrap_model(self.diffusion_prior).state_dict(),
                version = version.parse(__version__),
@@ -322,9 +317,7 @@ class DiffusionPriorTrainer(nn.Module):
        # unwrap the model when loading from checkpoint
        self.accelerator.unwrap_model(self.diffusion_prior).load_state_dict(loaded_obj['model'], strict = strict)
        self.step.copy_(torch.ones_like(self.step, device=self.device) * loaded_obj['step'].to(self.device))
-
        self.optimizer.load_state_dict(loaded_obj['optimizer'])
-        self.scheduler.load_state_dict(loaded_obj['scheduler'])

        # set warmupstep
        if exists(self.warmup_scheduler):
@@ -357,8 +350,7 @@ class DiffusionPriorTrainer(nn.Module):

        # accelerator will ocassionally skip optimizer steps in a "dynamic loss scaling strategy"
        if not self.accelerator.optimizer_step_was_skipped:
-            sched_context = self.warmup_scheduler.dampening if exists(self.warmup_scheduler) else nullcontext
-            with sched_context():
+            with self.warmup_scheduler.dampening():
                self.scheduler.step()

        if self.use_ema:
@@ -441,7 +433,6 @@ class DecoderTrainer(nn.Module):
        wd = 1e-2,
        eps = 1e-8,
        warmup_steps = None,
-        cosine_decay_max_steps = None,
        max_grad_norm = 0.5,
        amp = False,
        group_wd_params = True,
@@ -463,7 +454,7 @@ class DecoderTrainer(nn.Module):
        # be able to finely customize learning rate, weight decay
        # per unet

-        lr, wd, eps, warmup_steps, cosine_decay_max_steps = map(partial(cast_tuple, length = self.num_unets), (lr, wd, eps, warmup_steps, cosine_decay_max_steps))
+        lr, wd, eps, warmup_steps = map(partial(cast_tuple, length = self.num_unets), (lr, wd, eps, warmup_steps))

        assert all([unet_lr <= 1e-2 for unet_lr in lr]), 'your learning rate is too high, recommend sticking with 1e-4, at most 5e-4'

@@ -471,7 +462,7 @@ class DecoderTrainer(nn.Module):
        schedulers = []
        warmup_schedulers = []

-        for unet, unet_lr, unet_wd, unet_eps, unet_warmup_steps, unet_cosine_decay_max_steps in zip(decoder.unets, lr, wd, eps, warmup_steps, cosine_decay_max_steps):
+        for unet, unet_lr, unet_wd, unet_eps, unet_warmup_steps in zip(decoder.unets, lr, wd, eps, warmup_steps):
            if isinstance(unet, nn.Identity):
                optimizers.append(None)
                schedulers.append(None)
@@ -487,11 +478,7 @@ class DecoderTrainer(nn.Module):
                )

                optimizers.append(optimizer)
-
-                if exists(unet_cosine_decay_max_steps):
-                    scheduler = CosineAnnealingLR(optimizer, T_max = unet_cosine_decay_max_steps)
-                else:
-                    scheduler = LambdaLR(optimizer, lr_lambda = lambda step: 1.0)
+                scheduler = LambdaLR(optimizer, lr_lambda = lambda step: 1.0)

                warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period = unet_warmup_steps) if exists(unet_warmup_steps) else None
                warmup_schedulers.append(warmup_scheduler)
@@ -571,15 +558,9 @@ class DecoderTrainer(nn.Module):

        for ind in range(0, self.num_unets):
            optimizer_key = f'optim{ind}'
-            scheduler_key = f'sched{ind}'
-
            optimizer = getattr(self, optimizer_key)
-            scheduler = getattr(self, scheduler_key)
-
-            optimizer_state_dict = optimizer.state_dict() if exists(optimizer) else None
-            scheduler_state_dict = scheduler.state_dict() if exists(scheduler) else None
-
-            save_obj = {**save_obj, optimizer_key: optimizer_state_dict, scheduler_key: scheduler_state_dict}
+            state_dict = optimizer.state_dict() if optimizer is not None else None
+            save_obj = {**save_obj, optimizer_key: state_dict}

        if self.use_ema:
            save_obj = {**save_obj, 'ema': self.ema_unets.state_dict()}
@@ -600,18 +581,10 @@ class DecoderTrainer(nn.Module):

            optimizer_key = f'optim{ind}'
            optimizer = getattr(self, optimizer_key)
-
-            scheduler_key = f'sched{ind}'
-            scheduler = getattr(self, scheduler_key)
-
            warmup_scheduler = self.warmup_schedulers[ind]
-
-            if exists(optimizer):
+            if optimizer is not None:
                optimizer.load_state_dict(loaded_obj[optimizer_key])

-            if exists(scheduler):
-                scheduler.load_state_dict(loaded_obj[scheduler_key])
-
            if exists(warmup_scheduler):
                warmup_scheduler.last_step = last_step

--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '1.8.4'
+__version__ = '1.7.0'