add diffusion prior trainer, which automatically takes care of the exponential moving average (training and sampling), as well as mixed precision, gradient clipping

2026-02-23 22:34:45 +01:00 · 2022-05-06 08:06:28 -07:00
4 changed files with 14 additions and 77 deletions
--- a/README.md
+++ b/README.md
@@ -1047,14 +1047,4 @@ Once built, images will be saved to the same directory the command is invoked
 }
 ```
 ```bibtex
@article{Yu2022CoCaCC,
    title   = {CoCa: Contrastive Captioners are Image-Text Foundation Models},
    author  = {Jiahui Yu and Zirui Wang and Vijay Vasudevan and Legg Yeung and Mojtaba Seyedhosseini and Yonghui Wu},
    journal = {ArXiv},
    year    = {2022},
    volume  = {abs/2205.01917}
 }
 ```
 *Creating noise from data is easy; creating data from noise is generative modeling.* - <a href="https://arxiv.org/abs/2011.13456">Yang Song's paper</a>
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -23,14 +23,9 @@ from dalle2_pytorch.vqgan_vae import NullVQGanVAE, VQGanVAE
 from resize_right import resize
 # rotary embeddings
 from rotary_embedding_torch import RotaryEmbedding
 # use x-clip
 from x_clip import CLIP
 from coca_pytorch import CoCa
 # helper functions
@@ -118,10 +113,9 @@ EmbeddedText = namedtuple('EmbedTextReturn', ['text_embed', 'text_encodings', 't
 EmbeddedImage = namedtuple('EmbedImageReturn', ['image_embed', 'image_encodings'])
 class BaseClipAdapter(nn.Module):
-    def __init__(self, clip, **kwargs):
+    def __init__(self, clip):
        super().__init__()
        self.clip = clip
        self.overrides = kwargs
    @property
    def dim_latent(self):
@@ -179,39 +173,6 @@ class XClipAdapter(BaseClipAdapter):
        image_embed = self.clip.to_visual_latent(image_cls)
        return EmbeddedImage(l2norm(image_embed), image_encodings)
 class CoCaAdapter(BaseClipAdapter):
    @property
    def dim_latent(self):
        return self.clip.dim
    @property
    def image_size(self):
        assert 'image_size' in self.overrides
        return self.overrides['image_size']
    @property
    def image_channels(self):
        assert 'image_channels' in self.overrides
        return self.overrides['image_channels']
    @property
    def max_text_len(self):
        assert 'max_text_len' in self.overrides
        return self.overrides['max_text_len']
    @torch.no_grad()
    def embed_text(self, text):
        text = text[..., :self.max_text_len]
        text_mask = text != 0
        text_embed, text_encodings = self.clip.embed_text(text)
        return EmbeddedText(text_embed, text_encodings, text_mask)
    @torch.no_grad()
    def embed_image(self, image):
        image = resize_image_to(image, self.image_size)
        image_embed, image_encodings = self.clip.embed_image(image)
        return EmbeddedImage(image_embed, image_encodings)
 class OpenAIClipAdapter(BaseClipAdapter):
    def __init__(
        self,
@@ -570,8 +531,7 @@ class Attention(nn.Module):
        heads = 8,
        dropout = 0.,
        causal = False,
-        post_norm = False,
+        post_norm = False
        rotary_emb = None
    ):
        super().__init__()
        self.scale = dim_head ** -0.5
@@ -587,8 +547,6 @@ class Attention(nn.Module):
        self.to_q = nn.Linear(dim, inner_dim, bias = False)
        self.to_kv = nn.Linear(dim, dim_head * 2, bias = False)
        self.rotary_emb = rotary_emb
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim, bias = False),
            LayerNorm(dim) if post_norm else nn.Identity()
@@ -601,12 +559,6 @@ class Attention(nn.Module):
        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim = -1))
        q = rearrange(q, 'b n (h d) -> b h n d', h = self.heads)
        q = q * self.scale
        # rotary embeddings
        if exists(self.rotary_emb):
            q, k = map(self.rotary_emb.rotate_queries_or_keys, (q, k))
        # add null key / value for classifier free guidance in prior net
@@ -614,7 +566,7 @@ class Attention(nn.Module):
        k = torch.cat((nk, k), dim = -2)
        v = torch.cat((nv, v), dim = -2)
-        # calculate query / key similarities
+        q = q * self.scale
        sim = einsum('b h i d, b j d -> b h i j', q, k)
@@ -664,18 +616,15 @@ class CausalTransformer(nn.Module):
        attn_dropout = 0.,
        ff_dropout = 0.,
        final_proj = True,
-        normformer = False,
+        normformer = False
        rotary_emb = True
    ):
        super().__init__()
        self.rel_pos_bias = RelPosBias(heads = heads)
        rotary_emb = RotaryEmbedding(dim = min(32, dim_head)) if rotary_emb else None
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
-                Attention(dim = dim, causal = True, dim_head = dim_head, heads = heads, dropout = attn_dropout, post_norm = normformer, rotary_emb = rotary_emb),
+                Attention(dim = dim, causal = True, dim_head = dim_head, heads = heads, dropout = attn_dropout, post_norm = normformer),
                FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout, post_activation_norm = normformer)
            ]))
@@ -806,7 +755,6 @@ class DiffusionPrior(BaseGaussianDiffusion):
        condition_on_text_encodings = True, # the paper suggests this is needed, but you can turn it off for your CLIP preprocessed text embed -> image embed training
        sampling_clamp_l2norm = False,
        image_embed_scale = None,           # this is for scaling the l2-normed image embedding, so it is more suitable for gaussian diffusion, as outlined by Katherine (@crowsonkb) https://github.com/lucidrains/DALLE2-pytorch/issues/60#issue-1226116132
        clip_adapter_overrides = dict()
    ):
        super().__init__(
            beta_schedule = beta_schedule,
@@ -816,9 +764,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        if exists(clip):
            if isinstance(clip, CLIP):
-                clip = XClipAdapter(clip, **clip_adapter_overrides)
+                clip = XClipAdapter(clip)
            elif isinstance(clip, CoCa):
                clip = CoCaAdapter(clip, **clip_adapter_overrides)
            assert isinstance(clip, BaseClipAdapter)
            freeze_model_and_make_eval_(clip)
@@ -1541,8 +1487,7 @@ class Decoder(BaseGaussianDiffusion):
        blur_kernel_size = 3,                       # cascading ddpm - blur kernel size
        condition_on_text_encodings = False,        # the paper suggested that this didn't do much in the decoder, but i'm allowing the option for experimentation
        clip_denoised = True,
-        clip_x_start = True,
+        clip_x_start = True
        clip_adapter_overrides = dict()
    ):
        super().__init__(
            beta_schedule = beta_schedule,
@@ -1555,9 +1500,7 @@ class Decoder(BaseGaussianDiffusion):
        self.clip = None
        if exists(clip):
            if isinstance(clip, CLIP):
-                clip = XClipAdapter(clip, **clip_adapter_overrides)
+                clip = XClipAdapter(clip)
            elif isinstance(clip, CoCa):
                clip = CoCaAdapter(clip, **clip_adapter_overrides)
            freeze_model_and_make_eval_(clip)
            assert isinstance(clip, BaseClipAdapter)
--- a/dalle2_pytorch/train.py
+++ b/dalle2_pytorch/train.py
@@ -111,6 +111,11 @@ class DiffusionPriorTrainer(nn.Module):
        # exponential moving average
        self.use_ema = use_ema
        if use_ema:
            has_lazy_linear = any([type(module) == nn.LazyLinear for module in diffusion_prior.modules()])
            assert not has_lazy_linear, 'you must set the text_embed_dim on your u-nets if you plan on doing automatic exponential moving average'
        if self.use_ema:
            self.ema_diffusion_prior = EMA(diffusion_prior, **ema_kwargs)
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.1.0',
+  version = '0.0.108',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
@@ -24,7 +24,6 @@ setup(
  install_requires=[
    'click',
    'clip-anytorch',
    'coca-pytorch>=0.0.5',
    'einops>=0.4',
    'einops-exts>=0.0.3',
    'embedding-reader',