bring in prediction of v objective, combining the findings from progressive distillation paper and imagen-video to the eventual extension of dalle2 to make-a-video

2026-02-12 19:44:26 +01:00 · 2022-10-28 18:09:39 -07:00
7 changed files with 41 additions and 74 deletions
--- a/dalle2_pytorch/init.py
+++ b/dalle2_pytorch/init.py
@@ -1,10 +1,3 @@
-import torch
-from packaging import version
-
-if version.parse(torch.__version__) >= version.parse('2.0.0'):
-    from einops._torch_specific import allow_ops_in_compiled_graph
-    allow_ops_in_compiled_graph()
-
 from dalle2_pytorch.version import __version__
 from dalle2_pytorch.dalle2_pytorch import DALLE2, DiffusionPriorNetwork, DiffusionPrior, Unet, Decoder
 from dalle2_pytorch.dalle2_pytorch import OpenAIClipAdapter, OpenClipAdapter
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -12,8 +12,10 @@ from torch.utils.checkpoint import checkpoint
 from torch import nn, einsum
 import torchvision.transforms as T

-from einops import rearrange, repeat, reduce, pack, unpack
+from einops import rearrange, repeat, reduce
 from einops.layers.torch import Rearrange
+from einops_exts import rearrange_many, repeat_many, check_shape
+from einops_exts.torch import EinopsToAndFrom

 from kornia.filters import gaussian_blur2d
 import kornia.augmentation as K
@@ -358,7 +360,6 @@ class OpenAIClipAdapter(BaseClipAdapter):
        is_eos_id = (text == self.eos_id)
        text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
        text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
-        text_mask = text_mask & (text != 0)
        assert not self.cleared

        text_embed = self.clip.encode_text(text)
@@ -433,7 +434,6 @@ class OpenClipAdapter(BaseClipAdapter):
        is_eos_id = (text == self.eos_id)
        text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
        text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
-        text_mask = text_mask & (text != 0)
        assert not self.cleared

        text_embed = self.clip.encode_text(text)
@@ -629,7 +629,7 @@ class NoiseScheduler(nn.Module):

    def calculate_v(self, x_start, t, noise = None):
        return (
-            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * noise -
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * noise +
            extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * x_start
        )

@@ -667,23 +667,6 @@ class NoiseScheduler(nn.Module):
            return loss
        return loss * extract(self.p2_loss_weight, times, loss.shape)

-# rearrange image to sequence
-
-class RearrangeToSequence(nn.Module):
-    def __init__(self, fn):
-        super().__init__()
-        self.fn = fn
-
-    def forward(self, x):
-        x = rearrange(x, 'b c ... -> b ... c')
-        x, ps = pack([x], 'b * c')
-
-        x = self.fn(x)
-
-        x, = unpack(x, ps, 'b * c')
-        x = rearrange(x, 'b ... c -> b c ...')
-        return x
-
 # diffusion prior

 class LayerNorm(nn.Module):
@@ -882,7 +865,7 @@ class Attention(nn.Module):

        # add null key / value for classifier free guidance in prior net

-        nk, nv = map(lambda t: repeat(t, 'd -> b 1 d', b = b), self.null_kv.unbind(dim = -2))
+        nk, nv = repeat_many(self.null_kv.unbind(dim = -2), 'd -> b 1 d', b = b)
        k = torch.cat((nk, k), dim = -2)
        v = torch.cat((nv, v), dim = -2)

@@ -1139,7 +1122,7 @@ class DiffusionPriorNetwork(nn.Module):
        learned_queries = repeat(self.learned_query, 'd -> b 1 d', b = batch)

        if self.self_cond:
-            learned_queries = torch.cat((self_cond, learned_queries), dim = -2)
+            learned_queries = torch.cat((image_embed, self_cond), dim = -2)

        tokens = torch.cat((
            text_encodings,
@@ -1337,7 +1320,7 @@ class DiffusionPrior(nn.Module):
            elif self.predict_x_start:
                x_start = pred
            else:
-                x_start = self.noise_scheduler.predict_start_from_noise(image_embed, t = time_cond, noise = pred)
+                x_start = self.noise_scheduler.predict_start_from_noise(image_embed, t = time_cond, noise = pred_noise)

            # clip x0 before maybe predicting noise

@@ -1349,7 +1332,10 @@ class DiffusionPrior(nn.Module):

            # predict noise

-            pred_noise = self.noise_scheduler.predict_noise_from_start(image_embed, t = time_cond, x0 = x_start)
+            if self.predict_x_start or self.predict_v:
+                pred_noise = self.noise_scheduler.predict_noise_from_start(image_embed, t = time_cond, x0 = x_start)
+            else:
+                pred_noise = pred

            if time_next < 0:
                image_embed = x_start
@@ -1644,10 +1630,14 @@ class ResnetBlock(nn.Module):
        self.cross_attn = None

        if exists(cond_dim):
-            self.cross_attn = CrossAttention(
-                dim = dim_out,
-                context_dim = cond_dim,
-                cosine_sim = cosine_sim_cross_attn
+            self.cross_attn = EinopsToAndFrom(
+                'b c h w',
+                'b (h w) c',
+                CrossAttention(
+                    dim = dim_out,
+                    context_dim = cond_dim,
+                    cosine_sim = cosine_sim_cross_attn
+                )
            )

        self.block1 = Block(dim, dim_out, groups = groups, weight_standardization = weight_standardization)
@@ -1666,15 +1656,8 @@ class ResnetBlock(nn.Module):

        if exists(self.cross_attn):
            assert exists(cond)
-
-            h = rearrange(h, 'b c ... -> b ... c')
-            h, ps = pack([h], 'b * c')
-
            h = self.cross_attn(h, context = cond) + h

-            h, = unpack(h, ps, 'b * c')
-            h = rearrange(h, 'b ... c -> b c ...')
-
        h = self.block2(h)
        return h + self.res_conv(x)

@@ -1720,11 +1703,11 @@ class CrossAttention(nn.Module):

        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))

-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), (q, k, v))
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h = self.heads)

        # add null key / value for classifier free guidance in prior net

-        nk, nv = map(lambda t: repeat(t, 'd -> b h 1 d', h = self.heads,  b = b), self.null_kv.unbind(dim = -2))
+        nk, nv = repeat_many(self.null_kv.unbind(dim = -2), 'd -> b h 1 d', h = self.heads,  b = b)

        k = torch.cat((nk, k), dim = -2)
        v = torch.cat((nv, v), dim = -2)
@@ -1777,7 +1760,7 @@ class LinearAttention(nn.Module):

        fmap = self.norm(fmap)
        q, k, v = self.to_qkv(fmap).chunk(3, dim = 1)
-        q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> (b h) (x y) c', h = h), (q, k, v))
+        q, k, v = rearrange_many((q, k, v), 'b (h c) x y -> (b h) (x y) c', h = h)

        q = q.softmax(dim = -1)
        k = k.softmax(dim = -2)
@@ -2011,7 +1994,7 @@ class Unet(nn.Module):

        self_attn = cast_tuple(self_attn, num_stages)

-        create_self_attn = lambda dim: RearrangeToSequence(Residual(Attention(dim, **attn_kwargs)))
+        create_self_attn = lambda dim: EinopsToAndFrom('b c h w', 'b (h w) c', Residual(Attention(dim, **attn_kwargs)))

        # resnet block klass

@@ -2511,7 +2494,7 @@ class Decoder(nn.Module):
        dynamic_thres_percentile = 0.95,
        p2_loss_weight_gamma = 0.,                  # p2 loss weight, from https://arxiv.org/abs/2204.00227 - 0 is equivalent to weight of 1 across time - 1. is recommended
        p2_loss_weight_k = 1,
-        ddim_sampling_eta = 0.                      # can be set to 0. for deterministic sampling afaict
+        ddim_sampling_eta = 1.                      # can be set to 0. for deterministic sampling afaict
    ):
        super().__init__()

@@ -2745,16 +2728,11 @@ class Decoder(nn.Module):
        if exists(unet_number):
            unet = self.get_unet(unet_number)

-        # devices
-
-        cuda, cpu = torch.device('cuda'), torch.device('cpu')
-
        self.cuda()

        devices = [module_device(unet) for unet in self.unets]
-
-        self.unets.to(cpu)
-        unet.to(cuda)
+        self.unets.cpu()
+        unet.cuda()

        yield

@@ -2995,7 +2973,10 @@ class Decoder(nn.Module):

                # predict noise

-                pred_noise = noise_scheduler.predict_noise_from_start(img, t = time_cond, x0 = x_start)
+                if predict_x_start or predict_v:
+                    pred_noise = noise_scheduler.predict_noise_from_start(img, t = time_cond, x0 = x_start)
+                else:
+                    pred_noise = pred

                c1 = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
                c2 = ((1 - alpha_next) - torch.square(c1)).sqrt()
@@ -3137,8 +3118,7 @@ class Decoder(nn.Module):
        distributed = False,
        inpaint_image = None,
        inpaint_mask = None,
-        inpaint_resample_times = 5,
-        one_unet_in_gpu_at_time = True
+        inpaint_resample_times = 5
    ):
        assert self.unconditional or exists(image_embed), 'image embed must be present on sampling from decoder unless if trained unconditionally'

@@ -3161,7 +3141,6 @@ class Decoder(nn.Module):
            assert image.shape[0] == batch_size, 'image must have batch size of {} if starting at unet number > 1'.format(batch_size)
            prev_unet_output_size = self.image_sizes[start_at_unet_number - 2]
            img = resize_image_to(image, prev_unet_output_size, nearest = True)
-
        is_cuda = next(self.parameters()).is_cuda

        num_unets = self.num_unets
@@ -3171,7 +3150,7 @@ class Decoder(nn.Module):
            if unet_number < start_at_unet_number:
                continue  # It's the easiest way to do it

-            context = self.one_unet_in_gpu(unet = unet) if is_cuda and one_unet_in_gpu_at_time else null_context()
+            context = self.one_unet_in_gpu(unet = unet) if is_cuda else null_context()

            with context:
                # prepare low resolution conditioning for upsamplers
@@ -3248,7 +3227,7 @@ class Decoder(nn.Module):
        learned_variance    = self.learned_variance[unet_index]
        b, c, h, w, device, = *image.shape, image.device

-        assert image.shape[1] == self.channels
+        check_shape(image, 'b c h w', c = self.channels)
        assert h >= target_image_size and w >= target_image_size

        times = torch.randint(0, noise_scheduler.num_timesteps, (b,), device = device, dtype = torch.long)
--- a/dalle2_pytorch/train_configs.py
+++ b/dalle2_pytorch/train_configs.py
@@ -4,13 +4,11 @@ from pydantic import BaseModel, validator, root_validator
 from typing import List, Optional, Union, Tuple, Dict, Any, TypeVar

 from x_clip import CLIP as XCLIP
-from open_clip import list_pretrained
 from coca_pytorch import CoCa

 from dalle2_pytorch.dalle2_pytorch import (
    CoCaAdapter,
    OpenAIClipAdapter,
-    OpenClipAdapter,
    Unet,
    Decoder,
    DiffusionPrior,
@@ -119,10 +117,6 @@ class AdapterConfig(BaseModel):
    def create(self):
        if self.make == "openai":
            return OpenAIClipAdapter(self.model)
-        elif self.make == "open_clip":
-            pretrained = dict(list_pretrained())
-            checkpoint = pretrained[self.model]
-            return OpenClipAdapter(name=self.model, pretrained=checkpoint)
        elif self.make == "x-clip":
            return XClipAdapter(XCLIP(**self.base_model_kwargs))
        elif self.make == "coca":
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -236,7 +236,7 @@ class DiffusionPriorTrainer(nn.Module):
        )

        if exists(cosine_decay_max_steps):
-            self.scheduler = CosineAnnealingLR(self.optimizer, T_max = cosine_decay_max_steps)
+            self.scheduler = CosineAnnealingLR(optimizer, T_max = cosine_decay_max_steps)
        else:
            self.scheduler = LambdaLR(self.optimizer, lr_lambda = lambda _: 1.0)
        
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '1.14.2'
+__version__ = '1.11.0'
--- a/dalle2_pytorch/vqgan_vae.py
+++ b/dalle2_pytorch/vqgan_vae.py
@@ -11,7 +11,8 @@ import torch.nn.functional as F
 from torch.autograd import grad as torch_grad
 import torchvision

-from einops import rearrange, reduce, repeat, pack, unpack
+from einops import rearrange, reduce, repeat
+from einops_exts import rearrange_many
 from einops.layers.torch import Rearrange

 # constants
@@ -407,7 +408,7 @@ class Attention(nn.Module):
        x = self.norm(x)

        q, k, v = self.to_qkv(x).chunk(3, dim = -1)
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h = h)

        q = q * self.scale
        sim = einsum('b h i d, b h j d -> b h i j', q, k)
--- a/setup.py
+++ b/setup.py
@@ -26,11 +26,11 @@ setup(
  install_requires=[
    'accelerate',
    'click',
-    'open-clip-torch>=2.0.0,<3.0.0',
-    'clip-anytorch>=2.5.2',
+    'clip-anytorch>=2.4.0',
    'coca-pytorch>=0.0.5',
    'ema-pytorch>=0.0.7',
-    'einops>=0.6.1',
+    'einops>=0.4',
+    'einops-exts>=0.0.3',
    'embedding-reader',
    'kornia>=0.5.4',
    'numpy',