allow text encodings and text mask to be passed in on forward and sampling for Decoder class

2026-02-12 11:34:29 +01:00 · 2022-05-16 10:25:06 -07:00
7 changed files with 31 additions and 159 deletions
--- a/README.md
+++ b/README.md
@@ -14,16 +14,6 @@ Please join <a href="https://discord.gg/xBPBXfcFHd"><img alt="Join us on Discord

 There was enough interest for a <a href="https://github.com/lucidrains/dalle2-jax">Jax version</a>. I will also eventually extend this to <a href="https://github.com/lucidrains/dalle2-video">text to video</a>, once the repository is in a good place.

-## Status
-
- A research group has used the code in this repository to train a functional diffusion prior for their CLIP generations. Will share their work once they release their preprint. This, and <a href="https://github.com/crowsonkb">Katherine's</a> own experiments, validate OpenAI's finding that the extra prior increases variety of generations.
-
- Decoder is now verified working for unconditional generation on my experimental setup for Oxford flowers. 2 researchers have also confirmed Decoder is working for them.
-
-<img src="./samples/oxford.png" width="600px" />
-
-*ongoing at 21k steps*
-
 ## Install

 ```bash
@@ -824,8 +814,8 @@ clip = CLIP(

 # mock data

-text = torch.randint(0, 49408, (512, 256)).cuda()
-images = torch.randn(512, 3, 256, 256).cuda()
+text = torch.randint(0, 49408, (32, 256)).cuda()
+images = torch.randn(32, 3, 256, 256).cuda()

 # prior networks (with transformer)

@@ -858,7 +848,7 @@ diffusion_prior_trainer.update()  # this will update the optimizer as well as th
 # after much of the above three lines in a loop
 # you can sample from the exponential moving average of the diffusion prior identically to how you do so for DiffusionPrior

-image_embeds = diffusion_prior_trainer.sample(text, max_batch_size = 4) # (512, 512) - exponential moving averaged image embeddings
+image_embeds = diffusion_prior_trainer.sample(text) # (4, 512) - exponential moving averaged image embeddings
 ```

 ## Bonus
@@ -871,7 +861,7 @@ ex.

 ```python
 import torch
-from dalle2_pytorch import Unet, Decoder, DecoderTrainer
+from dalle2_pytorch import Unet, Decoder

 # unet for the cascading ddpm

@@ -894,24 +884,20 @@ decoder = Decoder(
    unconditional = True
 ).cuda()

-# decoder trainer
-
-decoder_trainer = DecoderTrainer(decoder)
-
-# images (get a lot of this)
+# mock images (get a lot of this)

 images = torch.randn(1, 3, 512, 512).cuda()

 # feed images into decoder

 for i in (1, 2):
-    loss = decoder_trainer(images, unet_number = i)
-    decoder_trainer.update(unet_number = i)
+    loss = decoder(images, unet_number = i)
+    loss.backward()

-# do the above for many many many many images
+# do the above for many many many many steps
 # then it will learn to generate images

-images = decoder_trainer.sample(batch_size = 36, max_batch_size = 4) # (36, 3, 512, 512)
+images = decoder.sample(batch_size = 2) # (2, 3, 512, 512)
 ```

 ## Dataloaders
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1697,8 +1697,7 @@ class Decoder(BaseGaussianDiffusion):
        clip_adapter_overrides = dict(),
        learned_variance = True,
        vb_loss_weight = 0.001,
-        unconditional = False,
-        auto_normalize_img = True,                  # whether to take care of normalizing the image from [0, 1] to [-1, 1] and back automatically - you can turn this off if you want to pass in the [-1, 1] ranged image yourself from the dataloader
+        unconditional = False
    ):
        super().__init__(
            beta_schedule = beta_schedule,
@@ -1807,10 +1806,6 @@ class Decoder(BaseGaussianDiffusion):
        self.clip_denoised = clip_denoised
        self.clip_x_start = clip_x_start

-        # normalize and unnormalize image functions
-        self.normalize_img = normalize_neg_one_to_one if auto_normalize_img else identity
-        self.unnormalize_img = unnormalize_zero_to_one if auto_normalize_img else identity
-
    def get_unet(self, unet_number):
        assert 0 < unet_number <= len(self.unets)
        index = unet_number - 1
@@ -1875,14 +1870,13 @@ class Decoder(BaseGaussianDiffusion):
        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise

    @torch.no_grad()
-    def p_sample_loop(self, unet, shape, image_embed, predict_x_start = False, learned_variance = False, clip_denoised = True, lowres_cond_img = None, text_encodings = None, text_mask = None, cond_scale = 1, is_latent_diffusion = False):
+    def p_sample_loop(self, unet, shape, image_embed, predict_x_start = False, learned_variance = False, clip_denoised = True, lowres_cond_img = None, text_encodings = None, text_mask = None, cond_scale = 1):
        device = self.betas.device

        b = shape[0]
        img = torch.randn(shape, device = device)

-        if not is_latent_diffusion:
-            lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img)
+        lowres_cond_img = maybe(normalize_neg_one_to_one)(lowres_cond_img)

        for i in tqdm(reversed(range(0, self.num_timesteps)), desc = 'sampling loop time step', total = self.num_timesteps):
            img = self.p_sample(
@@ -1899,17 +1893,16 @@ class Decoder(BaseGaussianDiffusion):
                clip_denoised = clip_denoised
            )

-        unnormalize_img = self.unnormalize_img(img)
+        unnormalize_img = unnormalize_zero_to_one(img)
        return unnormalize_img

-    def p_losses(self, unet, x_start, times, *, image_embed, lowres_cond_img = None, text_encodings = None, text_mask = None, predict_x_start = False, noise = None, learned_variance = False, clip_denoised = False, is_latent_diffusion = False):
+    def p_losses(self, unet, x_start, times, *, image_embed, lowres_cond_img = None, text_encodings = None, text_mask = None, predict_x_start = False, noise = None, learned_variance = False, clip_denoised = False):
        noise = default(noise, lambda: torch.randn_like(x_start))

        # normalize to [-1, 1]

-        if not is_latent_diffusion:
-            x_start = self.normalize_img(x_start)
-            lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img)
+        x_start = normalize_neg_one_to_one(x_start)
+        lowres_cond_img = maybe(normalize_neg_one_to_one)(lowres_cond_img)

        # get x_t

@@ -1987,7 +1980,7 @@ class Decoder(BaseGaussianDiffusion):
            batch_size = image_embed.shape[0]

        if exists(text) and not exists(text_encodings) and not self.unconditional:
-            assert exists(self.clip)
+            assert exist(self.clip)
            _, text_encodings, text_mask = self.clip.embed_text(text)

        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
@@ -2023,8 +2016,7 @@ class Decoder(BaseGaussianDiffusion):
                    predict_x_start = predict_x_start,
                    learned_variance = learned_variance,
                    clip_denoised = not is_latent_diffusion,
-                    lowres_cond_img = lowres_cond_img,
-                    is_latent_diffusion = is_latent_diffusion
+                    lowres_cond_img = lowres_cond_img
                )

                img = vae.decode(img)
@@ -2083,14 +2075,12 @@ class Decoder(BaseGaussianDiffusion):
            image = aug(image)
            lowres_cond_img = aug(lowres_cond_img, params = aug._params)

-        is_latent_diffusion = not isinstance(vae, NullVQGanVAE)
-
        vae.eval()
        with torch.no_grad():
            image = vae.encode(image)
            lowres_cond_img = maybe(vae.encode)(lowres_cond_img)

-        return self.p_losses(unet, image, times, image_embed = image_embed, text_encodings = text_encodings, text_mask = text_mask, lowres_cond_img = lowres_cond_img, predict_x_start = predict_x_start, learned_variance = learned_variance, is_latent_diffusion = is_latent_diffusion)
+        return self.p_losses(unet, image, times, image_embed = image_embed, text_encodings = text_encodings, text_mask = text_mask, lowres_cond_img = lowres_cond_img, predict_x_start = predict_x_start, learned_variance = learned_variance)

 # main class

--- a/dalle2_pytorch/dataloaders/simple_image_only_dataloader.py
+++ b/dalle2_pytorch/dataloaders/simple_image_only_dataloader.py
@@ -1,59 +0,0 @@
-from pathlib import Path
-
-import torch
-from torch.utils import data
-from torchvision import transforms, utils
-
-from PIL import Image
-
-# helpers functions
-
-def cycle(dl):
-    while True:
-        for data in dl:
-            yield data
-
-# dataset and dataloader
-
-class Dataset(data.Dataset):
-    def __init__(
-        self,
-        folder,
-        image_size,
-        exts = ['jpg', 'jpeg', 'png']
-    ):
-        super().__init__()
-        self.folder = folder
-        self.image_size = image_size
-        self.paths = [p for ext in exts for p in Path(f'{folder}').glob(f'**/*.{ext}')]
-
-        self.transform = transforms.Compose([
-            transforms.Resize(image_size),
-            transforms.RandomHorizontalFlip(),
-            transforms.CenterCrop(image_size),
-            transforms.ToTensor()
-        ])
-
-    def __len__(self):
-        return len(self.paths)
-
-    def __getitem__(self, index):
-        path = self.paths[index]
-        img = Image.open(path)
-        return self.transform(img)
-
-def get_images_dataloader(
-    folder,
-    *,
-    batch_size,
-    image_size,
-    shuffle = True,
-    cycle_dl = True,
-    pin_memory = True
-):
-    ds = Dataset(folder, image_size)
-    dl = data.DataLoader(ds, batch_size = batch_size, shuffle = shuffle, pin_memory = pin_memory)
-
-    if cycle_dl:
-        dl = cycle(dl)
-    return dl
--- a/dalle2_pytorch/optimizer.py
+++ b/dalle2_pytorch/optimizer.py
@@ -7,7 +7,7 @@ def separate_weight_decayable_params(params):

 def get_optimizer(
    params,
-    lr = 1e-4,
+    lr = 2e-5,
    wd = 1e-2,
    betas = (0.9, 0.999),
    eps = 1e-8,
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -47,14 +47,6 @@ def groupby_prefix_and_trim(prefix, d):
    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
    return kwargs_without_prefix, kwargs

-def num_to_groups(num, divisor):
-    groups = num // divisor
-    remainder = num % divisor
-    arr = [divisor] * groups
-    if remainder > 0:
-        arr.append(remainder)
-    return arr
-
 # decorators

 def cast_torch_tensor(fn):
@@ -187,8 +179,8 @@ class EMA(nn.Module):
        self.online_model = model
        self.ema_model = copy.deepcopy(model)

+        self.update_after_step = update_after_step # only start EMA after this step number, starting at 0
        self.update_every = update_every
-        self.update_after_step = update_after_step  // update_every # only start EMA after this step number, starting at 0

        self.register_buffer('initted', torch.Tensor([False]))
        self.register_buffer('step', torch.tensor([0.]))
@@ -197,21 +189,14 @@ class EMA(nn.Module):
        device = self.initted.device
        self.ema_model.to(device)

-    def copy_params_from_model_to_ema(self):
-        self.ema_model.state_dict(self.online_model.state_dict())
-
    def update(self):
        self.step += 1

-        if (self.step % self.update_every) != 0:
-            return
-
-        if self.step <= self.update_after_step:
-            self.copy_params_from_model_to_ema()
+        if self.step <= self.update_after_step or (self.step % self.update_every) != 0:
            return

        if not self.initted:
-            self.copy_params_from_model_to_ema()
+            self.ema_model.state_dict(self.online_model.state_dict())
            self.initted.data.copy_(torch.Tensor([True]))

        self.update_moving_average(self.ema_model, self.online_model)
@@ -235,16 +220,6 @@ class EMA(nn.Module):

 # diffusion prior trainer

-def prior_sample_in_chunks(fn):
-    @wraps(fn)
-    def inner(self, *args, max_batch_size = None, **kwargs):
-        if not exists(max_batch_size):
-            return fn(self, *args, **kwargs)
-
-        outputs = [fn(self, *chunked_args, **chunked_kwargs) for _, (chunked_args, chunked_kwargs) in split_args_and_kwargs(*args, split_size = max_batch_size, **kwargs)]
-        return torch.cat(outputs, dim = 0)
-    return inner
-
 class DiffusionPriorTrainer(nn.Module):
    def __init__(
        self,
@@ -305,13 +280,11 @@ class DiffusionPriorTrainer(nn.Module):

    @torch.no_grad()
    @cast_torch_tensor
-    @prior_sample_in_chunks
    def p_sample_loop(self, *args, **kwargs):
        return self.ema_diffusion_prior.ema_model.p_sample_loop(*args, **kwargs)

    @torch.no_grad()
    @cast_torch_tensor
-    @prior_sample_in_chunks
    def sample(self, *args, **kwargs):
        return self.ema_diffusion_prior.ema_model.sample(*args, **kwargs)

@@ -342,31 +315,15 @@ class DiffusionPriorTrainer(nn.Module):

 # decoder trainer

-def decoder_sample_in_chunks(fn):
-    @wraps(fn)
-    def inner(self, *args, max_batch_size = None, **kwargs):
-        if not exists(max_batch_size):
-            return fn(self, *args, **kwargs)
-
-        if self.decoder.unconditional:
-            batch_size = kwargs.get('batch_size')
-            batch_sizes = num_to_groups(batch_size, max_batch_size)
-            outputs = [fn(self, *args, **{**kwargs, 'batch_size': sub_batch_size}) for sub_batch_size in batch_sizes]
-        else:
-            outputs = [fn(self, *chunked_args, **chunked_kwargs) for _, (chunked_args, chunked_kwargs) in split_args_and_kwargs(*args, split_size = max_batch_size, **kwargs)]
-
-        return torch.cat(outputs, dim = 0)
-    return inner
-
 class DecoderTrainer(nn.Module):
    def __init__(
        self,
        decoder,
        use_ema = True,
-        lr = 1e-4,
+        lr = 2e-5,
        wd = 1e-2,
        eps = 1e-8,
-        max_grad_norm = 0.5,
+        max_grad_norm = None,
        amp = False,
        **kwargs
    ):
@@ -447,17 +404,15 @@ class DecoderTrainer(nn.Module):

    @torch.no_grad()
    @cast_torch_tensor
-    @decoder_sample_in_chunks
    def sample(self, *args, **kwargs):
-        if kwargs.pop('use_non_ema', False) or not self.use_ema:
-            return self.decoder.sample(*args, **kwargs)
-
-        trainable_unets = self.decoder.unets
-        self.decoder.unets = self.unets                  # swap in exponential moving averaged unets for sampling
+        if self.use_ema:
+            trainable_unets = self.decoder.unets
+            self.decoder.unets = self.unets                  # swap in exponential moving averaged unets for sampling

        output = self.decoder.sample(*args, **kwargs)

-        self.decoder.unets = trainable_unets             # restore original training unets
+        if self.use_ema:
+            self.decoder.unets = trainable_unets             # restore original training unets

        # cast the ema_model unets back to original device
        for ema in self.ema_unets:
--- a/samples/oxford.png
+++ b/samples/oxford.png
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.3.3',
+  version = '0.2.41',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',