now completely OpenAI CLIP compatible for training

add some training fns
fix another bug thanks to @xiankgx
2026-02-12 11:34:29 +01:00 · 2022-04-29 11:26:24 -07:00 · 2022-04-29 09:44:55 -07:00 · 2022-04-29 07:38:32 -07:00 · 2022-04-28 21:59:13 -07:00 · 2022-04-28 20:53:15 -07:00
5 changed files with 245 additions and 60 deletions
--- a/README.md
+++ b/README.md
@@ -495,6 +495,96 @@ loss.backward()
 # now the diffusion prior can generate image embeddings from the text embeddings
 ```

+## OpenAI CLIP
+
+Although there is the possibility they are using an unreleased, more powerful CLIP, you can use one of the released ones, if you do not wish to train your own CLIP from scratch. This will also allow the community to more quickly validate the conclusions of the paper.
+
+First you'll need to install <a href="https://github.com/openai/CLIP#usage">the prerequisites</a>
+
+Then to use a pretrained OpenAI CLIP, simply import `OpenAIClipAdapter` and pass it into the `DiffusionPrior` or `Decoder` like so
+
+```python
+import torch
+from dalle2_pytorch import DALLE2, DiffusionPriorNetwork, DiffusionPrior, Unet, Decoder, OpenAIClipAdapter
+
+# openai pretrained clip - defaults to ViT/B-32
+
+clip = OpenAIClipAdapter()
+
+# mock data
+
+text = torch.randint(0, 49408, (4, 256)).cuda()
+images = torch.randn(4, 3, 256, 256).cuda()
+
+# prior networks (with transformer)
+
+prior_network = DiffusionPriorNetwork(
+    dim = 512,
+    depth = 6,
+    dim_head = 64,
+    heads = 8
+).cuda()
+
+diffusion_prior = DiffusionPrior(
+    net = prior_network,
+    clip = clip,
+    timesteps = 100,
+    cond_drop_prob = 0.2
+).cuda()
+
+loss = diffusion_prior(text, images)
+loss.backward()
+
+# do above for many steps ...
+
+# decoder (with unet)
+
+unet1 = Unet(
+    dim = 128,
+    image_embed_dim = 512,
+    cond_dim = 128,
+    channels = 3,
+    dim_mults=(1, 2, 4, 8)
+).cuda()
+
+unet2 = Unet(
+    dim = 16,
+    image_embed_dim = 512,
+    cond_dim = 128,
+    channels = 3,
+    dim_mults = (1, 2, 4, 8, 16)
+).cuda()
+
+decoder = Decoder(
+    unet = (unet1, unet2),
+    image_sizes = (128, 256),
+    clip = clip,
+    timesteps = 100,
+    cond_drop_prob = 0.2,
+    condition_on_text_encodings = False  # set this to True if you wish to condition on text during training and sampling
+).cuda()
+
+for unet_number in (1, 2):
+    loss = decoder(images, unet_number = unet_number) # this can optionally be decoder(images, text) if you wish to condition on the text encodings as well, though it was hinted in the paper it didn't do much
+    loss.backward()
+
+# do above for many steps
+
+dalle2 = DALLE2(
+    prior = diffusion_prior,
+    decoder = decoder
+)
+
+images = dalle2(
+    ['a butterfly trying to escape a tornado'],
+    cond_scale = 2. # classifier free guidance strength (> 1 would strengthen the condition)
+)
+
+# save your image (in this example, of size 256x256)
+```
+
+Now you'll just have to worry about training the Prior and the Decoder!
+
 ## Experimental

 ### DALL-E2 with Latent Diffusion
--- a/dalle2_pytorch/init.py
+++ b/dalle2_pytorch/init.py
@@ -1,4 +1,5 @@
 from dalle2_pytorch.dalle2_pytorch import DALLE2, DiffusionPriorNetwork, DiffusionPrior, Unet, Decoder
+from dalle2_pytorch.dalle2_pytorch import OpenAIClipAdapter

 from dalle2_pytorch.vqgan_vae import VQGanVAE
 from x_clip import CLIP
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -7,6 +7,7 @@ from contextlib import contextmanager
 import torch
 import torch.nn.functional as F
 from torch import nn, einsum
+import torchvision.transforms as T

 from einops import rearrange, repeat
 from einops.layers.torch import Rearrange
@@ -89,6 +90,16 @@ def resize_image_to(t, image_size, mode = 'bilinear'): # take a look at https://

    return F.interpolate(t, size = shape, mode = mode, align_corners = False)

+# image normalization functions
+# ddpms expect images to be in the range of -1 to 1
+# but CLIP may otherwise
+
+def normalize_img(img):
+    return img * 2 - 1
+
+def unnormalize_img(normed_img):
+    return (normed_img + 1) * 0.5
+
 # clip related adapters

 class BaseClipAdapter(nn.Module):
@@ -108,6 +119,10 @@ class BaseClipAdapter(nn.Module):
    def image_channels(self):
        raise NotImplementedError

+    @property
+    def max_text_len(self):
+        raise NotImplementedError
+
    def embed_text(self, text):
        raise NotImplementedError

@@ -127,12 +142,18 @@ class XClipAdapter(BaseClipAdapter):
    def image_channels(self):
        return self.clip.image_channels

+    @property
+    def max_text_len(self):
+        return self.clip.text_seq_len
+
    @torch.no_grad()
    def embed_text(self, text):
+        text = text[..., :self.max_text_len]
+        text_mask = text != 0
        encoder_output = self.clip.text_transformer(text)
        text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
        text_embed = self.clip.to_text_latent(text_cls)
-        return l2norm(text_embed), text_encodings
+        return l2norm(text_embed), text_encodings, text_mask

    @torch.no_grad()
    def embed_image(self, image):
@@ -142,6 +163,72 @@ class XClipAdapter(BaseClipAdapter):
        image_embed = self.clip.to_visual_latent(image_cls)
        return l2norm(image_embed), image_encodings

+class OpenAIClipAdapter(BaseClipAdapter):
+    def __init__(
+        self,
+        name = 'ViT-B/32'
+    ):
+        try:
+            import clip
+        except ImportError:
+            print('you must install openai clip in order to use this adapter - `pip install git+https://github.com/openai/CLIP.git` - more instructions at https://github.com/openai/CLIP#usage')
+
+        openai_clip, _ = clip.load(name)
+        super().__init__(openai_clip)
+
+        text_attention_final = self.find_layer('ln_final')
+        self.handle = text_attention_final.register_forward_hook(self._hook)
+        self.clip_normalize = T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+        self.cleared = False
+
+    def find_layer(self,  layer):
+        modules = dict([*self.clip.named_modules()])
+        return modules.get(layer, None)
+
+    def clear(self):
+        if self.cleared:
+            return
+
+        self.handle()
+
+    def _hook(self, _, inputs, outputs):
+        self.text_encodings = outputs
+
+    @property
+    def dim_latent(self):
+        return 512
+
+    @property
+    def image_size(self):
+        return self.clip.visual.input_resolution
+
+    @property
+    def image_channels(self):
+        return 3
+
+    @property
+    def max_text_len(self):
+        return self.clip.context_length
+
+    @torch.no_grad()
+    def embed_text(self, text):
+        text = text[..., :self.max_text_len]
+        text_mask = text != 0
+        assert not self.cleared
+
+        text_embed = self.clip.encode_text(text)
+        text_encodings = self.text_encodings
+        del self.text_encodings
+        return text_embed.float(), text_encodings.float(), text_mask
+
+    @torch.no_grad()
+    def embed_image(self, image):
+        assert not self.cleared
+        image = resize_image_to(image, self.image_size)
+        image = self.clip_normalize(unnormalize_img(image))
+        image_embed = self.clip.encode_image(image)
+        return image_embed.float(), None
+
 # classifier free guidance functions

 def prob_mask_like(shape, prob, device):
@@ -222,7 +309,18 @@ class BaseGaussianDiffusion(nn.Module):

        timesteps, = betas.shape
        self.num_timesteps = int(timesteps)
+
+        if loss_type == 'l1':
+            loss_fn = F.l1_loss
+        elif loss_type == 'l2':
+            loss_fn = F.mse_loss
+        elif loss_type == 'huber':
+            loss_fn = F.smooth_l1_loss
+        else:
+            raise NotImplementedError()
+
        self.loss_type = loss_type
+        self.loss_fn = loss_fn

        self.register_buffer('betas', betas)
        self.register_buffer('alphas_cumprod', alphas_cumprod)
@@ -646,9 +744,12 @@ class DiffusionPrior(BaseGaussianDiffusion):
        )

        if exists(clip):
-            assert isinstance(clip, CLIP)
+            if isinstance(clip, CLIP):
+                clip = XClipAdapter(clip)
+
+            assert isinstance(clip, BaseClipAdapter)
            freeze_model_and_make_eval_(clip)
-            self.clip = XClipAdapter(clip)
+            self.clip = clip
        else:
            assert exists(image_embed_dim), 'latent dimension must be given, if training prior network without CLIP given'
            self.clip = None
@@ -699,29 +800,21 @@ class DiffusionPrior(BaseGaussianDiffusion):
            img = self.p_sample(img, torch.full((b,), i, device = device, dtype = torch.long), text_cond = text_cond)
        return img

-    def p_losses(self, image_embed, t, text_cond, noise = None):
+    def p_losses(self, image_embed, times, text_cond, noise = None):
        noise = default(noise, lambda: torch.randn_like(image_embed))

-        image_embed_noisy = self.q_sample(x_start = image_embed, t = t, noise = noise)
+        image_embed_noisy = self.q_sample(x_start = image_embed, t = times, noise = noise)

-        x_recon = self.net(
+        pred = self.net(
            image_embed_noisy,
-            t,
+            times,
            cond_drop_prob = self.cond_drop_prob,
            **text_cond
        )

-        to_predict = noise if not self.predict_x_start else image_embed
-
-        if self.loss_type == 'l1':
-            loss = F.l1_loss(to_predict, x_recon)
-        elif self.loss_type == 'l2':
-            loss = F.mse_loss(to_predict, x_recon)
-        elif self.loss_type == "huber":
-            loss = F.smooth_l1_loss(to_predict, x_recon)
-        else:
-            raise NotImplementedError()
+        target = noise if not self.predict_x_start else image_embed

+        loss = self.loss_fn(pred, target)
        return loss

    @torch.no_grad()
@@ -734,13 +827,12 @@ class DiffusionPrior(BaseGaussianDiffusion):
        batch_size = text.shape[0]
        image_embed_dim = self.image_embed_dim

-        text_embed, text_encodings = self.clip.embed_text(text)
+        text_embed, text_encodings, text_mask = self.clip.embed_text(text)

-        text_cond = dict(
-            text_embed = text_embed,
-            text_encodings = text_encodings,
-            mask = text != 0
-        )
+        text_cond = dict(text_embed = text_embed)
+
+        if self.condition_on_text_encodings:
+            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}

        image_embeds = self.p_sample_loop((batch_size, image_embed_dim), text_cond = text_cond)
        text_embeds = text_cond['text_embed']
@@ -777,14 +869,13 @@ class DiffusionPrior(BaseGaussianDiffusion):
        # calculate text conditionings, based on what is passed in

        if exists(text):
-            text_embed, text_encodings = self.clip.embed_text(text)
-            text_mask = text != 0
+            text_embed, text_encodings, text_mask = self.clip.embed_text(text)

-        text_cond = dict(
-            text_embed = text_embed,
-            text_encodings = text_encodings,
-            mask = text_mask
-        )
+        text_cond = dict(text_embed = text_embed)
+
+        if self.condition_on_text_encodings:
+            assert exists(text_encodings), 'text encodings must be present for diffusion prior if specified'
+            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}

        # timestep conditioning from ddpm

@@ -793,8 +884,7 @@ class DiffusionPrior(BaseGaussianDiffusion):

        # calculate forward loss

-        loss = self.p_losses(image_embed, times, text_cond = text_cond, *args, **kwargs)
-        return loss
+        return self.p_losses(image_embed, times, text_cond = text_cond, *args, **kwargs)

 # decoder

@@ -1064,13 +1154,14 @@ class Unet(nn.Module):
        self,
        *,
        lowres_cond,
-        channels
+        channels,
+        cond_on_image_embeds
    ):
-        if lowres_cond == self.lowres_cond and channels == self.channels:
+        if lowres_cond == self.lowres_cond and channels == self.channels and cond_on_image_embeds == self.cond_on_image_embeds:
            return self

-        updated_kwargs = {**self._locals, 'lowres_cond': lowres_cond, 'channels': channels}
-        return self.__class__(**updated_kwargs)
+        updated_kwargs = {'lowres_cond': lowres_cond, 'channels': channels, 'cond_on_image_embeds': cond_on_image_embeds}
+        return self.__class__(**{**self._locals, **updated_kwargs})

    def forward_with_cond_scale(
        self,
@@ -1207,7 +1298,7 @@ class LowresConditioner(nn.Module):
        target_image_size = cast_tuple(target_image_size, 2)

        if self.training and self.downsample_first and exists(downsample_image_size):
-            cond_fmap = resize_image_to(cond_fmap, target_image_size, mode = self.cond_upsample_mode)
+            cond_fmap = resize_image_to(cond_fmap, downsample_image_size, mode = self.cond_upsample_mode)

        if self.training:
            # when training, blur the low resolution conditional image
@@ -1249,6 +1340,8 @@ class Decoder(BaseGaussianDiffusion):
            clip = XClipAdapter(clip)

        freeze_model_and_make_eval_(clip)
+        assert isinstance(clip, BaseClipAdapter)
+
        self.clip = clip
        self.clip_image_size = clip.image_size
        self.channels = clip.image_channels
@@ -1275,6 +1368,7 @@ class Decoder(BaseGaussianDiffusion):

            one_unet = one_unet.cast_model_parameters(
                lowres_cond = not is_first,
+                cond_on_image_embeds = is_first,
                channels = unet_channels
            )

@@ -1332,11 +1426,8 @@ class Decoder(BaseGaussianDiffusion):

    @torch.no_grad()
    def get_image_embed(self, image):
-        image = resize_image_to(image, self.clip_image_size)
-        image_encoding = self.clip.visual_transformer(image)
-        image_cls = image_encoding[:, 0]
-        image_embed = self.clip.to_visual_latent(image_cls)
-        return l2norm(image_embed)
+        image_embed, _ = self.clip.embed_image(image)
+        return image_embed

    def p_mean_variance(self, unet, x, t, image_embed, text_encodings = None, lowres_cond_img = None, clip_denoised = True, predict_x_start = False, cond_scale = 1.):
        pred = unet.forward_with_cond_scale(x, t, image_embed = image_embed, text_encodings = text_encodings, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img)
@@ -1382,14 +1473,14 @@ class Decoder(BaseGaussianDiffusion):

        return img

-    def p_losses(self, unet, x_start, t, *, image_embed, lowres_cond_img = None, text_encodings = None, predict_x_start = False, noise = None):
+    def p_losses(self, unet, x_start, times, *, image_embed, lowres_cond_img = None, text_encodings = None, predict_x_start = False, noise = None):
        noise = default(noise, lambda: torch.randn_like(x_start))

-        x_noisy = self.q_sample(x_start = x_start, t = t, noise = noise)
+        x_noisy = self.q_sample(x_start = x_start, t = times, noise = noise)

-        x_recon = unet(
+        pred = unet(
            x_noisy,
-            t,
+            times,
            image_embed = image_embed,
            text_encodings = text_encodings,
            lowres_cond_img = lowres_cond_img,
@@ -1398,15 +1489,7 @@ class Decoder(BaseGaussianDiffusion):

        target = noise if not predict_x_start else x_start

-        if self.loss_type == 'l1':
-            loss = F.l1_loss(target, x_recon)
-        elif self.loss_type == 'l2':
-            loss = F.mse_loss(target, x_recon)
-        elif self.loss_type == "huber":
-            loss = F.smooth_l1_loss(target, x_recon)
-        else:
-            raise NotImplementedError()
-
+        loss = self.loss_fn(pred, target)
        return loss

    @torch.no_grad()
@@ -1416,9 +1499,10 @@ class Decoder(BaseGaussianDiffusion):

        text_encodings = None
        if exists(text):
-            _, text_encodings = self.clip.embed_text(text)
+            _, text_encodings, _ = self.clip.embed_text(text)

        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
+        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

        img = None

@@ -1483,9 +1567,10 @@ class Decoder(BaseGaussianDiffusion):

        text_encodings = None
        if exists(text) and not exists(text_encodings):
-            _, text_encodings = self.clip.embed_text(text)
+            _, text_encodings, _ = self.clip.embed_text(text)

        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
+        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

        lowres_cond_img = self.to_lowres_cond(image, target_image_size = target_image_size, downsample_image_size = self.image_sizes[unet_index - 1]) if unet_number > 1 else None
        image = resize_image_to(image, target_image_size)
@@ -1518,12 +1603,15 @@ class DALLE2(nn.Module):
        self.prior_num_samples = prior_num_samples
        self.decoder_need_text_cond = self.decoder.condition_on_text_encodings

+        self.to_pil = T.ToPILImage()
+
    @torch.no_grad()
    @eval_decorator
    def forward(
        self,
        text,
-        cond_scale = 1.
+        cond_scale = 1.,
+        return_pil_images = False
    ):
        device = next(self.parameters()).device
        one_text = isinstance(text, str) or (not is_list_str(text) and text.shape[0] == 1)
@@ -1537,7 +1625,11 @@ class DALLE2(nn.Module):
        text_cond = text if self.decoder_need_text_cond else None
        images = self.decoder.sample(image_embed, text = text_cond, cond_scale = cond_scale)

+        if return_pil_images:
+            images = list(map(self.to_pil, images.unbind(dim = 0)))
+
        if one_text:
            return images[0]

        return images
+
--- a/dalle2_pytorch/vqgan_vae.py
+++ b/dalle2_pytorch/vqgan_vae.py
@@ -545,6 +545,7 @@ class VQGanVAE(nn.Module):
        l2_recon_loss = False,
        use_hinge_loss = True,
        vgg = None,
+        vq_codebook_dim = 256,
        vq_codebook_size = 512,
        vq_decay = 0.8,
        vq_commitment_weight = 1.,
@@ -579,6 +580,7 @@ class VQGanVAE(nn.Module):

        self.vq = VQ(
            dim = self.enc_dec.encoded_dim,
+            codebook_dim = vq_codebook_dim,
            codebook_size = vq_codebook_size,
            decay = vq_decay,
            commitment_weight = vq_commitment_weight,
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.57',
+  version = '0.0.67',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
@@ -31,7 +31,7 @@ setup(
    'torchvision',
    'tqdm',
    'vector-quantize-pytorch',
-    'x-clip>=0.4.4',
+    'x-clip>=0.5.1',
    'youtokentome'
  ],
  classifiers=[
Author	SHA1	Message	Date
Phil Wang	39d3659ad9	now completely OpenAI CLIP compatible for training	2022-04-29 11:26:24 -07:00
Phil Wang	f4a54e475e	add some training fns	2022-04-29 09:44:55 -07:00
Phil Wang	fb662a62f3	fix another bug thanks to @xiankgx	2022-04-29 07:38:32 -07:00
Phil Wang	587c8c9b44	optimize for clarity	2022-04-28 21:59:13 -07:00
Phil Wang	aa900213e7	force first unet in the cascade to be conditioned on image embeds	2022-04-28 20:53:15 -07:00
Phil Wang	cb26187450	vqgan-vae codebook dims should be 256 or smaller	2022-04-28 08:59:03 -07:00
Phil Wang	625ce23f6b	🐛	2022-04-28 07:21:18 -07:00
Phil Wang	dbf4a281f1	make sure another CLIP can actually be passed in, as long as it is wrapped in an adapter extended from BaseClipAdapter	2022-04-27 20:45:27 -07:00
Phil Wang	4ab527e779	some extra asserts for text encoding of diffusion prior and decoder	2022-04-27 20:11:43 -07:00
Phil Wang	d0cdeb3247	add ability for DALL-E2 to return PIL images with `return_pil_images = True` on forward, for those who have no clue about deep learning	2022-04-27 19:58:06 -07:00
Phil Wang	8c610aad9a	only pass text encodings conditioning in diffusion prior if specified on initialization	2022-04-27 19:48:16 -07:00