🐛

make sure another CLIP can actually be passed in, as long as it is wrapped in an adapter extended from BaseClipAdapter
some extra asserts for text encoding of diffusion prior and decoder
2026-02-12 11:34:29 +01:00 · 2022-04-28 07:21:18 -07:00 · 2022-04-27 20:45:27 -07:00 · 2022-04-27 20:11:43 -07:00 · 2022-04-27 19:58:06 -07:00
2 changed files with 22 additions and 7 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -7,6 +7,7 @@ from contextlib import contextmanager
 import torch
 import torch.nn.functional as F
 from torch import nn, einsum
+import torchvision.transforms as T

 from einops import rearrange, repeat
 from einops.layers.torch import Rearrange
@@ -646,9 +647,12 @@ class DiffusionPrior(BaseGaussianDiffusion):
        )

        if exists(clip):
-            assert isinstance(clip, CLIP)
+            if isinstance(clip, CLIP):
+                clip = XClipAdapter(clip)
+
+            assert isinstance(clip, BaseClipAdapter)
            freeze_model_and_make_eval_(clip)
-            self.clip = XClipAdapter(clip)
+            self.clip = clip
        else:
            assert exists(image_embed_dim), 'latent dimension must be given, if training prior network without CLIP given'
            self.clip = None
@@ -739,7 +743,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        text_cond = dict(text_embed = text_embed)

        if self.condition_on_text_encodings:
-            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}
+            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text != 0}

        image_embeds = self.p_sample_loop((batch_size, image_embed_dim), text_cond = text_cond)
        text_embeds = text_cond['text_embed']
@@ -782,6 +786,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        text_cond = dict(text_embed = text_embed)

        if self.condition_on_text_encodings:
+            assert exists(text_encodings), 'text encodings must be present for diffusion prior if specified'
            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}

        # timestep conditioning from ddpm
@@ -791,8 +796,7 @@ class DiffusionPrior(BaseGaussianDiffusion):

        # calculate forward loss

-        loss = self.p_losses(image_embed, times, text_cond = text_cond, *args, **kwargs)
-        return loss
+        return self.p_losses(image_embed, times, text_cond = text_cond, *args, **kwargs)

 # decoder

@@ -1247,6 +1251,8 @@ class Decoder(BaseGaussianDiffusion):
            clip = XClipAdapter(clip)

        freeze_model_and_make_eval_(clip)
+        assert isinstance(clip, BaseClipAdapter)
+
        self.clip = clip
        self.clip_image_size = clip.image_size
        self.channels = clip.image_channels
@@ -1417,6 +1423,7 @@ class Decoder(BaseGaussianDiffusion):
            _, text_encodings = self.clip.embed_text(text)

        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
+        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

        img = None

@@ -1484,6 +1491,7 @@ class Decoder(BaseGaussianDiffusion):
            _, text_encodings = self.clip.embed_text(text)

        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
+        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

        lowres_cond_img = self.to_lowres_cond(image, target_image_size = target_image_size, downsample_image_size = self.image_sizes[unet_index - 1]) if unet_number > 1 else None
        image = resize_image_to(image, target_image_size)
@@ -1516,12 +1524,15 @@ class DALLE2(nn.Module):
        self.prior_num_samples = prior_num_samples
        self.decoder_need_text_cond = self.decoder.condition_on_text_encodings

+        self.to_pil = T.ToPILImage()
+
    @torch.no_grad()
    @eval_decorator
    def forward(
        self,
        text,
-        cond_scale = 1.
+        cond_scale = 1.,
+        return_pil_images = False
    ):
        device = next(self.parameters()).device
        one_text = isinstance(text, str) or (not is_list_str(text) and text.shape[0] == 1)
@@ -1535,7 +1546,11 @@ class DALLE2(nn.Module):
        text_cond = text if self.decoder_need_text_cond else None
        images = self.decoder.sample(image_embed, text = text_cond, cond_scale = cond_scale)

+        if return_pil_images:
+            images = list(map(self.to_pil, images.unbind(dim = 0)))
+
        if one_text:
            return images[0]

        return images
+
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.58',
+  version = '0.0.62',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	625ce23f6b	🐛	2022-04-28 07:21:18 -07:00
Phil Wang	dbf4a281f1	make sure another CLIP can actually be passed in, as long as it is wrapped in an adapter extended from BaseClipAdapter	2022-04-27 20:45:27 -07:00
Phil Wang	4ab527e779	some extra asserts for text encoding of diffusion prior and decoder	2022-04-27 20:11:43 -07:00
Phil Wang	d0cdeb3247	add ability for DALL-E2 to return PIL images with `return_pil_images = True` on forward, for those who have no clue about deep learning	2022-04-27 19:58:06 -07:00