add ability for DALL-E2 to return PIL images with return_pil_images = True on forward, for those who have no clue about deep learning

only pass text encodings conditioning in diffusion prior if specified on initialization
2026-02-21 08:14:29 +01:00 · 2022-04-27 19:57:27 -07:00 · 2022-04-27 19:48:16 -07:00
2 changed files with 20 additions and 12 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -7,6 +7,7 @@ from contextlib import contextmanager
 import torch
 import torch.nn.functional as F
 from torch import nn, einsum
 import torchvision.transforms as T
 from einops import rearrange, repeat
 from einops.layers.torch import Rearrange
@@ -736,11 +737,10 @@ class DiffusionPrior(BaseGaussianDiffusion):
        text_embed, text_encodings = self.clip.embed_text(text)
-        text_cond = dict(
+        text_cond = dict(text_embed = text_embed)
-            text_embed = text_embed,
+
-            text_encodings = text_encodings,
+        if self.condition_on_text_encodings:
-            mask = text != 0
+            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}
        )
        image_embeds = self.p_sample_loop((batch_size, image_embed_dim), text_cond = text_cond)
        text_embeds = text_cond['text_embed']
@@ -780,11 +780,10 @@ class DiffusionPrior(BaseGaussianDiffusion):
            text_embed, text_encodings = self.clip.embed_text(text)
            text_mask = text != 0
-        text_cond = dict(
+        text_cond = dict(text_embed = text_embed)
-            text_embed = text_embed,
+
-            text_encodings = text_encodings,
+        if self.condition_on_text_encodings:
-            mask = text_mask
+            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}
        )
        # timestep conditioning from ddpm
@@ -1518,12 +1517,15 @@ class DALLE2(nn.Module):
        self.prior_num_samples = prior_num_samples
        self.decoder_need_text_cond = self.decoder.condition_on_text_encodings
        self.to_pil = T.ToPILImage()
    @torch.no_grad()
    @eval_decorator
    def forward(
        self,
        text,
-        cond_scale = 1.
+        cond_scale = 1.,
        return_pil_images = False
    ):
        device = next(self.parameters()).device
        one_text = isinstance(text, str) or (not is_list_str(text) and text.shape[0] == 1)
@@ -1537,7 +1539,13 @@ class DALLE2(nn.Module):
        text_cond = text if self.decoder_need_text_cond else None
        images = self.decoder.sample(image_embed, text = text_cond, cond_scale = cond_scale)
        if return_pil_images:
            # do some magic - if the user passed in a string text, or a list of strings
            # assume they do not know anything about tensors and return PIL Image(s)
            images = list(map(self.to_pil, images.unbind(dim = 0)))
        if one_text:
            return images[0]
        return images
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.57',
+  version = '0.0.59',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	8c2015fd39	add ability for DALL-E2 to return PIL images with `return_pil_images = True` on forward, for those who have no clue about deep learning	2022-04-27 19:57:27 -07:00
Phil Wang	8c610aad9a	only pass text encodings conditioning in diffusion prior if specified on initialization	2022-04-27 19:48:16 -07:00