align the ema model device back after sampling from the cascading ddpm in the decoder

default to l2 loss
fix typo in README.md (#85 )
2026-02-12 11:34:29 +01:00 · 2022-05-11 19:56:54 -07:00 · 2022-05-11 19:24:51 -07:00 · 2022-05-11 13:38:16 -07:00 · 2022-05-11 08:21:39 -07:00
4 changed files with 18 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -508,7 +508,7 @@ To use a pretrained OpenAI CLIP, simply import `OpenAIClipAdapter` and pass it i
 import torch
 from dalle2_pytorch import DALLE2, DiffusionPriorNetwork, DiffusionPrior, Unet, Decoder, OpenAIClipAdapter

-# openai pretrained clip - defaults to ViT/B-32
+# openai pretrained clip - defaults to ViT-B/32

 clip = OpenAIClipAdapter()

--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -831,7 +831,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
        image_channels = 3,
        timesteps = 1000,
        cond_drop_prob = 0.,
-        loss_type = "l1",
+        loss_type = "l2",
        predict_x_start = True,
        beta_schedule = "cosine",
        condition_on_text_encodings = True, # the paper suggests this is needed, but you can turn it off for your CLIP preprocessed text embed -> image embed training
@@ -1492,11 +1492,12 @@ class Unet(nn.Module):

        if self.cond_on_image_embeds:
            image_tokens = self.image_to_cond(image_embed)
+            null_image_embed = self.null_image_embed.to(image_tokens.dtype) # for some reason pytorch AMP not working

            image_tokens = torch.where(
                image_keep_mask,
                image_tokens,
-                self.null_image_embed
+                null_image_embed
            )

        # take care of text encodings (optional)
@@ -1520,10 +1521,12 @@ class Unet(nn.Module):
                text_mask = rearrange(text_mask, 'b n -> b n 1')
                text_keep_mask = text_mask & text_keep_mask

+            null_text_embed = self.null_text_embed.to(text_tokens.dtype) # for some reason pytorch AMP not working
+
            text_tokens = torch.where(
                text_keep_mask,
                text_tokens,
-                self.null_text_embed
+                null_text_embed
            )

        # main conditioning tokens (c)
@@ -1611,7 +1614,7 @@ class Decoder(BaseGaussianDiffusion):
        timesteps = 1000,
        image_cond_drop_prob = 0.1,
        text_cond_drop_prob = 0.5,
-        loss_type = 'l1',
+        loss_type = 'l2',
        beta_schedule = 'cosine',
        predict_x_start = False,
        predict_x_start_for_latent_diffusion = False,
--- a/dalle2_pytorch/train.py
+++ b/dalle2_pytorch/train.py
@@ -105,6 +105,10 @@ class EMA(nn.Module):
        self.register_buffer('initted', torch.Tensor([False]))
        self.register_buffer('step', torch.tensor([0.]))

+    def restore_ema_model_device(self):
+        device = self.initted.device
+        self.ema_model.to(device)
+
    def update(self):
        self.step += 1

@@ -305,6 +309,11 @@ class DecoderTrainer(nn.Module):

        if self.use_ema:
            self.decoder.unets = trainable_unets             # restore original training unets
+
+        # cast the ema_model unets back to original device
+        for ema in self.ema_unets:
+            ema.restore_ema_model_device()
+
        return output

    def forward(
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.2.9',
+  version = '0.2.12',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	924455d97d	align the ema model device back after sampling from the cascading ddpm in the decoder	2022-05-11 19:56:54 -07:00
Phil Wang	6021945fc8	default to l2 loss	2022-05-11 19:24:51 -07:00
Light-V	6f76652d11	fix typo in README.md (#85 ) The default config for clip from openai should be ViT-B/32	2022-05-11 13:38:16 -07:00
Phil Wang	3dda2570ed	fix amp issue for https://github.com/lucidrains/DALLE2-pytorch/issues/82	2022-05-11 08:21:39 -07:00