cleanup

favorite quote
readme
2026-02-13 12:04:24 +01:00 · 2022-04-13 18:24:32 -07:00 · 2022-04-13 18:17:59 -07:00 · 2022-04-13 18:11:55 -07:00 · 2022-04-13 18:08:42 -07:00 · 2022-04-13 18:05:25 -07:00
3 changed files with 10 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -199,7 +199,7 @@ dalle2 = DALLE2(
    decoder = decoder
 )

-# send the text as a string if you want to use the simple tokenizer from DALL-E1
+# send the text as a string if you want to use the simple tokenizer from DALLE v1
 # or you can do it as token ids, if you have your own tokenizer

 texts = ['glistening morning dew on a flower petal']
@@ -212,10 +212,7 @@ Let's see the whole script below

 ```python
 import torch
-from dalle2_pytorch.dalle2_pytorch import DALLE2
-from dalle2_pytorch import DiffusionPriorNetwork, DiffusionPrior, Unet, Decoder, CLIP
-
-import torch
+from dalle2_pytorch import DALLE2, DiffusionPriorNetwork, DiffusionPrior, Unet, Decoder, CLIP

 clip = CLIP(
    dim_text = 512,
@@ -304,6 +301,8 @@ images = dalle2(['cute puppy chasing after a squirrel'])

 Everything in this readme should run without error

+For the layperson, no worries, training will all be automated into a CLI tool, at least for small scale training.
+
 ## Training CLI (wip)

 <a href="https://github.com/lucidrains/stylegan2-pytorch">template</a>
@@ -365,3 +364,5 @@ Everything in this readme should run without error
    primaryClass = {cs.LG}
 }
 ```
+
+*Creating noise from data is easy; creating data from noise is generative modeling.* - Yang Song's <a href="https://arxiv.org/abs/2011.13456">paper</a>
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -374,12 +374,13 @@ class DiffusionPrior(nn.Module):
        image_encoding = self.clip.visual_transformer(image)
        image_cls = image_encoding[:, 0]
        image_embed = self.clip.to_visual_latent(image_cls)
-        return image_embed
+        return l2norm(image_embed)

    def get_text_cond(self, text):
        text_encodings = self.clip.text_transformer(text)
        text_cls, text_encodings = text_encodings[:, 0], text_encodings[:, 1:]
        text_embed = self.clip.to_text_latent(text_cls)
+        text_embed = l2norm(text_embed)
        return dict(text_encodings = text_encodings, text_embed = text_embed, mask = text != 0)

    def q_mean_variance(self, x_start, t):
@@ -750,7 +751,7 @@ class Decoder(nn.Module):
        image_encoding = self.clip.visual_transformer(image)
        image_cls = image_encoding[:, 0]
        image_embed = self.clip.to_visual_latent(image_cls)
-        return image_embed
+        return l2norm(image_embed)

    def q_mean_variance(self, x_start, t):
        mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
@@ -876,7 +877,6 @@ class DALLE2(nn.Module):
            text = [text] if not isinstance(text, (list, tuple)) else text
            text = tokenizer.tokenize(text).to(device)

-        print(text.shape, type(text))
        image_embed = self.prior.sample(text, num_samples_per_batch = self.prior_num_samples)
        images = self.decoder.sample(image_embed)
        return images
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.4',
+  version = '0.0.6',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	14ddbc159c	cleanup	2022-04-13 18:24:32 -07:00
Phil Wang	0692f1699f	favorite quote	2022-04-13 18:17:59 -07:00
Phil Wang	26c4534bc3	readme	2022-04-13 18:11:55 -07:00
Phil Wang	5e06cde4cb	always work in the l2normed space for image and text embeddings	2022-04-13 18:08:42 -07:00
Phil Wang	a1a8a78f21	fix everything and make sure it runs end to end, document everything in readme for public	2022-04-13 18:05:25 -07:00