bug in pydantic decoder config class

fix a bug of name error (#179 )
bring in the skip connection scaling factor, used by imagen in their unets, cite original paper using it
2026-02-13 20:34:22 +01:00 · 2022-06-29 07:17:35 -07:00 · 2022-06-29 07:16:44 -07:00 · 2022-06-26 21:59:55 -07:00 · 2022-06-26 21:07:42 -07:00 · 2022-06-26 16:12:32 -07:00
5 changed files with 50 additions and 23 deletions
--- a/README.md
+++ b/README.md
@@ -368,7 +368,8 @@ unet1 = Unet(
    image_embed_dim = 512,
    cond_dim = 128,
    channels = 3,
-    dim_mults=(1, 2, 4, 8)
+    dim_mults=(1, 2, 4, 8),
+    cond_on_text_encodings = True    # set to True for any unets that need to be conditioned on text encodings
 ).cuda()

 unet2 = Unet(
@@ -385,8 +386,7 @@ decoder = Decoder(
    clip = clip,
    timesteps = 100,
    image_cond_drop_prob = 0.1,
-    text_cond_drop_prob = 0.5,
-    condition_on_text_encodings = False  # set this to True if you wish to condition on text during training and sampling
+    text_cond_drop_prob = 0.5
 ).cuda()

 for unet_number in (1, 2):
@@ -1189,4 +1189,14 @@ Once built, images will be saved to the same directory the command is invoked
 }
 ```

+```bibtex
+@article{Saharia2021PaletteID,
+    title   = {Palette: Image-to-Image Diffusion Models},
+    author  = {Chitwan Saharia and William Chan and Huiwen Chang and Chris A. Lee and Jonathan Ho and Tim Salimans and David J. Fleet and Mohammad Norouzi},
+    journal = {ArXiv},
+    year    = {2021},
+    volume  = {abs/2111.05826}
+}
+```
+
 *Creating noise from data is easy; creating data from noise is generative modeling.* - <a href="https://arxiv.org/abs/2011.13456">Yang Song's paper</a>
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1359,6 +1359,7 @@ class Unet(nn.Module):
        cross_embed_downsample = False,
        cross_embed_downsample_kernel_sizes = (2, 4),
        memory_efficient = False,
+        scale_skip_connection = False,
        **kwargs
    ):
        super().__init__()
@@ -1440,6 +1441,10 @@ class Unet(nn.Module):
        self.max_text_len = max_text_len
        self.null_text_embed = nn.Parameter(torch.randn(1, max_text_len, cond_dim))

+        # whether to scale skip connection, adopted in Imagen
+
+        self.skip_connect_scale = 1. if not scale_skip_connection else (2 ** -0.5)
+
        # attention related params

        attn_kwargs = dict(heads = attn_heads, dim_head = attn_dim_head)
@@ -1687,7 +1692,9 @@ class Unet(nn.Module):
        x = self.mid_block2(x, mid_c, t)

        for init_block, sparse_attn, resnet_blocks, upsample in self.ups:
-            x = torch.cat((x, hiddens.pop()), dim = 1)
+            skip_connect = hiddens.pop() * self.skip_connect_scale
+
+            x = torch.cat((x, skip_connect), dim = 1)
            x = init_block(x, c, t)
            x = sparse_attn(x)

@@ -1781,13 +1788,6 @@ class Decoder(nn.Module):
    ):
        super().__init__()

-        self.unconditional = unconditional
-
-        # text conditioning
-
-        assert not (condition_on_text_encodings and unconditional), 'unconditional decoder image generation cannot be set to True if conditioning on text is present'
-        self.condition_on_text_encodings = condition_on_text_encodings
-
        # clip

        self.clip = None
@@ -1819,12 +1819,16 @@ class Decoder(nn.Module):

        self.channels = channels

-        # automatically take care of ensuring that first unet is unconditional
-        # while the rest of the unets are conditioned on the low resolution image produced by previous unet
+        # verify conditioning method

        unets = cast_tuple(unet)
        num_unets = len(unets)

+        self.unconditional = unconditional
+
+        # automatically take care of ensuring that first unet is unconditional
+        # while the rest of the unets are conditioned on the low resolution image produced by previous unet
+
        vaes = pad_tuple_to_length(cast_tuple(vae), len(unets), fillvalue = NullVQGanVAE(channels = self.channels))

        # whether to use learned variance, defaults to True for the first unet in the cascade, as in paper
@@ -1852,7 +1856,7 @@ class Decoder(nn.Module):
            one_unet = one_unet.cast_model_parameters(
                lowres_cond = not is_first,
                cond_on_image_embeds = not unconditional and is_first,
-                cond_on_text_encodings = not unconditional and (is_first or one_unet.cond_on_text_encodings),
+                cond_on_text_encodings = not unconditional and one_unet.cond_on_text_encodings,
                channels = unet_channels,
                channels_out = unet_channels_out
            )
@@ -1860,6 +1864,10 @@ class Decoder(nn.Module):
            self.unets.append(one_unet)
            self.vaes.append(one_vae.copy_for_eval())

+        # determine from unets whether conditioning on text encoding is needed
+
+        self.condition_on_text_encodings = any([unet.cond_on_text_encodings for unet in self.unets])
+
        # create noise schedulers per unet

        if not exists(beta_schedule):
--- a/dalle2_pytorch/train_configs.py
+++ b/dalle2_pytorch/train_configs.py
@@ -158,6 +158,8 @@ class UnetConfig(BaseModel):
    dim: int
    dim_mults: ListOrTuple(int)
    image_embed_dim: int = None
+    text_embed_dim: int = None
+    cond_on_text_encodings: bool = None
    cond_dim: int = None
    channels: int = 3
    attn_dim_head: int = 32
@@ -170,7 +172,6 @@ class DecoderConfig(BaseModel):
    unets: ListOrTuple(UnetConfig)
    image_size: int = None
    image_sizes: ListOrTuple(int) = None
-    condition_on_text_encodings: bool = False
    clip: Optional[AdapterConfig]   # The clip model to use if embeddings are not provided
    channels: int = 3
    timesteps: int = 1000
@@ -283,21 +284,27 @@ class TrainDecoderConfig(BaseModel):
    def check_has_embeddings(cls, values):
        # Makes sure that enough information is provided to get the embeddings specified for training
        data_config, decoder_config = values.get('data'), values.get('decoder')
-        if data_config is None or decoder_config is None:
+
+        if not exists(data_config) or not exists(decoder_config):
            # Then something else errored and we should just pass through
            return values
-        using_text_embeddings = decoder_config.condition_on_text_encodings
+
+        using_text_embeddings = any([unet.cond_on_text_encodings for unet in decoder_config.unets])
        using_clip = exists(decoder_config.clip)
        img_emb_url = data_config.img_embeddings_url
        text_emb_url = data_config.text_embeddings_url
+
        if using_text_embeddings:
            # Then we need some way to get the embeddings
-            assert using_clip or text_emb_url is not None, 'If condition_on_text_encodings is true, either clip or text_embeddings_url must be provided'
+            assert using_clip or exists(text_emb_url), 'If text conditioning, either clip or text_embeddings_url must be provided'
+
        if using_clip:
            if using_text_embeddings:
-                assert text_emb_url is None or img_emb_url is None, 'Loaded clip, but also provided text_embeddings_url and img_embeddings_url. This is redundant. Remove the clip model or the embeddings'
+                assert not exists(text_emb_url) or not exists(img_emb_url), 'Loaded clip, but also provided text_embeddings_url and img_embeddings_url. This is redundant. Remove the clip model or the text embeddings'
            else:
-                assert img_emb_url is None, 'Loaded clip, but also provided img_embeddings_url. This is redundant. Remove the clip model or the embeddings'
+                assert not exists(img_emb_url), 'Loaded clip, but also provided img_embeddings_url. This is redundant. Remove the clip model or the embeddings'
+
        if text_emb_url:
            assert using_text_embeddings, "Text embeddings are being loaded, but text embeddings are not being conditioned on. This will slow down the dataloader for no reason."
+
        return values
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.12.0'
+__version__ = '0.12.4'
--- a/train_decoder.py
+++ b/train_decoder.py
@@ -596,9 +596,11 @@ def initialize_training(config, config_path):

    has_img_embeddings = config.data.img_embeddings_url is not None
    has_text_embeddings = config.data.text_embeddings_url is not None
-    conditioning_on_text = config.decoder.condition_on_text_encodings
+    conditioning_on_text = any([unet.cond_on_text_encodings for unet in config.decoder.unets])
+
    has_clip_model = config.decoder.clip is not None
    data_source_string = ""
+
    if has_img_embeddings:
        data_source_string += "precomputed image embeddings"
    elif has_clip_model:
@@ -622,7 +624,7 @@ def initialize_training(config, config_path):
        inference_device=accelerator.device,
        load_config=config.load,
        evaluate_config=config.evaluate,
-        condition_on_text_encodings=config.decoder.condition_on_text_encodings,
+        condition_on_text_encodings=conditioning_on_text,
        **config.train.dict(),
    )
Author	SHA1	Message	Date
Phil Wang	46a2558d53	bug in pydantic decoder config class	2022-06-29 07:17:35 -07:00
yytdfc	86109646e3	fix a bug of name error (#179 )	2022-06-29 07:16:44 -07:00
Phil Wang	6a11b9678b	bring in the skip connection scaling factor, used by imagen in their unets, cite original paper using it	2022-06-26 21:59:55 -07:00
Phil Wang	b90364695d	fix remaining issues with deriving cond_on_text_encodings from child unet settings	2022-06-26 21:07:42 -07:00
zion	868c001199	bug fixes for text conditioning update (#175 )	2022-06-26 16:12:32 -07:00
Phil Wang	032e83b0e0	nevermind, do not enforce text encodings on first unet	2022-06-26 12:45:05 -07:00