zero init final projection in unet, since openai and @crowsonkb are both doing it

make it so even if text mask is omitted, it will be derived based on whether text encodings are all 0s or not, simplify dataloading
Unet parameter count is now shown (#202 )
2026-02-14 20:54:27 +01:00 · 2022-07-11 13:22:06 -07:00 · 2022-07-11 10:56:19 -07:00 · 2022-07-10 16:45:59 -07:00
3 changed files with 23 additions and 3 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -77,6 +77,11 @@ def cast_tuple(val, length = None):
 def module_device(module):
    return next(module.parameters()).device

+def zero_init_(m):
+    nn.init.zeros_(m.weight)
+    if exists(m.bias):
+        nn.init.zeros_(m.bias)
+
@contextmanager
 def null_context(*args, **kwargs):
    yield
@@ -220,6 +225,7 @@ class XClipAdapter(BaseClipAdapter):
        encoder_output = self.clip.text_transformer(text)
        text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
        text_embed = self.clip.to_text_latent(text_cls)
+        text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
        return EmbeddedText(l2norm(text_embed), text_encodings, text_mask)

    @torch.no_grad()
@@ -255,6 +261,7 @@ class CoCaAdapter(BaseClipAdapter):
        text = text[..., :self.max_text_len]
        text_mask = text != 0
        text_embed, text_encodings = self.clip.embed_text(text)
+        text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
        return EmbeddedText(text_embed, text_encodings, text_mask)

    @torch.no_grad()
@@ -314,6 +321,7 @@ class OpenAIClipAdapter(BaseClipAdapter):

        text_embed = self.clip.encode_text(text)
        text_encodings = self.text_encodings
+        text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
        del self.text_encodings
        return EmbeddedText(l2norm(text_embed.float()), text_encodings.float(), text_mask)

@@ -1197,6 +1205,7 @@ class DiffusionPrior(nn.Module):

        if self.condition_on_text_encodings:
            assert exists(text_encodings), 'text encodings must be present for diffusion prior if specified'
+            text_mask = default(text_mask, lambda: torch.any(text_encodings != 0., dim = -1))
            text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}

        # timestep conditioning from ddpm
@@ -1665,6 +1674,8 @@ class Unet(nn.Module):
        self.final_resnet_block = ResnetBlock(dim * 2, dim, time_cond_dim = time_cond_dim, groups = top_level_resnet_group)
        self.to_out = nn.Conv2d(dim, self.channels_out, kernel_size = final_conv_kernel_size, padding = final_conv_kernel_size // 2)

+        zero_init_(self.to_out) # since both OpenAI and @crowsonkb are doing it
+
    # if the current settings for the unet are not correct
    # for cascading DDPM, then reinit the unet with the right settings
    def cast_model_parameters(
@@ -2410,6 +2421,9 @@ class Decoder(nn.Module):
        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

+        if self.condition_on_text_encodings:
+            text_mask = default(text_mask, lambda: torch.any(text_encodings != 0., dim = -1))
+
        img = None
        is_cuda = next(self.parameters()).is_cuda

@@ -2493,6 +2507,9 @@ class Decoder(nn.Module):
        assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
        assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'

+        if self.condition_on_text_encodings:
+            text_mask = default(text_mask, lambda: torch.any(text_encodings != 0., dim = -1))
+
        lowres_cond_img = self.to_lowres_cond(image, target_image_size = target_image_size, downsample_image_size = self.image_sizes[unet_index - 1]) if unet_number > 1 else None
        image = resize_image_to(image, target_image_size)

--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.19.6'
+__version__ = '0.20.1'
--- a/train_decoder.py
+++ b/train_decoder.py
@@ -557,7 +557,7 @@ def initialize_training(config: TrainDecoderConfig, config_path):

    # Create the decoder model and print basic info
    decoder = config.decoder.create()
-    num_parameters = sum(p.numel() for p in decoder.parameters())
+    get_num_parameters = lambda model, only_training=False: sum(p.numel() for p in model.parameters() if (p.requires_grad or not only_training))

    # Create and initialize the tracker if we are the master
    tracker = create_tracker(accelerator, config, config_path, dummy = rank!=0)
@@ -586,7 +586,10 @@ def initialize_training(config: TrainDecoderConfig, config_path):
    accelerator.print(print_ribbon("Loaded Config", repeat=40))
    accelerator.print(f"Running training with {accelerator.num_processes} processes and {accelerator.distributed_type} distributed training")
    accelerator.print(f"Training using {data_source_string}. {'conditioned on text' if conditioning_on_text else 'not conditioned on text'}")
-    accelerator.print(f"Number of parameters: {num_parameters}")
+    accelerator.print(f"Number of parameters: {get_num_parameters(decoder)} total; {get_num_parameters(decoder, only_training=True)} training")
+    for i, unet in enumerate(decoder.unets):
+        accelerator.print(f"Unet {i} has {get_num_parameters(unet)} total; {get_num_parameters(unet, only_training=True)} training")
+
    train(dataloaders, decoder, accelerator,
        tracker=tracker,
        inference_device=accelerator.device,
Author	SHA1	Message	Date
Phil Wang	bdd62c24b3	zero init final projection in unet, since openai and @crowsonkb are both doing it	2022-07-11 13:22:06 -07:00
Phil Wang	1f1557c614	make it so even if text mask is omitted, it will be derived based on whether text encodings are all 0s or not, simplify dataloading	2022-07-11 10:56:19 -07:00
Aidan Dempster	1a217e99e3	Unet parameter count is now shown (#202 )	2022-07-10 16:45:59 -07:00