fix issue with mixed precision and gradient clipping

address not calculating average eval / test loss when training diffusion prior https://github.com/lucidrains/DALLE2-pytorch/issues/49
fix calculation of adaptive weight for vit-vqgan, thanks to @CiaoHe
2026-02-12 11:34:29 +01:00 · 2022-05-02 09:20:19 -07:00 · 2022-05-02 08:51:41 -07:00 · 2022-05-02 07:58:14 -07:00 · 2022-05-01 18:02:30 -07:00
5 changed files with 32 additions and 8 deletions
--- a/README.md
+++ b/README.md
@@ -830,6 +830,7 @@ Once built, images will be saved to the same directory the command is invoked
 - [ ] extend diffusion head to use diffusion-gan (potentially using lightweight-gan) to speed up inference
 - [ ] bring in cross-scale embedding from iclr paper https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/crossformer.py#L14
 - [ ] figure out if possible to augment with external memory, as described in https://arxiv.org/abs/2204.11824
+- [ ] test out grid attention in cascading ddpm locally, decide whether to keep or remove

 ## Citations

--- a/dalle2_pytorch/train.py
+++ b/dalle2_pytorch/train.py
@@ -159,12 +159,13 @@ class DecoderTrainer(nn.Module):
        index = unet_number - 1
        unet = self.decoder.unets[index]

-        if exists(self.max_grad_norm):
-            nn.utils.clip_grad_norm_(unet.parameters(), self.max_grad_norm)
-
        optimizer = getattr(self, f'optim{index}')
        scaler = getattr(self, f'scaler{index}')

+        if exists(self.max_grad_norm):
+            scaler.unscale_(optimizer)
+            nn.utils.clip_grad_norm_(unet.parameters(), self.max_grad_norm)
+
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
--- a/dalle2_pytorch/vqgan_vae.py
+++ b/dalle2_pytorch/vqgan_vae.py
@@ -285,6 +285,10 @@ class ResnetEncDec(nn.Module):
    def get_encoded_fmap_size(self, image_size):
        return image_size // (2 ** self.layers)

+    @property
+    def last_dec_layer(self):
+        return self.decoders[-1].weight
+
    def encode(self, x):
        for enc in self.encoders:
            x = enc(x)
@@ -419,6 +423,10 @@ class ConvNextEncDec(nn.Module):
    def get_encoded_fmap_size(self, image_size):
        return image_size // (2 ** self.layers)

+    @property
+    def last_dec_layer(self):
+        return self.decoders[-1].weight
+
    def encode(self, x):
        for enc in self.encoders:
            x = enc(x)
@@ -606,6 +614,10 @@ class ViTEncDec(nn.Module):
    def get_encoded_fmap_size(self, image_size):
        return image_size // self.patch_size

+    @property
+    def last_dec_layer(self):
+        return self.decoder[-3][-1].weight
+
    def encode(self, x):
        return self.encoder(x)

@@ -843,7 +855,7 @@ class VQGanVAE(nn.Module):

        # calculate adaptive weight

-        last_dec_layer = self.decoders[-1].weight
+        last_dec_layer = self.enc_dec.last_dec_layer

        norm_grad_wrt_gen_loss = grad_layer_wrt_loss(gen_loss, last_dec_layer).norm(p = 2)
        norm_grad_wrt_perceptual_loss = grad_layer_wrt_loss(perceptual_loss, last_dec_layer).norm(p = 2)
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.0.88',
+  version = '0.0.90',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
--- a/train_diffusion_prior.py
+++ b/train_diffusion_prior.py
@@ -17,14 +17,24 @@ os.environ["WANDB_SILENT"] = "true"
 def eval_model(model,device,image_reader,text_reader,start,end,batch_size,loss_type,phase="Validation"):
    model.eval()
    with torch.no_grad():
-        for emb_images,emb_text in zip(image_reader(batch_size=batch_size, start=start, end=end),
+        total_loss = 0.
+        total_samples = 0.
+
+        for emb_images, emb_text in zip(image_reader(batch_size=batch_size, start=start, end=end),
                text_reader(batch_size=batch_size, start=start, end=end)):
+
            emb_images_tensor = torch.tensor(emb_images[0]).to(device)
            emb_text_tensor = torch.tensor(emb_text[0]).to(device)
+
+            batches = emb_images_tensor.shape[0]
+
            loss = model(text_embed = emb_text_tensor, image_embed = emb_images_tensor)

-            # Log to wandb
-            wandb.log({f'{phase} {loss_type}': loss})
+            total_loss += loss.item() * batches
+            total_samples += batches
+
+        avg_loss = (total_loss / total_samples)
+        wandb.log({f'{phase} {loss_type}': avg_loss})

 def save_model(save_path,state_dict):
    # Saving State Dict
Author	SHA1	Message	Date
Phil Wang	1924c7cc3d	fix issue with mixed precision and gradient clipping	2022-05-02 09:20:19 -07:00
Phil Wang	f7df3caaf3	address not calculating average eval / test loss when training diffusion prior https://github.com/lucidrains/DALLE2-pytorch/issues/49	2022-05-02 08:51:41 -07:00
Phil Wang	fc954ee788	fix calculation of adaptive weight for vit-vqgan, thanks to @CiaoHe	2022-05-02 07:58:14 -07:00
Phil Wang	c1db2753f5	todo	2022-05-01 18:02:30 -07:00