lower default initial learning rate to what Jonathan Ho had in his original repo

always use sandwich norm for attention layer
use post-attn-branch layernorm in attempt to stabilize cross attention conditioning in decoder
2026-02-12 19:44:26 +01:00 · 2022-05-14 13:22:43 -07:00 · 2022-05-14 12:13:41 -07:00 · 2022-05-14 11:58:09 -07:00 · 2022-05-14 01:23:54 -07:00
4 changed files with 23 additions and 14 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -1,7 +1,7 @@
 import math
 from tqdm import tqdm
 from inspect import isfunction
-from functools import partial
+from functools import partial, wraps
 from contextlib import contextmanager
 from collections import namedtuple
 from pathlib import Path
@@ -45,6 +45,14 @@ def exists(val):
 def identity(t, *args, **kwargs):
    return t

+def maybe(fn):
+    @wraps(fn)
+    def inner(x):
+        if not exists(x):
+            return x
+        return fn(x)
+    return inner
+
 def default(val, d):
    if exists(val):
        return val
@@ -606,7 +614,6 @@ class Attention(nn.Module):
        heads = 8,
        dropout = 0.,
        causal = False,
-        post_norm = False,
        rotary_emb = None
    ):
        super().__init__()
@@ -616,7 +623,6 @@ class Attention(nn.Module):

        self.causal = causal
        self.norm = LayerNorm(dim)
-        self.post_norm = LayerNorm(dim)     # sandwich norm from Coqview paper + Normformer
        self.dropout = nn.Dropout(dropout)

        self.null_kv = nn.Parameter(torch.randn(2, dim_head))
@@ -627,7 +633,7 @@ class Attention(nn.Module):

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim, bias = False),
-            LayerNorm(dim) if post_norm else nn.Identity()
+            LayerNorm(dim)
        )

    def forward(self, x, mask = None, attn_bias = None):
@@ -684,8 +690,7 @@ class Attention(nn.Module):
        out = einsum('b h i j, b j d -> b h i d', attn, v)

        out = rearrange(out, 'b h n d -> b n (h d)')
-        out = self.to_out(out)
-        return self.post_norm(out)
+        return self.to_out(out)

 class CausalTransformer(nn.Module):
    def __init__(
@@ -711,7 +716,7 @@ class CausalTransformer(nn.Module):
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
-                Attention(dim = dim, causal = True, dim_head = dim_head, heads = heads, dropout = attn_dropout, post_norm = normformer, rotary_emb = rotary_emb),
+                Attention(dim = dim, causal = True, dim_head = dim_head, heads = heads, dropout = attn_dropout, rotary_emb = rotary_emb),
                FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout, post_activation_norm = normformer)
            ]))

@@ -1173,7 +1178,11 @@ class CrossAttention(nn.Module):
        self.null_kv = nn.Parameter(torch.randn(2, dim_head))
        self.to_q = nn.Linear(dim, inner_dim, bias = False)
        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias = False)
-        self.to_out = nn.Linear(inner_dim, dim, bias = False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim, bias = False),
+            LayerNorm(dim)
+        )

    def forward(self, x, context, mask = None):
        b, n, device = *x.shape[:2], x.device
@@ -1844,6 +1853,8 @@ class Decoder(BaseGaussianDiffusion):
        b = shape[0]
        img = torch.randn(shape, device = device)

+        lowres_cond_img = maybe(normalize_neg_one_to_one)(lowres_cond_img)
+
        for i in tqdm(reversed(range(0, self.num_timesteps)), desc = 'sampling loop time step', total = self.num_timesteps):
            img = self.p_sample(
                unet,
@@ -1868,9 +1879,7 @@ class Decoder(BaseGaussianDiffusion):
        # normalize to [-1, 1]

        x_start = normalize_neg_one_to_one(x_start)
-
-        if exists(lowres_cond_img):
-            lowres_cond_img = normalize_neg_one_to_one(lowres_cond_img)
+        lowres_cond_img = maybe(normalize_neg_one_to_one)(lowres_cond_img)

        # get x_t

--- a/dalle2_pytorch/optimizer.py
+++ b/dalle2_pytorch/optimizer.py
@@ -7,7 +7,7 @@ def separate_weight_decayable_params(params):

 def get_optimizer(
    params,
-    lr = 3e-4,
+    lr = 2e-5,
    wd = 1e-2,
    betas = (0.9, 0.999),
    filter_by_requires_grad = False
--- a/dalle2_pytorch/train.py
+++ b/dalle2_pytorch/train.py
@@ -221,7 +221,7 @@ class DecoderTrainer(nn.Module):
        self,
        decoder,
        use_ema = True,
-        lr = 3e-4,
+        lr = 2e-5,
        wd = 1e-2,
        max_grad_norm = None,
        amp = False,
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
      'dream = dalle2_pytorch.cli:dream'
    ],
  },
-  version = '0.2.16',
+  version = '0.2.20',
  license='MIT',
  description = 'DALL-E 2',
  author = 'Phil Wang',
Author	SHA1	Message	Date
Phil Wang	591d37e266	lower default initial learning rate to what Jonathan Ho had in his original repo	2022-05-14 13:22:43 -07:00
Phil Wang	d1f02e8f49	always use sandwich norm for attention layer	2022-05-14 12:13:41 -07:00
Phil Wang	9faab59b23	use post-attn-branch layernorm in attempt to stabilize cross attention conditioning in decoder	2022-05-14 11:58:09 -07:00
Phil Wang	5d27029e98	make sure lowres conditioning image is properly normalized to -1 to 1 for cascading ddpm	2022-05-14 01:23:54 -07:00