set ability to do warmup steps for each unet during training

2026-02-13 21:34:21 +01:00 · 2022-07-05 16:20:49 -07:00
3 changed files with 15 additions and 29 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -335,10 +335,6 @@ def approx_standard_normal_cdf(x):
 def discretized_gaussian_log_likelihood(x, *, means, log_scales, thres = 0.999):
    assert x.shape == means.shape == log_scales.shape

-    # attempting to correct nan gradients when learned variance is turned on
-    # in the setting of deepspeed fp16
-    eps = 1e-12 if x.dtype == torch.float32 else 1e-5
-
    centered_x = x - means
    inv_stdv = torch.exp(-log_scales)
    plus_in = inv_stdv * (centered_x + 1. / 255.)
@@ -353,7 +349,7 @@ def discretized_gaussian_log_likelihood(x, *, means, log_scales, thres = 0.999):
        log_cdf_plus,
        torch.where(x > thres,
            log_one_minus_cdf_min,
-            log(cdf_delta, eps = eps)))
+            log(cdf_delta)))

    return log_probs

@@ -708,7 +704,7 @@ class Attention(nn.Module):
        sim = sim - sim.amax(dim = -1, keepdim = True).detach()
        sim = sim * self.pb_relax_alpha

-        attn = sim.softmax(dim = -1)
+        attn = sim.softmax(dim = -1, dtype = torch.float32)
        attn = self.dropout(attn)

        # aggregate values
@@ -1131,12 +1127,11 @@ class SinusoidalPosEmb(nn.Module):
        self.dim = dim

    def forward(self, x):
-        dtype, device = x.dtype, x.device
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, device = device, dtype = dtype) * -emb)
+        emb = torch.exp(torch.arange(half_dim, device = x.device) * -emb)
        emb = rearrange(x, 'i -> i 1') * rearrange(emb, 'j -> 1 j')
-        return torch.cat((emb.sin(), emb.cos()), dim = -1).type(dtype)
+        return torch.cat((emb.sin(), emb.cos()), dim = -1)

 class Block(nn.Module):
    def __init__(
@@ -1277,7 +1272,7 @@ class CrossAttention(nn.Module):
        sim = sim - sim.amax(dim = -1, keepdim = True).detach()
        sim = sim * self.pb_relax_alpha

-        attn = sim.softmax(dim = -1)
+        attn = sim.softmax(dim = -1, dtype = torch.float32)

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
@@ -1631,7 +1626,6 @@ class Unet(nn.Module):

        # time conditioning

-        time = time.type_as(x)
        time_hiddens = self.to_time_hiddens(time)

        time_tokens = self.to_time_tokens(time_hiddens)
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -173,26 +173,14 @@ class DiffusionPriorTrainer(nn.Module):
        super().__init__()
        assert isinstance(diffusion_prior, DiffusionPrior)
        assert not exists(accelerator) or isinstance(accelerator, Accelerator)
+        assert exists(accelerator) or exists(device), "You must supply some method of obtaining a device."
        ema_kwargs, kwargs = groupby_prefix_and_trim('ema_', kwargs)

-        # verbosity
-
-        self.verbose = verbose
-
        # assign some helpful member vars
-
        self.accelerator = accelerator
+        self.device = accelerator.device if exists(accelerator) else device
        self.text_conditioned = diffusion_prior.condition_on_text_encodings

-        # setting the device
-
-        if not exists(accelerator) and not exists(device):
-            diffusion_prior_device = next(diffusion_prior.parameters()).device
-            self.print(f'accelerator not given, and device not specified: defaulting to device of diffusion prior parameters - {diffusion_prior_device}')
-            self.device = diffusion_prior_device
-        else:
-            self.device = accelerator.device if exists(accelerator) else device
-
        # save model

        self.diffusion_prior = diffusion_prior
@@ -226,9 +214,13 @@ class DiffusionPriorTrainer(nn.Module):

        self.max_grad_norm = max_grad_norm

+        # verbosity
+
+        self.verbose = verbose
+
        # track steps internally

-        self.register_buffer('step', torch.tensor([0], device = self.device))
+        self.register_buffer('step', torch.tensor([0]))

    # accelerator wrappers

@@ -473,7 +465,7 @@ class DecoderTrainer(nn.Module):

        lr, wd, eps, warmup_steps = map(partial(cast_tuple, length = self.num_unets), (lr, wd, eps, warmup_steps))

-        assert all([unet_lr <= 1e-2 for unet_lr in lr]), 'your learning rate is too high, recommend sticking with 1e-4, at most 5e-4'
+        assert all([unet_lr < 1e-3 for unet_lr in lr]), 'your learning rate is too high, recommend sticking with 1e-4, at most 5e-4'

        optimizers = []
        schedulers = []
@@ -558,7 +550,7 @@ class DecoderTrainer(nn.Module):
        if only_model:
            return loaded_obj

-        for ind, last_step in zip(range(0, self.num_unets), self.steps.tolist()):
+        for ind, last_step in zip(range(0, self.num_unets), self.steps.cpu().unbind()):

            optimizer_key = f'optim{ind}'
            optimizer = getattr(self, optimizer_key)
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.16.13'
+__version__ = '0.16.5'