cast long as float before deriving sinusoidal pos emb

remove forcing of softmax in f32, in case it is interfering with deepspeed
set ability to do warmup steps for each unet during training
2026-02-13 20:54:23 +01:00 · 2022-07-05 18:01:22 -07:00 · 2022-07-05 16:53:58 -07:00 · 2022-07-05 16:24:16 -07:00
3 changed files with 5 additions and 5 deletions
--- a/dalle2_pytorch/dalle2_pytorch.py
+++ b/dalle2_pytorch/dalle2_pytorch.py
@@ -704,7 +704,7 @@ class Attention(nn.Module):
        sim = sim - sim.amax(dim = -1, keepdim = True).detach()
        sim = sim * self.pb_relax_alpha

-        attn = sim.softmax(dim = -1, dtype = torch.float32)
+        attn = sim.softmax(dim = -1)
        attn = self.dropout(attn)

        # aggregate values
@@ -1130,7 +1130,7 @@ class SinusoidalPosEmb(nn.Module):
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device = x.device) * -emb)
-        emb = rearrange(x, 'i -> i 1') * rearrange(emb, 'j -> 1 j')
+        emb = rearrange(x.type_as(emb), 'i -> i 1') * rearrange(emb, 'j -> 1 j')
        return torch.cat((emb.sin(), emb.cos()), dim = -1)

 class Block(nn.Module):
@@ -1272,7 +1272,7 @@ class CrossAttention(nn.Module):
        sim = sim - sim.amax(dim = -1, keepdim = True).detach()
        sim = sim * self.pb_relax_alpha

-        attn = sim.softmax(dim = -1, dtype = torch.float32)
+        attn = sim.softmax(dim = -1)

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
--- a/dalle2_pytorch/trainer.py
+++ b/dalle2_pytorch/trainer.py
@@ -550,7 +550,7 @@ class DecoderTrainer(nn.Module):
        if only_model:
            return loaded_obj

-        for ind, last_step in zip(range(0, self.num_unets), self.steps.cpu().unbind()):
+        for ind, last_step in zip(range(0, self.num_unets), self.steps.tolist()):

            optimizer_key = f'optim{ind}'
            optimizer = getattr(self, optimizer_key)
--- a/dalle2_pytorch/version.py
+++ b/dalle2_pytorch/version.py
@@ -1 +1 @@
-__version__ = '0.16.5'
+__version__ = '0.16.8'
Author	SHA1	Message	Date
Phil Wang	1e4bb2bafb	cast long as float before deriving sinusoidal pos emb	2022-07-05 18:01:22 -07:00
Phil Wang	ee75515c7d	remove forcing of softmax in f32, in case it is interfering with deepspeed	2022-07-05 16:53:58 -07:00
Phil Wang	ec68243479	set ability to do warmup steps for each unet during training	2022-07-05 16:24:16 -07:00