Compare commits

...

2 Commits

Author SHA1 Message Date
Phil Wang
1e4bb2bafb cast long as float before deriving sinusoidal pos emb 2022-07-05 18:01:22 -07:00
Phil Wang
ee75515c7d remove forcing of softmax in f32, in case it is interfering with deepspeed 2022-07-05 16:53:58 -07:00
2 changed files with 4 additions and 4 deletions

View File

@@ -704,7 +704,7 @@ class Attention(nn.Module):
sim = sim - sim.amax(dim = -1, keepdim = True).detach()
sim = sim * self.pb_relax_alpha
attn = sim.softmax(dim = -1, dtype = torch.float32)
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate values
@@ -1130,7 +1130,7 @@ class SinusoidalPosEmb(nn.Module):
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, device = x.device) * -emb)
emb = rearrange(x, 'i -> i 1') * rearrange(emb, 'j -> 1 j')
emb = rearrange(x.type_as(emb), 'i -> i 1') * rearrange(emb, 'j -> 1 j')
return torch.cat((emb.sin(), emb.cos()), dim = -1)
class Block(nn.Module):
@@ -1272,7 +1272,7 @@ class CrossAttention(nn.Module):
sim = sim - sim.amax(dim = -1, keepdim = True).detach()
sim = sim * self.pb_relax_alpha
attn = sim.softmax(dim = -1, dtype = torch.float32)
attn = sim.softmax(dim = -1)
out = einsum('b h i j, b h j d -> b h i d', attn, v)
out = rearrange(out, 'b h n d -> b n (h d)')

View File

@@ -1 +1 @@
__version__ = '0.16.6'
__version__ = '0.16.8'