Compare commits

...

5 Commits
1.8.1 ... 1.8.4

Author SHA1 Message Date
Phil Wang
083508ff8e cast attention matrix back to original dtype pre-softmax in attention 2022-08-20 10:56:01 -07:00
Phil Wang
7762edd0ff make it work for @ethancohen123 2022-08-19 11:28:58 -07:00
Phil Wang
de5e628773 cite einops 2022-08-17 08:58:41 -07:00
Phil Wang
1b4046b039 gratitude 2022-08-17 08:57:33 -07:00
Phil Wang
27f19ba7fa make sure diffusion prior trainer can operate with no warmup 2022-08-15 14:27:40 -07:00
4 changed files with 26 additions and 5 deletions

View File

@@ -49,6 +49,7 @@ This library would not have gotten to this working state without the help of
- <a href="https://github.com/crowsonkb">Katherine</a> for her advice
- <a href="https://stability.ai/">Stability AI</a> for the generous sponsorship
- <a href="https://huggingface.co">🤗 Huggingface</a> and in particular <a href="https://github.com/sgugger">Sylvain</a> for the <a href="https://github.com/huggingface/accelerate">Accelerate</a> library
- <a href="https://github.com/arogozhnikov">Alex</a> for <a href="https://github.com/arogozhnikov/einops">einops</a>, indispensable tool for tensor manipulation
... and many others. Thank you! 🙏
@@ -1274,4 +1275,14 @@ For detailed information on training the diffusion prior, please refer to the [d
}
```
```bibtex
@inproceedings{rogozhnikov2022einops,
title = {Einops: Clear and Reliable Tensor Manipulations with Einstein-like Notation},
author = {Alex Rogozhnikov},
booktitle = {International Conference on Learning Representations},
year = {2022},
url = {https://openreview.net/forum?id=oapKSVM2bcj}
}
```
*Creating noise from data is easy; creating data from noise is generative modeling.* - <a href="https://arxiv.org/abs/2011.13456">Yang Song's paper</a>

View File

@@ -250,9 +250,15 @@ class XClipAdapter(BaseClipAdapter):
text = text[..., :self.max_text_len]
text_mask = text != 0
encoder_output = self.clip.text_transformer(text)
text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
encoder_output_is_cls = encoder_output.ndim == 3
text_cls, text_encodings = (encoder_output[:, 0], encoder_output[:, 1:]) if encoder_output_is_cls else (encoder_output, None)
text_embed = self.clip.to_text_latent(text_cls)
text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
if exists(text_encodings):
text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
return EmbeddedText(l2norm(text_embed), text_encodings)
@torch.no_grad()
@@ -873,6 +879,8 @@ class Attention(nn.Module):
# attention
attn = sim.softmax(dim = -1, dtype = torch.float32)
attn = attn.type(sim.dtype)
attn = self.dropout(attn)
# aggregate values
@@ -1631,6 +1639,7 @@ class CrossAttention(nn.Module):
sim = sim.masked_fill(~mask, max_neg_value)
attn = sim.softmax(dim = -1, dtype = torch.float32)
attn = attn.type(sim.dtype)
out = einsum('b h i j, b h j d -> b h i d', attn, v)
out = rearrange(out, 'b h n d -> b n (h d)')

View File

@@ -181,7 +181,7 @@ class DiffusionPriorTrainer(nn.Module):
eps = 1e-6,
max_grad_norm = None,
group_wd_params = True,
warmup_steps = 1,
warmup_steps = None,
cosine_decay_max_steps = None,
**kwargs
):
@@ -357,7 +357,8 @@ class DiffusionPriorTrainer(nn.Module):
# accelerator will ocassionally skip optimizer steps in a "dynamic loss scaling strategy"
if not self.accelerator.optimizer_step_was_skipped:
with self.warmup_scheduler.dampening():
sched_context = self.warmup_scheduler.dampening if exists(self.warmup_scheduler) else nullcontext
with sched_context():
self.scheduler.step()
if self.use_ema:

View File

@@ -1 +1 @@
__version__ = '1.8.1'
__version__ = '1.8.4'