mirror of
https://github.com/lucidrains/DALLE2-pytorch.git
synced 2026-02-12 11:34:29 +01:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
083508ff8e | ||
|
|
7762edd0ff | ||
|
|
de5e628773 | ||
|
|
1b4046b039 | ||
|
|
27f19ba7fa | ||
|
|
8f38339c2b |
11
README.md
11
README.md
@@ -49,6 +49,7 @@ This library would not have gotten to this working state without the help of
|
||||
- <a href="https://github.com/crowsonkb">Katherine</a> for her advice
|
||||
- <a href="https://stability.ai/">Stability AI</a> for the generous sponsorship
|
||||
- <a href="https://huggingface.co">🤗 Huggingface</a> and in particular <a href="https://github.com/sgugger">Sylvain</a> for the <a href="https://github.com/huggingface/accelerate">Accelerate</a> library
|
||||
- <a href="https://github.com/arogozhnikov">Alex</a> for <a href="https://github.com/arogozhnikov/einops">einops</a>, indispensable tool for tensor manipulation
|
||||
|
||||
... and many others. Thank you! 🙏
|
||||
|
||||
@@ -1274,4 +1275,14 @@ For detailed information on training the diffusion prior, please refer to the [d
|
||||
}
|
||||
```
|
||||
|
||||
```bibtex
|
||||
@inproceedings{rogozhnikov2022einops,
|
||||
title = {Einops: Clear and Reliable Tensor Manipulations with Einstein-like Notation},
|
||||
author = {Alex Rogozhnikov},
|
||||
booktitle = {International Conference on Learning Representations},
|
||||
year = {2022},
|
||||
url = {https://openreview.net/forum?id=oapKSVM2bcj}
|
||||
}
|
||||
```
|
||||
|
||||
*Creating noise from data is easy; creating data from noise is generative modeling.* - <a href="https://arxiv.org/abs/2011.13456">Yang Song's paper</a>
|
||||
|
||||
@@ -250,9 +250,15 @@ class XClipAdapter(BaseClipAdapter):
|
||||
text = text[..., :self.max_text_len]
|
||||
text_mask = text != 0
|
||||
encoder_output = self.clip.text_transformer(text)
|
||||
text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
|
||||
|
||||
encoder_output_is_cls = encoder_output.ndim == 3
|
||||
|
||||
text_cls, text_encodings = (encoder_output[:, 0], encoder_output[:, 1:]) if encoder_output_is_cls else (encoder_output, None)
|
||||
text_embed = self.clip.to_text_latent(text_cls)
|
||||
text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
|
||||
|
||||
if exists(text_encodings):
|
||||
text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
|
||||
|
||||
return EmbeddedText(l2norm(text_embed), text_encodings)
|
||||
|
||||
@torch.no_grad()
|
||||
@@ -873,6 +879,8 @@ class Attention(nn.Module):
|
||||
# attention
|
||||
|
||||
attn = sim.softmax(dim = -1, dtype = torch.float32)
|
||||
attn = attn.type(sim.dtype)
|
||||
|
||||
attn = self.dropout(attn)
|
||||
|
||||
# aggregate values
|
||||
@@ -1631,6 +1639,7 @@ class CrossAttention(nn.Module):
|
||||
sim = sim.masked_fill(~mask, max_neg_value)
|
||||
|
||||
attn = sim.softmax(dim = -1, dtype = torch.float32)
|
||||
attn = attn.type(sim.dtype)
|
||||
|
||||
out = einsum('b h i j, b h j d -> b h i d', attn, v)
|
||||
out = rearrange(out, 'b h n d -> b n (h d)')
|
||||
|
||||
@@ -181,7 +181,8 @@ class DiffusionPriorTrainer(nn.Module):
|
||||
eps = 1e-6,
|
||||
max_grad_norm = None,
|
||||
group_wd_params = True,
|
||||
warmup_steps = 1,
|
||||
warmup_steps = None,
|
||||
cosine_decay_max_steps = None,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
@@ -233,8 +234,11 @@ class DiffusionPriorTrainer(nn.Module):
|
||||
**self.optim_kwargs,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
self.scheduler = LambdaLR(self.optimizer, lr_lambda = lambda _: 1.0)
|
||||
|
||||
if exists(cosine_decay_max_steps):
|
||||
self.scheduler = CosineAnnealingLR(optimizer, T_max = cosine_decay_max_steps)
|
||||
else:
|
||||
self.scheduler = LambdaLR(self.optimizer, lr_lambda = lambda _: 1.0)
|
||||
|
||||
self.warmup_scheduler = warmup.LinearWarmup(self.optimizer, warmup_period = warmup_steps) if exists(warmup_steps) else None
|
||||
|
||||
@@ -271,6 +275,7 @@ class DiffusionPriorTrainer(nn.Module):
|
||||
# FIXME: LambdaLR can't be saved due to pickling issues
|
||||
save_obj = dict(
|
||||
optimizer = self.optimizer.state_dict(),
|
||||
scheduler = self.scheduler.state_dict(),
|
||||
warmup_scheduler = self.warmup_scheduler,
|
||||
model = self.accelerator.unwrap_model(self.diffusion_prior).state_dict(),
|
||||
version = version.parse(__version__),
|
||||
@@ -317,7 +322,9 @@ class DiffusionPriorTrainer(nn.Module):
|
||||
# unwrap the model when loading from checkpoint
|
||||
self.accelerator.unwrap_model(self.diffusion_prior).load_state_dict(loaded_obj['model'], strict = strict)
|
||||
self.step.copy_(torch.ones_like(self.step, device=self.device) * loaded_obj['step'].to(self.device))
|
||||
|
||||
self.optimizer.load_state_dict(loaded_obj['optimizer'])
|
||||
self.scheduler.load_state_dict(loaded_obj['scheduler'])
|
||||
|
||||
# set warmupstep
|
||||
if exists(self.warmup_scheduler):
|
||||
@@ -350,7 +357,8 @@ class DiffusionPriorTrainer(nn.Module):
|
||||
|
||||
# accelerator will ocassionally skip optimizer steps in a "dynamic loss scaling strategy"
|
||||
if not self.accelerator.optimizer_step_was_skipped:
|
||||
with self.warmup_scheduler.dampening():
|
||||
sched_context = self.warmup_scheduler.dampening if exists(self.warmup_scheduler) else nullcontext
|
||||
with sched_context():
|
||||
self.scheduler.step()
|
||||
|
||||
if self.use_ema:
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '1.8.0'
|
||||
__version__ = '1.8.4'
|
||||
|
||||
Reference in New Issue
Block a user