mirror of
https://github.com/lucidrains/DALLE2-pytorch.git
synced 2026-02-19 23:24:29 +01:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3df86acc8b | ||
|
|
de5e628773 | ||
|
|
1b4046b039 | ||
|
|
27f19ba7fa | ||
|
|
8f38339c2b | ||
|
|
6b9b4b9e5e | ||
|
|
44e09d5a4d |
11
README.md
11
README.md
@@ -49,6 +49,7 @@ This library would not have gotten to this working state without the help of
|
|||||||
- <a href="https://github.com/crowsonkb">Katherine</a> for her advice
|
- <a href="https://github.com/crowsonkb">Katherine</a> for her advice
|
||||||
- <a href="https://stability.ai/">Stability AI</a> for the generous sponsorship
|
- <a href="https://stability.ai/">Stability AI</a> for the generous sponsorship
|
||||||
- <a href="https://huggingface.co">🤗 Huggingface</a> and in particular <a href="https://github.com/sgugger">Sylvain</a> for the <a href="https://github.com/huggingface/accelerate">Accelerate</a> library
|
- <a href="https://huggingface.co">🤗 Huggingface</a> and in particular <a href="https://github.com/sgugger">Sylvain</a> for the <a href="https://github.com/huggingface/accelerate">Accelerate</a> library
|
||||||
|
- <a href="https://github.com/arogozhnikov">Alex</a> for <a href="https://github.com/arogozhnikov/einops">einops</a>, indispensable tool for tensor manipulation
|
||||||
|
|
||||||
... and many others. Thank you! 🙏
|
... and many others. Thank you! 🙏
|
||||||
|
|
||||||
@@ -1274,4 +1275,14 @@ For detailed information on training the diffusion prior, please refer to the [d
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@inproceedings{rogozhnikov2022einops,
|
||||||
|
title = {Einops: Clear and Reliable Tensor Manipulations with Einstein-like Notation},
|
||||||
|
author = {Alex Rogozhnikov},
|
||||||
|
booktitle = {International Conference on Learning Representations},
|
||||||
|
year = {2022},
|
||||||
|
url = {https://openreview.net/forum?id=oapKSVM2bcj}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
*Creating noise from data is easy; creating data from noise is generative modeling.* - <a href="https://arxiv.org/abs/2011.13456">Yang Song's paper</a>
|
*Creating noise from data is easy; creating data from noise is generative modeling.* - <a href="https://arxiv.org/abs/2011.13456">Yang Song's paper</a>
|
||||||
|
|||||||
@@ -250,9 +250,13 @@ class XClipAdapter(BaseClipAdapter):
|
|||||||
text = text[..., :self.max_text_len]
|
text = text[..., :self.max_text_len]
|
||||||
text_mask = text != 0
|
text_mask = text != 0
|
||||||
encoder_output = self.clip.text_transformer(text)
|
encoder_output = self.clip.text_transformer(text)
|
||||||
text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
|
|
||||||
|
text_cls, text_encodings = (encoder_output[:, 0], encoder_output[:, 1:]) if encoder_output.ndim == 3 else (encoder_output, None)
|
||||||
text_embed = self.clip.to_text_latent(text_cls)
|
text_embed = self.clip.to_text_latent(text_cls)
|
||||||
text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
|
|
||||||
|
if exists(text_encodings):
|
||||||
|
text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
|
||||||
|
|
||||||
return EmbeddedText(l2norm(text_embed), text_encodings)
|
return EmbeddedText(l2norm(text_embed), text_encodings)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@@ -1456,10 +1460,6 @@ class WeightStandardizedConv2d(nn.Conv2d):
|
|||||||
https://arxiv.org/abs/1903.10520
|
https://arxiv.org/abs/1903.10520
|
||||||
weight standardization purportedly works synergistically with group normalization
|
weight standardization purportedly works synergistically with group normalization
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
eps = 1e-5 if x.dtype == torch.float32 else 1e-3
|
eps = 1e-5 if x.dtype == torch.float32 else 1e-3
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from collections.abc import Iterable
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.optim.lr_scheduler import LambdaLR
|
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
|
||||||
from torch.cuda.amp import autocast, GradScaler
|
from torch.cuda.amp import autocast, GradScaler
|
||||||
|
|
||||||
from dalle2_pytorch.dalle2_pytorch import Decoder, DiffusionPrior
|
from dalle2_pytorch.dalle2_pytorch import Decoder, DiffusionPrior
|
||||||
@@ -181,7 +181,8 @@ class DiffusionPriorTrainer(nn.Module):
|
|||||||
eps = 1e-6,
|
eps = 1e-6,
|
||||||
max_grad_norm = None,
|
max_grad_norm = None,
|
||||||
group_wd_params = True,
|
group_wd_params = True,
|
||||||
warmup_steps = 1,
|
warmup_steps = None,
|
||||||
|
cosine_decay_max_steps = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@@ -233,8 +234,11 @@ class DiffusionPriorTrainer(nn.Module):
|
|||||||
**self.optim_kwargs,
|
**self.optim_kwargs,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
self.scheduler = LambdaLR(self.optimizer, lr_lambda = lambda _: 1.0)
|
if exists(cosine_decay_max_steps):
|
||||||
|
self.scheduler = CosineAnnealingLR(optimizer, T_max = cosine_decay_max_steps)
|
||||||
|
else:
|
||||||
|
self.scheduler = LambdaLR(self.optimizer, lr_lambda = lambda _: 1.0)
|
||||||
|
|
||||||
self.warmup_scheduler = warmup.LinearWarmup(self.optimizer, warmup_period = warmup_steps) if exists(warmup_steps) else None
|
self.warmup_scheduler = warmup.LinearWarmup(self.optimizer, warmup_period = warmup_steps) if exists(warmup_steps) else None
|
||||||
|
|
||||||
@@ -271,6 +275,7 @@ class DiffusionPriorTrainer(nn.Module):
|
|||||||
# FIXME: LambdaLR can't be saved due to pickling issues
|
# FIXME: LambdaLR can't be saved due to pickling issues
|
||||||
save_obj = dict(
|
save_obj = dict(
|
||||||
optimizer = self.optimizer.state_dict(),
|
optimizer = self.optimizer.state_dict(),
|
||||||
|
scheduler = self.scheduler.state_dict(),
|
||||||
warmup_scheduler = self.warmup_scheduler,
|
warmup_scheduler = self.warmup_scheduler,
|
||||||
model = self.accelerator.unwrap_model(self.diffusion_prior).state_dict(),
|
model = self.accelerator.unwrap_model(self.diffusion_prior).state_dict(),
|
||||||
version = version.parse(__version__),
|
version = version.parse(__version__),
|
||||||
@@ -317,7 +322,9 @@ class DiffusionPriorTrainer(nn.Module):
|
|||||||
# unwrap the model when loading from checkpoint
|
# unwrap the model when loading from checkpoint
|
||||||
self.accelerator.unwrap_model(self.diffusion_prior).load_state_dict(loaded_obj['model'], strict = strict)
|
self.accelerator.unwrap_model(self.diffusion_prior).load_state_dict(loaded_obj['model'], strict = strict)
|
||||||
self.step.copy_(torch.ones_like(self.step, device=self.device) * loaded_obj['step'].to(self.device))
|
self.step.copy_(torch.ones_like(self.step, device=self.device) * loaded_obj['step'].to(self.device))
|
||||||
|
|
||||||
self.optimizer.load_state_dict(loaded_obj['optimizer'])
|
self.optimizer.load_state_dict(loaded_obj['optimizer'])
|
||||||
|
self.scheduler.load_state_dict(loaded_obj['scheduler'])
|
||||||
|
|
||||||
# set warmupstep
|
# set warmupstep
|
||||||
if exists(self.warmup_scheduler):
|
if exists(self.warmup_scheduler):
|
||||||
@@ -350,7 +357,8 @@ class DiffusionPriorTrainer(nn.Module):
|
|||||||
|
|
||||||
# accelerator will ocassionally skip optimizer steps in a "dynamic loss scaling strategy"
|
# accelerator will ocassionally skip optimizer steps in a "dynamic loss scaling strategy"
|
||||||
if not self.accelerator.optimizer_step_was_skipped:
|
if not self.accelerator.optimizer_step_was_skipped:
|
||||||
with self.warmup_scheduler.dampening():
|
sched_context = self.warmup_scheduler.dampening if exists(self.warmup_scheduler) else nullcontext
|
||||||
|
with sched_context():
|
||||||
self.scheduler.step()
|
self.scheduler.step()
|
||||||
|
|
||||||
if self.use_ema:
|
if self.use_ema:
|
||||||
@@ -433,6 +441,7 @@ class DecoderTrainer(nn.Module):
|
|||||||
wd = 1e-2,
|
wd = 1e-2,
|
||||||
eps = 1e-8,
|
eps = 1e-8,
|
||||||
warmup_steps = None,
|
warmup_steps = None,
|
||||||
|
cosine_decay_max_steps = None,
|
||||||
max_grad_norm = 0.5,
|
max_grad_norm = 0.5,
|
||||||
amp = False,
|
amp = False,
|
||||||
group_wd_params = True,
|
group_wd_params = True,
|
||||||
@@ -454,7 +463,7 @@ class DecoderTrainer(nn.Module):
|
|||||||
# be able to finely customize learning rate, weight decay
|
# be able to finely customize learning rate, weight decay
|
||||||
# per unet
|
# per unet
|
||||||
|
|
||||||
lr, wd, eps, warmup_steps = map(partial(cast_tuple, length = self.num_unets), (lr, wd, eps, warmup_steps))
|
lr, wd, eps, warmup_steps, cosine_decay_max_steps = map(partial(cast_tuple, length = self.num_unets), (lr, wd, eps, warmup_steps, cosine_decay_max_steps))
|
||||||
|
|
||||||
assert all([unet_lr <= 1e-2 for unet_lr in lr]), 'your learning rate is too high, recommend sticking with 1e-4, at most 5e-4'
|
assert all([unet_lr <= 1e-2 for unet_lr in lr]), 'your learning rate is too high, recommend sticking with 1e-4, at most 5e-4'
|
||||||
|
|
||||||
@@ -462,7 +471,7 @@ class DecoderTrainer(nn.Module):
|
|||||||
schedulers = []
|
schedulers = []
|
||||||
warmup_schedulers = []
|
warmup_schedulers = []
|
||||||
|
|
||||||
for unet, unet_lr, unet_wd, unet_eps, unet_warmup_steps in zip(decoder.unets, lr, wd, eps, warmup_steps):
|
for unet, unet_lr, unet_wd, unet_eps, unet_warmup_steps, unet_cosine_decay_max_steps in zip(decoder.unets, lr, wd, eps, warmup_steps, cosine_decay_max_steps):
|
||||||
if isinstance(unet, nn.Identity):
|
if isinstance(unet, nn.Identity):
|
||||||
optimizers.append(None)
|
optimizers.append(None)
|
||||||
schedulers.append(None)
|
schedulers.append(None)
|
||||||
@@ -478,7 +487,11 @@ class DecoderTrainer(nn.Module):
|
|||||||
)
|
)
|
||||||
|
|
||||||
optimizers.append(optimizer)
|
optimizers.append(optimizer)
|
||||||
scheduler = LambdaLR(optimizer, lr_lambda = lambda step: 1.0)
|
|
||||||
|
if exists(unet_cosine_decay_max_steps):
|
||||||
|
scheduler = CosineAnnealingLR(optimizer, T_max = unet_cosine_decay_max_steps)
|
||||||
|
else:
|
||||||
|
scheduler = LambdaLR(optimizer, lr_lambda = lambda step: 1.0)
|
||||||
|
|
||||||
warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period = unet_warmup_steps) if exists(unet_warmup_steps) else None
|
warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period = unet_warmup_steps) if exists(unet_warmup_steps) else None
|
||||||
warmup_schedulers.append(warmup_scheduler)
|
warmup_schedulers.append(warmup_scheduler)
|
||||||
@@ -558,9 +571,15 @@ class DecoderTrainer(nn.Module):
|
|||||||
|
|
||||||
for ind in range(0, self.num_unets):
|
for ind in range(0, self.num_unets):
|
||||||
optimizer_key = f'optim{ind}'
|
optimizer_key = f'optim{ind}'
|
||||||
|
scheduler_key = f'sched{ind}'
|
||||||
|
|
||||||
optimizer = getattr(self, optimizer_key)
|
optimizer = getattr(self, optimizer_key)
|
||||||
state_dict = optimizer.state_dict() if optimizer is not None else None
|
scheduler = getattr(self, scheduler_key)
|
||||||
save_obj = {**save_obj, optimizer_key: state_dict}
|
|
||||||
|
optimizer_state_dict = optimizer.state_dict() if exists(optimizer) else None
|
||||||
|
scheduler_state_dict = scheduler.state_dict() if exists(scheduler) else None
|
||||||
|
|
||||||
|
save_obj = {**save_obj, optimizer_key: optimizer_state_dict, scheduler_key: scheduler_state_dict}
|
||||||
|
|
||||||
if self.use_ema:
|
if self.use_ema:
|
||||||
save_obj = {**save_obj, 'ema': self.ema_unets.state_dict()}
|
save_obj = {**save_obj, 'ema': self.ema_unets.state_dict()}
|
||||||
@@ -581,10 +600,18 @@ class DecoderTrainer(nn.Module):
|
|||||||
|
|
||||||
optimizer_key = f'optim{ind}'
|
optimizer_key = f'optim{ind}'
|
||||||
optimizer = getattr(self, optimizer_key)
|
optimizer = getattr(self, optimizer_key)
|
||||||
|
|
||||||
|
scheduler_key = f'sched{ind}'
|
||||||
|
scheduler = getattr(self, scheduler_key)
|
||||||
|
|
||||||
warmup_scheduler = self.warmup_schedulers[ind]
|
warmup_scheduler = self.warmup_schedulers[ind]
|
||||||
if optimizer is not None:
|
|
||||||
|
if exists(optimizer):
|
||||||
optimizer.load_state_dict(loaded_obj[optimizer_key])
|
optimizer.load_state_dict(loaded_obj[optimizer_key])
|
||||||
|
|
||||||
|
if exists(scheduler):
|
||||||
|
scheduler.load_state_dict(loaded_obj[scheduler_key])
|
||||||
|
|
||||||
if exists(warmup_scheduler):
|
if exists(warmup_scheduler):
|
||||||
warmup_scheduler.last_step = last_step
|
warmup_scheduler.last_step = last_step
|
||||||
|
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
__version__ = '1.7.0'
|
__version__ = '1.8.3'
|
||||||
|
|||||||
Reference in New Issue
Block a user