mirror of
https://github.com/lucidrains/DALLE2-pytorch.git
synced 2026-02-13 23:44:50 +01:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bb3ff0ac67 | ||
|
|
1ec4dbe64f | ||
|
|
e0835acca9 | ||
|
|
e055793e5d | ||
|
|
1d9ef99288 | ||
|
|
bdd62c24b3 | ||
|
|
1f1557c614 | ||
|
|
1a217e99e3 |
@@ -45,6 +45,7 @@ This library would not have gotten to this working state without the help of
|
||||
- <a href="https://github.com/rom1504">Romain</a> for the pull request reviews and project management
|
||||
- <a href="https://github.com/Ciaohe">He Cao</a> and <a href="https://github.com/xiankgx">xiankgx</a> for the Q&A and for identifying of critical bugs
|
||||
- <a href="https://github.com/marunine">Marunine</a> for identifying issues with resizing of the low resolution conditioner, when training the upsampler, in addition to various other bug fixes
|
||||
- <a href="https://github.com/malumadev">MalumaDev</a> for proposing the use of pixel shuffle upsampler for fixing checkboard artifacts
|
||||
- <a href="https://github.com/crowsonkb">Katherine</a> for her advice
|
||||
- <a href="https://stability.ai/">Stability AI</a> for the generous sponsorship
|
||||
- <a href="https://huggingface.co">🤗 Huggingface</a> and in particular <a href="https://github.com/sgugger">Sylvain</a> for the <a href="https://github.com/huggingface/accelerate">Accelerate</a> library
|
||||
|
||||
@@ -77,6 +77,11 @@ def cast_tuple(val, length = None):
|
||||
def module_device(module):
|
||||
return next(module.parameters()).device
|
||||
|
||||
def zero_init_(m):
|
||||
nn.init.zeros_(m.weight)
|
||||
if exists(m.bias):
|
||||
nn.init.zeros_(m.bias)
|
||||
|
||||
@contextmanager
|
||||
def null_context(*args, **kwargs):
|
||||
yield
|
||||
@@ -220,6 +225,7 @@ class XClipAdapter(BaseClipAdapter):
|
||||
encoder_output = self.clip.text_transformer(text)
|
||||
text_cls, text_encodings = encoder_output[:, 0], encoder_output[:, 1:]
|
||||
text_embed = self.clip.to_text_latent(text_cls)
|
||||
text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
|
||||
return EmbeddedText(l2norm(text_embed), text_encodings, text_mask)
|
||||
|
||||
@torch.no_grad()
|
||||
@@ -255,6 +261,7 @@ class CoCaAdapter(BaseClipAdapter):
|
||||
text = text[..., :self.max_text_len]
|
||||
text_mask = text != 0
|
||||
text_embed, text_encodings = self.clip.embed_text(text)
|
||||
text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
|
||||
return EmbeddedText(text_embed, text_encodings, text_mask)
|
||||
|
||||
@torch.no_grad()
|
||||
@@ -314,6 +321,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
|
||||
|
||||
text_embed = self.clip.encode_text(text)
|
||||
text_encodings = self.text_encodings
|
||||
text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
|
||||
del self.text_encodings
|
||||
return EmbeddedText(l2norm(text_embed.float()), text_encodings.float(), text_mask)
|
||||
|
||||
@@ -863,8 +871,8 @@ class DiffusionPriorNetwork(nn.Module):
|
||||
if not exists(text_encodings):
|
||||
text_encodings = torch.empty((batch, 0, dim), device = device, dtype = dtype)
|
||||
|
||||
if not exists(mask):
|
||||
mask = torch.ones((batch, text_encodings.shape[-2]), device = device, dtype = torch.bool)
|
||||
if not exists(mask) or mask.numel() == 0:
|
||||
mask = torch.any(text_encodings != 0., dim = -1)
|
||||
|
||||
# classifier free guidance
|
||||
|
||||
@@ -1214,16 +1222,35 @@ class DiffusionPrior(nn.Module):
|
||||
|
||||
# decoder
|
||||
|
||||
def ConvTransposeUpsample(dim, dim_out = None):
|
||||
dim_out = default(dim_out, dim)
|
||||
return nn.ConvTranspose2d(dim, dim_out, 4, 2, 1)
|
||||
class PixelShuffleUpsample(nn.Module):
|
||||
"""
|
||||
code shared by @MalumaDev at DALLE2-pytorch for addressing checkboard artifacts
|
||||
https://arxiv.org/ftp/arxiv/papers/1707/1707.02937.pdf
|
||||
"""
|
||||
def __init__(self, dim, dim_out = None):
|
||||
super().__init__()
|
||||
dim_out = default(dim_out, dim)
|
||||
conv = nn.Conv2d(dim, dim_out * 4, 1)
|
||||
|
||||
def NearestUpsample(dim, dim_out = None):
|
||||
dim_out = default(dim_out, dim)
|
||||
return nn.Sequential(
|
||||
nn.Upsample(scale_factor = 2, mode = 'nearest'),
|
||||
nn.Conv2d(dim, dim_out, 3, padding = 1)
|
||||
)
|
||||
self.net = nn.Sequential(
|
||||
conv,
|
||||
nn.SiLU(),
|
||||
nn.PixelShuffle(2)
|
||||
)
|
||||
|
||||
self.init_conv_(conv)
|
||||
|
||||
def init_conv_(self, conv):
|
||||
o, i, h, w = conv.weight.shape
|
||||
conv_weight = torch.empty(o // 4, i, h, w)
|
||||
nn.init.kaiming_uniform_(conv_weight)
|
||||
conv_weight = repeat(conv_weight, 'o ... -> (o 4) ...')
|
||||
|
||||
conv.weight.data.copy_(conv_weight)
|
||||
nn.init.zeros_(conv.bias.data)
|
||||
|
||||
def forward(self, x):
|
||||
return self.net(x)
|
||||
|
||||
def Downsample(dim, *, dim_out = None):
|
||||
dim_out = default(dim_out, dim)
|
||||
@@ -1487,7 +1514,7 @@ class Unet(nn.Module):
|
||||
cross_embed_downsample_kernel_sizes = (2, 4),
|
||||
memory_efficient = False,
|
||||
scale_skip_connection = False,
|
||||
nearest_upsample = False,
|
||||
pixel_shuffle_upsample = True,
|
||||
final_conv_kernel_size = 1,
|
||||
**kwargs
|
||||
):
|
||||
@@ -1601,7 +1628,7 @@ class Unet(nn.Module):
|
||||
|
||||
# upsample klass
|
||||
|
||||
upsample_klass = ConvTransposeUpsample if not nearest_upsample else NearestUpsample
|
||||
upsample_klass = ConvTransposeUpsample if not pixel_shuffle_upsample else PixelShuffleUpsample
|
||||
|
||||
# give memory efficient unet an initial resnet block
|
||||
|
||||
@@ -1665,6 +1692,8 @@ class Unet(nn.Module):
|
||||
self.final_resnet_block = ResnetBlock(dim * 2, dim, time_cond_dim = time_cond_dim, groups = top_level_resnet_group)
|
||||
self.to_out = nn.Conv2d(dim, self.channels_out, kernel_size = final_conv_kernel_size, padding = final_conv_kernel_size // 2)
|
||||
|
||||
zero_init_(self.to_out) # since both OpenAI and @crowsonkb are doing it
|
||||
|
||||
# if the current settings for the unet are not correct
|
||||
# for cascading DDPM, then reinit the unet with the right settings
|
||||
def cast_model_parameters(
|
||||
@@ -1789,21 +1818,25 @@ class Unet(nn.Module):
|
||||
if exists(text_encodings) and self.cond_on_text_encodings:
|
||||
assert self.text_embed_dim == text_encodings.shape[-1], f'the text encodings you are passing in have a dimension of {text_encodings.shape[-1]}, but the unet was created with text_embed_dim of {self.text_embed_dim}.'
|
||||
|
||||
if not exists(text_mask) or text_mask.numel() == 0:
|
||||
text_mask = torch.any(text_encodings != 0., dim = -1)
|
||||
|
||||
text_tokens = self.text_to_cond(text_encodings)
|
||||
|
||||
text_tokens = text_tokens[:, :self.max_text_len]
|
||||
text_mask = text_mask[:, :self.max_text_len]
|
||||
|
||||
text_tokens_len = text_tokens.shape[1]
|
||||
remainder = self.max_text_len - text_tokens_len
|
||||
|
||||
if remainder > 0:
|
||||
text_tokens = F.pad(text_tokens, (0, 0, 0, remainder))
|
||||
text_mask = F.pad(text_mask, (0, remainder), value = False)
|
||||
|
||||
if exists(text_mask):
|
||||
if remainder > 0:
|
||||
text_mask = F.pad(text_mask, (0, remainder), value = False)
|
||||
text_mask = rearrange(text_mask, 'b n -> b n 1')
|
||||
|
||||
text_mask = rearrange(text_mask, 'b n -> b n 1')
|
||||
text_keep_mask = text_mask & text_keep_mask
|
||||
assert text_mask.shape[0] == text_keep_mask.shape[0], f'text_mask has shape of {text_mask.shape} while text_keep_mask has shape {text_keep_mask.shape}. text encoding is of shape {text_encodings.shape}'
|
||||
text_keep_mask = text_mask & text_keep_mask
|
||||
|
||||
null_text_embed = self.null_text_embed.to(text_tokens.dtype) # for some reason pytorch AMP not working
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '0.19.6'
|
||||
__version__ = '0.21.3'
|
||||
|
||||
@@ -557,7 +557,7 @@ def initialize_training(config: TrainDecoderConfig, config_path):
|
||||
|
||||
# Create the decoder model and print basic info
|
||||
decoder = config.decoder.create()
|
||||
num_parameters = sum(p.numel() for p in decoder.parameters())
|
||||
get_num_parameters = lambda model, only_training=False: sum(p.numel() for p in model.parameters() if (p.requires_grad or not only_training))
|
||||
|
||||
# Create and initialize the tracker if we are the master
|
||||
tracker = create_tracker(accelerator, config, config_path, dummy = rank!=0)
|
||||
@@ -586,7 +586,10 @@ def initialize_training(config: TrainDecoderConfig, config_path):
|
||||
accelerator.print(print_ribbon("Loaded Config", repeat=40))
|
||||
accelerator.print(f"Running training with {accelerator.num_processes} processes and {accelerator.distributed_type} distributed training")
|
||||
accelerator.print(f"Training using {data_source_string}. {'conditioned on text' if conditioning_on_text else 'not conditioned on text'}")
|
||||
accelerator.print(f"Number of parameters: {num_parameters}")
|
||||
accelerator.print(f"Number of parameters: {get_num_parameters(decoder)} total; {get_num_parameters(decoder, only_training=True)} training")
|
||||
for i, unet in enumerate(decoder.unets):
|
||||
accelerator.print(f"Unet {i} has {get_num_parameters(unet)} total; {get_num_parameters(unet, only_training=True)} training")
|
||||
|
||||
train(dataloaders, decoder, accelerator,
|
||||
tracker=tracker,
|
||||
inference_device=accelerator.device,
|
||||
|
||||
Reference in New Issue
Block a user