Compare commits

...

5 Commits

3 changed files with 24 additions and 7 deletions

View File

@@ -7,6 +7,7 @@ from contextlib import contextmanager
import torch
import torch.nn.functional as F
from torch import nn, einsum
import torchvision.transforms as T
from einops import rearrange, repeat
from einops.layers.torch import Rearrange
@@ -646,9 +647,12 @@ class DiffusionPrior(BaseGaussianDiffusion):
)
if exists(clip):
assert isinstance(clip, CLIP)
if isinstance(clip, CLIP):
clip = XClipAdapter(clip)
assert isinstance(clip, BaseClipAdapter)
freeze_model_and_make_eval_(clip)
self.clip = XClipAdapter(clip)
self.clip = clip
else:
assert exists(image_embed_dim), 'latent dimension must be given, if training prior network without CLIP given'
self.clip = None
@@ -739,7 +743,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
text_cond = dict(text_embed = text_embed)
if self.condition_on_text_encodings:
text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}
text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text != 0}
image_embeds = self.p_sample_loop((batch_size, image_embed_dim), text_cond = text_cond)
text_embeds = text_cond['text_embed']
@@ -782,6 +786,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
text_cond = dict(text_embed = text_embed)
if self.condition_on_text_encodings:
assert exists(text_encodings), 'text encodings must be present for diffusion prior if specified'
text_cond = {**text_cond, 'text_encodings': text_encodings, 'mask': text_mask}
# timestep conditioning from ddpm
@@ -791,8 +796,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
# calculate forward loss
loss = self.p_losses(image_embed, times, text_cond = text_cond, *args, **kwargs)
return loss
return self.p_losses(image_embed, times, text_cond = text_cond, *args, **kwargs)
# decoder
@@ -1247,6 +1251,8 @@ class Decoder(BaseGaussianDiffusion):
clip = XClipAdapter(clip)
freeze_model_and_make_eval_(clip)
assert isinstance(clip, BaseClipAdapter)
self.clip = clip
self.clip_image_size = clip.image_size
self.channels = clip.image_channels
@@ -1417,6 +1423,7 @@ class Decoder(BaseGaussianDiffusion):
_, text_encodings = self.clip.embed_text(text)
assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'
img = None
@@ -1484,6 +1491,7 @@ class Decoder(BaseGaussianDiffusion):
_, text_encodings = self.clip.embed_text(text)
assert not (self.condition_on_text_encodings and not exists(text_encodings)), 'text or text encodings must be passed into decoder if specified'
assert not (not self.condition_on_text_encodings and exists(text_encodings)), 'decoder specified not to be conditioned on text, yet it is presented'
lowres_cond_img = self.to_lowres_cond(image, target_image_size = target_image_size, downsample_image_size = self.image_sizes[unet_index - 1]) if unet_number > 1 else None
image = resize_image_to(image, target_image_size)
@@ -1516,12 +1524,15 @@ class DALLE2(nn.Module):
self.prior_num_samples = prior_num_samples
self.decoder_need_text_cond = self.decoder.condition_on_text_encodings
self.to_pil = T.ToPILImage()
@torch.no_grad()
@eval_decorator
def forward(
self,
text,
cond_scale = 1.
cond_scale = 1.,
return_pil_images = False
):
device = next(self.parameters()).device
one_text = isinstance(text, str) or (not is_list_str(text) and text.shape[0] == 1)
@@ -1535,7 +1546,11 @@ class DALLE2(nn.Module):
text_cond = text if self.decoder_need_text_cond else None
images = self.decoder.sample(image_embed, text = text_cond, cond_scale = cond_scale)
if return_pil_images:
images = list(map(self.to_pil, images.unbind(dim = 0)))
if one_text:
return images[0]
return images

View File

@@ -545,6 +545,7 @@ class VQGanVAE(nn.Module):
l2_recon_loss = False,
use_hinge_loss = True,
vgg = None,
vq_codebook_dim = 256,
vq_codebook_size = 512,
vq_decay = 0.8,
vq_commitment_weight = 1.,
@@ -579,6 +580,7 @@ class VQGanVAE(nn.Module):
self.vq = VQ(
dim = self.enc_dec.encoded_dim,
codebook_dim = vq_codebook_dim,
codebook_size = vq_codebook_size,
decay = vq_decay,
commitment_weight = vq_commitment_weight,

View File

@@ -10,7 +10,7 @@ setup(
'dream = dalle2_pytorch.cli:dream'
],
},
version = '0.0.58',
version = '0.0.63',
license='MIT',
description = 'DALL-E 2',
author = 'Phil Wang',