|
|
|
|
@@ -360,7 +360,6 @@ class OpenAIClipAdapter(BaseClipAdapter):
|
|
|
|
|
is_eos_id = (text == self.eos_id)
|
|
|
|
|
text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
|
|
|
|
|
text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
|
|
|
|
|
text_mask = text_mask & (text != 0)
|
|
|
|
|
assert not self.cleared
|
|
|
|
|
|
|
|
|
|
text_embed = self.clip.encode_text(text)
|
|
|
|
|
@@ -435,7 +434,6 @@ class OpenClipAdapter(BaseClipAdapter):
|
|
|
|
|
is_eos_id = (text == self.eos_id)
|
|
|
|
|
text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
|
|
|
|
|
text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
|
|
|
|
|
text_mask = text_mask & (text != 0)
|
|
|
|
|
assert not self.cleared
|
|
|
|
|
|
|
|
|
|
text_embed = self.clip.encode_text(text)
|
|
|
|
|
@@ -631,7 +629,7 @@ class NoiseScheduler(nn.Module):
|
|
|
|
|
|
|
|
|
|
def calculate_v(self, x_start, t, noise = None):
|
|
|
|
|
return (
|
|
|
|
|
extract(self.sqrt_alphas_cumprod, t, x_start.shape) * noise -
|
|
|
|
|
extract(self.sqrt_alphas_cumprod, t, x_start.shape) * noise +
|
|
|
|
|
extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * x_start
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@@ -1124,7 +1122,7 @@ class DiffusionPriorNetwork(nn.Module):
|
|
|
|
|
learned_queries = repeat(self.learned_query, 'd -> b 1 d', b = batch)
|
|
|
|
|
|
|
|
|
|
if self.self_cond:
|
|
|
|
|
learned_queries = torch.cat((self_cond, learned_queries), dim = -2)
|
|
|
|
|
learned_queries = torch.cat((image_embed, self_cond), dim = -2)
|
|
|
|
|
|
|
|
|
|
tokens = torch.cat((
|
|
|
|
|
text_encodings,
|
|
|
|
|
@@ -1322,7 +1320,7 @@ class DiffusionPrior(nn.Module):
|
|
|
|
|
elif self.predict_x_start:
|
|
|
|
|
x_start = pred
|
|
|
|
|
else:
|
|
|
|
|
x_start = self.noise_scheduler.predict_start_from_noise(image_embed, t = time_cond, noise = pred)
|
|
|
|
|
x_start = self.noise_scheduler.predict_start_from_noise(image_embed, t = time_cond, noise = pred_noise)
|
|
|
|
|
|
|
|
|
|
# clip x0 before maybe predicting noise
|
|
|
|
|
|
|
|
|
|
@@ -1334,7 +1332,10 @@ class DiffusionPrior(nn.Module):
|
|
|
|
|
|
|
|
|
|
# predict noise
|
|
|
|
|
|
|
|
|
|
pred_noise = self.noise_scheduler.predict_noise_from_start(image_embed, t = time_cond, x0 = x_start)
|
|
|
|
|
if self.predict_x_start or self.predict_v:
|
|
|
|
|
pred_noise = self.noise_scheduler.predict_noise_from_start(image_embed, t = time_cond, x0 = x_start)
|
|
|
|
|
else:
|
|
|
|
|
pred_noise = pred
|
|
|
|
|
|
|
|
|
|
if time_next < 0:
|
|
|
|
|
image_embed = x_start
|
|
|
|
|
@@ -2493,7 +2494,7 @@ class Decoder(nn.Module):
|
|
|
|
|
dynamic_thres_percentile = 0.95,
|
|
|
|
|
p2_loss_weight_gamma = 0., # p2 loss weight, from https://arxiv.org/abs/2204.00227 - 0 is equivalent to weight of 1 across time - 1. is recommended
|
|
|
|
|
p2_loss_weight_k = 1,
|
|
|
|
|
ddim_sampling_eta = 0. # can be set to 0. for deterministic sampling afaict
|
|
|
|
|
ddim_sampling_eta = 1. # can be set to 0. for deterministic sampling afaict
|
|
|
|
|
):
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
@@ -2727,16 +2728,11 @@ class Decoder(nn.Module):
|
|
|
|
|
if exists(unet_number):
|
|
|
|
|
unet = self.get_unet(unet_number)
|
|
|
|
|
|
|
|
|
|
# devices
|
|
|
|
|
|
|
|
|
|
cuda, cpu = torch.device('cuda'), torch.device('cpu')
|
|
|
|
|
|
|
|
|
|
self.cuda()
|
|
|
|
|
|
|
|
|
|
devices = [module_device(unet) for unet in self.unets]
|
|
|
|
|
|
|
|
|
|
self.unets.to(cpu)
|
|
|
|
|
unet.to(cuda)
|
|
|
|
|
self.unets.cpu()
|
|
|
|
|
unet.cuda()
|
|
|
|
|
|
|
|
|
|
yield
|
|
|
|
|
|
|
|
|
|
@@ -2977,7 +2973,10 @@ class Decoder(nn.Module):
|
|
|
|
|
|
|
|
|
|
# predict noise
|
|
|
|
|
|
|
|
|
|
pred_noise = noise_scheduler.predict_noise_from_start(img, t = time_cond, x0 = x_start)
|
|
|
|
|
if predict_x_start or predict_v:
|
|
|
|
|
pred_noise = noise_scheduler.predict_noise_from_start(img, t = time_cond, x0 = x_start)
|
|
|
|
|
else:
|
|
|
|
|
pred_noise = pred
|
|
|
|
|
|
|
|
|
|
c1 = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
|
|
|
|
|
c2 = ((1 - alpha_next) - torch.square(c1)).sqrt()
|
|
|
|
|
@@ -3119,8 +3118,7 @@ class Decoder(nn.Module):
|
|
|
|
|
distributed = False,
|
|
|
|
|
inpaint_image = None,
|
|
|
|
|
inpaint_mask = None,
|
|
|
|
|
inpaint_resample_times = 5,
|
|
|
|
|
one_unet_in_gpu_at_time = True
|
|
|
|
|
inpaint_resample_times = 5
|
|
|
|
|
):
|
|
|
|
|
assert self.unconditional or exists(image_embed), 'image embed must be present on sampling from decoder unless if trained unconditionally'
|
|
|
|
|
|
|
|
|
|
@@ -3143,7 +3141,6 @@ class Decoder(nn.Module):
|
|
|
|
|
assert image.shape[0] == batch_size, 'image must have batch size of {} if starting at unet number > 1'.format(batch_size)
|
|
|
|
|
prev_unet_output_size = self.image_sizes[start_at_unet_number - 2]
|
|
|
|
|
img = resize_image_to(image, prev_unet_output_size, nearest = True)
|
|
|
|
|
|
|
|
|
|
is_cuda = next(self.parameters()).is_cuda
|
|
|
|
|
|
|
|
|
|
num_unets = self.num_unets
|
|
|
|
|
@@ -3153,7 +3150,7 @@ class Decoder(nn.Module):
|
|
|
|
|
if unet_number < start_at_unet_number:
|
|
|
|
|
continue # It's the easiest way to do it
|
|
|
|
|
|
|
|
|
|
context = self.one_unet_in_gpu(unet = unet) if is_cuda and one_unet_in_gpu_at_time else null_context()
|
|
|
|
|
context = self.one_unet_in_gpu(unet = unet) if is_cuda else null_context()
|
|
|
|
|
|
|
|
|
|
with context:
|
|
|
|
|
# prepare low resolution conditioning for upsamplers
|
|
|
|
|
|