mirror of
https://github.com/lucidrains/DALLE2-pytorch.git
synced 2026-02-22 01:24:24 +01:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
830afd3c15 | ||
|
|
8f93729d19 | ||
|
|
cd5f2c1de4 |
@@ -706,7 +706,7 @@ class DiffusionPriorNetwork(nn.Module):
|
|||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.time_embeddings = nn.Embedding(num_timesteps, dim) if exists(num_timesteps) else nn.Sequential(Rearrange('b -> b 1'), MLP(1, dim)) # also offer a continuous version of timestep embeddings, with a 2 layer MLP
|
self.time_embeddings = nn.Embedding(num_timesteps, dim) if exists(num_timesteps) else nn.Sequential(SinusoidalPosEmb(dim), MLP(dim, dim)) # also offer a continuous version of timestep embeddings, with a 2 layer MLP
|
||||||
self.learned_query = nn.Parameter(torch.randn(dim))
|
self.learned_query = nn.Parameter(torch.randn(dim))
|
||||||
self.causal_transformer = CausalTransformer(dim = dim, **kwargs)
|
self.causal_transformer = CausalTransformer(dim = dim, **kwargs)
|
||||||
|
|
||||||
@@ -807,6 +807,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
|
|||||||
condition_on_text_encodings = True, # the paper suggests this is needed, but you can turn it off for your CLIP preprocessed text embed -> image embed training
|
condition_on_text_encodings = True, # the paper suggests this is needed, but you can turn it off for your CLIP preprocessed text embed -> image embed training
|
||||||
sampling_clamp_l2norm = False,
|
sampling_clamp_l2norm = False,
|
||||||
training_clamp_l2norm = False,
|
training_clamp_l2norm = False,
|
||||||
|
init_image_embed_l2norm = False,
|
||||||
image_embed_scale = None, # this is for scaling the l2-normed image embedding, so it is more suitable for gaussian diffusion, as outlined by Katherine (@crowsonkb) https://github.com/lucidrains/DALLE2-pytorch/issues/60#issue-1226116132
|
image_embed_scale = None, # this is for scaling the l2-normed image embedding, so it is more suitable for gaussian diffusion, as outlined by Katherine (@crowsonkb) https://github.com/lucidrains/DALLE2-pytorch/issues/60#issue-1226116132
|
||||||
clip_adapter_overrides = dict()
|
clip_adapter_overrides = dict()
|
||||||
):
|
):
|
||||||
@@ -845,6 +846,7 @@ class DiffusionPrior(BaseGaussianDiffusion):
|
|||||||
# whether to force an l2norm, similar to clipping denoised, when sampling
|
# whether to force an l2norm, similar to clipping denoised, when sampling
|
||||||
self.sampling_clamp_l2norm = sampling_clamp_l2norm
|
self.sampling_clamp_l2norm = sampling_clamp_l2norm
|
||||||
self.training_clamp_l2norm = training_clamp_l2norm
|
self.training_clamp_l2norm = training_clamp_l2norm
|
||||||
|
self.init_image_embed_l2norm = init_image_embed_l2norm
|
||||||
|
|
||||||
def p_mean_variance(self, x, t, text_cond, clip_denoised: bool):
|
def p_mean_variance(self, x, t, text_cond, clip_denoised: bool):
|
||||||
pred = self.net(x, t, **text_cond)
|
pred = self.net(x, t, **text_cond)
|
||||||
@@ -879,11 +881,16 @@ class DiffusionPrior(BaseGaussianDiffusion):
|
|||||||
device = self.betas.device
|
device = self.betas.device
|
||||||
|
|
||||||
b = shape[0]
|
b = shape[0]
|
||||||
img = torch.randn(shape, device=device)
|
image_embed = torch.randn(shape, device=device)
|
||||||
|
|
||||||
|
if self.init_image_embed_l2norm:
|
||||||
|
image_embed = l2norm(image_embed) * self.image_embed_scale
|
||||||
|
|
||||||
for i in tqdm(reversed(range(0, self.num_timesteps)), desc='sampling loop time step', total=self.num_timesteps):
|
for i in tqdm(reversed(range(0, self.num_timesteps)), desc='sampling loop time step', total=self.num_timesteps):
|
||||||
img = self.p_sample(img, torch.full((b,), i, device = device, dtype = torch.long), text_cond = text_cond)
|
times = torch.full((b,), i, device = device, dtype = torch.long)
|
||||||
return img
|
image_embed = self.p_sample(image_embed, times, text_cond = text_cond)
|
||||||
|
|
||||||
|
return image_embed
|
||||||
|
|
||||||
def p_losses(self, image_embed, times, text_cond, noise = None):
|
def p_losses(self, image_embed, times, text_cond, noise = None):
|
||||||
noise = default(noise, lambda: torch.randn_like(image_embed))
|
noise = default(noise, lambda: torch.randn_like(image_embed))
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -10,7 +10,7 @@ setup(
|
|||||||
'dream = dalle2_pytorch.cli:dream'
|
'dream = dalle2_pytorch.cli:dream'
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
version = '0.1.6',
|
version = '0.1.8',
|
||||||
license='MIT',
|
license='MIT',
|
||||||
description = 'DALL-E 2',
|
description = 'DALL-E 2',
|
||||||
author = 'Phil Wang',
|
author = 'Phil Wang',
|
||||||
|
|||||||
@@ -46,28 +46,60 @@ def save_model(save_path, state_dict):
|
|||||||
print("====================================== Saving checkpoint ======================================")
|
print("====================================== Saving checkpoint ======================================")
|
||||||
torch.save(state_dict, save_path+'/'+str(time.time())+'_saved_model.pth')
|
torch.save(state_dict, save_path+'/'+str(time.time())+'_saved_model.pth')
|
||||||
|
|
||||||
def report_cosine_sims(diffusion_prior,image_reader,text_reader,train_set_size,val_set_size,NUM_TEST_EMBEDDINGS,device):
|
|
||||||
|
def report_cosine_sims(diffusion_prior, image_reader, text_reader, train_set_size, val_set_size, NUM_TEST_EMBEDDINGS, device):
|
||||||
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
|
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
|
||||||
|
|
||||||
tstart = train_set_size+val_set_size
|
tstart = train_set_size+val_set_size
|
||||||
tend = train_set_size+val_set_size+NUM_TEST_EMBEDDINGS
|
tend = train_set_size+val_set_size+NUM_TEST_EMBEDDINGS
|
||||||
|
|
||||||
for embt, embi in zip(text_reader(batch_size = NUM_TEST_EMBEDDINGS, start=tstart, end = tend),image_reader(batch_size = NUM_TEST_EMBEDDINGS, start=tstart, end = tend)):
|
for embt, embi in zip(text_reader(batch_size=NUM_TEST_EMBEDDINGS, start=tstart, end=tend), image_reader(batch_size=NUM_TEST_EMBEDDINGS, start=tstart, end=tend)):
|
||||||
|
# make a copy of the text embeddings for shuffling
|
||||||
text_embed = torch.tensor(embt[0]).to(device)
|
text_embed = torch.tensor(embt[0]).to(device)
|
||||||
text_embed = text_embed / text_embed.norm(dim=1, keepdim=True)
|
text_embed_shuffled = text_embed.clone()
|
||||||
test_text_cond = dict(text_embed = text_embed)
|
|
||||||
|
|
||||||
|
# roll the text embeddings to simulate "unrelated" captions
|
||||||
|
rolled_idx = torch.roll(torch.arange(NUM_TEST_EMBEDDINGS), 1)
|
||||||
|
text_embed_shuffled = text_embed_shuffled[rolled_idx]
|
||||||
|
text_embed_shuffled = text_embed_shuffled / \
|
||||||
|
text_embed_shuffled.norm(dim=1, keepdim=True)
|
||||||
|
test_text_shuffled_cond = dict(text_embed=text_embed_shuffled)
|
||||||
|
|
||||||
|
# prepare the text embedding
|
||||||
|
text_embed = text_embed / text_embed.norm(dim=1, keepdim=True)
|
||||||
|
test_text_cond = dict(text_embed=text_embed)
|
||||||
|
|
||||||
|
# prepare image embeddings
|
||||||
test_image_embeddings = torch.tensor(embi[0]).to(device)
|
test_image_embeddings = torch.tensor(embi[0]).to(device)
|
||||||
test_image_embeddings = test_image_embeddings / test_image_embeddings.norm(dim=1, keepdim=True)
|
test_image_embeddings = test_image_embeddings / \
|
||||||
|
test_image_embeddings.norm(dim=1, keepdim=True)
|
||||||
|
|
||||||
predicted_image_embeddings = diffusion_prior.p_sample_loop((NUM_TEST_EMBEDDINGS, 768), text_cond = test_text_cond)
|
# predict on the unshuffled text embeddings
|
||||||
predicted_image_embeddings = predicted_image_embeddings / predicted_image_embeddings.norm(dim=1, keepdim=True)
|
predicted_image_embeddings = diffusion_prior.p_sample_loop(
|
||||||
|
(NUM_TEST_EMBEDDINGS, 768), text_cond=test_text_cond)
|
||||||
|
predicted_image_embeddings = predicted_image_embeddings / \
|
||||||
|
predicted_image_embeddings.norm(dim=1, keepdim=True)
|
||||||
|
|
||||||
original_similarity = cos(text_embed,test_image_embeddings).cpu().numpy()
|
# predict on the shuffled embeddings
|
||||||
predicted_similarity = cos(text_embed,predicted_image_embeddings).cpu().numpy()
|
predicted_unrelated_embeddings = diffusion_prior.p_sample_loop(
|
||||||
|
(NUM_TEST_EMBEDDINGS, 768), text_cond=test_text_shuffled_cond)
|
||||||
|
predicted_unrelated_embeddings = predicted_unrelated_embeddings / \
|
||||||
|
predicted_unrelated_embeddings.norm(dim=1, keepdim=True)
|
||||||
|
|
||||||
wandb.log({"CosineSimilarity(text_embed,image_embed)": np.mean(original_similarity)})
|
# calculate similarities
|
||||||
wandb.log({"CosineSimilarity(text_embed,predicted_image_embed)":np.mean(predicted_similarity)})
|
original_similarity = cos(
|
||||||
|
text_embed, test_image_embeddings).cpu().numpy()
|
||||||
|
predicted_similarity = cos(
|
||||||
|
text_embed, predicted_image_embeddings).cpu().numpy()
|
||||||
|
unrelated_similarity = cos(
|
||||||
|
text_embed, predicted_unrelated_embeddings).cpu().numpy()
|
||||||
|
|
||||||
|
wandb.log(
|
||||||
|
{"CosineSimilarity(text_embed,image_embed)": np.mean(original_similarity)})
|
||||||
|
wandb.log({"CosineSimilarity(text_embed,predicted_image_embed)": np.mean(
|
||||||
|
predicted_similarity)})
|
||||||
|
wandb.log({"CosineSimilarity(text_embed,predicted_unrelated_embed)": np.mean(
|
||||||
|
unrelated_similarity)})
|
||||||
|
|
||||||
return np.mean(predicted_similarity - original_similarity)
|
return np.mean(predicted_similarity - original_similarity)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user