diff --git a/README.md b/README.md index d7d731f..182b9e8 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Todo ## Todo - [x] finish off gaussian diffusion class for latent embedding - allow for prediction of epsilon -- [ ] add what was proposed in the paper, where DDPM objective for image latent embedding predicts x0 directly (reread vq-diffusion paper and get caught up on that line of work) +- [x] add what was proposed in the paper, where DDPM objective for image latent embedding predicts x0 directly (reread vq-diffusion paper and get caught up on that line of work) - [ ] make sure it works end to end to produce an output tensor, taking a single gradient step - [ ] augment unet so that it can also be conditioned on text encodings (although in paper they hinted this didn't make much a difference) - [ ] look into Jonathan Ho's cascading DDPM for the decoder, as that seems to be what they are using. get caught up on DDPM literature diff --git a/dalle2_pytorch/dalle2_pytorch.py b/dalle2_pytorch/dalle2_pytorch.py index 42be949..53eb9db 100644 --- a/dalle2_pytorch/dalle2_pytorch.py +++ b/dalle2_pytorch/dalle2_pytorch.py @@ -363,7 +363,12 @@ class DiffusionPrior(nn.Module): return posterior_mean, posterior_variance, posterior_log_variance_clipped def p_mean_variance(self, x, t, text_cond, clip_denoised: bool): - x_recon = self.predict_start_from_noise(x, t = t, noise = self.net(x, t, **text_cond)) + if self.predict_x0: + x_recon = self.net(x, t, **text_cond) + # not 100% sure of this above line - for any spectators, let me know in the github issues (or through a pull request) if you know how to correctly do this + # i'll be rereading https://arxiv.org/abs/2111.14822, where i think a similar approach is taken + else: + x_recon = self.predict_start_from_noise(x, t = t, noise = self.net(x, t, **text_cond)) if clip_denoised: x_recon.clamp_(-1., 1.)