mirror of
https://github.com/lucidrains/DALLE2-pytorch.git
synced 2026-02-13 12:04:24 +01:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3070610231 | ||
|
|
870aeeca62 | ||
|
|
f28dc6dc01 |
33
.github/workflows/ci.yml
vendored
Normal file
33
.github/workflows/ci.yml
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
name: Continuous integration
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [3.8]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install
|
||||
run: |
|
||||
python3 -m venv .env
|
||||
source .env/bin/activate
|
||||
make install
|
||||
- name: Tests
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
make test
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -136,3 +136,5 @@ dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
.tracker_data
|
||||
*.pth
|
||||
|
||||
6
Makefile
Normal file
6
Makefile
Normal file
@@ -0,0 +1,6 @@
|
||||
install:
|
||||
pip install -U pip
|
||||
pip install -e .
|
||||
|
||||
test:
|
||||
CUDA_VISIBLE_DEVICES= python train_decoder.py --config_file configs/train_decoder_config.test.json
|
||||
102
configs/train_decoder_config.test.json
Normal file
102
configs/train_decoder_config.test.json
Normal file
@@ -0,0 +1,102 @@
|
||||
{
|
||||
"decoder": {
|
||||
"unets": [
|
||||
{
|
||||
"dim": 16,
|
||||
"image_embed_dim": 768,
|
||||
"cond_dim": 16,
|
||||
"channels": 3,
|
||||
"dim_mults": [1, 2, 4, 8],
|
||||
"attn_dim_head": 16,
|
||||
"attn_heads": 4,
|
||||
"self_attn": [false, true, true, true]
|
||||
}
|
||||
],
|
||||
"clip": {
|
||||
"make": "openai",
|
||||
"model": "ViT-L/14"
|
||||
},
|
||||
|
||||
"timesteps": 10,
|
||||
"image_sizes": [64],
|
||||
"channels": 3,
|
||||
"loss_type": "l2",
|
||||
"beta_schedule": ["cosine"],
|
||||
"learned_variance": true
|
||||
},
|
||||
"data": {
|
||||
"webdataset_base_url": "test_data/{}.tar",
|
||||
"num_workers": 4,
|
||||
"batch_size": 4,
|
||||
"start_shard": 0,
|
||||
"end_shard": 9,
|
||||
"shard_width": 1,
|
||||
"index_width": 1,
|
||||
"splits": {
|
||||
"train": 0.75,
|
||||
"val": 0.15,
|
||||
"test": 0.1
|
||||
},
|
||||
"shuffle_train": false,
|
||||
"resample_train": true,
|
||||
"preprocessing": {
|
||||
"RandomResizedCrop": {
|
||||
"size": [64, 64],
|
||||
"scale": [0.75, 1.0],
|
||||
"ratio": [1.0, 1.0]
|
||||
},
|
||||
"ToTensor": true
|
||||
}
|
||||
},
|
||||
"train": {
|
||||
"epochs": 1,
|
||||
"lr": 1e-16,
|
||||
"wd": 0.01,
|
||||
"max_grad_norm": 0.5,
|
||||
"save_every_n_samples": 100,
|
||||
"n_sample_images": 1,
|
||||
"device": "cpu",
|
||||
"epoch_samples": 50,
|
||||
"validation_samples": 5,
|
||||
"use_ema": true,
|
||||
"ema_beta": 0.99,
|
||||
"amp": false,
|
||||
"save_all": false,
|
||||
"save_latest": true,
|
||||
"save_best": true,
|
||||
"unet_training_mask": [true]
|
||||
},
|
||||
"evaluate": {
|
||||
"n_evaluation_samples": 2,
|
||||
"FID": {
|
||||
"feature": 64
|
||||
},
|
||||
"IS": {
|
||||
"feature": 64,
|
||||
"splits": 10
|
||||
},
|
||||
"KID": {
|
||||
"feature": 64,
|
||||
"subset_size": 2
|
||||
},
|
||||
"LPIPS": {
|
||||
"net_type": "vgg",
|
||||
"reduction": "mean"
|
||||
}
|
||||
},
|
||||
"tracker": {
|
||||
"overwrite_data_path": true,
|
||||
|
||||
"log": {
|
||||
"log_type": "console"
|
||||
},
|
||||
|
||||
"load": {
|
||||
"load_from": null
|
||||
},
|
||||
|
||||
"save": [{
|
||||
"save_to": "local"
|
||||
}]
|
||||
}
|
||||
}
|
||||
@@ -169,6 +169,11 @@ class BaseClipAdapter(nn.Module):
|
||||
self.clip = clip
|
||||
self.overrides = kwargs
|
||||
|
||||
def validate_and_resize_image(self, image):
|
||||
image_size = image.shape[-1]
|
||||
assert image_size >= self.image_size, f'you are passing in an image of size {image_size} but CLIP requires the image size to be at least {self.image_size}'
|
||||
return resize_image_to(image, self.image_size)
|
||||
|
||||
@property
|
||||
def dim_latent(self):
|
||||
raise NotImplementedError
|
||||
@@ -219,7 +224,7 @@ class XClipAdapter(BaseClipAdapter):
|
||||
|
||||
@torch.no_grad()
|
||||
def embed_image(self, image):
|
||||
image = resize_image_to(image, self.image_size)
|
||||
image = self.validate_and_resize_image(image)
|
||||
encoder_output = self.clip.visual_transformer(image)
|
||||
image_cls, image_encodings = encoder_output[:, 0], encoder_output[:, 1:]
|
||||
image_embed = self.clip.to_visual_latent(image_cls)
|
||||
@@ -254,7 +259,7 @@ class CoCaAdapter(BaseClipAdapter):
|
||||
|
||||
@torch.no_grad()
|
||||
def embed_image(self, image):
|
||||
image = resize_image_to(image, self.image_size)
|
||||
image = self.validate_and_resize_image(image)
|
||||
image_embed, image_encodings = self.clip.embed_image(image)
|
||||
return EmbeddedImage(image_embed, image_encodings)
|
||||
|
||||
@@ -315,7 +320,7 @@ class OpenAIClipAdapter(BaseClipAdapter):
|
||||
@torch.no_grad()
|
||||
def embed_image(self, image):
|
||||
assert not self.cleared
|
||||
image = resize_image_to(image, self.image_size)
|
||||
image = self.validate_and_resize_image(image)
|
||||
image = self.clip_normalize(image)
|
||||
image_embed = self.clip.encode_image(image)
|
||||
return EmbeddedImage(l2norm(image_embed.float()), None)
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '0.17.0'
|
||||
__version__ = '0.17.1'
|
||||
|
||||
BIN
test_data/0.tar
Normal file
BIN
test_data/0.tar
Normal file
Binary file not shown.
BIN
test_data/1.tar
Normal file
BIN
test_data/1.tar
Normal file
Binary file not shown.
BIN
test_data/2.tar
Normal file
BIN
test_data/2.tar
Normal file
Binary file not shown.
BIN
test_data/3.tar
Normal file
BIN
test_data/3.tar
Normal file
Binary file not shown.
BIN
test_data/4.tar
Normal file
BIN
test_data/4.tar
Normal file
Binary file not shown.
BIN
test_data/5.tar
Normal file
BIN
test_data/5.tar
Normal file
Binary file not shown.
BIN
test_data/6.tar
Normal file
BIN
test_data/6.tar
Normal file
Binary file not shown.
BIN
test_data/7.tar
Normal file
BIN
test_data/7.tar
Normal file
Binary file not shown.
BIN
test_data/8.tar
Normal file
BIN
test_data/8.tar
Normal file
Binary file not shown.
BIN
test_data/9.tar
Normal file
BIN
test_data/9.tar
Normal file
Binary file not shown.
@@ -132,7 +132,7 @@ def get_example_data(dataloader, device, n=5):
|
||||
break
|
||||
return list(zip(images[:n], img_embeddings[:n], text_embeddings[:n], captions[:n]))
|
||||
|
||||
def generate_samples(trainer, example_data, condition_on_text_encodings=False, text_prepend=""):
|
||||
def generate_samples(trainer, example_data, condition_on_text_encodings=False, text_prepend="", match_image_size=True):
|
||||
"""
|
||||
Takes example data and generates images from the embeddings
|
||||
Returns three lists: real images, generated images, and captions
|
||||
@@ -160,6 +160,9 @@ def generate_samples(trainer, example_data, condition_on_text_encodings=False, t
|
||||
samples = trainer.sample(**sample_params)
|
||||
generated_images = list(samples)
|
||||
captions = [text_prepend + txt for txt in txts]
|
||||
if match_image_size:
|
||||
generated_image_size = generated_images[0].shape[-1]
|
||||
real_images = [resize_image_to(image, generated_image_size, clamp_range=(0, 1)) for image in real_images]
|
||||
return real_images, generated_images, captions
|
||||
|
||||
def generate_grid_samples(trainer, examples, condition_on_text_encodings=False, text_prepend=""):
|
||||
@@ -167,14 +170,6 @@ def generate_grid_samples(trainer, examples, condition_on_text_encodings=False,
|
||||
Generates samples and uses torchvision to put them in a side by side grid for easy viewing
|
||||
"""
|
||||
real_images, generated_images, captions = generate_samples(trainer, examples, condition_on_text_encodings, text_prepend)
|
||||
|
||||
real_image_size = real_images[0].shape[-1]
|
||||
generated_image_size = generated_images[0].shape[-1]
|
||||
|
||||
# training images may be larger than the generated one
|
||||
if real_image_size > generated_image_size:
|
||||
real_images = [resize_image_to(image, generated_image_size) for image in real_images]
|
||||
|
||||
grid_images = [torchvision.utils.make_grid([original_image, generated_image]) for original_image, generated_image in zip(real_images, generated_images)]
|
||||
return grid_images, captions
|
||||
|
||||
|
||||
Reference in New Issue
Block a user