first commit

2025-12-17 06:14:22 +01:00 · 2024-12-11 18:46:36 +08:00
parent 9e65255d34
commit 27f2eb7dc3
847 changed files with 377076 additions and 2 deletions
--- a/configs/sd-turbo-sr-ldis.yaml
+++ b/configs/sd-turbo-sr-ldis.yaml
@@ -0,0 +1,264 @@
+trainer:
+  target: trainer.TrainerSDTurboSR
+
+sd_pipe:
+  target: diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline
+  num_train_steps: 1000
+  enable_grad_checkpoint: True
+  compile: False
+  vae_split: 8
+  params:
+    pretrained_model_name_or_path: stabilityai/sd-turbo
+    cache_dir: weights
+    use_safetensors: True
+    torch_dtype: torch.float16
+
+llpips:
+  target: latent_lpips.lpips.LPIPS
+  ckpt_path: weights/vgg16_sdturbo_lpips.pth
+  compile: False
+  params:
+    pretrained: False
+    net: vgg16
+    lpips: True
+    spatial: False
+    pnet_rand: False
+    pnet_tune: True
+    use_dropout: True
+    eval_mode: True
+    latent: True
+    in_chans: 4
+    verbose: True
+
+model:
+  target: diffusers.models.autoencoders.NoisePredictor
+  ckpt_start_path: ~     # only used for training the intermidiate model
+  ckpt_path: ~           # For initializing
+  compile: False
+  params:
+    in_channels: 3
+    down_block_types:
+      - AttnDownBlock2D
+      - AttnDownBlock2D
+    up_block_types:
+      - AttnUpBlock2D
+      - AttnUpBlock2D
+    block_out_channels:
+      - 256      # 192, 256
+      - 512      # 384, 512
+    layers_per_block: 
+      - 3
+      - 3
+    act_fn: silu
+    latent_channels: 4
+    norm_num_groups: 32
+    sample_size: 128
+    mid_block_add_attention: True
+    resnet_time_scale_shift: default
+    temb_channels: 512
+    attention_head_dim: 64 
+    freq_shift: 0
+    flip_sin_to_cos: True
+    double_z: True
+
+discriminator:
+  target: diffusers.models.unets.unet_2d_condition_discriminator.UNet2DConditionDiscriminator
+  enable_grad_checkpoint: True
+  compile: False
+  params:
+    sample_size: 64
+    in_channels: 4
+    center_input_sample: False
+    flip_sin_to_cos: True
+    freq_shift: 0
+    down_block_types:
+      - DownBlock2D
+      - CrossAttnDownBlock2D
+      - CrossAttnDownBlock2D
+    mid_block_type: UNetMidBlock2DCrossAttn
+    up_block_types:
+      - CrossAttnUpBlock2D
+      - CrossAttnUpBlock2D
+      - UpBlock2D
+    only_cross_attention: False
+    block_out_channels:
+      - 128
+      - 256
+      - 512
+    layers_per_block:
+      - 1
+      - 2
+      - 2
+    downsample_padding: 1
+    mid_block_scale_factor: 1
+    dropout: 0.0
+    act_fn: silu
+    norm_num_groups: 32
+    norm_eps: 1e-5
+    cross_attention_dim: 1024
+    transformer_layers_per_block: 1
+    reverse_transformer_layers_per_block: ~
+    encoder_hid_dim: ~
+    encoder_hid_dim_type: ~
+    attention_head_dim:
+      - 8 
+      - 16 
+      - 16 
+    num_attention_heads: ~
+    dual_cross_attention: False
+    use_linear_projection: False
+    class_embed_type: ~
+    addition_embed_type: text 
+    addition_time_embed_dim: 256
+    num_class_embeds: ~
+    upcast_attention: ~
+    resnet_time_scale_shift: default
+    resnet_skip_time_act: False
+    resnet_out_scale_factor: 1.0
+    time_embedding_type: positional
+    time_embedding_dim: ~
+    time_embedding_act_fn: ~
+    timestep_post_act: ~
+    time_cond_proj_dim: ~
+    conv_in_kernel: 3
+    conv_out_kernel: 3
+    projection_class_embeddings_input_dim: 2560
+    attention_type: default
+    class_embeddings_concat: False
+    mid_block_only_cross_attention: ~
+    cross_attention_norm: ~
+    addition_embed_type_num_heads: 64
+
+degradation:
+  sf: 4
+  # the first degradation process
+  resize_prob: [0.2, 0.7, 0.1]  # up, down, keep
+  resize_range: [0.15, 1.5]
+  gaussian_noise_prob: 0.5
+  noise_range: [1, 30]
+  poisson_scale_range: [0.05, 3.0]
+  gray_noise_prob: 0.4
+  jpeg_range: [30, 95]
+
+  # the second degradation process
+  second_order_prob: 0.5
+  second_blur_prob: 0.8
+  resize_prob2: [0.3, 0.4, 0.3]  # up, down, keep
+  resize_range2: [0.3, 1.2]
+  gaussian_noise_prob2: 0.5
+  noise_range2: [1, 25]
+  poisson_scale_range2: [0.05, 2.5]
+  gray_noise_prob2: 0.4
+  jpeg_range2: [30, 95]
+
+  gt_size: 512 
+  resize_back: False
+  use_sharp: False
+
+data:
+  train:
+    type: realesrgan
+    params:
+      data_source: 
+        source1:
+          root_path: /mnt/sfs-common/zsyue/database/FFHQ
+          image_path: images1024
+          moment_path: ~
+          text_path: ~
+          im_ext: png
+          length: 20000
+        source2:
+          root_path: /mnt/sfs-common/zsyue/database/LSDIR/train
+          image_path: images 
+          moment_path: ~
+          text_path: ~
+          im_ext: png
+      max_token_length: 77   # 77
+      io_backend:
+        type: disk
+      blur_kernel_size: 21
+      kernel_list: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso']
+      kernel_prob: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03]
+      sinc_prob: 0.1
+      blur_sigma: [0.2, 3.0]
+      betag_range: [0.5, 4.0]
+      betap_range: [1, 2.0]
+
+      blur_kernel_size2: 15
+      kernel_list2: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso']
+      kernel_prob2: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03]
+      sinc_prob2: 0.1
+      blur_sigma2: [0.2, 1.5]
+      betag_range2: [0.5, 4.0]
+      betap_range2: [1, 2.0]
+
+      final_sinc_prob: 0.8
+
+      gt_size: ${degradation.gt_size}
+      use_hflip: True
+      use_rot: False
+      random_crop: True
+  val:
+    type: base
+    params:
+      dir_path: /mnt/sfs-common/zsyue/projects/DifInv/SR/testingdata/imagenet512/lq
+      transform_type: default
+      transform_kwargs:
+          mean: 0.0
+          std: 1.0
+      extra_dir_path: /mnt/sfs-common/zsyue/projects/DifInv/SR/testingdata/imagenet512/gt
+      extra_transform_type: default
+      extra_transform_kwargs:
+          mean: 0.0
+          std: 1.0
+      im_exts: png
+      length: 16
+      recursive: False
+
+train:
+  # predict started inverser
+  start_mode: True
+  # learning rate
+  lr: 5e-5                      # learning rate 
+  lr_min: 5e-5                  # learning rate 
+  lr_schedule: ~
+  warmup_iterations: 2000
+  # discriminator
+  lr_dis: 5e-5                  # learning rate for dicriminator
+  weight_decay_dis: 1e-3        # weight decay for dicriminator
+  dis_init_iterations: 10000    # iterations used for updating the discriminator
+  dis_update_freq: 1            
+  # dataloader
+  batch: 64               
+  microbatch: 16 
+  num_workers: 4
+  prefetch_factor: 2            
+  use_text: True
+  # optimization settings
+  weight_decay: 0               
+  ema_rate: 0.999
+  iterations: 200000            # total iterations
+  # logging
+  save_freq: 5000
+  log_freq: [200, 5000]         # [training loss, training images, val images]
+  local_logging: True           # manually save images
+  tf_logging: False             # tensorboard logging
+  # loss 
+  loss_type: L2
+  loss_coef:
+    ldif: 1.0
+  timesteps: [200, 100]
+  num_inference_steps: 5
+  # mixed precision
+  use_amp: True                
+  use_fsdp: False                
+  # random seed 
+  seed: 123456                 
+  global_seeding: False
+  noise_detach: False
+
+validate:
+  batch: 2
+  use_ema: True            
+  log_freq: 4      # logging frequence
+  val_y_channel: True