Adds SV4D code

2026-01-06 23:24:23 +01:00 · 2024-07-23 20:17:16 +00:00
parent fbdc58cab9
commit abe9ed3d40
16 changed files with 3174 additions and 23 deletions
--- a/scripts/sampling/configs/sv4d.yaml
+++ b/scripts/sampling/configs/sv4d.yaml
@@ -0,0 +1,208 @@
+N_TIME: 5
+N_VIEW: 8
+N_FRAMES: 40
+
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    en_and_decode_n_samples_a_time: 7
+    disable_first_stage_autocast: True
+    ckpt_path: checkpoints/sv4d.safetensors
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.SpatialUNetModelWithTime
+      params:
+        adm_in_channels: 1280
+        attention_resolutions: [4, 2, 1]
+        channel_mult: [1, 2, 4, 4]
+        context_dim: 1024
+        extra_ff_mix_layer: True
+        in_channels: 8
+        legacy: False
+        model_channels: 320
+        num_classes: sequential
+        num_head_channels: 64
+        num_res_blocks: 2
+        out_channels: 4
+        replicate_time_mix_bug: True
+        spatial_transformer_attn_type: softmax-xformers
+        time_block_merge_factor: 0.0
+        time_block_merge_strategy: learned_with_images
+        time_kernel_size: [3, 1, 1]
+        time_mix_legacy: False
+        transformer_depth: 1
+        use_checkpoint: False
+        use_linear_in_transformer: True
+        use_spatial_context: True
+        use_spatial_transformer: True
+        use_motion_attention: True
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+
+        - input_key: cond_frames_without_noise
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          is_trainable: False
+          params:
+            n_cond_frames: ${N_TIME}
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: True
+
+        - input_key: cond_frames
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          is_trainable: False
+          params:
+            is_ae: True
+            n_cond_frames: ${N_FRAMES}
+            n_copies: 1
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                ddconfig:
+                  attn_resolutions: []
+                  attn_type: vanilla-xformers
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  double_z: True
+                  dropout: 0.0
+                  in_channels: 3
+                  num_res_blocks: 2
+                  out_ch: 3
+                  resolution: 256
+                  z_channels: 4
+                embed_dim: 4
+                lossconfig:
+                  target: torch.nn.Identity
+                monitor: val/rec_loss
+            sigma_cond_config:
+              target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+              params:
+                outdim: 256
+            sigma_sampler_config:
+              target: sgm.modules.diffusionmodules.sigma_sampling.ZeroSampler
+
+        # - input_key: cond_aug
+        #   is_trainable: False
+        #   target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+        #   params:
+        #     outdim: 256
+
+        - input_key: polar_rad
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 512
+
+        - input_key: azimuth_rad
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 512
+
+        - input_key: cond_view
+          is_trainable: False
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_resolutions: []
+                  attn_type: vanilla-xformers
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  double_z: True
+                  dropout: 0.0
+                  in_channels: 3
+                  num_res_blocks: 2
+                  out_ch: 3
+                  resolution: 256
+                  z_channels: 4
+                lossconfig:
+                  target: torch.nn.Identity
+            is_ae: True
+            n_cond_frames: ${N_VIEW}
+            n_copies: 1
+            sigma_sampler_config:
+              target: sgm.modules.diffusionmodules.sigma_sampling.ZeroSampler
+
+        - input_key: cond_motion
+          is_trainable: False
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            is_ae: True
+            n_cond_frames: ${N_TIME}
+            n_copies: 1
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_resolutions: []
+                  attn_type: vanilla-xformers
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  double_z: True
+                  dropout: 0.0
+                  in_channels: 3
+                  num_res_blocks: 2
+                  out_ch: 3
+                  resolution: 256
+                  z_channels: 4
+                lossconfig:
+                  target: torch.nn.Identity
+            sigma_sampler_config:
+              target: sgm.modules.diffusionmodules.sigma_sampling.ZeroSampler
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config:
+          target: torch.nn.Identity
+        decoder_config:
+          target: sgm.modules.diffusionmodules.model.Decoder
+          params:
+            attn_resolutions: []
+            attn_type: vanilla-xformers
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            double_z: True
+            dropout: 0.0
+            in_channels: 3
+            num_res_blocks: 2
+            out_ch: 3
+            resolution: 256
+            z_channels: 4
+
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 500.0
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            max_scale: 2.5
+            num_frames: ${N_FRAMES}
+            additional_cond_keys: [ cond_view, cond_motion ]