mirror of
https://github.com/Stability-AI/generative-models.git
synced 2025-12-24 08:44:20 +01:00
207 lines
6.8 KiB
YAML
Executable File
207 lines
6.8 KiB
YAML
Executable File
N_TIME: 5
|
|
N_VIEW: 8
|
|
N_FRAMES: 40
|
|
ENCODE_N_A_TIME: 8
|
|
|
|
model:
|
|
target: sgm.models.diffusion.DiffusionEngine
|
|
params:
|
|
scale_factor: 0.18215
|
|
en_and_decode_n_samples_a_time: 7
|
|
disable_first_stage_autocast: True
|
|
ckpt_path: checkpoints/sv4d.safetensors
|
|
|
|
denoiser_config:
|
|
target: sgm.modules.diffusionmodules.denoiser.Denoiser
|
|
params:
|
|
scaling_config:
|
|
target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
|
|
|
|
network_config:
|
|
target: sgm.modules.diffusionmodules.video_model.SpatialUNetModelWithTime
|
|
params:
|
|
adm_in_channels: 1280
|
|
attention_resolutions: [4, 2, 1]
|
|
channel_mult: [1, 2, 4, 4]
|
|
context_dim: 1024
|
|
extra_ff_mix_layer: True
|
|
in_channels: 8
|
|
legacy: False
|
|
model_channels: 320
|
|
num_classes: sequential
|
|
num_head_channels: 64
|
|
num_res_blocks: 2
|
|
out_channels: 4
|
|
replicate_time_mix_bug: True
|
|
spatial_transformer_attn_type: softmax-xformers
|
|
time_block_merge_factor: 0.0
|
|
time_block_merge_strategy: learned_with_images
|
|
time_kernel_size: [3, 1, 1]
|
|
time_mix_legacy: False
|
|
transformer_depth: 1
|
|
use_checkpoint: False
|
|
use_linear_in_transformer: True
|
|
use_spatial_context: True
|
|
use_spatial_transformer: True
|
|
use_motion_attention: True
|
|
|
|
conditioner_config:
|
|
target: sgm.modules.GeneralConditioner
|
|
params:
|
|
emb_models:
|
|
|
|
- input_key: cond_frames_without_noise
|
|
target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
|
|
is_trainable: False
|
|
params:
|
|
n_cond_frames: ${N_TIME}
|
|
n_copies: 1
|
|
open_clip_embedding_config:
|
|
target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
|
|
params:
|
|
freeze: True
|
|
|
|
- input_key: cond_frames
|
|
target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
|
|
is_trainable: False
|
|
params:
|
|
is_ae: True
|
|
n_cond_frames: ${N_FRAMES}
|
|
n_copies: 1
|
|
en_and_decode_n_samples_a_time: ${ENCODE_N_A_TIME}
|
|
encoder_config:
|
|
target: sgm.models.autoencoder.AutoencoderKLModeOnly
|
|
params:
|
|
ddconfig:
|
|
attn_resolutions: []
|
|
attn_type: vanilla-xformers
|
|
ch: 128
|
|
ch_mult: [1, 2, 4, 4]
|
|
double_z: True
|
|
dropout: 0.0
|
|
in_channels: 3
|
|
num_res_blocks: 2
|
|
out_ch: 3
|
|
resolution: 256
|
|
z_channels: 4
|
|
embed_dim: 4
|
|
lossconfig:
|
|
target: torch.nn.Identity
|
|
monitor: val/rec_loss
|
|
sigma_cond_config:
|
|
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
|
params:
|
|
outdim: 256
|
|
sigma_sampler_config:
|
|
target: sgm.modules.diffusionmodules.sigma_sampling.ZeroSampler
|
|
|
|
- input_key: polar_rad
|
|
is_trainable: False
|
|
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
|
params:
|
|
outdim: 512
|
|
|
|
- input_key: azimuth_rad
|
|
is_trainable: False
|
|
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
|
params:
|
|
outdim: 512
|
|
|
|
- input_key: cond_view
|
|
is_trainable: False
|
|
target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
|
|
params:
|
|
encoder_config:
|
|
target: sgm.models.autoencoder.AutoencoderKLModeOnly
|
|
params:
|
|
embed_dim: 4
|
|
monitor: val/rec_loss
|
|
ddconfig:
|
|
attn_resolutions: []
|
|
attn_type: vanilla-xformers
|
|
ch: 128
|
|
ch_mult: [1, 2, 4, 4]
|
|
double_z: True
|
|
dropout: 0.0
|
|
in_channels: 3
|
|
num_res_blocks: 2
|
|
out_ch: 3
|
|
resolution: 256
|
|
z_channels: 4
|
|
lossconfig:
|
|
target: torch.nn.Identity
|
|
is_ae: True
|
|
n_cond_frames: ${N_VIEW}
|
|
n_copies: 1
|
|
en_and_decode_n_samples_a_time: ${ENCODE_N_A_TIME}
|
|
sigma_sampler_config:
|
|
target: sgm.modules.diffusionmodules.sigma_sampling.ZeroSampler
|
|
|
|
- input_key: cond_motion
|
|
is_trainable: False
|
|
target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
|
|
params:
|
|
is_ae: True
|
|
n_cond_frames: ${N_TIME}
|
|
n_copies: 1
|
|
en_and_decode_n_samples_a_time: ${ENCODE_N_A_TIME}
|
|
encoder_config:
|
|
target: sgm.models.autoencoder.AutoencoderKLModeOnly
|
|
params:
|
|
embed_dim: 4
|
|
monitor: val/rec_loss
|
|
ddconfig:
|
|
attn_resolutions: []
|
|
attn_type: vanilla-xformers
|
|
ch: 128
|
|
ch_mult: [1, 2, 4, 4]
|
|
double_z: True
|
|
dropout: 0.0
|
|
in_channels: 3
|
|
num_res_blocks: 2
|
|
out_ch: 3
|
|
resolution: 256
|
|
z_channels: 4
|
|
lossconfig:
|
|
target: torch.nn.Identity
|
|
sigma_sampler_config:
|
|
target: sgm.modules.diffusionmodules.sigma_sampling.ZeroSampler
|
|
|
|
first_stage_config:
|
|
target: sgm.models.autoencoder.AutoencodingEngine
|
|
params:
|
|
loss_config:
|
|
target: torch.nn.Identity
|
|
regularizer_config:
|
|
target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
|
|
encoder_config:
|
|
target: torch.nn.Identity
|
|
decoder_config:
|
|
target: sgm.modules.diffusionmodules.model.Decoder
|
|
params:
|
|
attn_resolutions: []
|
|
attn_type: vanilla-xformers
|
|
ch: 128
|
|
ch_mult: [1, 2, 4, 4]
|
|
double_z: True
|
|
dropout: 0.0
|
|
in_channels: 3
|
|
num_res_blocks: 2
|
|
out_ch: 3
|
|
resolution: 256
|
|
z_channels: 4
|
|
|
|
sampler_config:
|
|
target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
|
|
params:
|
|
discretization_config:
|
|
target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
|
|
params:
|
|
sigma_max: 500.0
|
|
guider_config:
|
|
target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
|
|
params:
|
|
max_scale: 2.5
|
|
num_frames: ${N_FRAMES}
|
|
additional_cond_keys: [ cond_view, cond_motion ]
|