sv4d: fix readme;

rename video exampel folder; add encode_t as input parameter.
2026-01-09 08:34:25 +01:00 · 2024-08-02 17:19:03 +00:00
parent da40ebad4e
commit e90e953330
22 changed files with 43 additions and 29 deletions
--- a/scripts/demo/gradio_app_sv4d.py
+++ b/scripts/demo/gradio_app_sv4d.py
@@ -14,6 +14,7 @@ from huggingface_hub import hf_hub_download
 from typing import List, Optional, Union
 import torchvision

+from sgm.modules.encoders.modules import VideoPredictionEmbedderWithEncoder
 from scripts.demo.sv4d_helpers import (
    decode_latents,
    load_model,
@@ -138,6 +139,7 @@ sv3d_model = initial_model_load(sv3d_model)
 def sample_anchor(
    input_path: str = "assets/test_image.png",  # Can either be image file or folder with image files
    seed: Optional[int] = None,
+    encoding_t: int = 8,  # Number of frames encoded at a time! This eats most VRAM. Reduce if necessary.
    decoding_t: int = 4,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    num_steps: int = 20,
    sv3d_version: str = "sv3d_u",  # sv3d_u or sv3d_p
@@ -205,6 +207,10 @@ def sample_anchor(
    sv3d_file = os.path.join(output_folder, "t000.mp4")
    save_video(sv3d_file, images_t0.unsqueeze(1))
    
+    for emb in model.conditioner.embedders:
+        if isinstance(emb, VideoPredictionEmbedderWithEncoder):
+            emb.en_and_decode_n_samples_a_time = encoding_t
+    model.en_and_decode_n_samples_a_time = decoding_t
    # Initialize image matrix
    img_matrix = [[None] * n_views for _ in range(n_frames)]
    for i, v in enumerate(subsampled_views):
@@ -413,6 +419,13 @@ with gr.Blocks() as demo:
            maximum=100,
            step=1,
        )
+        encoding_t = gr.Slider(
+            label="Encode n frames at a time",
+            info="Number of frames encoded at a time! This eats most VRAM. Reduce if necessary.",
+            value=8,
+            minimum=1,
+            maximum=40,
+        )
        decoding_t = gr.Slider(
            label="Decode n frames at a time",
            info="Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.",
@@ -440,7 +453,7 @@ with gr.Blocks() as demo:

    generate_btn.click(
        fn=sample_anchor,
-        inputs=[input_video, seed, decoding_t, denoising_steps],
+        inputs=[input_video, seed, encoding_t, decoding_t, denoising_steps],
        outputs=[sv3d_video, anchor_video, anchor_frames],
        api_name="SV4D output (5 frames)",
    )
@@ -455,22 +468,22 @@ with gr.Blocks() as demo:
    examples = gr.Examples(
        fn=preprocess_video,
        examples=[
-            "./assets/sv4d_example_video/test_video1.mp4",
-            "./assets/sv4d_example_video/test_video2.mp4",
-            "./assets/sv4d_example_video/green_robot.mp4",
-            "./assets/sv4d_example_video/dolphin.mp4",
-            "./assets/sv4d_example_video/lucia_v000.mp4",
-            "./assets/sv4d_example_video/snowboard_v000.mp4",
-            "./assets/sv4d_example_video/stroller_v000.mp4",
-            "./assets/sv4d_example_video/human5.mp4",
-            "./assets/sv4d_example_video/bunnyman.mp4",
-            "./assets/sv4d_example_video/hiphop_parrot.mp4",
-            "./assets/sv4d_example_video/guppie_v0.mp4",
-            "./assets/sv4d_example_video/wave_hello.mp4",
-            "./assets/sv4d_example_video/pistol_v0.mp4",
-            "./assets/sv4d_example_video/human7.mp4",
-            "./assets/sv4d_example_video/monkey.mp4",
-            "./assets/sv4d_example_video/train_v0.mp4",
+            "./assets/sv4d_videos/test_video1.mp4",
+            "./assets/sv4d_videos/test_video2.mp4",
+            "./assets/sv4d_videos/green_robot.mp4",
+            "./assets/sv4d_videos/dolphin.mp4",
+            "./assets/sv4d_videos/lucia_v000.mp4",
+            "./assets/sv4d_videos/snowboard_v000.mp4",
+            "./assets/sv4d_videos/stroller_v000.mp4",
+            "./assets/sv4d_videos/human5.mp4",
+            "./assets/sv4d_videos/bunnyman.mp4",
+            "./assets/sv4d_videos/hiphop_parrot.mp4",
+            "./assets/sv4d_videos/guppie_v0.mp4",
+            "./assets/sv4d_videos/wave_hello.mp4",
+            "./assets/sv4d_videos/pistol_v0.mp4",
+            "./assets/sv4d_videos/human7.mp4",
+            "./assets/sv4d_videos/monkey.mp4",
+            "./assets/sv4d_videos/train_v0.mp4",
        ],
        inputs=[input_video],
        run_on_click=True,
--- a/scripts/demo/sv4d_helpers.py
+++ b/scripts/demo/sv4d_helpers.py
@@ -264,7 +264,7 @@ def preprocess_video(input_path, remove_bg=False, n_frames=21, W=576, H=576, out
        
        images_v0.append(image)
    
-    base_count = len(glob(os.path.join(output_folder, "*.mp4"))) // 10
+    base_count = len(glob(os.path.join(output_folder, "*.mp4"))) // 12
    processed_file = os.path.join(output_folder, f"{base_count:06d}_process_input.mp4")
    imageio.mimwrite(processed_file, images_v0, fps=10)
    return processed_file
@@ -892,7 +892,6 @@ def do_sample(
                unload_module_gpu(model.model)
                unload_module_gpu(model.denoiser)
                load_module_gpu(model.first_stage_model)
-                model.en_and_decode_n_samples_a_time = decoding_t
                if isinstance(model.first_stage_model.decoder, VideoDecoder):
                    samples_x = model.decode_first_stage(
                        samples_z, timesteps=default(decoding_t, T)
--- a/scripts/sampling/configs/sv4d.yaml
+++ b/scripts/sampling/configs/sv4d.yaml
@@ -1,7 +1,6 @@
 N_TIME: 5
 N_VIEW: 8
 N_FRAMES: 40
-ENCODE_N_A_TIME: 8

 model:
  target: sgm.models.diffusion.DiffusionEngine
@@ -68,7 +67,6 @@ model:
            is_ae: True
            n_cond_frames: ${N_FRAMES}
            n_copies: 1
-            en_and_decode_n_samples_a_time: ${ENCODE_N_A_TIME}
            encoder_config:
              target: sgm.models.autoencoder.AutoencoderKLModeOnly
              params:
@@ -133,7 +131,6 @@ model:
            is_ae: True
            n_cond_frames: ${N_VIEW}
            n_copies: 1
-            en_and_decode_n_samples_a_time: ${ENCODE_N_A_TIME}
            sigma_sampler_config:
              target: sgm.modules.diffusionmodules.sigma_sampling.ZeroSampler

@@ -144,7 +141,6 @@ model:
            is_ae: True
            n_cond_frames: ${N_TIME}
            n_copies: 1
-            en_and_decode_n_samples_a_time: ${ENCODE_N_A_TIME}
            encoder_config:
              target: sgm.models.autoencoder.AutoencoderKLModeOnly
              params:
--- a/scripts/sampling/simple_video_sample_4d.py
+++ b/scripts/sampling/simple_video_sample_4d.py
@@ -10,6 +10,7 @@ import numpy as np
 import torch
 from fire import Fire

+from sgm.modules.encoders.modules import VideoPredictionEmbedderWithEncoder
 from scripts.demo.sv4d_helpers import (
    decode_latents,
    load_model,
@@ -35,6 +36,7 @@ def sample(
    motion_bucket_id: int = 127,
    cond_aug: float = 1e-5,
    seed: int = 23,
+    encoding_t: int = 8,  # Number of frames encoded at a time! This eats most VRAM. Reduce if necessary.
    decoding_t: int = 4,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    device: str = "cuda",
    elevations_deg: Optional[Union[float, List[float]]] = 10.0,
@@ -45,7 +47,7 @@ def sample(
 ):
    """
    Simple script to generate multiple novel-view videos conditioned on a video `input_path` or multiple frames, one for each
-    image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t`.
+    image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t` and `encoding_t`.
    """
    # Set model config
    T = 5  # number of frames per sample
@@ -162,6 +164,10 @@ def sample(
        verbose,
    )
    model = initial_model_load(model)
+    for emb in model.conditioner.embedders:
+        if isinstance(emb, VideoPredictionEmbedderWithEncoder):
+            emb.en_and_decode_n_samples_a_time = encoding_t
+    model.en_and_decode_n_samples_a_time = decoding_t

    # Interleaved sampling for anchor frames
    t0, v0 = 0, 0