Add audio sources

Refactor audio sources
Store the target audio source integer (one of the constants in android.media.MediaRecorder.AudioSource) in the AudioSource enum (or -1 if not relevant). This will simplify adding new audio sources.
2026-03-04 19:24:41 +01:00 · 2025-02-22 12:46:06 +01:00 · 2025-02-22 12:46:06 +01:00 · 2025-02-22 12:46:06 +01:00 · 2025-02-22 12:26:27 +01:00 · 2025-02-22 12:26:27 +01:00
12 changed files with 232 additions and 34 deletions
--- a/app/scrcpy.1
+++ b/app/scrcpy.1
@@ -67,13 +67,19 @@ The available encoders can be listed by \fB\-\-list\-encoders\fR.

 .TP
 .BI "\-\-audio\-source " source
-Select the audio source (output, mic or playback).
+Select the audio source. Possible values are:

-The "output" source forwards the whole audio output, and disables playback on the device.
-
-The "playback" source captures the audio playback (Android apps can opt-out, so the whole output is not necessarily captured).
-
-The "mic" source captures the microphone.
+ - "output": forwards the whole audio output, and disables playback on the device.
+ - "playback": captures the audio playback (Android apps can opt-out, so the whole output is not necessarily captured).
+ - "mic": captures the microphone.
+ - "mic-unprocessed": captures the microphone unprocessed (raw) sound.
+ - "mic-camcorder": captures the microphone tuned for video recording, with the same orientation as the camera if available.
+ - "mic-voice-recognition": captures the microphone tuned for voice recognition.
+ - "mic-voice-communication": captures the microphone tuned for voice communications (it will for instance take advantage of echo cancellation or automatic gain control if available).
+ - "voice-call": captures voice call.
+ - "voice-call-uplink": captures voice call uplink only.
+ - "voice-call-downlink": captures voice call downlink only.
+ - "voice-performance": captures audio meant to be processed for live performance (karaoke), includes both the microphone and the device playback.

 Default is output.

--- a/app/src/audio_regulator.c
+++ b/app/src/audio_regulator.c
@@ -76,8 +76,10 @@ sc_audio_regulator_pull(struct sc_audio_regulator *ar, uint8_t *out,
        // Wait until the buffer is filled up to at least target_buffering
        // before playing
        if (buffered_samples < ar->target_buffering) {
-            LOGV("[Audio] Inserting initial buffering silence: %" PRIu32
+#ifdef SC_AUDIO_REGULATOR_DEBUG
+            LOGD("[Audio] Inserting initial buffering silence: %" PRIu32
                 " samples", out_samples);
+#endif
            // Delay playback starting to reach the target buffering. Fill the
            // whole buffer with silence (len is small compared to the
            // arbitrary margin value).
@@ -98,8 +100,10 @@ sc_audio_regulator_pull(struct sc_audio_regulator *ar, uint8_t *out,
        // dropped to keep the latency minimal. However, this would cause very
        // audible glitches, so let the clock compensation restore the target
        // latency.
+#ifdef SC_AUDIO_REGULATOR_DEBUG
        LOGD("[Audio] Buffer underflow, inserting silence: %" PRIu32 " samples",
             silence);
+#endif
        memset(out + TO_BYTES(read), 0, TO_BYTES(silence));

        bool received = atomic_load_explicit(&ar->received,
@@ -137,6 +141,35 @@ bool
 sc_audio_regulator_push(struct sc_audio_regulator *ar, const AVFrame *frame) {
    SwrContext *swr_ctx = ar->swr_ctx;

+    uint32_t input_samples = frame->nb_samples;
+
+    assert(frame->pts >= 0);
+    int64_t pts = frame->pts;
+    if (ar->next_expected_pts && pts - ar->next_expected_pts > 100000) {
+        LOGV("[Audio] Discontinuity detected: %" PRIi64 "µs",
+             pts - ar->next_expected_pts);
+        // More than 100ms: consider it as a discontinuity
+        // (typically because silence packets were not captured)
+        uint32_t can_read = sc_audiobuf_can_read(&ar->buf);
+        if (input_samples + can_read < ar->target_buffering) {
+            // Adjust buffering to the target value directly
+            uint32_t silence = ar->target_buffering - can_read - input_samples;
+            sc_audiobuf_write_silence(&ar->buf, silence);
+        }
+
+        // Reset state
+        ar->avg_buffering.avg = ar->target_buffering;
+        int ret = swr_set_compensation(swr_ctx, 0, 0);
+        assert(!ret); // disabling compensation should never fail
+        ar->compensation_active = false;
+        ar->samples_since_resync = 0;
+        atomic_store_explicit(&ar->underflow, 0, memory_order_relaxed);
+    }
+
+    int64_t packet_duration = input_samples * INT64_C(1000000)
+                            / ar->sample_rate;
+    ar->next_expected_pts = pts + packet_duration;
+
    int64_t swr_delay = swr_get_delay(swr_ctx, ar->sample_rate);
    // No need to av_rescale_rnd(), input and output sample rates are the same.
    // Add more space (256) for clock compensation.
@@ -209,6 +242,7 @@ sc_audio_regulator_push(struct sc_audio_regulator *ar, const AVFrame *frame) {
    if (played) {
        underflow = atomic_exchange_explicit(&ar->underflow, 0,
                                             memory_order_relaxed);
+        ar->underflow_report += underflow;

        max_buffered_samples = ar->target_buffering * 11 / 10
                             + 60 * ar->sample_rate / 1000 /* 60 ms */;
@@ -255,7 +289,7 @@ sc_audio_regulator_push(struct sc_audio_regulator *ar, const AVFrame *frame) {
    }

    // Number of samples added (or removed, if negative) for compensation
-    int32_t instant_compensation = (int32_t) written - frame->nb_samples;
+    int32_t instant_compensation = (int32_t) written - input_samples;
    // Inserting silence instantly increases buffering
    int32_t inserted_silence = (int32_t) underflow;
    // Dropping input samples instantly decreases buffering
@@ -311,7 +345,9 @@ sc_audio_regulator_push(struct sc_audio_regulator *ar, const AVFrame *frame) {
        int abs_max_diff = distance / 50;
        diff = CLAMP(diff, -abs_max_diff, abs_max_diff);
        LOGV("[Audio] Buffering: target=%" PRIu32 " avg=%f cur=%" PRIu32
-             " compensation=%d", ar->target_buffering, avg, can_read, diff);
+             " compensation=%d (underflow=%" PRIu32 ")",
+             ar->target_buffering, avg, can_read, diff, ar->underflow_report);
+        ar->underflow_report = 0;

        int ret = swr_set_compensation(swr_ctx, diff, distance);
        if (ret < 0) {
@@ -394,7 +430,9 @@ sc_audio_regulator_init(struct sc_audio_regulator *ar, size_t sample_size,
    atomic_init(&ar->played, false);
    atomic_init(&ar->received, false);
    atomic_init(&ar->underflow, 0);
+    ar->underflow_report = 0;
    ar->compensation_active = false;
+    ar->next_expected_pts = 0;

    return true;

--- a/app/src/audio_regulator.h
+++ b/app/src/audio_regulator.h
@@ -46,6 +46,9 @@ struct sc_audio_regulator {
    // Number of silence samples inserted since the last received packet
    atomic_uint_least32_t underflow;

+    // Number of silence samples inserted since the last log
+    uint32_t underflow_report;
+
    // Non-zero compensation applied (only used by the receiver thread)
    bool compensation_active;

@@ -54,6 +57,9 @@ struct sc_audio_regulator {

    // Set to true the first time samples are pulled by the player
    atomic_bool played;
+
+    // PTS of the next expected packet (useful to detect discontinuities)
+    int64_t next_expected_pts;
 };

 bool
--- a/app/src/cli.c
+++ b/app/src/cli.c
@@ -217,13 +217,31 @@ static const struct sc_option options[] = {
        .longopt_id = OPT_AUDIO_SOURCE,
        .longopt = "audio-source",
        .argdesc = "source",
-        .text = "Select the audio source (output, mic or playback).\n"
-                "The \"output\" source forwards the whole audio output, and "
-                "disables playback on the device.\n"
-                "The \"playback\" source captures the audio playback (Android "
-                "apps can opt-out, so the whole output is not necessarily "
+        .text = "Select the audio source. Possible values are:\n"
+                " - \"output\": forwards the whole audio output, and disables "
+                "playback on the device.\n"
+                " - \"playback\": captures the audio playback (Android apps "
+                "can opt-out, so the whole output is not necessarily "
                "captured).\n"
-                "The \"mic\" source captures the microphone.\n"
+                " - \"mic\": captures the microphone.\n"
+                " - \"mic-unprocessed\": captures the microphone unprocessed "
+                "(raw) sound.\n"
+                " - \"mic-camcorder\": captures the microphone tuned for video "
+                "recording, with the same orientation as the camera if "
+                "available.\n"
+                " - \"mic-voice-recognition\": captures the microphone tuned "
+                "for voice recognition.\n"
+                " - \"mic-voice-communication\": captures the microphone tuned "
+                "for voice communications (it will for instance take advantage "
+                "of echo cancellation or automatic gain control if "
+                "available).\n"
+                " - \"voice-call\": captures voice call.\n"
+                " - \"voice-call-uplink\": captures voice call uplink only.\n"
+                " - \"voice-call-downlink\": captures voice call downlink "
+                "only.\n"
+                " - \"voice-performance\": captures audio meant to be "
+                "processed for live performance (karaoke), includes both the "
+                "microphone and the device playback.\n"
                "Default is output.",
    },
    {
@@ -2036,8 +2054,50 @@ parse_audio_source(const char *optarg, enum sc_audio_source *source) {
        return true;
    }

-    LOGE("Unsupported audio source: %s (expected output, mic or playback)",
-         optarg);
+    if (!strcmp(optarg, "mic-unprocessed")) {
+        *source = SC_AUDIO_SOURCE_MIC_UNPROCESSED;
+        return true;
+    }
+
+    if (!strcmp(optarg, "mic-camcorder")) {
+        *source = SC_AUDIO_SOURCE_MIC_CAMCORDER;
+        return true;
+    }
+
+    if (!strcmp(optarg, "mic-voice-recognition")) {
+        *source = SC_AUDIO_SOURCE_MIC_VOICE_RECOGNITION;
+        return true;
+    }
+
+    if (!strcmp(optarg, "mic-voice-communication")) {
+        *source = SC_AUDIO_SOURCE_MIC_VOICE_COMMUNICATION;
+        return true;
+    }
+
+    if (!strcmp(optarg, "voice-call")) {
+        *source = SC_AUDIO_SOURCE_VOICE_CALL;
+        return true;
+    }
+
+    if (!strcmp(optarg, "voice-call-uplink")) {
+        *source = SC_AUDIO_SOURCE_VOICE_CALL_UPLINK;
+        return true;
+    }
+
+    if (!strcmp(optarg, "voice-call-downlink")) {
+        *source = SC_AUDIO_SOURCE_VOICE_CALL_DOWNLINK;
+        return true;
+    }
+
+    if (!strcmp(optarg, "voice-performance")) {
+        *source = SC_AUDIO_SOURCE_VOICE_PERFORMANCE;
+        return true;
+    }
+
+    LOGE("Unsupported audio source: %s (expected output, mic, playback, "
+         "mic-unprocessed, mic-camcorder, mic-voice-recognition, "
+         "mic-voice-communication, voice-call, voice-call-uplink, "
+         "voice-call-downlink, voice-performance)", optarg);
    return false;
 }

--- a/app/src/options.h
+++ b/app/src/options.h
@@ -59,6 +59,14 @@ enum sc_audio_source {
    SC_AUDIO_SOURCE_OUTPUT,
    SC_AUDIO_SOURCE_MIC,
    SC_AUDIO_SOURCE_PLAYBACK,
+    SC_AUDIO_SOURCE_MIC_UNPROCESSED,
+    SC_AUDIO_SOURCE_MIC_CAMCORDER,
+    SC_AUDIO_SOURCE_MIC_VOICE_RECOGNITION,
+    SC_AUDIO_SOURCE_MIC_VOICE_COMMUNICATION,
+    SC_AUDIO_SOURCE_VOICE_CALL,
+    SC_AUDIO_SOURCE_VOICE_CALL_UPLINK,
+    SC_AUDIO_SOURCE_VOICE_CALL_DOWNLINK,
+    SC_AUDIO_SOURCE_VOICE_PERFORMANCE,
 };

 enum sc_camera_facing {
--- a/app/src/server.c
+++ b/app/src/server.c
@@ -149,6 +149,22 @@ sc_server_get_audio_source_name(enum sc_audio_source audio_source) {
            return "mic";
        case SC_AUDIO_SOURCE_PLAYBACK:
            return "playback";
+        case SC_AUDIO_SOURCE_MIC_UNPROCESSED:
+            return "mic-unprocessed";
+        case SC_AUDIO_SOURCE_MIC_CAMCORDER:
+            return "mic-camcorder";
+        case SC_AUDIO_SOURCE_MIC_VOICE_RECOGNITION:
+            return "mic-voice-recognition";
+        case SC_AUDIO_SOURCE_MIC_VOICE_COMMUNICATION:
+            return "mic-voice-communication";
+        case SC_AUDIO_SOURCE_VOICE_CALL:
+            return "voice-call";
+        case SC_AUDIO_SOURCE_VOICE_CALL_UPLINK:
+            return "voice-call-uplink";
+        case SC_AUDIO_SOURCE_VOICE_CALL_DOWNLINK:
+            return "voice-call-downlink";
+        case SC_AUDIO_SOURCE_VOICE_PERFORMANCE:
+            return "voice-performance";
        default:
            assert(!"unexpected audio source");
            return NULL;
--- a/app/src/util/audiobuf.c
+++ b/app/src/util/audiobuf.c
@@ -116,3 +116,38 @@ sc_audiobuf_write(struct sc_audiobuf *buf, const void *from_,

    return samples_count;
 }
+
+uint32_t
+sc_audiobuf_write_silence(struct sc_audiobuf *buf, uint32_t samples_count) {
+    // Only the writer thread can write head, so memory_order_relaxed is
+    // sufficient
+    uint32_t head = atomic_load_explicit(&buf->head, memory_order_relaxed);
+
+    // The tail cursor is updated after the data is consumed by the reader
+    uint32_t tail = atomic_load_explicit(&buf->tail, memory_order_acquire);
+
+    uint32_t can_write = (buf->alloc_size + tail - head - 1) % buf->alloc_size;
+    if (!can_write) {
+        return 0;
+    }
+    if (samples_count > can_write) {
+        samples_count = can_write;
+    }
+
+    uint32_t right_count = buf->alloc_size - head;
+    if (right_count > samples_count) {
+        right_count = samples_count;
+    }
+    memset(buf->data + (head * buf->sample_size), 0,
+           right_count * buf->sample_size);
+
+    if (samples_count > right_count) {
+        uint32_t left_count = samples_count - right_count;
+        memset(buf->data, 0, left_count * buf->sample_size);
+    }
+
+    uint32_t new_head = (head + samples_count) % buf->alloc_size;
+    atomic_store_explicit(&buf->head, new_head, memory_order_release);
+
+    return samples_count;
+}
--- a/app/src/util/audiobuf.h
+++ b/app/src/util/audiobuf.h
@@ -50,6 +50,9 @@ uint32_t
 sc_audiobuf_write(struct sc_audiobuf *buf, const void *from,
                  uint32_t samples_count);

+uint32_t
+sc_audiobuf_write_silence(struct sc_audiobuf *buf, uint32_t samples);
+
 static inline uint32_t
 sc_audiobuf_capacity(struct sc_audiobuf *buf) {
    assert(buf->alloc_size);
--- a/app/tests/test_audiobuf.c
+++ b/app/tests/test_audiobuf.c
@@ -113,6 +113,14 @@ static void test_audiobuf_partial_read_write(void) {
    uint32_t expected2[] = {4, 5, 6, 1, 2, 3, 4, 1, 2, 3};
    assert(!memcmp(data, expected2, 12));

+    w = sc_audiobuf_write_silence(&buf, 4);
+    assert(w == 4);
+
+    r = sc_audiobuf_read(&buf, data, 4);
+    assert(r == 4);
+    uint32_t expected3[] = {0, 0, 0, 0};
+    assert(!memcmp(data, expected3, 4));
+
    sc_audiobuf_destroy(&buf);
 }

--- a/doc/audio.md
+++ b/doc/audio.md
@@ -66,6 +66,20 @@ the computer:
 scrcpy --audio-source=mic --no-video --no-playback --record=file.opus
 ```

+Many sources are available:
+
+ - `output` (default): forwards the whole audio output, and disables playback on the device (mapped to [`REMOTE_SUBMIX`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#REMOTE_SUBMIX)).
+ - `playback`: captures the audio playback (Android apps can opt-out, so the whole output is not necessarily captured).
+ - `mic`: captures the microphone (mapped to [`MIC`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#MIC)).
+ - `mic-unprocessed`: captures the microphone unprocessed (raw) sound (mapped to [`UNPROCESSED`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#UNPROCESSED)).
+ - `mic-camcorder`: captures the microphone tuned for video recording, with the same orientation as the camera if available (mapped to [`CAMCORDER`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#CAMCORDER)).
+ - `mic-voice-recognition`: captures the microphone tuned for voice recognition (mapped to [`VOICE_RECOGNITION`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_RECOGNITION)).
+ - `mic-voice-communication`: captures the microphone tuned for voice communications (it will for instance take advantage of echo cancellation or automatic gain control if available) (mapped to [`VOICE_COMMUNICATION`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_COMMUNICATION)).
+ - `voice-call`: captures voice call (mapped to [`VOICE_CALL`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_CALL)).
+ - `voice-call-uplink`: captures voice call uplink only (mapped to [`VOICE_UPLINK`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_UPLINK)).
+ - `voice-call-downlink`: captures voice call downlink only (mapped to [`VOICE_DOWNLINK`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_DOWNLINK)).
+ - `voice-performance`: captures audio meant to be processed for live performance (karaoke), includes both the microphone and the device playback (mapped to [`VOICE_PERFORMANCE`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_PERFORMANCE)).
+
 ### Duplication

 An alternative device audio capture method is also available (only for Android
--- a/server/src/main/java/com/genymobile/scrcpy/audio/AudioDirectCapture.java
+++ b/server/src/main/java/com/genymobile/scrcpy/audio/AudioDirectCapture.java
@@ -12,7 +12,6 @@ import android.content.ComponentName;
 import android.content.Intent;
 import android.media.AudioRecord;
 import android.media.MediaCodec;
-import android.media.MediaRecorder;
 import android.os.Build;
 import android.os.SystemClock;

@@ -32,18 +31,7 @@ public class AudioDirectCapture implements AudioCapture {
    private AudioRecordReader reader;

    public AudioDirectCapture(AudioSource audioSource) {
-        this.audioSource = getAudioSourceValue(audioSource);
-    }
-
-    private static int getAudioSourceValue(AudioSource audioSource) {
-        switch (audioSource) {
-            case OUTPUT:
-                return MediaRecorder.AudioSource.REMOTE_SUBMIX;
-            case MIC:
-                return MediaRecorder.AudioSource.MIC;
-            default:
-                throw new IllegalArgumentException("Unsupported audio source: " + audioSource);
-        }
+        this.audioSource = audioSource.getDirectAudioSource();
    }

    @TargetApi(AndroidVersions.API_23_ANDROID_6_0)
--- a/server/src/main/java/com/genymobile/scrcpy/audio/AudioSource.java
+++ b/server/src/main/java/com/genymobile/scrcpy/audio/AudioSource.java
@@ -1,20 +1,36 @@
 package com.genymobile.scrcpy.audio;

+import android.media.MediaRecorder;
+
 public enum AudioSource {
-    OUTPUT("output"),
-    MIC("mic"),
-    PLAYBACK("playback");
+    OUTPUT("output", MediaRecorder.AudioSource.REMOTE_SUBMIX),
+    MIC("mic", MediaRecorder.AudioSource.MIC),
+    PLAYBACK("playback", -1),
+    MIC_UNPROCESSED("mic-unprocessed", MediaRecorder.AudioSource.UNPROCESSED),
+    MIC_CAMCORDER("mic-camcorder", MediaRecorder.AudioSource.CAMCORDER),
+    MIC_VOICE_RECOGNITION("mic-voice-recognition", MediaRecorder.AudioSource.VOICE_RECOGNITION),
+    MIC_VOICE_COMMUNICATION("mic-voice-communication", MediaRecorder.AudioSource.VOICE_COMMUNICATION),
+    VOICE_CALL("voice-call", MediaRecorder.AudioSource.VOICE_CALL),
+    VOICE_CALL_UPLINK("voice-call-uplink", MediaRecorder.AudioSource.VOICE_CALL),
+    VOICE_CALL_DOWNLINK("voice-call-downlink", MediaRecorder.AudioSource.VOICE_CALL),
+    VOICE_PERFORMANCE("voice-performance", MediaRecorder.AudioSource.VOICE_PERFORMANCE);

    private final String name;
+    private final int directAudioSource;

-    AudioSource(String name) {
+    AudioSource(String name, int directAudioSource) {
        this.name = name;
+        this.directAudioSource = directAudioSource;
    }

    public boolean isDirect() {
        return this != PLAYBACK;
    }

+    public int getDirectAudioSource() {
+        return directAudioSource;
+    }
+
    public static AudioSource findByName(String name) {
        for (AudioSource audioSource : AudioSource.values()) {
            if (name.equals(audioSource.name)) {
Author	SHA1	Message	Date
Romain Vimont	1ebe2e2db6	Add audio sources	2025-02-22 12:46:06 +01:00
Romain Vimont	9fb7446b88	Refactor audio sources Store the target audio source integer (one of the constants in android.media.MediaRecorder.AudioSource) in the AudioSource enum (or -1 if not relevant). This will simplify adding new audio sources.	2025-02-22 12:46:06 +01:00
Romain Vimont	671025cb68	Handle audio stream discontinuities The audio regulator assumed a continuous audio stream. But some audio sources (like the "voice call" audio source) do not produce any packets on silence, breaking this assumption. Use PTS to detect such discontinuities. TODO: if PTS values are broken, the detection is also broken.	2025-02-22 12:46:06 +01:00
Romain Vimont	8925bdc8fd	Report underflow samples in verbose mode Report the number of silence samples inserted due to underflow every second, along with the other metrics.	2025-02-22 12:26:27 +01:00
Romain Vimont	ea4c076345	Disable audio regulator underflow logs Only enable them if SC_AUDIO_REGULATOR_DEBUG is set, as they may spam the output.	2025-02-22 12:26:27 +01:00