Compare commits

...

6 Commits

Author SHA1 Message Date
Romain Vimont
63d848fc55 Fix PTS produced by the default OPUS encoder
The default OPUS encoder on Android rewrites the PTS so that it exactly
matches the number of samples.

As a consequence:
 - audio clock drift is not compensated
 - hard silences are ignored

To fix this behavior, recreate the PTS based on the current time (after
encoding) and the packet duration.
2025-03-02 18:00:12 +01:00
Romain Vimont
b292e356de Add audio sources 2025-03-02 17:17:01 +01:00
Romain Vimont
9fb7446b88 Refactor audio sources
Store the target audio source integer (one of the constants in
android.media.MediaRecorder.AudioSource) in the AudioSource enum (or -1
if not relevant).

This will simplify adding new audio sources.
2025-02-22 12:46:06 +01:00
Romain Vimont
671025cb68 Handle audio stream discontinuities
The audio regulator assumed a continuous audio stream. But some audio
sources (like the "voice call" audio source) do not produce any packets
on silence, breaking this assumption.

Use PTS to detect such discontinuities.

TODO: if PTS values are broken, the detection is also broken.
2025-02-22 12:46:06 +01:00
Romain Vimont
8925bdc8fd Report underflow samples in verbose mode
Report the number of silence samples inserted due to underflow every
second, along with the other metrics.
2025-02-22 12:26:27 +01:00
Romain Vimont
ea4c076345 Disable audio regulator underflow logs
Only enable them if SC_AUDIO_REGULATOR_DEBUG is set, as they may spam
the output.
2025-02-22 12:26:27 +01:00
15 changed files with 265 additions and 36 deletions

View File

@@ -122,7 +122,7 @@ _scrcpy() {
return
;;
--audio-source)
COMPREPLY=($(compgen -W 'output mic playback' -- "$cur"))
COMPREPLY=($(compgen -W 'output playback mic mic-unprocessed mic-camcorder mic-voice-recognition mic-voice-communication voice-call voice-call-uplink voice-call-downlink voice-performance' -- "$cur"))
return
;;
--camera-facing)

View File

@@ -16,7 +16,7 @@ arguments=(
'--audio-codec-options=[Set a list of comma-separated key\:type=value options for the device audio encoder]'
'--audio-dup=[Duplicate audio]'
'--audio-encoder=[Use a specific MediaCodec audio encoder]'
'--audio-source=[Select the audio source]:source:(output mic playback)'
'--audio-source=[Select the audio source]:source:(output playback mic mic-unprocessed mic-camcorder mic-voice-recognition mic-voice-communication voice-call voice-call-uplink voice-call-downlink voice-performance)'
'--audio-output-buffer=[Configure the size of the SDL audio output buffer (in milliseconds)]'
{-b,--video-bit-rate=}'[Encode the video at the given bit-rate]'
'--camera-ar=[Select the camera size by its aspect ratio]'

View File

@@ -67,13 +67,19 @@ The available encoders can be listed by \fB\-\-list\-encoders\fR.
.TP
.BI "\-\-audio\-source " source
Select the audio source (output, mic or playback).
Select the audio source. Possible values are:
The "output" source forwards the whole audio output, and disables playback on the device.
The "playback" source captures the audio playback (Android apps can opt-out, so the whole output is not necessarily captured).
The "mic" source captures the microphone.
- "output": forwards the whole audio output, and disables playback on the device.
- "playback": captures the audio playback (Android apps can opt-out, so the whole output is not necessarily captured).
- "mic": captures the microphone.
- "mic-unprocessed": captures the microphone unprocessed (raw) sound.
- "mic-camcorder": captures the microphone tuned for video recording, with the same orientation as the camera if available.
- "mic-voice-recognition": captures the microphone tuned for voice recognition.
- "mic-voice-communication": captures the microphone tuned for voice communications (it will for instance take advantage of echo cancellation or automatic gain control if available).
- "voice-call": captures voice call.
- "voice-call-uplink": captures voice call uplink only.
- "voice-call-downlink": captures voice call downlink only.
- "voice-performance": captures audio meant to be processed for live performance (karaoke), includes both the microphone and the device playback.
Default is output.

View File

@@ -76,8 +76,10 @@ sc_audio_regulator_pull(struct sc_audio_regulator *ar, uint8_t *out,
// Wait until the buffer is filled up to at least target_buffering
// before playing
if (buffered_samples < ar->target_buffering) {
LOGV("[Audio] Inserting initial buffering silence: %" PRIu32
#ifdef SC_AUDIO_REGULATOR_DEBUG
LOGD("[Audio] Inserting initial buffering silence: %" PRIu32
" samples", out_samples);
#endif
// Delay playback starting to reach the target buffering. Fill the
// whole buffer with silence (len is small compared to the
// arbitrary margin value).
@@ -98,8 +100,10 @@ sc_audio_regulator_pull(struct sc_audio_regulator *ar, uint8_t *out,
// dropped to keep the latency minimal. However, this would cause very
// audible glitches, so let the clock compensation restore the target
// latency.
#ifdef SC_AUDIO_REGULATOR_DEBUG
LOGD("[Audio] Buffer underflow, inserting silence: %" PRIu32 " samples",
silence);
#endif
memset(out + TO_BYTES(read), 0, TO_BYTES(silence));
bool received = atomic_load_explicit(&ar->received,
@@ -137,6 +141,35 @@ bool
sc_audio_regulator_push(struct sc_audio_regulator *ar, const AVFrame *frame) {
SwrContext *swr_ctx = ar->swr_ctx;
uint32_t input_samples = frame->nb_samples;
assert(frame->pts >= 0);
int64_t pts = frame->pts;
if (ar->next_expected_pts && pts - ar->next_expected_pts > 100000) {
LOGV("[Audio] Discontinuity detected: %" PRIi64 "µs",
pts - ar->next_expected_pts);
// More than 100ms: consider it as a discontinuity
// (typically because silence packets were not captured)
uint32_t can_read = sc_audiobuf_can_read(&ar->buf);
if (input_samples + can_read < ar->target_buffering) {
// Adjust buffering to the target value directly
uint32_t silence = ar->target_buffering - can_read - input_samples;
sc_audiobuf_write_silence(&ar->buf, silence);
}
// Reset state
ar->avg_buffering.avg = ar->target_buffering;
int ret = swr_set_compensation(swr_ctx, 0, 0);
assert(!ret); // disabling compensation should never fail
ar->compensation_active = false;
ar->samples_since_resync = 0;
atomic_store_explicit(&ar->underflow, 0, memory_order_relaxed);
}
int64_t packet_duration = input_samples * INT64_C(1000000)
/ ar->sample_rate;
ar->next_expected_pts = pts + packet_duration;
int64_t swr_delay = swr_get_delay(swr_ctx, ar->sample_rate);
// No need to av_rescale_rnd(), input and output sample rates are the same.
// Add more space (256) for clock compensation.
@@ -209,6 +242,7 @@ sc_audio_regulator_push(struct sc_audio_regulator *ar, const AVFrame *frame) {
if (played) {
underflow = atomic_exchange_explicit(&ar->underflow, 0,
memory_order_relaxed);
ar->underflow_report += underflow;
max_buffered_samples = ar->target_buffering * 11 / 10
+ 60 * ar->sample_rate / 1000 /* 60 ms */;
@@ -255,7 +289,7 @@ sc_audio_regulator_push(struct sc_audio_regulator *ar, const AVFrame *frame) {
}
// Number of samples added (or removed, if negative) for compensation
int32_t instant_compensation = (int32_t) written - frame->nb_samples;
int32_t instant_compensation = (int32_t) written - input_samples;
// Inserting silence instantly increases buffering
int32_t inserted_silence = (int32_t) underflow;
// Dropping input samples instantly decreases buffering
@@ -311,7 +345,9 @@ sc_audio_regulator_push(struct sc_audio_regulator *ar, const AVFrame *frame) {
int abs_max_diff = distance / 50;
diff = CLAMP(diff, -abs_max_diff, abs_max_diff);
LOGV("[Audio] Buffering: target=%" PRIu32 " avg=%f cur=%" PRIu32
" compensation=%d", ar->target_buffering, avg, can_read, diff);
" compensation=%d (underflow=%" PRIu32 ")",
ar->target_buffering, avg, can_read, diff, ar->underflow_report);
ar->underflow_report = 0;
int ret = swr_set_compensation(swr_ctx, diff, distance);
if (ret < 0) {
@@ -394,7 +430,9 @@ sc_audio_regulator_init(struct sc_audio_regulator *ar, size_t sample_size,
atomic_init(&ar->played, false);
atomic_init(&ar->received, false);
atomic_init(&ar->underflow, 0);
ar->underflow_report = 0;
ar->compensation_active = false;
ar->next_expected_pts = 0;
return true;

View File

@@ -46,6 +46,9 @@ struct sc_audio_regulator {
// Number of silence samples inserted since the last received packet
atomic_uint_least32_t underflow;
// Number of silence samples inserted since the last log
uint32_t underflow_report;
// Non-zero compensation applied (only used by the receiver thread)
bool compensation_active;
@@ -54,6 +57,9 @@ struct sc_audio_regulator {
// Set to true the first time samples are pulled by the player
atomic_bool played;
// PTS of the next expected packet (useful to detect discontinuities)
int64_t next_expected_pts;
};
bool

View File

@@ -217,13 +217,31 @@ static const struct sc_option options[] = {
.longopt_id = OPT_AUDIO_SOURCE,
.longopt = "audio-source",
.argdesc = "source",
.text = "Select the audio source (output, mic or playback).\n"
"The \"output\" source forwards the whole audio output, and "
"disables playback on the device.\n"
"The \"playback\" source captures the audio playback (Android "
"apps can opt-out, so the whole output is not necessarily "
.text = "Select the audio source. Possible values are:\n"
" - \"output\": forwards the whole audio output, and disables "
"playback on the device.\n"
" - \"playback\": captures the audio playback (Android apps "
"can opt-out, so the whole output is not necessarily "
"captured).\n"
"The \"mic\" source captures the microphone.\n"
" - \"mic\": captures the microphone.\n"
" - \"mic-unprocessed\": captures the microphone unprocessed "
"(raw) sound.\n"
" - \"mic-camcorder\": captures the microphone tuned for video "
"recording, with the same orientation as the camera if "
"available.\n"
" - \"mic-voice-recognition\": captures the microphone tuned "
"for voice recognition.\n"
" - \"mic-voice-communication\": captures the microphone tuned "
"for voice communications (it will for instance take advantage "
"of echo cancellation or automatic gain control if "
"available).\n"
" - \"voice-call\": captures voice call.\n"
" - \"voice-call-uplink\": captures voice call uplink only.\n"
" - \"voice-call-downlink\": captures voice call downlink "
"only.\n"
" - \"voice-performance\": captures audio meant to be "
"processed for live performance (karaoke), includes both the "
"microphone and the device playback.\n"
"Default is output.",
},
{
@@ -2036,8 +2054,50 @@ parse_audio_source(const char *optarg, enum sc_audio_source *source) {
return true;
}
LOGE("Unsupported audio source: %s (expected output, mic or playback)",
optarg);
if (!strcmp(optarg, "mic-unprocessed")) {
*source = SC_AUDIO_SOURCE_MIC_UNPROCESSED;
return true;
}
if (!strcmp(optarg, "mic-camcorder")) {
*source = SC_AUDIO_SOURCE_MIC_CAMCORDER;
return true;
}
if (!strcmp(optarg, "mic-voice-recognition")) {
*source = SC_AUDIO_SOURCE_MIC_VOICE_RECOGNITION;
return true;
}
if (!strcmp(optarg, "mic-voice-communication")) {
*source = SC_AUDIO_SOURCE_MIC_VOICE_COMMUNICATION;
return true;
}
if (!strcmp(optarg, "voice-call")) {
*source = SC_AUDIO_SOURCE_VOICE_CALL;
return true;
}
if (!strcmp(optarg, "voice-call-uplink")) {
*source = SC_AUDIO_SOURCE_VOICE_CALL_UPLINK;
return true;
}
if (!strcmp(optarg, "voice-call-downlink")) {
*source = SC_AUDIO_SOURCE_VOICE_CALL_DOWNLINK;
return true;
}
if (!strcmp(optarg, "voice-performance")) {
*source = SC_AUDIO_SOURCE_VOICE_PERFORMANCE;
return true;
}
LOGE("Unsupported audio source: %s (expected output, mic, playback, "
"mic-unprocessed, mic-camcorder, mic-voice-recognition, "
"mic-voice-communication, voice-call, voice-call-uplink, "
"voice-call-downlink, voice-performance)", optarg);
return false;
}

View File

@@ -59,6 +59,14 @@ enum sc_audio_source {
SC_AUDIO_SOURCE_OUTPUT,
SC_AUDIO_SOURCE_MIC,
SC_AUDIO_SOURCE_PLAYBACK,
SC_AUDIO_SOURCE_MIC_UNPROCESSED,
SC_AUDIO_SOURCE_MIC_CAMCORDER,
SC_AUDIO_SOURCE_MIC_VOICE_RECOGNITION,
SC_AUDIO_SOURCE_MIC_VOICE_COMMUNICATION,
SC_AUDIO_SOURCE_VOICE_CALL,
SC_AUDIO_SOURCE_VOICE_CALL_UPLINK,
SC_AUDIO_SOURCE_VOICE_CALL_DOWNLINK,
SC_AUDIO_SOURCE_VOICE_PERFORMANCE,
};
enum sc_camera_facing {

View File

@@ -149,6 +149,22 @@ sc_server_get_audio_source_name(enum sc_audio_source audio_source) {
return "mic";
case SC_AUDIO_SOURCE_PLAYBACK:
return "playback";
case SC_AUDIO_SOURCE_MIC_UNPROCESSED:
return "mic-unprocessed";
case SC_AUDIO_SOURCE_MIC_CAMCORDER:
return "mic-camcorder";
case SC_AUDIO_SOURCE_MIC_VOICE_RECOGNITION:
return "mic-voice-recognition";
case SC_AUDIO_SOURCE_MIC_VOICE_COMMUNICATION:
return "mic-voice-communication";
case SC_AUDIO_SOURCE_VOICE_CALL:
return "voice-call";
case SC_AUDIO_SOURCE_VOICE_CALL_UPLINK:
return "voice-call-uplink";
case SC_AUDIO_SOURCE_VOICE_CALL_DOWNLINK:
return "voice-call-downlink";
case SC_AUDIO_SOURCE_VOICE_PERFORMANCE:
return "voice-performance";
default:
assert(!"unexpected audio source");
return NULL;

View File

@@ -116,3 +116,38 @@ sc_audiobuf_write(struct sc_audiobuf *buf, const void *from_,
return samples_count;
}
uint32_t
sc_audiobuf_write_silence(struct sc_audiobuf *buf, uint32_t samples_count) {
// Only the writer thread can write head, so memory_order_relaxed is
// sufficient
uint32_t head = atomic_load_explicit(&buf->head, memory_order_relaxed);
// The tail cursor is updated after the data is consumed by the reader
uint32_t tail = atomic_load_explicit(&buf->tail, memory_order_acquire);
uint32_t can_write = (buf->alloc_size + tail - head - 1) % buf->alloc_size;
if (!can_write) {
return 0;
}
if (samples_count > can_write) {
samples_count = can_write;
}
uint32_t right_count = buf->alloc_size - head;
if (right_count > samples_count) {
right_count = samples_count;
}
memset(buf->data + (head * buf->sample_size), 0,
right_count * buf->sample_size);
if (samples_count > right_count) {
uint32_t left_count = samples_count - right_count;
memset(buf->data, 0, left_count * buf->sample_size);
}
uint32_t new_head = (head + samples_count) % buf->alloc_size;
atomic_store_explicit(&buf->head, new_head, memory_order_release);
return samples_count;
}

View File

@@ -50,6 +50,9 @@ uint32_t
sc_audiobuf_write(struct sc_audiobuf *buf, const void *from,
uint32_t samples_count);
uint32_t
sc_audiobuf_write_silence(struct sc_audiobuf *buf, uint32_t samples);
static inline uint32_t
sc_audiobuf_capacity(struct sc_audiobuf *buf) {
assert(buf->alloc_size);

View File

@@ -113,6 +113,14 @@ static void test_audiobuf_partial_read_write(void) {
uint32_t expected2[] = {4, 5, 6, 1, 2, 3, 4, 1, 2, 3};
assert(!memcmp(data, expected2, 12));
w = sc_audiobuf_write_silence(&buf, 4);
assert(w == 4);
r = sc_audiobuf_read(&buf, data, 4);
assert(r == 4);
uint32_t expected3[] = {0, 0, 0, 0};
assert(!memcmp(data, expected3, 4));
sc_audiobuf_destroy(&buf);
}

View File

@@ -66,6 +66,20 @@ the computer:
scrcpy --audio-source=mic --no-video --no-playback --record=file.opus
```
Many sources are available:
- `output` (default): forwards the whole audio output, and disables playback on the device (mapped to [`REMOTE_SUBMIX`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#REMOTE_SUBMIX)).
- `playback`: captures the audio playback (Android apps can opt-out, so the whole output is not necessarily captured).
- `mic`: captures the microphone (mapped to [`MIC`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#MIC)).
- `mic-unprocessed`: captures the microphone unprocessed (raw) sound (mapped to [`UNPROCESSED`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#UNPROCESSED)).
- `mic-camcorder`: captures the microphone tuned for video recording, with the same orientation as the camera if available (mapped to [`CAMCORDER`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#CAMCORDER)).
- `mic-voice-recognition`: captures the microphone tuned for voice recognition (mapped to [`VOICE_RECOGNITION`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_RECOGNITION)).
- `mic-voice-communication`: captures the microphone tuned for voice communications (it will for instance take advantage of echo cancellation or automatic gain control if available) (mapped to [`VOICE_COMMUNICATION`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_COMMUNICATION)).
- `voice-call`: captures voice call (mapped to [`VOICE_CALL`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_CALL)).
- `voice-call-uplink`: captures voice call uplink only (mapped to [`VOICE_UPLINK`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_UPLINK)).
- `voice-call-downlink`: captures voice call downlink only (mapped to [`VOICE_DOWNLINK`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_DOWNLINK)).
- `voice-performance`: captures audio meant to be processed for live performance (karaoke), includes both the microphone and the device playback (mapped to [`VOICE_PERFORMANCE`](https://developer.android.com/reference/android/media/MediaRecorder.AudioSource#VOICE_PERFORMANCE)).
### Duplication
An alternative device audio capture method is also available (only for Android

View File

@@ -12,7 +12,6 @@ import android.content.ComponentName;
import android.content.Intent;
import android.media.AudioRecord;
import android.media.MediaCodec;
import android.media.MediaRecorder;
import android.os.Build;
import android.os.SystemClock;
@@ -32,18 +31,7 @@ public class AudioDirectCapture implements AudioCapture {
private AudioRecordReader reader;
public AudioDirectCapture(AudioSource audioSource) {
this.audioSource = getAudioSourceValue(audioSource);
}
private static int getAudioSourceValue(AudioSource audioSource) {
switch (audioSource) {
case OUTPUT:
return MediaRecorder.AudioSource.REMOTE_SUBMIX;
case MIC:
return MediaRecorder.AudioSource.MIC;
default:
throw new IllegalArgumentException("Unsupported audio source: " + audioSource);
}
this.audioSource = audioSource.getDirectAudioSource();
}
@TargetApi(AndroidVersions.API_23_ANDROID_6_0)

View File

@@ -55,6 +55,9 @@ public final class AudioEncoder implements AsyncProcessor {
private final List<CodecOption> codecOptions;
private final String encoderName;
private boolean recreatePts;
private long previousPts;
// Capacity of 64 is in practice "infinite" (it is limited by the number of available MediaCodec buffers, typically 4).
// So many pending tasks would lead to an unacceptable delay anyway.
private final BlockingQueue<InputTask> inputTasks = new ArrayBlockingQueue<>(64);
@@ -118,6 +121,9 @@ public final class AudioEncoder implements AsyncProcessor {
OutputTask task = outputTasks.take();
ByteBuffer buffer = mediaCodec.getOutputBuffer(task.index);
try {
if (recreatePts) {
fixTimestamp(task.bufferInfo);
}
streamer.writePacket(buffer, task.bufferInfo);
} finally {
mediaCodec.releaseOutputBuffer(task.index, false);
@@ -125,6 +131,24 @@ public final class AudioEncoder implements AsyncProcessor {
}
}
private void fixTimestamp(MediaCodec.BufferInfo bufferInfo) {
assert recreatePts;
if ((bufferInfo.flags & MediaCodec.BUFFER_FLAG_CODEC_CONFIG) != 0) {
// Config packet, nothing to fix
return;
}
long pts = bufferInfo.presentationTimeUs;
if (previousPts != 0) {
long now = System.nanoTime() / 1000;
long duration = pts - previousPts;
bufferInfo.presentationTimeUs = now - duration;
}
previousPts = pts;
}
@Override
public void start(TerminationListener listener) {
thread = new Thread(() -> {
@@ -194,6 +218,11 @@ public final class AudioEncoder implements AsyncProcessor {
Codec codec = streamer.getCodec();
mediaCodec = createMediaCodec(codec, encoderName);
// The default OPUS encoder generates its own input PTS which matches the number of samples. This is not the behavior we want: it
// ignores any audio clock drift and hard silences (packets not produced on silence). To fix this behavior, regenerate PTS based on the
// current time and the packet duration.
recreatePts = "c2.android.opus.encoder".equals(mediaCodec.getName());
mediaCodecThread = new HandlerThread("media-codec");
mediaCodecThread.start();

View File

@@ -1,20 +1,38 @@
package com.genymobile.scrcpy.audio;
import android.annotation.SuppressLint;
import android.media.MediaRecorder;
@SuppressLint("InlinedApi")
public enum AudioSource {
OUTPUT("output"),
MIC("mic"),
PLAYBACK("playback");
OUTPUT("output", MediaRecorder.AudioSource.REMOTE_SUBMIX),
MIC("mic", MediaRecorder.AudioSource.MIC),
PLAYBACK("playback", -1),
MIC_UNPROCESSED("mic-unprocessed", MediaRecorder.AudioSource.UNPROCESSED),
MIC_CAMCORDER("mic-camcorder", MediaRecorder.AudioSource.CAMCORDER),
MIC_VOICE_RECOGNITION("mic-voice-recognition", MediaRecorder.AudioSource.VOICE_RECOGNITION),
MIC_VOICE_COMMUNICATION("mic-voice-communication", MediaRecorder.AudioSource.VOICE_COMMUNICATION),
VOICE_CALL("voice-call", MediaRecorder.AudioSource.VOICE_CALL),
VOICE_CALL_UPLINK("voice-call-uplink", MediaRecorder.AudioSource.VOICE_CALL),
VOICE_CALL_DOWNLINK("voice-call-downlink", MediaRecorder.AudioSource.VOICE_CALL),
VOICE_PERFORMANCE("voice-performance", MediaRecorder.AudioSource.VOICE_PERFORMANCE);
private final String name;
private final int directAudioSource;
AudioSource(String name) {
AudioSource(String name, int directAudioSource) {
this.name = name;
this.directAudioSource = directAudioSource;
}
public boolean isDirect() {
return this != PLAYBACK;
}
public int getDirectAudioSource() {
return directAudioSource;
}
public static AudioSource findByName(String name) {
for (AudioSource audioSource : AudioSource.values()) {
if (name.equals(audioSource.name)) {