Is there a specific Bitstream Filter required for VP9 in an MP4 container that I am missing, or is AVFMT_FLAG_AUTO_BSF actually required for VP9 but failing for another reason? Why would this stream copy work for H.264 but produce a black screen for VP9 on Android? Where could be the real problem, or is the code right but the problem is elsewhere?
I am using FFmpeg libraries (libavformat/libavcodec) so files via JNI in an Android app to merge separate video and audio tracks (remuxing/stream copying).
While H.264 and H.265 files merge perfectly, VP9 videos (specifically from Instagram/YouTube sources) result in a black screen or playback errors on some Android players after merging.
The Symptoms:
H.264/AAC merges and plays fine.
VP9/Opus or VP9/AAC results in a file that has the correct duration but no visible video or fails to initialize the decoder in some android devices.
I suspect the issue relates to Bitstream Filters (BSF) or timestamp handling.
What I've tried: I updated my code to include three specific fixes:
Skipping AVFMT_FLAG_AUTO_BSF for VP9: I noticed auto-bsf might be corrupting VP9 headers.
Setting avoid_negative_ts: To handle sources with negative start times.
Filtering out ATTACHED_PIC: To ensure I don't accidentally mux a thumbnail as the main video stream.
My JNI Implementation:
#include <jni.h>
#include <libavformat/avformat.h>
#include <android/log.h>
#ifdef NDEBUG // NDEBUG is automatically defined in release builds
// Release build - disable debug logs
#define LOGD(...)
#define LOGE(...)
#else
// Debug build - enable debug logs
#define TAG "Downloader"
#define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, TAG, __VA_ARGS__)
#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
#endif
#include <libavutil/cpu.h>
JNIEXPORT jint JNICALL
Java_com_harrshbermann_SocialMate_google_logEvent(
JNIEnv *env, jobject thiz, jstring jVideoPath, jstring jAudioPath, jstring jOutPath) {
/*
// Inside your JNI function:
int cpu_flags = av_get_cpu_flags();
if (cpu_flags & AV_CPU_FLAG_NEON) {
LOGD("FFMpeg Check: NEON is ENABLED and active!");
} else {
LOGE("FFMpeg Check: NEON is NOT detected!");
}
if (cpu_flags & AV_CPU_FLAG_ARMV8) {
LOGD("FFMpeg Check: ARMv8 optimizations are active!");
}
*/
// ── Declare ALL variables at top (C89 compliance for NDK) ────────────────
AVFormatContext *ifmt_ctx_v = NULL, *ifmt_ctx_a = NULL, *ofmt_ctx = NULL;
AVStream *in_v, *in_a, *out_v, *out_a;
AVPacket *pkt = NULL;
int ret = 0;
int v_idx, a_idx;
int v_done = 0, a_done = 0;
int64_t v_dts = 0, a_dts = 0;
int file_opened = 0; // track if avio_open succeeded
const char *vPath = (*env)->GetStringUTFChars(env, jVideoPath, 0);
const char *aPath = (*env)->GetStringUTFChars(env, jAudioPath, 0);
const char *oPath = (*env)->GetStringUTFChars(env, jOutPath, 0);
// ── Allocate packet ───────────────────────────────────────────────────────
pkt = av_packet_alloc();
if (!pkt) { ret = -1; goto cleanup; }
// ── Open inputs ───────────────────────────────────────────────────────────
if (avformat_open_input(&ifmt_ctx_v, vPath, NULL, NULL) < 0) {
LOGE("Failed to open video");
ret = -2; goto cleanup;
}
if (avformat_open_input(&ifmt_ctx_a, aPath, NULL, NULL) < 0) {
LOGE("Failed to open audio");
ret = -3; goto cleanup;
}
if (avformat_find_stream_info(ifmt_ctx_v, NULL) < 0 ||
avformat_find_stream_info(ifmt_ctx_a, NULL) < 0) {
LOGE("Failed to find stream info");
ret = -4; goto cleanup;
}
// ── Find best streams ─────────────────────────────────────────────────────
v_idx = av_find_best_stream(ifmt_ctx_v, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
a_idx = av_find_best_stream(ifmt_ctx_a, AVMEDIA_TYPE_AUDIO, -1, -1, NULL, 0);
if (v_idx < 0 || a_idx < 0) {
LOGE("Could not find video/audio stream (v=%d, a=%d)", v_idx, a_idx);
ret = -5; goto cleanup;
}
in_v = ifmt_ctx_v->streams[v_idx];
in_a = ifmt_ctx_a->streams[a_idx];
// ── Allocate output context ───────────────────────────────────────────────
if (avformat_alloc_output_context2(&ofmt_ctx, NULL, NULL, oPath) < 0 || !ofmt_ctx) {
LOGE("Failed to alloc output context");
ret = -6; goto cleanup;
}
// ── Create output streams ─────────────────────────────────────────────────
out_v = avformat_new_stream(ofmt_ctx, NULL);
out_a = avformat_new_stream(ofmt_ctx, NULL);
if (!out_v || !out_a) {
LOGE("Failed to create output streams");
ret = -7; goto cleanup;
}
// avcodec_parameters_copy handles extradata internally — no manual malloc needed
avcodec_parameters_copy(out_v->codecpar, in_v->codecpar);
avcodec_parameters_copy(out_a->codecpar, in_a->codecpar);
out_v->codecpar->codec_tag = 0; // reset for container compatibility
out_a->codecpar->codec_tag = 0;
// ── Open output file ──────────────────────────────────────────────────────
if (!(ofmt_ctx->oformat->flags & AVFMT_NOFILE)) {
if (avio_open(&ofmt_ctx->pb, oPath, AVIO_FLAG_WRITE) < 0) {
LOGE("Failed to open output file");
ret = -8; goto cleanup;
}
file_opened = 1;
}
// Auto BSF fixes Annex-B vs. AVCC format mismatch for H.264/H.265 in MP4
ofmt_ctx->flags |= AVFMT_FLAG_AUTO_BSF;
if (avformat_write_header(ofmt_ctx, NULL) < 0) {
LOGE("Failed to write header");
ret = -9; goto cleanup;
}
// ── Interleaved muxing loop ───────────────────────────────────────────────
while (!v_done || !a_done) {
AVFormatContext *src;
AVStream *in_st, *out_st;
int target_idx, pick_video, got;
pick_video = !v_done && (a_done ||
av_compare_ts(v_dts, in_v->time_base, a_dts, in_a->time_base) <= 0);
src = pick_video ? ifmt_ctx_v : ifmt_ctx_a;
in_st = pick_video ? in_v : in_a;
out_st = pick_video ? out_v : out_a;
target_idx = pick_video ? v_idx : a_idx;
// Skip non-target packets (e.g. subtitles in same container)
got = 0;
while (av_read_frame(src, pkt) >= 0) {
if (pkt->stream_index == target_idx) { got = 1; break; }
av_packet_unref(pkt);
}
if (!got) {
if (pick_video) v_done = 1;
else a_done = 1;
continue;
}
// Update DTS tracker for next interleaving decision
if (pick_video) v_dts = pkt->dts;
else a_dts = pkt->dts;
// Rescale timestamps with proper rounding flags
pkt->pts = av_rescale_q_rnd(pkt->pts, in_st->time_base, out_st->time_base,
AV_ROUND_NEAR_INF | AV_ROUND_PASS_MINMAX);
pkt->dts = av_rescale_q_rnd(pkt->dts, in_st->time_base, out_st->time_base,
AV_ROUND_NEAR_INF | AV_ROUND_PASS_MINMAX);
pkt->duration = av_rescale_q(pkt->duration, in_st->time_base, out_st->time_base);
pkt->pos = -1;
pkt->stream_index = out_st->index; // use actual index, not hardcoded 0/1
if (av_interleaved_write_frame(ofmt_ctx, pkt) < 0) {
LOGE("Error writing %s packet", pick_video ? "video" : "audio");
}
av_packet_unref(pkt);
}
av_write_trailer(ofmt_ctx);
LOGD("mergeAV finished successfully");
cleanup:
if (pkt) av_packet_free(&pkt);
if (ifmt_ctx_v) avformat_close_input(&ifmt_ctx_v);
if (ifmt_ctx_a) avformat_close_input(&ifmt_ctx_a);
if (ofmt_ctx) {
if (file_opened) avio_closep(&ofmt_ctx->pb);
avformat_free_context(ofmt_ctx);
}
(*env)->ReleaseStringUTFChars(env, jVideoPath, vPath);
(*env)->ReleaseStringUTFChars(env, jAudioPath, aPath);
(*env)->ReleaseStringUTFChars(env, jOutPath, oPath);
return ret;
}
Changes I made:
JNIEXPORT jint JNICALL
Java_com_example_app_NativeMuxer_mergeAV(JNIEnv *env, jobject thiz, jstring jVideoPath, jstring jAudioPath, jstring jOutPath) {
AVFormatContext *ifmt_ctx_v = NULL, *ifmt_ctx_a = NULL, *ofmt_ctx = NULL;
AVPacket *pkt = av_packet_alloc();
int v_idx = -1, a_idx = -1, ret = 0;
// ... [Opening inputs and finding stream info] ...
// Fix 1: Explicitly skip thumbnail streams
for (int i = 0; i < (int)ifmt_ctx_v->nb_streams; i++) {
if (ifmt_ctx_v->streams[i]->disposition & AV_DISPOSITION_ATTACHED_PIC) continue;
if (ifmt_ctx_v->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
v_idx = i; break;
}
}
// Detect VP9
int is_vp = (ifmt_ctx_v->streams[v_idx]->codecpar->codec_id == AV_CODEC_ID_VP9);
avformat_alloc_output_context2(&ofmt_ctx, NULL, NULL, oPath);
// Create streams and copy parameters
// ... [avformat_new_stream & avcodec_parameters_copy] ...
out_v->codecpar->codec_tag = 0;
out_a->codecpar->codec_tag = 0;
// Fix 2: Skip AUTO_BSF for VP9 to avoid bitstream corruption
if (!is_vp) {
ofmt_ctx->flags |= AVFMT_FLAG_AUTO_BSF;
}
// Fix 3: Handle negative timestamps
ofmt_ctx->avoid_negative_ts = AVFMT_AVOID_NEG_TS_MAKE_NON_NEGATIVE;
avformat_write_header(ofmt_ctx, NULL);
// ... [Interleaved muxing loop with av_rescale_q_rnd] ...
av_write_trailer(ofmt_ctx);
// ... [Cleanup] ...
return ret;
}