opusenc: Encode exactly the amount of samples we got as input and put correct timestamps on it

The first frame has lookahead less samples, the last frame might have some padding or we might have to encode another frame of silence to get all our input into the encoded data. This is because of a) the lookahead at the beginning of the encoding, which shifts all data by that amount of samples and b) the padding needed to fill the very last frame completely. Ideally we would use LPC to calculate something better than silence for the padding to make the encoding as smooth as possible. With this we get exactly the same amount of samples again in an opusenc ! opusdec pipeline. https://bugzilla.gnome.org/show_bug.cgi?id=757153
2024-12-20 23:36:38 +00:00 · 2015-10-30 20:57:37 +02:00 · 2015-10-30 20:57:37 +02:00 · 4df2ffaad6
commit 4df2ffaad6
parent 6ffb90e037
2 changed files with 57 additions and 12 deletions
--- a/ext/opus/gstopusenc.c
+++ b/ext/opus/gstopusenc.c
@ -412,6 +412,7 @@ gst_opus_enc_start (GstAudioEncoder * benc)

  GST_DEBUG_OBJECT (enc, "start");
  enc->encoded_samples = 0;
+  enc->consumed_samples = 0;

  return TRUE;
 }
@ -766,6 +767,7 @@ gst_opus_enc_setup (GstOpusEnc * enc)
      lookahead);

  /* lookahead is samples, the Opus header wants it in 48kHz samples */
+  enc->lookahead = enc->pending_lookahead = lookahead;
  lookahead = lookahead * 48000 / enc->sample_rate;

  gst_opus_header_create_caps (&caps, NULL, lookahead, enc->sample_rate,
@ -807,6 +809,7 @@ gst_opus_enc_sink_event (GstAudioEncoder * benc, GstEvent * event)
    }
    case GST_EVENT_SEGMENT:
      enc->encoded_samples = 0;
+      enc->consumed_samples = 0;
      break;

    default:
@ -899,13 +902,13 @@ gst_opus_enc_encode (GstOpusEnc * enc, GstBuffer * buf)
  GstClockTime duration;

  guint max_payload_size;
-  gint frame_samples;
+  gint frame_samples, input_samples, output_samples;

  g_mutex_lock (&enc->property_lock);

  bytes = enc->frame_samples * enc->n_channels * 2;
  max_payload_size = enc->max_payload_size;
-  frame_samples = enc->frame_samples;
+  frame_samples = input_samples = enc->frame_samples;

  g_mutex_unlock (&enc->property_lock);

@ -915,20 +918,23 @@ gst_opus_enc_encode (GstOpusEnc * enc, GstBuffer * buf)
    bsize = map.size;

    if (G_UNLIKELY (bsize % bytes)) {
+      gint64 diff;
+
      GST_DEBUG_OBJECT (enc, "draining; adding silence samples");
+      g_assert (bsize < bytes);

      /* If encoding part of a frame, and we have no set stop time on
       * the output segment, we update the segment stop time to reflect
       * the last sample. This will let oggmux set the last page's
       * granpos to tell a decoder the dummy samples should be clipped.
       */
+      input_samples = bsize / (enc->n_channels * 2);
      segment = &GST_AUDIO_ENCODER_OUTPUT_SEGMENT (enc);
      if (!GST_CLOCK_TIME_IS_VALID (segment->stop)) {
-        int input_samples = bsize / (enc->n_channels * 2);
        GST_DEBUG_OBJECT (enc,
            "No stop time and partial frame, updating segment");
        duration =
-            gst_util_uint64_scale (enc->encoded_samples + input_samples,
+            gst_util_uint64_scale_ceil (enc->consumed_samples + input_samples,
            GST_SECOND, enc->sample_rate);
        segment->stop = segment->start + duration;
        GST_DEBUG_OBJECT (enc, "new output segment %" GST_SEGMENT_FORMAT,
@ -937,6 +943,21 @@ gst_opus_enc_encode (GstOpusEnc * enc, GstBuffer * buf)
            gst_event_new_segment (segment));
      }

+      diff =
+          (enc->encoded_samples + frame_samples) - (enc->consumed_samples +
+          input_samples);
+      if (diff >= 0) {
+        GST_DEBUG_OBJECT (enc,
+            "%" G_GINT64_FORMAT " extra samples of padding in this frame",
+            diff);
+        output_samples = frame_samples - diff;
+      } else {
+        GST_DEBUG_OBJECT (enc,
+            "Need to add %" G_GINT64_FORMAT " extra samples in the next frame",
+            -diff);
+        output_samples = frame_samples;
+      }
+
      size = ((bsize / bytes) + 1) * bytes;
      mdata = g_malloc0 (size);
      memcpy (mdata, bdata, bsize);
@ -944,10 +965,34 @@ gst_opus_enc_encode (GstOpusEnc * enc, GstBuffer * buf)
    } else {
      data = bdata;
      size = bsize;
+
+      /* Adjust for lookahead here */
+      if (enc->pending_lookahead) {
+        if (input_samples > enc->pending_lookahead) {
+          output_samples = input_samples - enc->pending_lookahead;
+          enc->pending_lookahead = 0;
+        } else {
+          enc->pending_lookahead -= input_samples;
+          output_samples = 0;
        }
      } else {
+        output_samples = input_samples;
+      }
+    }
+  } else {
+    if (enc->encoded_samples < enc->consumed_samples) {
+      data = mdata = g_malloc0 (bytes);
+      size = bytes;
+      output_samples = enc->consumed_samples - enc->encoded_samples;
+      input_samples = 0;
+      GST_DEBUG_OBJECT (enc, "draining %d samples", output_samples);
+    } else if (enc->encoded_samples == enc->consumed_samples) {
      GST_DEBUG_OBJECT (enc, "nothing to drain");
      goto done;
+    } else {
+      g_assert_not_reached ();
+      goto done;
+    }
  }

  g_assert (size == bytes);
@ -963,9 +1008,6 @@ gst_opus_enc_encode (GstOpusEnc * enc, GstBuffer * buf)

  gst_buffer_map (outbuf, &omap, GST_MAP_WRITE);

-  GST_DEBUG_OBJECT (enc, "encoding %d samples (%d bytes)",
-      frame_samples, (int) bytes);
-
  outsize =
      opus_multistream_encode (enc->state, (const gint16 *) data,
      frame_samples, omap.data, max_payload_size * enc->n_channels);
@ -987,10 +1029,12 @@ gst_opus_enc_encode (GstOpusEnc * enc, GstBuffer * buf)
  GST_DEBUG_OBJECT (enc, "Output packet is %u bytes", outsize);
  gst_buffer_set_size (outbuf, outsize);

+
  ret =
      gst_audio_encoder_finish_frame (GST_AUDIO_ENCODER (enc), outbuf,
-      frame_samples);
-  enc->encoded_samples += frame_samples;
+      output_samples);
+  enc->encoded_samples += output_samples;
+  enc->consumed_samples += input_samples;

 done:

--- a/ext/opus/gstopusenc.h
+++ b/ext/opus/gstopusenc.h
@ -79,7 +79,8 @@ struct _GstOpusEnc {
  gint                  n_channels;
  gint                  sample_rate;

-  guint64               encoded_samples;
+  guint64               encoded_samples, consumed_samples;
+  guint16               lookahead, pending_lookahead;

  guint8                channel_mapping_family;
  guint8                encoding_channel_mapping[256];