From caca46e0e6dce823783f85174aaff9e49bd088a8 Mon Sep 17 00:00:00 2001 From: Mathieu Duponchelle Date: Fri, 20 Mar 2020 19:09:17 +0100 Subject: [PATCH] subparse: convert from pango-markup to utf8 .. when downstream requires it --- gst/subparse/gstsubparse.c | 124 +++++++++++++++++++++++++++++--- gst/subparse/gstsubparse.h | 1 + tests/check/elements/subparse.c | 29 ++++++++ 3 files changed, 146 insertions(+), 8 deletions(-) diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c index bb3435c0d0..1ab53e7a03 100644 --- a/gst/subparse/gstsubparse.c +++ b/gst/subparse/gstsubparse.c @@ -179,6 +179,7 @@ gst_sub_parse_init (GstSubParse * subparse) subparse->textbuf = g_string_new (NULL); subparse->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN; + subparse->strip_pango_markup = FALSE; subparse->flushing = FALSE; gst_segment_init (&subparse->segment, GST_FORMAT_TIME); subparse->need_segment = TRUE; @@ -1724,11 +1725,97 @@ feed_textbuf (GstSubParse * self, GstBuffer * buf) g_free (input); } + +static void +xml_text (GMarkupParseContext * context, + const gchar * text, gsize text_len, gpointer user_data, GError ** error) +{ + gchar **accum = (gchar **) user_data; + gchar *concat; + + if (*accum) { + concat = g_strconcat (*accum, text, NULL); + g_free (*accum); + *accum = concat; + } else { + *accum = g_strdup (text); + } +} + +static gchar * +strip_pango_markup (gchar * markup, GError ** error) +{ + GMarkupParser parser = { 0, }; + GMarkupParseContext *context; + gchar *accum = NULL; + + parser.text = xml_text; + context = g_markup_parse_context_new (&parser, 0, &accum, NULL); + + g_markup_parse_context_parse (context, "", 6, NULL); + g_markup_parse_context_parse (context, markup, strlen (markup), error); + g_markup_parse_context_parse (context, "", 7, NULL); + if (*error) + goto error; + + g_markup_parse_context_end_parse (context, error); + if (*error) + goto error; + +done: + g_markup_parse_context_free (context); + return accum; + +error: + g_free (accum); + accum = NULL; + goto done; +} + +static gboolean +gst_sub_parse_negotiate (GstSubParse * self, GstCaps * preferred) +{ + GstCaps *caps; + gboolean ret = FALSE; + const GstStructure *s1, *s2; + + caps = gst_pad_get_allowed_caps (self->srcpad); + + s1 = gst_caps_get_structure (preferred, 0); + + if (!g_strcmp0 (gst_structure_get_string (s1, "format"), "utf8")) { + GstCaps *intersected = gst_caps_intersect (caps, preferred); + gst_caps_unref (caps); + caps = intersected; + } + + caps = gst_caps_fixate (caps); + + if (gst_caps_is_empty (caps)) { + goto done; + } + + s2 = gst_caps_get_structure (caps, 0); + + self->strip_pango_markup = + !g_strcmp0 (gst_structure_get_string (s2, "format"), "utf8") + && !g_strcmp0 (gst_structure_get_string (s1, "format"), "pango-markup"); + + if (self->strip_pango_markup) { + GST_INFO_OBJECT (self, "We will convert from pango-markup to utf8"); + } + + ret = gst_pad_set_caps (self->srcpad, caps); + +done: + gst_caps_unref (caps); + return ret; +} + static GstFlowReturn handle_buffer (GstSubParse * self, GstBuffer * buf) { GstFlowReturn ret = GST_FLOW_OK; - GstCaps *caps = NULL; gchar *line, *subtitle; gboolean need_tags = FALSE; @@ -1747,14 +1834,19 @@ handle_buffer (GstSubParse * self, GstBuffer * buf) /* make sure we know the format */ if (G_UNLIKELY (self->parser_type == GST_SUB_PARSE_FORMAT_UNKNOWN)) { - if (!(caps = gst_sub_parse_format_autodetect (self))) { - return GST_FLOW_EOS; + GstCaps *preferred; + + if (!(preferred = gst_sub_parse_format_autodetect (self))) { + return GST_FLOW_NOT_NEGOTIATED; } - if (!gst_pad_set_caps (self->srcpad, caps)) { - gst_caps_unref (caps); - return GST_FLOW_EOS; + + if (!gst_sub_parse_negotiate (self, preferred)) { + gst_caps_unref (preferred); + return GST_FLOW_NOT_NEGOTIATED; } - gst_caps_unref (caps); + + gst_caps_unref (preferred); + need_tags = TRUE; } @@ -1790,7 +1882,22 @@ handle_buffer (GstSubParse * self, GstBuffer * buf) g_free (line); if (subtitle) { - guint subtitle_len = strlen (subtitle); + guint subtitle_len; + + if (self->strip_pango_markup) { + GError *error = NULL; + gchar *stripped; + + if ((stripped = strip_pango_markup (subtitle, &error))) { + g_free (subtitle); + subtitle = stripped; + } else { + GST_WARNING_OBJECT (self, "Failed to strip pango markup: %s", + error->message); + } + } + + subtitle_len = strlen (subtitle); /* +1 for terminating NUL character */ buf = gst_buffer_new_and_alloc (subtitle_len + 1); @@ -1947,6 +2054,7 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition) /* format detection will init the parser state */ self->offset = 0; self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN; + self->strip_pango_markup = FALSE; self->valid_utf8 = TRUE; self->first_buffer = TRUE; g_free (self->detected_encoding); diff --git a/gst/subparse/gstsubparse.h b/gst/subparse/gstsubparse.h index 41f73acf55..6e156fb928 100644 --- a/gst/subparse/gstsubparse.h +++ b/gst/subparse/gstsubparse.h @@ -99,6 +99,7 @@ struct _GstSubParse { gboolean valid_utf8; gchar *detected_encoding; gchar *encoding; + gboolean strip_pango_markup; gboolean first_buffer; diff --git a/tests/check/elements/subparse.c b/tests/check/elements/subparse.c index e6c75e0dd3..aa1387815b 100644 --- a/tests/check/elements/subparse.c +++ b/tests/check/elements/subparse.c @@ -22,6 +22,7 @@ #endif #include +#include #include @@ -1036,6 +1037,33 @@ GST_START_TEST (test_lrc) GST_END_TEST; +GST_START_TEST (test_raw_conversion) +{ + GstHarness *h; + GstBuffer *buffer; + GstMapInfo map; + + h = gst_harness_new ("subparse"); + + gst_harness_set_src_caps_str (h, "application/x-subtitle"); + gst_harness_set_sink_caps_str (h, "text/x-raw, format=utf8"); + + buffer = buffer_from_static_string (srt_input[5].in); + + buffer = gst_harness_push_and_pull (h, buffer); + + gst_buffer_map (buffer, &map, GST_MAP_READ); + fail_unless_equals_int (map.size, 3); + fail_unless_equals_string ((gchar *) map.data, "Six"); + gst_buffer_unmap (buffer, &map); + + gst_clear_buffer (&buffer); + + gst_harness_teardown (h); +} + +GST_END_TEST; + /* TODO: * - add/modify tests so that lines aren't dogfed to the parsers in complete * lines or sets of complete lines, but rather in random chunks @@ -1071,6 +1099,7 @@ subparse_suite (void) tcase_add_test (tc_chain, test_sami_bad_entities); tcase_add_test (tc_chain, test_sami_comment); tcase_add_test (tc_chain, test_lrc); + tcase_add_test (tc_chain, test_raw_conversion); return s; }