mpegts: Add function to encode UTF8 strings

This can be used to create descriptors with appropriate character maps
This commit is contained in:
Jesper Larsen 2013-10-30 10:43:21 +01:00
parent 465dea6f32
commit 4630dfda05
2 changed files with 155 additions and 0 deletions

View file

@ -32,6 +32,7 @@ GST_DEBUG_CATEGORY_EXTERN (gst_mpegts_debug);
G_GNUC_INTERNAL void __initialize_descriptors (void);
G_GNUC_INTERNAL guint32 _calc_crc32 (const guint8 *data, guint datalen);
G_GNUC_INTERNAL gchar *get_encoding_and_convert (const gchar *text, guint length);
G_GNUC_INTERNAL guint8 *dvb_text_from_utf8 (const gchar * text, gsize *out_size);
typedef gpointer (*GstMpegTsParseFunc) (GstMpegTsSection *section);
G_GNUC_INTERNAL gpointer __common_desc_checks (GstMpegTsSection *section,

View file

@ -259,6 +259,160 @@ _get_iconv (LocalIconvCode from, LocalIconvCode to)
return __iconvs[from][to];
}
static void
_encode_control_codes (gchar * text, gsize length, gboolean is_multibyte)
{
gsize pos = 0;
while (pos < length) {
if (is_multibyte) {
guint16 code = GST_READ_UINT16_BE (text + pos);
if (code == 0x000A) {
text[pos] = 0xE0;
text[pos + 1] = 0x8A;
}
pos += 2;
} else {
guint8 code = text[pos];
if (code == 0x0A)
text[pos] = 0x8A;
pos++;
}
}
}
/**
* dvb_text_from_utf8:
* @text: The text to convert. This should be in UTF-8 format
* @out_size: (out) the byte length of the new text
*
* Converts UTF-8 strings to text characters compliant with EN 300 468.
* The converted text can be used directly in DVB #GstMpegTsDescriptor
*
* The function will try different character maps until the string is
* completely converted.
*
* The function tries the default ISO 6937 character map first.
*
* If no character map that contains all characters could be found, the
* string is converted to ISO 6937 with unknown characters set to `?`.
*
* Returns: (transfer full) byte array of size @out_size
*/
guint8 *
dvb_text_from_utf8 (const gchar * text, gsize * out_size)
{
GError *error = NULL;
gchar *out_text;
guint8 *out_buffer;
guint encoding;
GIConv giconv = (GIConv) - 1;
/* We test character maps one-by-one. Start with the default */
encoding = _ICONV_ISO6937;
giconv = _get_iconv (_ICONV_UTF8, encoding);
out_text = g_convert_with_iconv (text, -1, giconv, NULL, out_size, &error);
if (out_text) {
GST_DEBUG ("Using default ISO6937 encoding");
goto out;
}
g_clear_error (&error);
for (encoding = _ICONV_ISO8859_1; encoding <= _ICONV_ISO10646_UTF8;
encoding++) {
giconv = _get_iconv (_ICONV_UTF8, encoding);
if (giconv == (GIConv) - 1)
continue;
out_text = g_convert_with_iconv (text, -1, giconv, NULL, out_size, &error);
if (out_text) {
GST_DEBUG ("Found suitable character map - %s", iconvtablename[encoding]);
goto out;
}
g_clear_error (&error);
}
out_text = g_convert_with_fallback (text, -1, iconvtablename[_ICONV_ISO6937],
iconvtablename[_ICONV_UTF8], "?", NULL, out_size, &error);
out:
if (error) {
GST_WARNING ("Could not convert from utf-8: %s", error->message);
g_error_free (error);
if (out_text)
g_free (out_text);
return NULL;
}
switch (encoding) {
case _ICONV_ISO6937:
/* Default encoding contains no selection bytes. */
_encode_control_codes (out_text, *out_size, FALSE);
return (guint8 *) out_text;
case _ICONV_ISO8859_1:
case _ICONV_ISO8859_2:
case _ICONV_ISO8859_3:
case _ICONV_ISO8859_4:
/* These character sets requires 3 selection bytes */
_encode_control_codes (out_text, *out_size, FALSE);
out_buffer = g_malloc (*out_size + 3);
out_buffer[0] = 0x10;
out_buffer[1] = 0x00;
out_buffer[2] = encoding - _ICONV_ISO8859_1 + 1;
memcpy (out_buffer + 3, out_text, *out_size);
*out_size += 3;
g_free (out_text);
return out_buffer;
case _ICONV_ISO8859_5:
case _ICONV_ISO8859_6:
case _ICONV_ISO8859_7:
case _ICONV_ISO8859_8:
case _ICONV_ISO8859_9:
case _ICONV_ISO8859_10:
case _ICONV_ISO8859_11:
case _ICONV_ISO8859_12:
case _ICONV_ISO8859_13:
case _ICONV_ISO8859_14:
case _ICONV_ISO8859_15:
/* These character sets requires 1 selection byte */
_encode_control_codes (out_text, *out_size, FALSE);
out_buffer = g_malloc (*out_size + 1);
out_buffer[0] = encoding - _ICONV_ISO8859_5 + 1;
memcpy (out_buffer + 1, out_text, *out_size);
*out_size += 1;
g_free (out_text);
return out_buffer;
case _ICONV_UCS_2BE:
case _ICONV_EUC_KR:
case _ICONV_UTF_16BE:
/* These character sets requires 1 selection byte */
_encode_control_codes (out_text, *out_size, TRUE);
out_buffer = g_malloc (*out_size + 1);
out_buffer[0] = encoding - _ICONV_UCS_2BE + 0x11;
memcpy (out_buffer + 1, out_text, *out_size);
*out_size += 1;
g_free (out_text);
return out_buffer;
case _ICONV_GB2312:
case _ICONV_ISO10646_UTF8:
/* These character sets requires 1 selection byte */
_encode_control_codes (out_text, *out_size, FALSE);
out_buffer = g_malloc (*out_size + 1);
out_buffer[0] = encoding - _ICONV_UCS_2BE + 0x11;
memcpy (out_buffer + 1, out_text, *out_size);
*out_size += 1;
g_free (out_text);
return out_buffer;
default:
g_free (out_text);
return NULL;
}
}
/*
* @text: The text to convert. It may include pango markup (<b> and </b>)
* @length: The length of the string -1 if it's nul-terminated