From 7ce74ccd5db8cc084ebe07da9e33366712064ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20P=C3=B6lsterl?= Date: Fri, 13 Jun 2008 15:46:03 +0000 Subject: gst/mpegtsparse/mpegtspacketizer.c: Handle character sets in strings coming from DVB SI according to the DVB SI spec. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Original commit message from CVS: patch by: Sebastian Pölsterl * gst/mpegtsparse/mpegtspacketizer.c: Handle character sets in strings coming from DVB SI according to the DVB SI spec. --- gst/mpegtsparse/mpegtspacketizer.c | 297 +++++++++++++++++++++++++++++++++---- 1 file changed, 272 insertions(+), 25 deletions(-) (limited to 'gst/mpegtsparse/mpegtspacketizer.c') diff --git a/gst/mpegtsparse/mpegtspacketizer.c b/gst/mpegtsparse/mpegtspacketizer.c index a2249fae..220bd49b 100644 --- a/gst/mpegtsparse/mpegtspacketizer.c +++ b/gst/mpegtsparse/mpegtspacketizer.c @@ -31,6 +31,11 @@ G_DEFINE_TYPE (MpegTSPacketizer, mpegts_packetizer, G_TYPE_OBJECT); static void mpegts_packetizer_dispose (GObject * object); static void mpegts_packetizer_finalize (GObject * object); +static gchar *convert_to_utf8 (const gchar * text, gint length, guint start, + const gchar * encoding, gboolean is_multibyte, GError ** error); +static gchar *get_encoding (const gchar * text, guint * start_text, + gboolean * is_multibyte); +static gchar *get_encoding_and_convert (const gchar * text, guint length); #define CONTINUITY_UNSET 255 #define MAX_CONTINUITY 15 @@ -606,11 +611,9 @@ mpegts_packetizer_parse_nit (MpegTSPacketizer * packetizer, DESC_DVB_NETWORK_NAME_length (networkname_descriptor); gchar *networkname = (gchar *) DESC_DVB_NETWORK_NAME_text (networkname_descriptor); - if (networkname[0] < 0x20) { - networkname_length -= 1; - networkname += 1; - } - networkname_tmp = g_strndup (networkname, networkname_length); + + networkname_tmp = + get_encoding_and_convert (networkname, networkname_length); gst_structure_set (nit, "network-name", G_TYPE_STRING, networkname_tmp, NULL); g_free (networkname_tmp); @@ -1213,17 +1216,13 @@ mpegts_packetizer_parse_sdt (MpegTSPacketizer * packetizer, (gchar *) DESC_DVB_SERVICE_name_text (service_descriptor); if (servicename_length + serviceprovider_name_length + 2 <= DESC_LENGTH (service_descriptor)) { - if (servicename[0] < 0x20) { - servicename_length -= 1; - servicename += 1; - } - if (serviceprovider_name[0] < 0x20) { - serviceprovider_name_length -= 1; - serviceprovider_name += 1; - } - servicename_tmp = g_strndup (servicename, servicename_length); + + servicename_tmp = + get_encoding_and_convert (servicename, servicename_length); serviceprovider_name_tmp = - g_strndup (serviceprovider_name, serviceprovider_name_length); + get_encoding_and_convert (serviceprovider_name, + serviceprovider_name_length); + gst_structure_set (service, "name", G_TYPE_STRING, servicename_tmp, NULL); gst_structure_set (service, "provider-name", G_TYPE_STRING, @@ -1444,17 +1443,12 @@ mpegts_packetizer_parse_eit (MpegTSPacketizer * packetizer, (gchar *) DESC_DVB_SHORT_EVENT_description_text (event_descriptor); if (eventname_length + eventdescription_length + 2 <= DESC_LENGTH (event_descriptor)) { - if (eventname[0] < 0x20) { - eventname_length -= 1; - eventname += 1; - } - if (eventdescription[0] < 0x20) { - eventdescription_length -= 1; - eventdescription += 1; - } - eventname_tmp = g_strndup (eventname, eventname_length), + + eventname_tmp = + get_encoding_and_convert (eventname, eventname_length), eventdescription_tmp = - g_strndup (eventdescription, eventdescription_length); + get_encoding_and_convert (eventdescription, + eventdescription_length); gst_structure_set (event, "name", G_TYPE_STRING, eventname_tmp, NULL); gst_structure_set (event, "description", G_TYPE_STRING, @@ -1933,3 +1927,256 @@ mpegts_packetizer_init_debug () GST_DEBUG_CATEGORY_INIT (mpegts_packetizer_debug, "mpegtspacketizer", 0, "MPEG transport stream parser"); } + +/** + * @text: The text you want to get the encoding from + * @start_text: Location where the beginning of the actual text is stored + * @is_multibyte: Location where information whether it's a multibyte encoding + * or not is stored + * @returns: Name of encoding or NULL of encoding could not be detected. + * + * The returned string should be freed with g_free () when no longer needed. + */ +static gchar * +get_encoding (const gchar * text, guint * start_text, gboolean * is_multibyte) +{ + gchar *encoding; + guint8 firstbyte; + + g_return_val_if_fail (text != NULL, NULL); + + firstbyte = (guint8) text[0]; + + if (firstbyte == 0x01) { + encoding = g_strdup ("iso8859-5"); + *start_text = 1; + *is_multibyte = FALSE; + } else if (firstbyte == 0x02) { + encoding = g_strdup ("iso8859-6"); + *start_text = 1; + *is_multibyte = FALSE; + } else if (firstbyte == 0x03) { + encoding = g_strdup ("iso8859-7"); + *start_text = 1; + *is_multibyte = FALSE; + } else if (firstbyte == 0x04) { + encoding = g_strdup ("iso8859-8"); + *start_text = 1; + *is_multibyte = FALSE; + } else if (firstbyte == 0x05) { + encoding = g_strdup ("iso8859-9"); + *start_text = 1; + *is_multibyte = FALSE; + } else if (firstbyte >= 0x20) { + encoding = g_strdup ("iso6937"); + *start_text = 0; + *is_multibyte = FALSE; + } else if (firstbyte == 0x10) { + guint16 table; + gchar table_str[6]; + + text++; + table = GST_READ_UINT16_BE (text); + g_snprintf (table_str, 6, "%d", table); + + encoding = g_strconcat ("iso8859-", table_str, NULL); + *start_text = 3; + *is_multibyte = FALSE; + } else if (firstbyte == 0x11) { + encoding = g_strdup ("ISO-10646/UCS2"); + *start_text = 1; + *is_multibyte = TRUE; + } else if (firstbyte == 0x12) { + // That's korean encoding. + // The spec says it's encoded in KSC 5601, but iconv only knows KSC 5636. + // Couldn't find any information about either of them. + encoding = NULL; + *start_text = 1; + *is_multibyte = TRUE; + } else { + // reserved + encoding = NULL; + } + + return encoding; +} + +/** + * @text: The text to convert. It may include pango markup ( and ) + * @length: The length of the string -1 if it's nul-terminated + * @start: Where to start converting in the text + * @encoding: The encoding of text + * @is_multibyte: Whether the encoding is a multibyte encoding + * @error: The location to store the error, or NULL to ignore errors + * @returns: UTF-8 encoded string + * + * Convert text to UTF-8. + */ +static gchar * +convert_to_utf8 (const gchar * text, gint length, guint start, + const gchar * encoding, gboolean is_multibyte, GError ** error) +{ + gchar *new_text; + GByteArray *sb; + gint i; + + g_return_val_if_fail (text != NULL, NULL); + g_return_val_if_fail (encoding != NULL, NULL); + + text += start; + + sb = g_byte_array_sized_new (length * 1.1); + + if (is_multibyte) { + if (length == -1) { + while (*text != '\0') { + guint16 code = GST_READ_UINT16_BE (text); + + switch (code) { + case 0xE086:{ + guint8 emph_on[] = { 0x3C, 0x00, // < + 0x62, 0x00, // b + 0x3E, 0x00 // > + }; + g_byte_array_append (sb, emph_on, 6); + break; + } + case 0xE087:{ + guint8 emph_on[] = { 0x3C, 0x00, // < + 0x2F, 0x00, // / + 0x62, 0x00, // b + 0x3E, 0x00 // > + }; + g_byte_array_append (sb, emph_on, 8); + break; + } + case 0xE08A:{ + guint8 nl[] = { 0x0A, 0x00 }; // new line + g_byte_array_append (sb, nl, 2); + break; + } + default: + g_byte_array_append (sb, (guint8 *) text, 2); + break; + } + + text += 2; + } + } else { + for (i = 0; i < length; i += 2) { + guint16 code = GST_READ_UINT16_BE (text); + + switch (code) { + case 0xE086:{ + guint8 emph_on[] = { 0x3C, 0x00, // < + 0x62, 0x00, // b + 0x3E, 0x00 // > + }; + g_byte_array_append (sb, emph_on, 6); + break; + } + case 0xE087:{ + guint8 emph_on[] = { 0x3C, 0x00, // < + 0x2F, 0x00, // / + 0x62, 0x00, // b + 0x3E, 0x00 // > + }; + g_byte_array_append (sb, emph_on, 8); + break; + } + case 0xE08A:{ + guint8 nl[] = { 0x0A, 0x00 }; // new line + g_byte_array_append (sb, nl, 2); + break; + } + default: + g_byte_array_append (sb, (guint8 *) text, 2); + break; + } + + text += 2; + } + } + } else { + if (length == -1) { + while (*text != '\0') { + guint8 code = (guint8) (*text); + + switch (code) { + case 0x86: + g_byte_array_append (sb, (guint8 *) "", 3); + break; + case 0x87: + g_byte_array_append (sb, (guint8 *) "", 4); + break; + case 0x8A: + g_byte_array_append (sb, (guint8 *) "\n", 1); + break; + default: + g_byte_array_append (sb, &code, 1); + break; + } + + text++; + } + } else { + for (i = 0; i < length; i++) { + guint8 code = (guint8) (*text); + + switch (code) { + case 0x86: + g_byte_array_append (sb, (guint8 *) "", 3); + break; + case 0x87: + g_byte_array_append (sb, (guint8 *) "", 4); + break; + case 0x8A: + g_byte_array_append (sb, (guint8 *) "\n", 1); + break; + default: + g_byte_array_append (sb, &code, 1); + break; + } + + text++; + } + } + } + + new_text = + g_convert ((gchar *) sb->data, sb->len, "utf-8", encoding, NULL, NULL, + error); + + g_byte_array_free (sb, TRUE); + + return new_text; +} + +static gchar * +get_encoding_and_convert (const gchar * text, guint length) +{ + GError *error = NULL; + gchar *converted_str; + gchar *encoding; + guint start_text = 0; + gboolean is_multibyte; + + encoding = get_encoding (text, &start_text, &is_multibyte); + + if (encoding == NULL) { + converted_str = g_strndup (text, length); + } else { + converted_str = convert_to_utf8 (text, length - start_text, start_text, + encoding, is_multibyte, &error); + if (error != NULL) { + g_critical ("Could not convert string: %s", error->message); + g_error_free (error); + text += start_text; + converted_str = g_strndup (text, length - start_text); + } + + g_free (encoding); + } + + return converted_str; +} -- cgit v1.2.1