gstreamer/subprojects/gst-plugins-bad/sys/dwrite/libcaption/utf8.c
2023-06-27 13:23:07 +00:00

262 lines
7 KiB
C

/**********************************************************************************************/
/* The MIT License */
/* */
/* Copyright 2016-2017 Twitch Interactive, Inc. or its affiliates. All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining a copy */
/* of this software and associated documentation files (the "Software"), to deal */
/* in the Software without restriction, including without limitation the rights */
/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
/* copies of the Software, and to permit persons to whom the Software is */
/* furnished to do so, subject to the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be included in */
/* all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN */
/* THE SOFTWARE. */
/**********************************************************************************************/
#include "utf8.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
const utf8_char_t *
utf8_char_next (const utf8_char_t * c)
{
const utf8_char_t *n = c + utf8_char_length (c);
return n == c ? 0 : n;
}
// returnes the length of the char in bytes
size_t
utf8_char_length (const utf8_char_t * c)
{
// count null term as zero size
if (!c || 0x00 == c[0]) {
return 0;
}
static const size_t _utf8_char_length[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2,
2, 2, 2, 3, 3, 4, 0
};
return _utf8_char_length[(c[0] >> 3) & 0x1F];
}
int
utf8_char_whitespace (const utf8_char_t * c)
{
// 0x7F is DEL
if (!c || (c[0] >= 0 && c[0] <= ' ') || c[0] == 0x7F) {
return 1;
}
// EIA608_CHAR_NO_BREAK_SPACE TODO other utf8 spaces
if (0xC2 == (unsigned char) c[0] && 0xA0 == (unsigned char) c[1]) {
return 1;
}
return 0;
}
// returns length of the string in bytes
// size is number of charcter to count (0 to count until NULL term)
size_t
utf8_string_length (const utf8_char_t * data, utf8_size_t size)
{
size_t char_length, byts = 0;
if (0 == size) {
size = utf8_char_count (data, 0);
}
for (; 0 < size; --size) {
if (0 == (char_length = utf8_char_length (data))) {
break;
}
data += char_length;
byts += char_length;
}
return byts;
}
size_t
utf8_char_copy (utf8_char_t * dst, const utf8_char_t * src)
{
size_t bytes = utf8_char_length (src);
if (bytes && dst) {
memcpy (dst, src, bytes);
dst[bytes] = '\0';
}
return bytes;
}
// returnes the number of utf8 charcters in a string given the number of bytes
// to count until the a null terminator, pass 0 for size
utf8_size_t
utf8_char_count (const char *data, size_t size)
{
size_t i, bytes = 0;
utf8_size_t count = 0;
if (0 == size) {
size = strlen (data);
}
for (i = 0; i < size; ++count, i += bytes) {
if (0 == (bytes = utf8_char_length (&data[i]))) {
break;
}
}
return count;
}
// returns the length of the line in bytes triming not printable charcters at the end
size_t
utf8_trimmed_length (const utf8_char_t * data, utf8_size_t charcters)
{
size_t l, t = 0, split_at = 0;
for (size_t c = 0; (*data) && c < charcters; ++c) {
l = utf8_char_length (data);
if (!utf8_char_whitespace (data)) {
split_at = t + l;
}
t += l, data += l;
}
return split_at;
}
size_t
_utf8_newline (const utf8_char_t * data)
{
if ('\r' == data[0]) {
return '\n' == data[1] ? 2 : 1; // windows/unix
} else if ('\n' == data[0]) {
return '\r' == data[1] ? 2 : 1; // riscos/macos
} else {
return 0;
}
}
// returns the length in bytes of the line including the new line charcter(s)
// auto detects between windows(CRLF), unix(LF), mac(CR) and riscos (LFCR) line endings
size_t
utf8_line_length (const utf8_char_t * data)
{
size_t n, len = 0;
for (len = 0; 0 != data[len]; ++len) {
if (0 < (n = _utf8_newline (data))) {
return len + n;
}
data += utf8_char_length (data);
}
return len;
}
// returns number of chars to include before split
utf8_size_t
utf8_wrap_length (const utf8_char_t * data, utf8_size_t size)
{
// Set split_at to size, so if a split point cna not be found, retuns the size passed in
size_t char_length, char_count, split_at = size;
for (char_count = 0; char_count <= size; ++char_count) {
if (_utf8_newline (data)) {
return char_count;
} else if (utf8_char_whitespace (data)) {
split_at = char_count;
}
char_length = utf8_char_length (data);
data += char_length;
}
return split_at;
}
int
utf8_line_count (const utf8_char_t * data)
{
size_t len = 0;
int count = 0;
do {
len = utf8_line_length (data);
data += len;
++count;
} while (0 < len);
return count - 1;
}
utf8_char_t *
utf8_load_text_file (const char *path, size_t * size)
{
utf8_char_t *data = NULL;
FILE *file = fopen (path, "r");
if (file) {
fseek (file, 0, SEEK_END);
size_t file_size = ftell (file);
fseek (file, 0, SEEK_SET);
if (0 == (*size) || file_size <= (*size)) {
(*size) = 0;
data = (utf8_char_t *) malloc (1 + file_size);
memset (data, '\0', file_size);
if (data) {
utf8_char_t *pos = data;
size_t bytes_read = 0;
while (0 < (bytes_read = fread (pos, 1, file_size - (*size), file))) {
pos += bytes_read;
(*size) += bytes_read;
}
}
fclose (file);
}
}
data[*size] = 0;
return data;
}
#ifndef strnstr
char *
strnstr (const char *string1, const char *string2, size_t len)
{
size_t length2;
length2 = strlen (string2);
if (!length2) {
return (char *) string1;
}
while (len >= length2) {
len--;
if (!memcmp (string1, string2, length2))
return (char *) string1;
string1++;
}
return NULL;
}
#endif