#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <parserutils/charset/utf8.h>
#include "charset/encodings/utf8impl.h"
Go to the source code of this file.
Functions | |
| parserutils_error | parserutils_charset_utf8_to_ucs4 (const uint8_t *s, size_t len, uint32_t *ucs4, size_t *clen) |
| Convert a UTF-8 multibyte sequence into a single UCS-4 character. | |
| parserutils_error | parserutils_charset_utf8_from_ucs4 (uint32_t ucs4, uint8_t **s, size_t *len) |
| Convert a single UCS-4 character into a UTF-8 multibyte sequence. | |
| parserutils_error | parserutils_charset_utf8_length (const uint8_t *s, size_t max, size_t *len) |
| Calculate the length (in characters) of a bounded UTF-8 string. | |
| parserutils_error | parserutils_charset_utf8_char_byte_length (const uint8_t *s, size_t *len) |
| Calculate the length (in bytes) of a UTF-8 character. | |
| parserutils_error | parserutils_charset_utf8_prev (const uint8_t *s, uint32_t off, uint32_t *prevoff) |
| Find previous legal UTF-8 char in string. | |
| parserutils_error | parserutils_charset_utf8_next (const uint8_t *s, uint32_t len, uint32_t off, uint32_t *nextoff) |
| Find next legal UTF-8 char in string. | |
| parserutils_error | parserutils_charset_utf8_next_paranoid (const uint8_t *s, uint32_t len, uint32_t off, uint32_t *nextoff) |
| Find next legal UTF-8 char in string. | |
Variables | |
| const uint8_t | numContinuations [256] |
| Number of continuation bytes for a given start byte. | |
Definition in file utf8.c.
| parserutils_error parserutils_charset_utf8_char_byte_length | ( | const uint8_t * | s, | |
| size_t * | len | |||
| ) |
Calculate the length (in bytes) of a UTF-8 character.
| s | Pointer to start of character | |
| len | Pointer to location to receive length |
Definition at line 107 of file utf8.c.
References UTF8_CHAR_BYTE_LENGTH.
Referenced by parserutils_inputstream_peek(), and parserutils_inputstream_peek_slow().
| parserutils_error parserutils_charset_utf8_from_ucs4 | ( | uint32_t | ucs4, | |
| uint8_t ** | s, | |||
| size_t * | len | |||
| ) |
Convert a single UCS-4 character into a UTF-8 multibyte sequence.
Encoding of UCS values outside the UTF-16 plane has been removed from RFC3629. This function conforms to RFC2279, however.
| ucs4 | The character to process (0 <= c <= 0x7FFFFFFF) (host endian) | |
| s | Pointer to pointer to output buffer, updated on exit | |
| len | Pointer to length, in bytes, of output buffer, updated on exit |
Definition at line 72 of file utf8.c.
References UTF8_FROM_UCS4.
| parserutils_error parserutils_charset_utf8_length | ( | const uint8_t * | s, | |
| size_t | max, | |||
| size_t * | len | |||
| ) |
Calculate the length (in characters) of a bounded UTF-8 string.
| s | The string | |
| max | Maximum length | |
| len | Pointer to location to receive length of string |
Definition at line 90 of file utf8.c.
References UTF8_LENGTH.
| parserutils_error parserutils_charset_utf8_next | ( | const uint8_t * | s, | |
| uint32_t | len, | |||
| uint32_t | off, | |||
| uint32_t * | nextoff | |||
| ) |
Find next legal UTF-8 char in string.
| s | The string (assumed valid) | |
| len | Maximum offset in string | |
| off | Offset in the string to start at | |
| nextoff | Pointer to location to receive offset of first byte of next legal character |
Definition at line 146 of file utf8.c.
References UTF8_NEXT.
| parserutils_error parserutils_charset_utf8_next_paranoid | ( | const uint8_t * | s, | |
| uint32_t | len, | |||
| uint32_t | off, | |||
| uint32_t * | nextoff | |||
| ) |
Find next legal UTF-8 char in string.
| s | The string (assumed to be of dubious validity) | |
| len | Maximum offset in string | |
| off | Offset in the string to start at | |
| nextoff | Pointer to location to receive offset of first byte of next legal character |
Definition at line 166 of file utf8.c.
References UTF8_NEXT_PARANOID.
| parserutils_error parserutils_charset_utf8_prev | ( | const uint8_t * | s, | |
| uint32_t | off, | |||
| uint32_t * | prevoff | |||
| ) |
Find previous legal UTF-8 char in string.
| s | The string | |
| off | Offset in the string to start at | |
| prevoff | Pointer to location to receive offset of first byte of previous legal character |
Definition at line 126 of file utf8.c.
References UTF8_PREV.
| parserutils_error parserutils_charset_utf8_to_ucs4 | ( | const uint8_t * | s, | |
| size_t | len, | |||
| uint32_t * | ucs4, | |||
| size_t * | clen | |||
| ) |
Convert a UTF-8 multibyte sequence into a single UCS-4 character.
Encoding of UCS values outside the UTF-16 plane has been removed from RFC3629. This function conforms to RFC2279, however.
| s | The sequence to process | |
| len | Length of sequence | |
| ucs4 | Pointer to location to receive UCS-4 character (host endian) | |
| clen | Pointer to location to receive byte length of UTF-8 sequence |
Definition at line 51 of file utf8.c.
References UTF8_TO_UCS4.
| const uint8_t numContinuations[256] |
Initial value:
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
}
1.5.6