utf16.c
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00012 #include <stdbool.h>
00013 #include <stdlib.h>
00014 #include <string.h>
00015
00016 #include <parserutils/charset/utf16.h>
00017
00027 parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
00028 size_t len, uint32_t *ucs4, size_t *clen)
00029 {
00030 const uint16_t *ss = (const uint16_t *) (const void *) s;
00031
00032 if (s == NULL || ucs4 == NULL || clen == NULL)
00033 return PARSERUTILS_BADPARM;
00034
00035 if (len < 2)
00036 return PARSERUTILS_NEEDDATA;
00037
00038 if (*ss < 0xD800 || *ss > 0xDFFF) {
00039 *ucs4 = *ss;
00040 *clen = 2;
00041 } else if (0xD800 <= *ss && *ss <= 0xDBFF) {
00042
00043 if (len < 4)
00044 return PARSERUTILS_NEEDDATA;
00045
00046 if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) {
00047
00048 *ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF))
00049 + (1<<16);
00050 *clen = 4;
00051 } else {
00052 return PARSERUTILS_INVALID;
00053 }
00054 } else {
00055
00056 return PARSERUTILS_INVALID;
00057 }
00058
00059 return PARSERUTILS_OK;
00060 }
00061
00070 parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
00071 size_t *len)
00072 {
00073 uint16_t *ss = (uint16_t *) (void *) s;
00074 uint32_t l = 0;
00075
00076 if (s == NULL || len == NULL)
00077 return PARSERUTILS_BADPARM;
00078 else if (ucs4 < 0x10000) {
00079 *ss = (uint16_t) ucs4;
00080 l = 2;
00081 } else if (ucs4 < 0x110000) {
00082 ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
00083 ss[1] = 0xDC00 | (ucs4 & 0x3ff);
00084 l = 4;
00085 } else {
00086 return PARSERUTILS_INVALID;
00087 }
00088
00089 *len = l;
00090
00091 return PARSERUTILS_OK;
00092 }
00093
00102 parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
00103 size_t *len)
00104 {
00105 const uint16_t *ss = (const uint16_t *) (const void *) s;
00106 const uint16_t *end = (const uint16_t *) (const void *) (s + max);
00107 int l = 0;
00108
00109 if (s == NULL || len == NULL)
00110 return PARSERUTILS_BADPARM;
00111
00112 while (ss < end) {
00113 if (*ss < 0xD800 || 0xDFFF < *ss)
00114 ss++;
00115 else
00116 ss += 2;
00117
00118 l++;
00119 }
00120
00121 *len = l;
00122
00123 return PARSERUTILS_OK;
00124 }
00125
00133 parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
00134 size_t *len)
00135 {
00136 const uint16_t *ss = (const uint16_t *) (const void *) s;
00137
00138 if (s == NULL || len == NULL)
00139 return PARSERUTILS_BADPARM;
00140
00141 if (*ss < 0xD800 || 0xDFFF < *ss)
00142 *len = 2;
00143 else
00144 *len = 4;
00145
00146 return PARSERUTILS_OK;
00147 }
00148
00158 parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
00159 uint32_t *prevoff)
00160 {
00161 const uint16_t *ss = (const uint16_t *) (const void *) s;
00162
00163 if (s == NULL || prevoff == NULL)
00164 return PARSERUTILS_BADPARM;
00165
00166 if (off < 2)
00167 *prevoff = 0;
00168 else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
00169 *prevoff = off - 2;
00170 else
00171 *prevoff = (off < 4) ? 0 : off - 4;
00172
00173 return PARSERUTILS_OK;
00174 }
00175
00186 parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
00187 uint32_t off, uint32_t *nextoff)
00188 {
00189 const uint16_t *ss = (const uint16_t *) (const void *) s;
00190
00191 if (s == NULL || off >= len || nextoff == NULL)
00192 return PARSERUTILS_BADPARM;
00193
00194 if (len - off < 4)
00195 *nextoff = len;
00196 else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
00197 *nextoff = off + 2;
00198 else
00199 *nextoff = (len - off < 6) ? len : off + 4;
00200
00201 return PARSERUTILS_OK;
00202 }
00203
00214 parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
00215 uint32_t len, uint32_t off, uint32_t *nextoff)
00216 {
00217 const uint16_t *ss = (const uint16_t *) (const void *) s;
00218
00219 if (s == NULL || off >= len || nextoff == NULL)
00220 return PARSERUTILS_BADPARM;
00221
00222 while (1) {
00223 if (len - off < 4) {
00224 return PARSERUTILS_NEEDDATA;
00225 } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
00226 *nextoff = off + 2;
00227 break;
00228 } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
00229 if (len - off < 6)
00230 return PARSERUTILS_NEEDDATA;
00231
00232 if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
00233 *nextoff = off + 4;
00234 break;
00235 } else {
00236 ss++;
00237 off += 2;
00238 }
00239 }
00240 }
00241
00242 return PARSERUTILS_OK;
00243 }
00244