utf8impl.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008 #ifndef parserutils_charset_encodings_utf8impl_h_
00009 #define parserutils_charset_encodings_utf8impl_h_
00010
00015 #include <stdbool.h>
00016 #include <stdlib.h>
00017 #include <string.h>
00018
00020 extern const uint8_t numContinuations[256];
00021
00034 #define UTF8_TO_UCS4(s, len, ucs4, clen, error) \
00035 do { \
00036 uint32_t c, min; \
00037 uint8_t n; \
00038 uint8_t i; \
00039 \
00040 error = PARSERUTILS_OK; \
00041 \
00042 if (s == NULL || ucs4 == NULL || clen == NULL) { \
00043 error = PARSERUTILS_BADPARM; \
00044 break; \
00045 } \
00046 \
00047 if (len == 0) { \
00048 error = PARSERUTILS_NEEDDATA; \
00049 break; \
00050 } \
00051 \
00052 c = s[0]; \
00053 \
00054 if (c < 0x80) { \
00055 n = 1; \
00056 min = 0; \
00057 } else if ((c & 0xE0) == 0xC0) { \
00058 c &= 0x1F; \
00059 n = 2; \
00060 min = 0x80; \
00061 } else if ((c & 0xF0) == 0xE0) { \
00062 c &= 0x0F; \
00063 n = 3; \
00064 min = 0x800; \
00065 } else if ((c & 0xF8) == 0xF0) { \
00066 c &= 0x07; \
00067 n = 4; \
00068 min = 0x10000; \
00069 } else if ((c & 0xFC) == 0xF8) { \
00070 c &= 0x03; \
00071 n = 5; \
00072 min = 0x200000; \
00073 } else if ((c & 0xFE) == 0xFC) { \
00074 c &= 0x01; \
00075 n = 6; \
00076 min = 0x4000000; \
00077 } else { \
00078 error = PARSERUTILS_INVALID; \
00079 break; \
00080 } \
00081 \
00082 if (len < n) { \
00083 error = PARSERUTILS_NEEDDATA; \
00084 break; \
00085 } \
00086 \
00087 for (i = 1; i < n; i++) { \
00088 uint32_t t = s[i]; \
00089 \
00090 if ((t & 0xC0) != 0x80) { \
00091 error = PARSERUTILS_INVALID; \
00092 break; \
00093 } \
00094 \
00095 c <<= 6; \
00096 c |= t & 0x3F; \
00097 } \
00098 \
00099 if (error == PARSERUTILS_OK) { \
00100 \
00101 if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \
00102 c == 0xFFFE || c == 0xFFFF) { \
00103 error = PARSERUTILS_INVALID; \
00104 break; \
00105 } \
00106 \
00107 *ucs4 = c; \
00108 *clen = n; \
00109 } \
00110 } while(0)
00111
00123 #define UTF8_FROM_UCS4(ucs4, s, len, error) \
00124 do { \
00125 uint8_t *buf; \
00126 uint8_t l = 0; \
00127 \
00128 error = PARSERUTILS_OK; \
00129 \
00130 if (s == NULL || *s == NULL || len == NULL) { \
00131 error = PARSERUTILS_BADPARM; \
00132 break; \
00133 } \
00134 \
00135 if (ucs4 < 0x80) { \
00136 l = 1; \
00137 } else if (ucs4 < 0x800) { \
00138 l = 2; \
00139 } else if (ucs4 < 0x10000) { \
00140 l = 3; \
00141 } else if (ucs4 < 0x200000) { \
00142 l = 4; \
00143 } else if (ucs4 < 0x4000000) { \
00144 l = 5; \
00145 } else if (ucs4 <= 0x7FFFFFFF) { \
00146 l = 6; \
00147 } else { \
00148 error = PARSERUTILS_INVALID; \
00149 break; \
00150 } \
00151 \
00152 if (l > *len) { \
00153 error = PARSERUTILS_NOMEM; \
00154 break; \
00155 } \
00156 \
00157 buf = *s; \
00158 \
00159 if (l == 1) { \
00160 buf[0] = (uint8_t) ucs4; \
00161 } else { \
00162 uint8_t i; \
00163 for (i = l; i > 1; i--) { \
00164 buf[i - 1] = 0x80 | (ucs4 & 0x3F); \
00165 ucs4 >>= 6; \
00166 } \
00167 buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \
00168 } \
00169 \
00170 *s += l; \
00171 *len -= l; \
00172 } while(0)
00173
00182 #define UTF8_LENGTH(s, max, len, error) \
00183 do { \
00184 const uint8_t *end = s + max; \
00185 int l = 0; \
00186 \
00187 error = PARSERUTILS_OK; \
00188 \
00189 if (s == NULL || len == NULL) { \
00190 error = PARSERUTILS_BADPARM; \
00191 break; \
00192 } \
00193 \
00194 while (s < end) { \
00195 uint32_t c = s[0]; \
00196 \
00197 if ((c & 0x80) == 0x00) \
00198 s += 1; \
00199 else if ((c & 0xE0) == 0xC0) \
00200 s += 2; \
00201 else if ((c & 0xF0) == 0xE0) \
00202 s += 3; \
00203 else if ((c & 0xF8) == 0xF0) \
00204 s += 4; \
00205 else if ((c & 0xFC) == 0xF8) \
00206 s += 5; \
00207 else if ((c & 0xFE) == 0xFC) \
00208 s += 6; \
00209 else { \
00210 error = PARSERUTILS_INVALID; \
00211 break; \
00212 } \
00213 \
00214 l++; \
00215 } \
00216 \
00217 if (error == PARSERUTILS_OK) \
00218 *len = l; \
00219 } while(0)
00220
00228 #define UTF8_CHAR_BYTE_LENGTH(s, len, error) \
00229 do { \
00230 if (s == NULL || len == NULL) { \
00231 error = PARSERUTILS_BADPARM; \
00232 break; \
00233 } \
00234 \
00235 *len = numContinuations[s[0]] + 1 ; \
00236 \
00237 error = PARSERUTILS_OK; \
00238 } while(0)
00239
00249 #define UTF8_PREV(s, off, prevoff, error) \
00250 do { \
00251 if (s == NULL || prevoff == NULL) { \
00252 error = PARSERUTILS_BADPARM; \
00253 break; \
00254 } \
00255 \
00256 while (off != 0 && (s[--off] & 0xC0) == 0x80) \
00257 ; \
00258 \
00259 *prevoff = off; \
00260 \
00261 error = PARSERUTILS_OK; \
00262 } while(0)
00263
00274 #define UTF8_NEXT(s, len, off, nextoff, error) \
00275 do { \
00276 if (s == NULL || off >= len || nextoff == NULL) { \
00277 error = PARSERUTILS_BADPARM; \
00278 break; \
00279 } \
00280 \
00281 \
00282 if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \
00283 off++; \
00284 \
00285 while (off < len && (s[off] & 0xC0) == 0x80) \
00286 off++; \
00287 \
00288 *nextoff = off; \
00289 \
00290 error = PARSERUTILS_OK; \
00291 } while(0)
00292
00303 #define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \
00304 do { \
00305 uint8_t c; \
00306 \
00307 error = PARSERUTILS_OK; \
00308 \
00309 if (s == NULL || off >= len || nextoff == NULL) { \
00310 error = PARSERUTILS_BADPARM; \
00311 break; \
00312 } \
00313 \
00314 c = s[off]; \
00315 \
00316 \
00317 if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \
00318 off++; \
00319 } else { \
00320 uint32_t nCont = numContinuations[c]; \
00321 uint32_t nToSkip; \
00322 \
00323 if (off + nCont + 1 >= len) { \
00324 error = PARSERUTILS_NEEDDATA; \
00325 break; \
00326 } \
00327 \
00328 \
00329 for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \
00330 if ((s[off + nToSkip] & 0xC0) != 0x80) \
00331 break; \
00332 } \
00333 \
00334 \
00335 off += nToSkip; \
00336 } \
00337 \
00338 *nextoff = off; \
00339 } while(0)
00340
00341 #endif