utf8impl.h

Go to the documentation of this file.
00001 /*
00002  * This file is part of LibParserUtils.
00003  * Licensed under the MIT License,
00004  *                http://www.opensource.org/licenses/mit-license.php
00005  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
00006  */
00007 
00008 #ifndef parserutils_charset_encodings_utf8impl_h_
00009 #define parserutils_charset_encodings_utf8impl_h_
00010 
00015 #include <stdbool.h>
00016 #include <stdlib.h>
00017 #include <string.h>
00018 
00020 extern const uint8_t numContinuations[256];
00021 
00034 #define UTF8_TO_UCS4(s, len, ucs4, clen, error)                         \
00035 do {                                                                    \
00036         uint32_t c, min;                                                \
00037         uint8_t n;                                                      \
00038         uint8_t i;                                                      \
00039                                                                         \
00040         error = PARSERUTILS_OK;                                         \
00041                                                                         \
00042         if (s == NULL || ucs4 == NULL || clen == NULL) {                \
00043                 error = PARSERUTILS_BADPARM;                            \
00044                 break;                                                  \
00045         }                                                               \
00046                                                                         \
00047         if (len == 0) {                                                 \
00048                 error = PARSERUTILS_NEEDDATA;                           \
00049                 break;                                                  \
00050         }                                                               \
00051                                                                         \
00052         c = s[0];                                                       \
00053                                                                         \
00054         if (c < 0x80) {                                                 \
00055                 n = 1;                                                  \
00056                 min = 0;                                                \
00057         } else if ((c & 0xE0) == 0xC0) {                                \
00058                 c &= 0x1F;                                              \
00059                 n = 2;                                                  \
00060                 min = 0x80;                                             \
00061         } else if ((c & 0xF0) == 0xE0) {                                \
00062                 c &= 0x0F;                                              \
00063                 n = 3;                                                  \
00064                 min = 0x800;                                            \
00065         } else if ((c & 0xF8) == 0xF0) {                                \
00066                 c &= 0x07;                                              \
00067                 n = 4;                                                  \
00068                 min = 0x10000;                                          \
00069         } else if ((c & 0xFC) == 0xF8) {                                \
00070                 c &= 0x03;                                              \
00071                 n = 5;                                                  \
00072                 min = 0x200000;                                         \
00073         } else if ((c & 0xFE) == 0xFC) {                                \
00074                 c &= 0x01;                                              \
00075                 n = 6;                                                  \
00076                 min = 0x4000000;                                        \
00077         } else {                                                        \
00078                 error = PARSERUTILS_INVALID;                            \
00079                 break;                                                  \
00080         }                                                               \
00081                                                                         \
00082         if (len < n) {                                                  \
00083                 error = PARSERUTILS_NEEDDATA;                           \
00084                 break;                                                  \
00085         }                                                               \
00086                                                                         \
00087         for (i = 1; i < n; i++) {                                       \
00088                 uint32_t t = s[i];                                      \
00089                                                                         \
00090                 if ((t & 0xC0) != 0x80) {                               \
00091                         error = PARSERUTILS_INVALID;                    \
00092                         break;                                          \
00093                 }                                                       \
00094                                                                         \
00095                 c <<= 6;                                                \
00096                 c |= t & 0x3F;                                          \
00097         }                                                               \
00098                                                                         \
00099         if (error == PARSERUTILS_OK) {                                  \
00100                 /* Detect overlong sequences, surrogates and fffe/ffff */ \
00101                 if (c < min || (c >= 0xD800 && c <= 0xDFFF) ||          \
00102                                 c == 0xFFFE || c == 0xFFFF) {           \
00103                         error = PARSERUTILS_INVALID;                    \
00104                         break;                                          \
00105                 }                                                       \
00106                                                                         \
00107                 *ucs4 = c;                                              \
00108                 *clen = n;                                              \
00109         }                                                               \
00110 } while(0)
00111 
00123 #define UTF8_FROM_UCS4(ucs4, s, len, error)                             \
00124 do {                                                                    \
00125         uint8_t *buf;                                                   \
00126         uint8_t l = 0;                                                  \
00127                                                                         \
00128         error = PARSERUTILS_OK;                                         \
00129                                                                         \
00130         if (s == NULL || *s == NULL || len == NULL) {                   \
00131                 error = PARSERUTILS_BADPARM;                            \
00132                 break;                                                  \
00133         }                                                               \
00134                                                                         \
00135         if (ucs4 < 0x80) {                                              \
00136                 l = 1;                                                  \
00137         } else if (ucs4 < 0x800) {                                      \
00138                 l = 2;                                                  \
00139         } else if (ucs4 < 0x10000) {                                    \
00140                 l = 3;                                                  \
00141         } else if (ucs4 < 0x200000) {                                   \
00142                 l = 4;                                                  \
00143         } else if (ucs4 < 0x4000000) {                                  \
00144                 l = 5;                                                  \
00145         } else if (ucs4 <= 0x7FFFFFFF) {                                \
00146                 l = 6;                                                  \
00147         } else {                                                        \
00148                 error = PARSERUTILS_INVALID;                            \
00149                 break;                                                  \
00150         }                                                               \
00151                                                                         \
00152         if (l > *len) {                                                 \
00153                 error = PARSERUTILS_NOMEM;                              \
00154                 break;                                                  \
00155         }                                                               \
00156                                                                         \
00157         buf = *s;                                                       \
00158                                                                         \
00159         if (l == 1) {                                                   \
00160                 buf[0] = (uint8_t) ucs4;                                \
00161         } else {                                                        \
00162                 uint8_t i;                                              \
00163                 for (i = l; i > 1; i--) {                               \
00164                         buf[i - 1] = 0x80 | (ucs4 & 0x3F);              \
00165                         ucs4 >>= 6;                                     \
00166                 }                                                       \
00167                 buf[0] = ~((1 << (8 - l)) - 1) | ucs4;                  \
00168         }                                                               \
00169                                                                         \
00170         *s += l;                                                        \
00171         *len -= l;                                                      \
00172 } while(0)
00173 
00182 #define UTF8_LENGTH(s, max, len, error)                                 \
00183 do {                                                                    \
00184         const uint8_t *end = s + max;                                   \
00185         int l = 0;                                                      \
00186                                                                         \
00187         error = PARSERUTILS_OK;                                         \
00188                                                                         \
00189         if (s == NULL || len == NULL) {                                 \
00190                 error = PARSERUTILS_BADPARM;                            \
00191                 break;                                                  \
00192         }                                                               \
00193                                                                         \
00194         while (s < end) {                                               \
00195                 uint32_t c = s[0];                                      \
00196                                                                         \
00197                 if ((c & 0x80) == 0x00)                                 \
00198                         s += 1;                                         \
00199                 else if ((c & 0xE0) == 0xC0)                            \
00200                         s += 2;                                         \
00201                 else if ((c & 0xF0) == 0xE0)                            \
00202                         s += 3;                                         \
00203                 else if ((c & 0xF8) == 0xF0)                            \
00204                         s += 4;                                         \
00205                 else if ((c & 0xFC) == 0xF8)                            \
00206                         s += 5;                                         \
00207                 else if ((c & 0xFE) == 0xFC)                            \
00208                         s += 6;                                         \
00209                 else {                                                  \
00210                         error = PARSERUTILS_INVALID;                    \
00211                         break;                                          \
00212                 }                                                       \
00213                                                                         \
00214                 l++;                                                    \
00215         }                                                               \
00216                                                                         \
00217         if (error == PARSERUTILS_OK)                                    \
00218                 *len = l;                                               \
00219 } while(0)
00220 
00228 #define UTF8_CHAR_BYTE_LENGTH(s, len, error)                            \
00229 do {                                                                    \
00230         if (s == NULL || len == NULL) {                                 \
00231                 error = PARSERUTILS_BADPARM;                            \
00232                 break;                                                  \
00233         }                                                               \
00234                                                                         \
00235         *len = numContinuations[s[0]] + 1 /* Start byte */;             \
00236                                                                         \
00237         error = PARSERUTILS_OK;                                         \
00238 } while(0)
00239 
00249 #define UTF8_PREV(s, off, prevoff, error)                               \
00250 do {                                                                    \
00251         if (s == NULL || prevoff == NULL) {                             \
00252                 error = PARSERUTILS_BADPARM;                            \
00253                 break;                                                  \
00254         }                                                               \
00255                                                                         \
00256         while (off != 0 && (s[--off] & 0xC0) == 0x80)                   \
00257                 /* do nothing */;                                       \
00258                                                                         \
00259         *prevoff = off;                                                 \
00260                                                                         \
00261         error = PARSERUTILS_OK;                                         \
00262 } while(0)
00263 
00274 #define UTF8_NEXT(s, len, off, nextoff, error)                          \
00275 do {                                                                    \
00276         if (s == NULL || off >= len || nextoff == NULL) {               \
00277                 error = PARSERUTILS_BADPARM;                            \
00278                 break;                                                  \
00279         }                                                               \
00280                                                                         \
00281         /* Skip current start byte (if present - may be mid-sequence) */\
00282         if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)                   \
00283                 off++;                                                  \
00284                                                                         \
00285         while (off < len && (s[off] & 0xC0) == 0x80)                    \
00286                 off++;                                                  \
00287                                                                         \
00288         *nextoff = off;                                                 \
00289                                                                         \
00290         error = PARSERUTILS_OK;                                         \
00291 } while(0)
00292 
00303 #define UTF8_NEXT_PARANOID(s, len, off, nextoff, error)                 \
00304 do {                                                                    \
00305         uint8_t c;                                                      \
00306                                                                         \
00307         error = PARSERUTILS_OK;                                         \
00308                                                                         \
00309         if (s == NULL || off >= len || nextoff == NULL) {               \
00310                 error = PARSERUTILS_BADPARM;                            \
00311                 break;                                                  \
00312         }                                                               \
00313                                                                         \
00314         c = s[off];                                                     \
00315                                                                         \
00316         /* If we're mid-sequence, simply advance to next byte */        \
00317         if (!(c < 0x80 || (c & 0xC0) == 0xC0)) {                        \
00318                 off++;                                                  \
00319         } else {                                                        \
00320                 uint32_t nCont = numContinuations[c];                   \
00321                 uint32_t nToSkip;                                       \
00322                                                                         \
00323                 if (off + nCont + 1 >= len) {                           \
00324                         error = PARSERUTILS_NEEDDATA;                   \
00325                         break;                                          \
00326                 }                                                       \
00327                                                                         \
00328                 /* Verify continuation bytes */                         \
00329                 for (nToSkip = 1; nToSkip <= nCont; nToSkip++) {        \
00330                         if ((s[off + nToSkip] & 0xC0) != 0x80)          \
00331                                 break;                                  \
00332                 }                                                       \
00333                                                                         \
00334                 /* Skip over the valid bytes */                         \
00335                 off += nToSkip;                                         \
00336         }                                                               \
00337                                                                         \
00338         *nextoff = off;                                                 \
00339 } while(0)
00340 
00341 #endif

Generated on Wed Jul 29 11:59:21 2015 for Libparserutils by  doxygen 1.5.6