utf16.c

Go to the documentation of this file.
00001 /*
00002  * This file is part of LibParserUtils.
00003  * Licensed under the MIT License,
00004  *                http://www.opensource.org/licenses/mit-license.php
00005  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
00006  */
00007 
00012 #include <stdbool.h>
00013 #include <stdlib.h>
00014 #include <string.h>
00015 
00016 #include <parserutils/charset/utf16.h>
00017 
00027 parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, 
00028                 size_t len, uint32_t *ucs4, size_t *clen)
00029 {
00030         const uint16_t *ss = (const uint16_t *) (const void *) s;
00031 
00032         if (s == NULL || ucs4 == NULL || clen == NULL)
00033                 return PARSERUTILS_BADPARM;
00034 
00035         if (len < 2)
00036                 return PARSERUTILS_NEEDDATA;
00037 
00038         if (*ss < 0xD800 || *ss > 0xDFFF) {
00039                 *ucs4 = *ss;
00040                 *clen = 2;
00041         } else if (0xD800 <= *ss && *ss <= 0xDBFF) {
00042                 /* High-surrogate code unit.  */
00043                 if (len < 4)
00044                         return PARSERUTILS_NEEDDATA;
00045 
00046                 if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) {
00047                         /* We have a valid surrogate pair.  */
00048                         *ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF))
00049                                 + (1<<16);
00050                         *clen = 4;
00051                 } else {
00052                         return PARSERUTILS_INVALID;
00053                 }
00054         } else {
00055                 /* Low-surrogate code unit.  */
00056                 return PARSERUTILS_INVALID;
00057         }
00058 
00059         return PARSERUTILS_OK;
00060 }
00061 
00070 parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
00071                 size_t *len)
00072 {
00073         uint16_t *ss = (uint16_t *) (void *) s;
00074         uint32_t l = 0;
00075 
00076         if (s == NULL || len == NULL)
00077                 return PARSERUTILS_BADPARM;
00078         else if (ucs4 < 0x10000) {
00079                 *ss = (uint16_t) ucs4;
00080                 l = 2;
00081         } else if (ucs4 < 0x110000) {
00082                 ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
00083                 ss[1] = 0xDC00 | (ucs4 & 0x3ff);
00084                 l = 4;
00085         } else {
00086                 return PARSERUTILS_INVALID;
00087         }
00088 
00089         *len = l;
00090 
00091         return PARSERUTILS_OK;
00092 }
00093 
00102 parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
00103                 size_t *len)
00104 {
00105         const uint16_t *ss = (const uint16_t *) (const void *) s;
00106         const uint16_t *end = (const uint16_t *) (const void *) (s + max);
00107         int l = 0;
00108 
00109         if (s == NULL || len == NULL)
00110                 return PARSERUTILS_BADPARM;
00111 
00112         while (ss < end) {
00113                 if (*ss < 0xD800 || 0xDFFF < *ss)
00114                         ss++;
00115                 else
00116                         ss += 2;
00117 
00118                 l++;
00119         }
00120 
00121         *len = l;
00122 
00123         return PARSERUTILS_OK;
00124 }
00125 
00133 parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
00134                 size_t *len)
00135 {
00136         const uint16_t *ss = (const uint16_t *) (const void *) s;
00137 
00138         if (s == NULL || len == NULL)
00139                 return PARSERUTILS_BADPARM;
00140 
00141         if (*ss < 0xD800 || 0xDFFF < *ss)
00142                 *len = 2;
00143         else
00144                 *len = 4;
00145 
00146         return PARSERUTILS_OK;
00147 }
00148 
00158 parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
00159                 uint32_t *prevoff)
00160 {
00161         const uint16_t *ss = (const uint16_t *) (const void *) s;
00162 
00163         if (s == NULL || prevoff == NULL)
00164                 return PARSERUTILS_BADPARM;
00165 
00166         if (off < 2)
00167                 *prevoff = 0;
00168         else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
00169                 *prevoff = off - 2;
00170         else
00171                 *prevoff = (off < 4) ? 0 : off - 4;
00172 
00173         return PARSERUTILS_OK;
00174 }
00175 
00186 parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
00187                 uint32_t off, uint32_t *nextoff)
00188 {
00189         const uint16_t *ss = (const uint16_t *) (const void *) s;
00190 
00191         if (s == NULL || off >= len || nextoff == NULL)
00192                 return PARSERUTILS_BADPARM;
00193 
00194         if (len - off < 4)
00195                 *nextoff = len;
00196         else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
00197                 *nextoff = off + 2;
00198         else
00199                 *nextoff = (len - off < 6) ? len : off + 4;
00200 
00201         return PARSERUTILS_OK;
00202 }
00203 
00214 parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
00215                 uint32_t len, uint32_t off, uint32_t *nextoff)
00216 {
00217         const uint16_t *ss = (const uint16_t *) (const void *) s;
00218 
00219         if (s == NULL || off >= len || nextoff == NULL)
00220                 return PARSERUTILS_BADPARM;
00221 
00222         while (1) {
00223                 if (len - off < 4) {
00224                         return PARSERUTILS_NEEDDATA;
00225                 } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
00226                         *nextoff = off + 2;
00227                         break;
00228                 } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
00229                         if (len - off < 6)
00230                                 return PARSERUTILS_NEEDDATA;
00231 
00232                         if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
00233                                 *nextoff = off + 4;
00234                                 break;
00235                         } else {
00236                                 ss++;
00237                                 off += 2;
00238                         }
00239                 }
00240         }
00241 
00242         return PARSERUTILS_OK;
00243 }
00244 

Generated on Wed Jul 29 11:59:21 2015 for Libparserutils by  doxygen 1.5.6