codec_utf16.c

Go to the documentation of this file.
00001 /*
00002  * This file is part of LibParserUtils.
00003  * Licensed under the MIT License,
00004  *                http://www.opensource.org/licenses/mit-license.php
00005  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
00006  */
00007 
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011 
00012 #include <parserutils/charset/mibenum.h>
00013 #include <parserutils/charset/utf16.h>
00014 
00015 #include "charset/codecs/codec_impl.h"
00016 #include "utils/endian.h"
00017 #include "utils/utils.h"
00018 
00022 typedef struct charset_utf16_codec {
00023         parserutils_charset_codec base; 
00025 #define INVAL_BUFSIZE (32)
00026         uint8_t inval_buf[INVAL_BUFSIZE];       
00029         size_t inval_len;               /*< Byte length of inval_buf **/
00030 
00031 #define READ_BUFSIZE (8)
00032         uint32_t read_buf[READ_BUFSIZE];        
00035         size_t read_len;                
00037 #define WRITE_BUFSIZE (8)
00038         uint32_t write_buf[WRITE_BUFSIZE];      
00041         size_t write_len;               
00043 } charset_utf16_codec;
00044 
00045 static bool charset_utf16_codec_handles_charset(const char *charset);
00046 static parserutils_error charset_utf16_codec_create(const char *charset,
00047                 parserutils_charset_codec **codec);
00048 static parserutils_error charset_utf16_codec_destroy(
00049                 parserutils_charset_codec *codec);
00050 static parserutils_error charset_utf16_codec_encode(
00051                 parserutils_charset_codec *codec,
00052                 const uint8_t **source, size_t *sourcelen,
00053                 uint8_t **dest, size_t *destlen);
00054 static parserutils_error charset_utf16_codec_decode(
00055                 parserutils_charset_codec *codec,
00056                 const uint8_t **source, size_t *sourcelen,
00057                 uint8_t **dest, size_t *destlen);
00058 static parserutils_error charset_utf16_codec_reset(
00059                 parserutils_charset_codec *codec);
00060 static inline parserutils_error charset_utf16_codec_read_char(
00061                 charset_utf16_codec *c,
00062                 const uint8_t **source, size_t *sourcelen,
00063                 uint8_t **dest, size_t *destlen);
00064 static inline parserutils_error charset_utf16_codec_output_decoded_char(
00065                 charset_utf16_codec *c,
00066                 uint32_t ucs4, uint8_t **dest, size_t *destlen);
00067 
00074 bool charset_utf16_codec_handles_charset(const char *charset)
00075 {
00076         return parserutils_charset_mibenum_from_name(charset, strlen(charset))
00077                 ==
00078                 parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16"));
00079 }
00080 
00090 parserutils_error charset_utf16_codec_create(const char *charset,
00091                 parserutils_charset_codec **codec)
00092 {
00093         charset_utf16_codec *c;
00094 
00095         UNUSED(charset);
00096 
00097         c = malloc(sizeof(charset_utf16_codec));
00098         if (c == NULL)
00099                 return PARSERUTILS_NOMEM;
00100 
00101         c->inval_buf[0] = '\0';
00102         c->inval_len = 0;
00103 
00104         c->read_buf[0] = 0;
00105         c->read_len = 0;
00106 
00107         c->write_buf[0] = 0;
00108         c->write_len = 0;
00109 
00110         /* Finally, populate vtable */
00111         c->base.handler.destroy = charset_utf16_codec_destroy;
00112         c->base.handler.encode = charset_utf16_codec_encode;
00113         c->base.handler.decode = charset_utf16_codec_decode;
00114         c->base.handler.reset = charset_utf16_codec_reset;
00115 
00116         *codec = (parserutils_charset_codec *) c;
00117 
00118         return PARSERUTILS_OK;
00119 }
00120 
00127 parserutils_error charset_utf16_codec_destroy (parserutils_charset_codec *codec)
00128 {
00129         UNUSED(codec);
00130 
00131         return PARSERUTILS_OK;
00132 }
00133 
00161 parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec,
00162                 const uint8_t **source, size_t *sourcelen,
00163                 uint8_t **dest, size_t *destlen)
00164 {
00165         charset_utf16_codec *c = (charset_utf16_codec *) codec;
00166         uint32_t ucs4;
00167         uint32_t *towrite;
00168         size_t towritelen;
00169         parserutils_error error;
00170 
00171         /* Process any outstanding characters from the previous call */
00172         if (c->write_len > 0) {
00173                 uint32_t *pwrite = c->write_buf;
00174                 uint8_t buf[4];
00175                 size_t len;
00176 
00177                 while (c->write_len > 0) {
00178                         error = parserutils_charset_utf16_from_ucs4(
00179                                         pwrite[0], buf, &len);
00180                         assert(error == PARSERUTILS_OK);
00181 
00182                         if (*destlen < len) {
00183                                 /* Insufficient output buffer space */
00184                                 for (len = 0; len < c->write_len; len++)
00185                                         c->write_buf[len] = pwrite[len];
00186 
00187                                 return PARSERUTILS_NOMEM;
00188                         }
00189 
00190                         memcpy(*dest, buf, len);
00191 
00192                         *dest += len;
00193                         *destlen -= len;
00194 
00195                         pwrite++;
00196                         c->write_len--;
00197                 }
00198         }
00199 
00200         /* Now process the characters for this call */
00201         while (*sourcelen > 0) {
00202                 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
00203                 towrite = &ucs4;
00204                 towritelen = 1;
00205 
00206                 /* Output current characters */
00207                 while (towritelen > 0) {
00208                         uint8_t buf[4];
00209                         size_t len;
00210 
00211                         error = parserutils_charset_utf16_from_ucs4(
00212                                         towrite[0], buf, &len);
00213                         assert(error == PARSERUTILS_OK);
00214 
00215                         if (*destlen < len) {
00216                                 /* Insufficient output space */
00217                                 assert(towritelen < WRITE_BUFSIZE);
00218 
00219                                 c->write_len = towritelen;
00220 
00221                                 /* Copy pending chars to save area, for
00222                                  * processing next call. */
00223                                 for (len = 0; len < towritelen; len++)
00224                                         c->write_buf[len] = towrite[len];
00225 
00226                                 /* Claim character we've just buffered,
00227                                  * so it's not reprocessed */
00228                                 *source += 4;
00229                                 *sourcelen -= 4;
00230 
00231                                 return PARSERUTILS_NOMEM;
00232                         }
00233 
00234                         memcpy(*dest, buf, len);
00235 
00236                         *dest += len;
00237                         *destlen -= len;
00238 
00239                         towrite++;
00240                         towritelen--;
00241                 }
00242 
00243                 *source += 4;
00244                 *sourcelen -= 4;
00245         }
00246 
00247         (void) error;
00248 
00249         return PARSERUTILS_OK;
00250 }
00251 
00293 parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec,
00294                 const uint8_t **source, size_t *sourcelen,
00295                 uint8_t **dest, size_t *destlen)
00296 {
00297         charset_utf16_codec *c = (charset_utf16_codec *) codec;
00298         parserutils_error error;
00299 
00300         if (c->read_len > 0) {
00301                 /* Output left over from last decode */
00302                 uint32_t *pread = c->read_buf;
00303 
00304                 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
00305                         *((uint32_t *) (void *) *dest) =
00306                                         endian_host_to_big(pread[0]);
00307 
00308                         *dest += 4;
00309                         *destlen -= 4;
00310 
00311                         pread++;
00312                         c->read_len--;
00313                 }
00314 
00315                 if (*destlen < c->read_len * 4) {
00316                         /* Ran out of output buffer */
00317                         size_t i;
00318 
00319                         /* Shuffle remaining output down */
00320                         for (i = 0; i < c->read_len; i++)
00321                                 c->read_buf[i] = pread[i];
00322 
00323                         return PARSERUTILS_NOMEM;
00324                 }
00325         }
00326 
00327         if (c->inval_len > 0) {
00328                 /* The last decode ended in an incomplete sequence.
00329                  * Fill up inval_buf with data from the start of the
00330                  * new chunk and process it. */
00331                 uint8_t *in = c->inval_buf;
00332                 size_t ol = c->inval_len;
00333                 size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
00334                 size_t orig_l = l;
00335 
00336                 memcpy(c->inval_buf + ol, *source, l);
00337 
00338                 l += c->inval_len;
00339 
00340                 error = charset_utf16_codec_read_char(c,
00341                                 (const uint8_t **) &in, &l, dest, destlen);
00342                 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
00343                         return error;
00344                 }
00345 
00346                 /* And now, fix up source pointers */
00347                 *source += max((signed) (orig_l - l), 0);
00348                 *sourcelen -= max((signed) (orig_l - l), 0);
00349 
00350                 /* Failed to resolve an incomplete character and
00351                  * ran out of buffer space. No recovery strategy
00352                  * possible, so explode everywhere. */
00353                 assert((orig_l + ol) - l != 0);
00354 
00355                 /* Report memory exhaustion case from above */
00356                 if (error != PARSERUTILS_OK)
00357                         return error;
00358         }
00359 
00360         /* Finally, the "normal" case; process all outstanding characters */
00361         while (*sourcelen > 0) {
00362                 error = charset_utf16_codec_read_char(c,
00363                                 source, sourcelen, dest, destlen);
00364                 if (error != PARSERUTILS_OK) {
00365                         return error;
00366                 }
00367         }
00368 
00369         return PARSERUTILS_OK;
00370 }
00371 
00378 parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
00379 {
00380         charset_utf16_codec *c = (charset_utf16_codec *) codec;
00381 
00382         c->inval_buf[0] = '\0';
00383         c->inval_len = 0;
00384 
00385         c->read_buf[0] = 0;
00386         c->read_len = 0;
00387 
00388         c->write_buf[0] = 0;
00389         c->write_len = 0;
00390 
00391         return PARSERUTILS_OK;
00392 }
00393 
00394 
00423 parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c,
00424                 const uint8_t **source, size_t *sourcelen,
00425                 uint8_t **dest, size_t *destlen)
00426 {
00427         uint32_t ucs4;
00428         size_t sucs4;
00429         parserutils_error error;
00430 
00431         /* Convert a single character */
00432         error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen,
00433                         &ucs4, &sucs4);
00434         if (error == PARSERUTILS_OK) {
00435                 /* Read a character */
00436                 error = charset_utf16_codec_output_decoded_char(c,
00437                                 ucs4, dest, destlen);
00438                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00439                         /* output succeeded; update source pointers */
00440                         *source += sucs4;
00441                         *sourcelen -= sucs4;
00442                 }
00443 
00444                 /* Clear inval buffer */
00445                 c->inval_buf[0] = '\0';
00446                 c->inval_len = 0;
00447 
00448                 return error;
00449         } else if (error == PARSERUTILS_NEEDDATA) {
00450                 /* Incomplete input sequence */
00451                 assert(*sourcelen < INVAL_BUFSIZE);
00452 
00453                 memmove(c->inval_buf, *source, *sourcelen);
00454                 c->inval_buf[*sourcelen] = '\0';
00455                 c->inval_len = *sourcelen;
00456 
00457                 *source += *sourcelen;
00458                 *sourcelen = 0;
00459 
00460                 return PARSERUTILS_OK;
00461         } else if (error == PARSERUTILS_INVALID) {
00462                 /* Illegal input sequence */
00463                 uint32_t nextchar;
00464 
00465                 /* Clear inval buffer */
00466                 c->inval_buf[0] = '\0';
00467                 c->inval_len = 0;
00468 
00469                 /* Strict errormode; simply flag invalid character */
00470                 if (c->base.errormode ==
00471                                 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
00472                         return PARSERUTILS_INVALID;
00473                 }
00474 
00475                 /* Find next valid UTF-16 sequence.
00476                  * We're processing client-provided data, so let's
00477                  * be paranoid about its validity. */
00478                 error = parserutils_charset_utf16_next_paranoid(
00479                                 *source, *sourcelen, 0, &nextchar);
00480                 if (error != PARSERUTILS_OK) {
00481                         if (error == PARSERUTILS_NEEDDATA) {
00482                                 /* Need more data to be sure */
00483                                 assert(*sourcelen < INVAL_BUFSIZE);
00484 
00485                                 memmove(c->inval_buf, *source, *sourcelen);
00486                                 c->inval_buf[*sourcelen] = '\0';
00487                                 c->inval_len = *sourcelen;
00488 
00489                                 *source += *sourcelen;
00490                                 *sourcelen = 0;
00491 
00492                                 nextchar = 0;
00493                         } else {
00494                                 return error;
00495                         }
00496                 }
00497 
00498                 /* output U+FFFD and continue processing. */
00499                 error = charset_utf16_codec_output_decoded_char(c,
00500                                 0xFFFD, dest, destlen);
00501                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00502                         /* output succeeded; update source pointers */
00503                         *source += nextchar;
00504                         *sourcelen -= nextchar;
00505                 }
00506 
00507                 return error;
00508         }
00509 
00510         return PARSERUTILS_OK;
00511 }
00512 
00523 parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c,
00524                 uint32_t ucs4, uint8_t **dest, size_t *destlen)
00525 {
00526         if (*destlen < 4) {
00527                 /* Run out of output buffer */
00528                 c->read_len = 1;
00529                 c->read_buf[0] = ucs4;
00530 
00531                 return PARSERUTILS_NOMEM;
00532         }
00533 
00534         *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
00535         *dest += 4;
00536         *destlen -= 4;
00537 
00538         return PARSERUTILS_OK;
00539 }
00540 
00541 
00542 const parserutils_charset_handler charset_utf16_codec_handler = {
00543         charset_utf16_codec_handles_charset,
00544         charset_utf16_codec_create
00545 };

Generated on Wed Jul 29 11:59:20 2015 for Libparserutils by  doxygen 1.5.6