codec_utf8.c

Go to the documentation of this file.
00001 /*
00002  * This file is part of LibParserUtils.
00003  * Licensed under the MIT License,
00004  *                http://www.opensource.org/licenses/mit-license.php
00005  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
00006  */
00007 
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011 
00012 #include <parserutils/charset/mibenum.h>
00013 
00014 #include "charset/codecs/codec_impl.h"
00015 #include "charset/encodings/utf8impl.h"
00016 #include "utils/endian.h"
00017 #include "utils/utils.h"
00018 
00022 typedef struct charset_utf8_codec {
00023         parserutils_charset_codec base; 
00025 #define INVAL_BUFSIZE (32)
00026         uint8_t inval_buf[INVAL_BUFSIZE];       
00029         size_t inval_len;               /*< Byte length of inval_buf **/
00030 
00031 #define READ_BUFSIZE (8)
00032         uint32_t read_buf[READ_BUFSIZE];        
00035         size_t read_len;                
00037 #define WRITE_BUFSIZE (8)
00038         uint32_t write_buf[WRITE_BUFSIZE];      
00041         size_t write_len;               
00043 } charset_utf8_codec;
00044 
00045 static bool charset_utf8_codec_handles_charset(const char *charset);
00046 static parserutils_error charset_utf8_codec_create(const char *charset,
00047                 parserutils_charset_codec **codec);
00048 static parserutils_error charset_utf8_codec_destroy(
00049                 parserutils_charset_codec *codec);
00050 static parserutils_error charset_utf8_codec_encode(
00051                 parserutils_charset_codec *codec,
00052                 const uint8_t **source, size_t *sourcelen,
00053                 uint8_t **dest, size_t *destlen);
00054 static parserutils_error charset_utf8_codec_decode(
00055                 parserutils_charset_codec *codec,
00056                 const uint8_t **source, size_t *sourcelen,
00057                 uint8_t **dest, size_t *destlen);
00058 static parserutils_error charset_utf8_codec_reset(
00059                 parserutils_charset_codec *codec);
00060 static inline parserutils_error charset_utf8_codec_read_char(
00061                 charset_utf8_codec *c,
00062                 const uint8_t **source, size_t *sourcelen,
00063                 uint8_t **dest, size_t *destlen);
00064 static inline parserutils_error charset_utf8_codec_output_decoded_char(
00065                 charset_utf8_codec *c,
00066                 uint32_t ucs4, uint8_t **dest, size_t *destlen);
00067 
00074 bool charset_utf8_codec_handles_charset(const char *charset)
00075 {
00076         return parserutils_charset_mibenum_from_name(charset,
00077                                 strlen(charset)) ==
00078                         parserutils_charset_mibenum_from_name("UTF-8",
00079                                 SLEN("UTF-8"));
00080 }
00081 
00091 parserutils_error charset_utf8_codec_create(const char *charset,
00092                 parserutils_charset_codec **codec)
00093 {
00094         charset_utf8_codec *c;
00095 
00096         UNUSED(charset);
00097 
00098         c = malloc(sizeof(charset_utf8_codec));
00099         if (c == NULL)
00100                 return PARSERUTILS_NOMEM;
00101 
00102         c->inval_buf[0] = '\0';
00103         c->inval_len = 0;
00104 
00105         c->read_buf[0] = 0;
00106         c->read_len = 0;
00107 
00108         c->write_buf[0] = 0;
00109         c->write_len = 0;
00110 
00111         /* Finally, populate vtable */
00112         c->base.handler.destroy = charset_utf8_codec_destroy;
00113         c->base.handler.encode = charset_utf8_codec_encode;
00114         c->base.handler.decode = charset_utf8_codec_decode;
00115         c->base.handler.reset = charset_utf8_codec_reset;
00116 
00117         *codec = (parserutils_charset_codec *) c;
00118 
00119         return PARSERUTILS_OK;
00120 }
00121 
00128 parserutils_error charset_utf8_codec_destroy (parserutils_charset_codec *codec)
00129 {
00130         UNUSED(codec);
00131 
00132         return PARSERUTILS_OK;
00133 }
00134 
00162 parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec,
00163                 const uint8_t **source, size_t *sourcelen,
00164                 uint8_t **dest, size_t *destlen)
00165 {
00166         charset_utf8_codec *c = (charset_utf8_codec *) codec;
00167         uint32_t ucs4;
00168         uint32_t *towrite;
00169         size_t towritelen;
00170         parserutils_error error;
00171 
00172         /* Process any outstanding characters from the previous call */
00173         if (c->write_len > 0) {
00174                 uint32_t *pwrite = c->write_buf;
00175 
00176                 while (c->write_len > 0) {
00177                         UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
00178                         if (error != PARSERUTILS_OK) {
00179                                 uint32_t len;
00180                                 assert(error == PARSERUTILS_NOMEM);
00181 
00182                                 /* Insufficient output buffer space */
00183                                 for (len = 0; len < c->write_len; len++) {
00184                                         c->write_buf[len] = pwrite[len];
00185                                 }
00186 
00187                                 return PARSERUTILS_NOMEM;
00188                         }
00189 
00190                         pwrite++;
00191                         c->write_len--;
00192                 }
00193         }
00194 
00195         /* Now process the characters for this call */
00196         while (*sourcelen > 0) {
00197                 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
00198                 towrite = &ucs4;
00199                 towritelen = 1;
00200 
00201                 /* Output current characters */
00202                 while (towritelen > 0) {
00203                         UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
00204                         if (error != PARSERUTILS_OK) {
00205                                 uint32_t len;
00206                                 assert(error == PARSERUTILS_NOMEM);
00207 
00208                                 /* Insufficient output space */
00209                                 assert(towritelen < WRITE_BUFSIZE);
00210 
00211                                 c->write_len = towritelen;
00212 
00213                                 /* Copy pending chars to save area, for
00214                                  * processing next call. */
00215                                 for (len = 0; len < towritelen; len++)
00216                                         c->write_buf[len] = towrite[len];
00217 
00218                                 /* Claim character we've just buffered,
00219                                  * so it's not reprocessed */
00220                                 *source += 4;
00221                                 *sourcelen -= 4;
00222 
00223                                 return PARSERUTILS_NOMEM;
00224                         }
00225 
00226                         towrite++;
00227                         towritelen--;
00228                 }
00229 
00230                 *source += 4;
00231                 *sourcelen -= 4;
00232         }
00233 
00234         return PARSERUTILS_OK;
00235 }
00236 
00278 parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec,
00279                 const uint8_t **source, size_t *sourcelen,
00280                 uint8_t **dest, size_t *destlen)
00281 {
00282         charset_utf8_codec *c = (charset_utf8_codec *) codec;
00283         parserutils_error error;
00284 
00285         if (c->read_len > 0) {
00286                 /* Output left over from last decode */
00287                 uint32_t *pread = c->read_buf;
00288 
00289                 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
00290                         *((uint32_t *) (void *) *dest) =
00291                                         endian_host_to_big(pread[0]);
00292 
00293                         *dest += 4;
00294                         *destlen -= 4;
00295 
00296                         pread++;
00297                         c->read_len--;
00298                 }
00299 
00300                 if (*destlen < c->read_len * 4) {
00301                         /* Ran out of output buffer */
00302                         size_t i;
00303 
00304                         /* Shuffle remaining output down */
00305                         for (i = 0; i < c->read_len; i++)
00306                                 c->read_buf[i] = pread[i];
00307 
00308                         return PARSERUTILS_NOMEM;
00309                 }
00310         }
00311 
00312         if (c->inval_len > 0) {
00313                 /* The last decode ended in an incomplete sequence.
00314                  * Fill up inval_buf with data from the start of the
00315                  * new chunk and process it. */
00316                 uint8_t *in = c->inval_buf;
00317                 size_t ol = c->inval_len;
00318                 size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
00319                 size_t orig_l = l;
00320 
00321                 memcpy(c->inval_buf + ol, *source, l);
00322 
00323                 l += c->inval_len;
00324 
00325                 error = charset_utf8_codec_read_char(c,
00326                                 (const uint8_t **) &in, &l, dest, destlen);
00327                 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
00328                         return error;
00329                 }
00330 
00331                 /* And now, fix up source pointers */
00332                 *source += max((signed) (orig_l - l), 0);
00333                 *sourcelen -= max((signed) (orig_l - l), 0);
00334 
00335                 /* Failed to resolve an incomplete character and
00336                  * ran out of buffer space. No recovery strategy
00337                  * possible, so explode everywhere. */
00338                 assert((orig_l + ol) - l != 0);
00339 
00340                 /* Report memory exhaustion case from above */
00341                 if (error != PARSERUTILS_OK)
00342                         return error;
00343         }
00344 
00345         /* Finally, the "normal" case; process all outstanding characters */
00346         while (*sourcelen > 0) {
00347                 error = charset_utf8_codec_read_char(c,
00348                                 source, sourcelen, dest, destlen);
00349                 if (error != PARSERUTILS_OK) {
00350                         return error;
00351                 }
00352         }
00353 
00354         return PARSERUTILS_OK;
00355 }
00356 
00363 parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
00364 {
00365         charset_utf8_codec *c = (charset_utf8_codec *) codec;
00366 
00367         c->inval_buf[0] = '\0';
00368         c->inval_len = 0;
00369 
00370         c->read_buf[0] = 0;
00371         c->read_len = 0;
00372 
00373         c->write_buf[0] = 0;
00374         c->write_len = 0;
00375 
00376         return PARSERUTILS_OK;
00377 }
00378 
00379 
00408 parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c,
00409                 const uint8_t **source, size_t *sourcelen,
00410                 uint8_t **dest, size_t *destlen)
00411 {
00412         uint32_t ucs4;
00413         size_t sucs4;
00414         parserutils_error error;
00415 
00416         /* Convert a single character */
00417         {
00418                 const uint8_t *src = *source;
00419                 size_t srclen = *sourcelen;
00420                 uint32_t *uptr = &ucs4;
00421                 size_t *usptr = &sucs4;
00422                 UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
00423         }
00424         if (error == PARSERUTILS_OK) {
00425                 /* Read a character */
00426                 error = charset_utf8_codec_output_decoded_char(c,
00427                                 ucs4, dest, destlen);
00428                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00429                         /* output succeeded; update source pointers */
00430                         *source += sucs4;
00431                         *sourcelen -= sucs4;
00432                 }
00433 
00434                 /* Clear inval buffer */
00435                 c->inval_buf[0] = '\0';
00436                 c->inval_len = 0;
00437 
00438                 return error;
00439         } else if (error == PARSERUTILS_NEEDDATA) {
00440                 /* Incomplete input sequence */
00441                 assert(*sourcelen < INVAL_BUFSIZE);
00442 
00443                 memmove(c->inval_buf, *source, *sourcelen);
00444                 c->inval_buf[*sourcelen] = '\0';
00445                 c->inval_len = *sourcelen;
00446 
00447                 *source += *sourcelen;
00448                 *sourcelen = 0;
00449 
00450                 return PARSERUTILS_OK;
00451         } else if (error == PARSERUTILS_INVALID) {
00452                 /* Illegal input sequence */
00453                 uint32_t nextchar;
00454 
00455                 /* Strict errormode; simply flag invalid character */
00456                 if (c->base.errormode ==
00457                                 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
00458                         /* Clear inval buffer */
00459                         c->inval_buf[0] = '\0';
00460                         c->inval_len = 0;
00461 
00462                         return PARSERUTILS_INVALID;
00463                 }
00464 
00465                 /* Find next valid UTF-8 sequence.
00466                  * We're processing client-provided data, so let's
00467                  * be paranoid about its validity. */
00468                 {
00469                         const uint8_t *src = *source;
00470                         size_t srclen = *sourcelen;
00471                         uint32_t off = 0;
00472                         uint32_t *ncptr = &nextchar;
00473 
00474                         UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
00475                 }
00476                 if (error != PARSERUTILS_OK) {
00477                         if (error == PARSERUTILS_NEEDDATA) {
00478                                 /* Need more data to be sure */
00479                                 assert(*sourcelen < INVAL_BUFSIZE);
00480 
00481                                 memmove(c->inval_buf, *source, *sourcelen);
00482                                 c->inval_buf[*sourcelen] = '\0';
00483                                 c->inval_len = *sourcelen;
00484 
00485                                 *source += *sourcelen;
00486                                 *sourcelen = 0;
00487 
00488                                 nextchar = 0;
00489                         } else {
00490                                 return error;
00491                         }
00492                 }
00493 
00494                 /* Clear inval buffer */
00495                 c->inval_buf[0] = '\0';
00496                 c->inval_len = 0;
00497 
00498                 /* output U+FFFD and continue processing. */
00499                 error = charset_utf8_codec_output_decoded_char(c,
00500                                 0xFFFD, dest, destlen);
00501                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00502                         /* output succeeded; update source pointers */
00503                         *source += nextchar;
00504                         *sourcelen -= nextchar;
00505                 }
00506 
00507                 return error;
00508         }
00509 
00510         return PARSERUTILS_OK;
00511 }
00512 
00523 parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c,
00524                 uint32_t ucs4, uint8_t **dest, size_t *destlen)
00525 {
00526         if (*destlen < 4) {
00527                 /* Run out of output buffer */
00528                 c->read_len = 1;
00529                 c->read_buf[0] = ucs4;
00530 
00531                 return PARSERUTILS_NOMEM;
00532         }
00533 
00534         *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
00535         *dest += 4;
00536         *destlen -= 4;
00537 
00538         return PARSERUTILS_OK;
00539 }
00540 
00541 
00542 const parserutils_charset_handler charset_utf8_codec_handler = {
00543         charset_utf8_codec_handles_charset,
00544         charset_utf8_codec_create
00545 };
00546 

Generated on Wed Jul 29 11:59:20 2015 for Libparserutils by  doxygen 1.5.6