codec_ascii.c

Go to the documentation of this file.
00001 /*
00002  * This file is part of LibParserUtils.
00003  * Licensed under the MIT License,
00004  *                http://www.opensource.org/licenses/mit-license.php
00005  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
00006  */
00007 
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011 
00012 #include <parserutils/charset/mibenum.h>
00013 
00014 #include "charset/codecs/codec_impl.h"
00015 #include "utils/endian.h"
00016 #include "utils/utils.h"
00017 
00021 typedef struct charset_ascii_codec {
00022         parserutils_charset_codec base; 
00024 #define READ_BUFSIZE (8)
00025         uint32_t read_buf[READ_BUFSIZE];        
00028         size_t read_len;                
00030 #define WRITE_BUFSIZE (8)
00031         uint32_t write_buf[WRITE_BUFSIZE];      
00034         size_t write_len;               
00036 } charset_ascii_codec;
00037 
00038 static bool charset_ascii_codec_handles_charset(const char *charset);
00039 static parserutils_error charset_ascii_codec_create(
00040                 const char *charset, parserutils_charset_codec **codec);
00041 static parserutils_error charset_ascii_codec_destroy(
00042                 parserutils_charset_codec *codec);
00043 static parserutils_error charset_ascii_codec_encode(
00044                 parserutils_charset_codec *codec,
00045                 const uint8_t **source, size_t *sourcelen,
00046                 uint8_t **dest, size_t *destlen);
00047 static parserutils_error charset_ascii_codec_decode(
00048                 parserutils_charset_codec *codec,
00049                 const uint8_t **source, size_t *sourcelen,
00050                 uint8_t **dest, size_t *destlen);
00051 static parserutils_error charset_ascii_codec_reset(
00052                 parserutils_charset_codec *codec);
00053 static inline parserutils_error charset_ascii_codec_read_char(
00054                 charset_ascii_codec *c,
00055                 const uint8_t **source, size_t *sourcelen,
00056                 uint8_t **dest, size_t *destlen);
00057 static inline parserutils_error charset_ascii_codec_output_decoded_char(
00058                 charset_ascii_codec *c,
00059                 uint32_t ucs4, uint8_t **dest, size_t *destlen);
00060 static inline parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c,
00061                 uint32_t ucs4, uint8_t **s, size_t *len);
00062 static inline parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c,
00063                 const uint8_t *s, size_t len, uint32_t *ucs4);
00064 
00071 bool charset_ascii_codec_handles_charset(const char *charset)
00072 {
00073         static uint16_t ascii;
00074         uint16_t match = parserutils_charset_mibenum_from_name(charset,
00075                         strlen(charset));
00076 
00077         if (ascii == 0) {
00078                 ascii = parserutils_charset_mibenum_from_name(
00079                                 "US-ASCII", SLEN("US-ASCII"));
00080         }
00081 
00082         if (ascii != 0 && ascii == match)
00083                 return true;
00084 
00085         return false;
00086 }
00087 
00097 parserutils_error charset_ascii_codec_create(const char *charset,
00098                 parserutils_charset_codec **codec)
00099 {
00100         charset_ascii_codec *c;
00101 
00102         UNUSED(charset);
00103 
00104         c = malloc(sizeof(charset_ascii_codec));
00105         if (c == NULL)
00106                 return PARSERUTILS_NOMEM;
00107 
00108         c->read_buf[0] = 0;
00109         c->read_len = 0;
00110 
00111         c->write_buf[0] = 0;
00112         c->write_len = 0;
00113 
00114         /* Finally, populate vtable */
00115         c->base.handler.destroy = charset_ascii_codec_destroy;
00116         c->base.handler.encode = charset_ascii_codec_encode;
00117         c->base.handler.decode = charset_ascii_codec_decode;
00118         c->base.handler.reset = charset_ascii_codec_reset;
00119 
00120         *codec = (parserutils_charset_codec *) c;
00121 
00122         return PARSERUTILS_OK;
00123 }
00124 
00131 parserutils_error charset_ascii_codec_destroy (parserutils_charset_codec *codec)
00132 {
00133         UNUSED(codec);
00134 
00135         return PARSERUTILS_OK;
00136 }
00137 
00165 parserutils_error charset_ascii_codec_encode(parserutils_charset_codec *codec,
00166                 const uint8_t **source, size_t *sourcelen,
00167                 uint8_t **dest, size_t *destlen)
00168 {
00169         charset_ascii_codec *c = (charset_ascii_codec *) codec;
00170         uint32_t ucs4;
00171         uint32_t *towrite;
00172         size_t towritelen;
00173         parserutils_error error;
00174 
00175         /* Process any outstanding characters from the previous call */
00176         if (c->write_len > 0) {
00177                 uint32_t *pwrite = c->write_buf;
00178 
00179                 while (c->write_len > 0) {
00180                         error = charset_ascii_from_ucs4(c, pwrite[0],
00181                                         dest, destlen);
00182                         if (error != PARSERUTILS_OK) {
00183                                 uint32_t len;
00184                                 assert(error == PARSERUTILS_NOMEM);
00185 
00186                                 for (len = 0; len < c->write_len; len++) {
00187                                         c->write_buf[len] = pwrite[len];
00188                                 }
00189 
00190                                 return error;
00191                         }
00192 
00193                         pwrite++;
00194                         c->write_len--;
00195                 }
00196         }
00197 
00198         /* Now process the characters for this call */
00199         while (*sourcelen > 0) {
00200                 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
00201                 towrite = &ucs4;
00202                 towritelen = 1;
00203 
00204                 /* Output current characters */
00205                 while (towritelen > 0) {
00206                         error = charset_ascii_from_ucs4(c, towrite[0], dest,
00207                                         destlen);
00208                         if (error != PARSERUTILS_OK) {
00209                                 uint32_t len;
00210                                 if (error != PARSERUTILS_NOMEM) {
00211                                         return error;
00212                                 }
00213 
00214                                 /* Insufficient output space */
00215                                 assert(towritelen < WRITE_BUFSIZE);
00216 
00217                                 c->write_len = towritelen;
00218 
00219                                 /* Copy pending chars to save area, for
00220                                  * processing next call. */
00221                                 for (len = 0; len < towritelen; len++)
00222                                         c->write_buf[len] = towrite[len];
00223 
00224                                 /* Claim character we've just buffered,
00225                                  * so it's not reprocessed */
00226                                 *source += 4;
00227                                 *sourcelen -= 4;
00228 
00229                                 return PARSERUTILS_NOMEM;
00230                         }
00231 
00232                         towrite++;
00233                         towritelen--;
00234                 }
00235 
00236                 *source += 4;
00237                 *sourcelen -= 4;
00238         }
00239 
00240         return PARSERUTILS_OK;
00241 }
00242 
00284 parserutils_error charset_ascii_codec_decode(parserutils_charset_codec *codec,
00285                 const uint8_t **source, size_t *sourcelen,
00286                 uint8_t **dest, size_t *destlen)
00287 {
00288         charset_ascii_codec *c = (charset_ascii_codec *) codec;
00289         parserutils_error error;
00290 
00291         if (c->read_len > 0) {
00292                 /* Output left over from last decode */
00293                 uint32_t *pread = c->read_buf;
00294 
00295                 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
00296                         *((uint32_t *) (void *) *dest) =
00297                                         endian_host_to_big(pread[0]);
00298 
00299                         *dest += 4;
00300                         *destlen -= 4;
00301 
00302                         pread++;
00303                         c->read_len--;
00304                 }
00305 
00306                 if (*destlen < c->read_len * 4) {
00307                         /* Ran out of output buffer */
00308                         size_t i;
00309 
00310                         /* Shuffle remaining output down */
00311                         for (i = 0; i < c->read_len; i++)
00312                                 c->read_buf[i] = pread[i];
00313 
00314                         return PARSERUTILS_NOMEM;
00315                 }
00316         }
00317 
00318         /* Finally, the "normal" case; process all outstanding characters */
00319         while (*sourcelen > 0) {
00320                 error = charset_ascii_codec_read_char(c,
00321                                 source, sourcelen, dest, destlen);
00322                 if (error != PARSERUTILS_OK) {
00323                         return error;
00324                 }
00325         }
00326 
00327         return PARSERUTILS_OK;
00328 }
00329 
00336 parserutils_error charset_ascii_codec_reset(parserutils_charset_codec *codec)
00337 {
00338         charset_ascii_codec *c = (charset_ascii_codec *) codec;
00339 
00340         c->read_buf[0] = 0;
00341         c->read_len = 0;
00342 
00343         c->write_buf[0] = 0;
00344         c->write_len = 0;
00345 
00346         return PARSERUTILS_OK;
00347 }
00348 
00349 
00378 parserutils_error charset_ascii_codec_read_char(charset_ascii_codec *c,
00379                 const uint8_t **source, size_t *sourcelen,
00380                 uint8_t **dest, size_t *destlen)
00381 {
00382         uint32_t ucs4;
00383         parserutils_error error;
00384 
00385         /* Convert a single character */
00386         error = charset_ascii_to_ucs4(c, *source, *sourcelen, &ucs4);
00387         if (error == PARSERUTILS_OK) {
00388                 /* Read a character */
00389                 error = charset_ascii_codec_output_decoded_char(c,
00390                                 ucs4, dest, destlen);
00391                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00392                         /* output succeeded; update source pointers */
00393                         *source += 1;
00394                         *sourcelen -= 1;
00395                 }
00396 
00397                 return error;
00398         } else if (error == PARSERUTILS_NEEDDATA) {
00399                 /* Can only happen if sourcelen == 0 */
00400                 return error;
00401         } else if (error == PARSERUTILS_INVALID) {
00402                 /* Illegal input sequence */
00403 
00404                 /* Strict errormode; simply flag invalid character */
00405                 if (c->base.errormode ==
00406                                 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
00407                         return PARSERUTILS_INVALID;
00408                 }
00409 
00410                 /* output U+FFFD and continue processing. */
00411                 error = charset_ascii_codec_output_decoded_char(c,
00412                                 0xFFFD, dest, destlen);
00413                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00414                         /* output succeeded; update source pointers */
00415                         *source += 1;
00416                         *sourcelen -= 1;
00417                 }
00418 
00419                 return error;
00420         }
00421 
00422         return PARSERUTILS_OK;
00423 }
00424 
00435 parserutils_error charset_ascii_codec_output_decoded_char(
00436                 charset_ascii_codec *c,
00437                 uint32_t ucs4, uint8_t **dest, size_t *destlen)
00438 {
00439         if (*destlen < 4) {
00440                 /* Run out of output buffer */
00441                 c->read_len = 1;
00442                 c->read_buf[0] = ucs4;
00443 
00444                 return PARSERUTILS_NOMEM;
00445         }
00446 
00447         *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
00448         *dest += 4;
00449         *destlen -= 4;
00450 
00451         return PARSERUTILS_OK;
00452 }
00453 
00470 parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c,
00471                 uint32_t ucs4, uint8_t **s, size_t *len)
00472 {
00473         uint8_t out = 0;
00474 
00475         if (*len < 1)
00476                 return PARSERUTILS_NOMEM;
00477 
00478         if (ucs4 < 0x80) {
00479                 /* ASCII */
00480                 out = ucs4;
00481         } else {
00482                 if (c->base.errormode == PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
00483                         return PARSERUTILS_INVALID;
00484                 else
00485                         out = '?';
00486         }
00487 
00488         *(*s) = out;
00489         (*s)++;
00490         (*len)--;
00491 
00492         return PARSERUTILS_OK;
00493 }
00494 
00506 parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c,
00507                 const uint8_t *s, size_t len, uint32_t *ucs4)
00508 {
00509         uint32_t out;
00510 
00511         UNUSED(c);
00512 
00513         if (len < 1)
00514                 return PARSERUTILS_NEEDDATA;
00515 
00516         if (*s < 0x80) {
00517                 out = *s;
00518         } else {
00519                 return PARSERUTILS_INVALID;
00520         }
00521 
00522         *ucs4 = out;
00523 
00524         return PARSERUTILS_OK;
00525 }
00526 
00527 const parserutils_charset_handler charset_ascii_codec_handler = {
00528         charset_ascii_codec_handles_charset,
00529         charset_ascii_codec_create
00530 };
00531 

Generated on Wed Jul 29 11:59:20 2015 for Libparserutils by  doxygen 1.5.6