codec_8859.c

Go to the documentation of this file.
00001 /*
00002  * This file is part of LibParserUtils.
00003  * Licensed under the MIT License,
00004  *                http://www.opensource.org/licenses/mit-license.php
00005  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
00006  */
00007 
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011 
00012 #include <parserutils/charset/mibenum.h>
00013 
00014 #include "charset/codecs/codec_impl.h"
00015 #include "utils/endian.h"
00016 #include "utils/utils.h"
00017 
00018 #include "charset/codecs/8859_tables.h"
00019 
00020 static struct {
00021         uint16_t mib;
00022         const char *name;
00023         size_t len;
00024         uint32_t *table;
00025 } known_charsets[] = {
00026         { 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 },
00027         { 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 },
00028         { 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 },
00029         { 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 },
00030         { 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 },
00031         { 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 },
00032         { 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 },
00033         { 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 },
00034         { 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 },
00035         { 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 },
00036         { 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 },
00037         { 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 },
00038         { 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 },
00039         { 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 },
00040         { 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 }
00041 };
00042 
00046 typedef struct charset_8859_codec {
00047         parserutils_charset_codec base; 
00049         uint32_t *table;                
00051 #define READ_BUFSIZE (8)
00052         uint32_t read_buf[READ_BUFSIZE];        
00055         size_t read_len;                
00057 #define WRITE_BUFSIZE (8)
00058         uint32_t write_buf[WRITE_BUFSIZE];      
00061         size_t write_len;               
00063 } charset_8859_codec;
00064 
00065 static bool charset_8859_codec_handles_charset(const char *charset);
00066 static parserutils_error charset_8859_codec_create(const char *charset,
00067                 parserutils_charset_codec **codec);
00068 static parserutils_error charset_8859_codec_destroy(
00069                 parserutils_charset_codec *codec);
00070 static parserutils_error charset_8859_codec_encode(
00071                 parserutils_charset_codec *codec,
00072                 const uint8_t **source, size_t *sourcelen,
00073                 uint8_t **dest, size_t *destlen);
00074 static parserutils_error charset_8859_codec_decode(
00075                 parserutils_charset_codec *codec,
00076                 const uint8_t **source, size_t *sourcelen,
00077                 uint8_t **dest, size_t *destlen);
00078 static parserutils_error charset_8859_codec_reset(
00079                 parserutils_charset_codec *codec);
00080 static inline parserutils_error charset_8859_codec_read_char(
00081                 charset_8859_codec *c,
00082                 const uint8_t **source, size_t *sourcelen,
00083                 uint8_t **dest, size_t *destlen);
00084 static inline parserutils_error charset_8859_codec_output_decoded_char(
00085                 charset_8859_codec *c,
00086                 uint32_t ucs4, uint8_t **dest, size_t *destlen);
00087 static inline parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
00088                 uint32_t ucs4, uint8_t **s, size_t *len);
00089 static inline parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
00090                 const uint8_t *s, size_t len, uint32_t *ucs4);
00091 
00098 bool charset_8859_codec_handles_charset(const char *charset)
00099 {
00100         uint32_t i;
00101         uint16_t match = parserutils_charset_mibenum_from_name(charset,
00102                         strlen(charset));
00103 
00104         if (known_charsets[0].mib == 0) {
00105                 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00106                         known_charsets[i].mib =
00107                                 parserutils_charset_mibenum_from_name(
00108                                                 known_charsets[i].name,
00109                                                 known_charsets[i].len);
00110                 }
00111         }
00112 
00113         for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00114                 if (known_charsets[i].mib == match)
00115                         return true;
00116         }
00117 
00118         return false;
00119 }
00120 
00130 parserutils_error charset_8859_codec_create(const char *charset,
00131                 parserutils_charset_codec **codec)
00132 {
00133         uint32_t i;
00134         charset_8859_codec *c;
00135         uint16_t match = parserutils_charset_mibenum_from_name(
00136                         charset, strlen(charset));
00137         uint32_t *table = NULL;
00138 
00139         for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00140                 if (known_charsets[i].mib == match) {
00141                         table = known_charsets[i].table;
00142                         break;
00143                 }
00144         }
00145 
00146         assert(table != NULL);
00147 
00148         c = malloc(sizeof(charset_8859_codec));
00149         if (c == NULL)
00150                 return PARSERUTILS_NOMEM;
00151 
00152         c->table = table;
00153 
00154         c->read_buf[0] = 0;
00155         c->read_len = 0;
00156 
00157         c->write_buf[0] = 0;
00158         c->write_len = 0;
00159 
00160         /* Finally, populate vtable */
00161         c->base.handler.destroy = charset_8859_codec_destroy;
00162         c->base.handler.encode = charset_8859_codec_encode;
00163         c->base.handler.decode = charset_8859_codec_decode;
00164         c->base.handler.reset = charset_8859_codec_reset;
00165 
00166         *codec = (parserutils_charset_codec *) c;
00167 
00168         return PARSERUTILS_OK;
00169 }
00170 
00177 parserutils_error charset_8859_codec_destroy (parserutils_charset_codec *codec)
00178 {
00179         UNUSED(codec);
00180 
00181         return PARSERUTILS_OK;
00182 }
00183 
00211 parserutils_error charset_8859_codec_encode(parserutils_charset_codec *codec,
00212                 const uint8_t **source, size_t *sourcelen,
00213                 uint8_t **dest, size_t *destlen)
00214 {
00215         charset_8859_codec *c = (charset_8859_codec *) codec;
00216         uint32_t ucs4;
00217         uint32_t *towrite;
00218         size_t towritelen;
00219         parserutils_error error;
00220 
00221         /* Process any outstanding characters from the previous call */
00222         if (c->write_len > 0) {
00223                 uint32_t *pwrite = c->write_buf;
00224 
00225                 while (c->write_len > 0) {
00226                         error = charset_8859_from_ucs4(c, pwrite[0],
00227                                         dest, destlen);
00228                         if (error != PARSERUTILS_OK) {
00229                                 uint32_t len;
00230                                 assert(error == PARSERUTILS_NOMEM);
00231 
00232                                 for (len = 0; len < c->write_len; len++) {
00233                                         c->write_buf[len] = pwrite[len];
00234                                 }
00235 
00236                                 return error;
00237                         }
00238 
00239                         pwrite++;
00240                         c->write_len--;
00241                 }
00242         }
00243 
00244         /* Now process the characters for this call */
00245         while (*sourcelen > 0) {
00246                 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
00247                 towrite = &ucs4;
00248                 towritelen = 1;
00249 
00250                 /* Output current characters */
00251                 while (towritelen > 0) {
00252                         error = charset_8859_from_ucs4(c, towrite[0], dest,
00253                                         destlen);
00254                         if (error != PARSERUTILS_OK) {
00255                                 uint32_t len;
00256                                 if (error != PARSERUTILS_NOMEM) {
00257                                         return error;
00258                                 }
00259 
00260                                 /* Insufficient output space */
00261                                 assert(towritelen < WRITE_BUFSIZE);
00262 
00263                                 c->write_len = towritelen;
00264 
00265                                 /* Copy pending chars to save area, for
00266                                  * processing next call. */
00267                                 for (len = 0; len < towritelen; len++)
00268                                         c->write_buf[len] = towrite[len];
00269 
00270                                 /* Claim character we've just buffered,
00271                                  * so it's not reprocessed */
00272                                 *source += 4;
00273                                 *sourcelen -= 4;
00274 
00275                                 return PARSERUTILS_NOMEM;
00276                         }
00277 
00278                         towrite++;
00279                         towritelen--;
00280                 }
00281 
00282                 *source += 4;
00283                 *sourcelen -= 4;
00284         }
00285 
00286         return PARSERUTILS_OK;
00287 }
00288 
00330 parserutils_error charset_8859_codec_decode(parserutils_charset_codec *codec,
00331                 const uint8_t **source, size_t *sourcelen,
00332                 uint8_t **dest, size_t *destlen)
00333 {
00334         charset_8859_codec *c = (charset_8859_codec *) codec;
00335         parserutils_error error;
00336 
00337         if (c->read_len > 0) {
00338                 /* Output left over from last decode */
00339                 uint32_t *pread = c->read_buf;
00340 
00341                 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
00342                         *((uint32_t *) (void *) *dest) =
00343                                         endian_host_to_big(pread[0]);
00344 
00345                         *dest += 4;
00346                         *destlen -= 4;
00347 
00348                         pread++;
00349                         c->read_len--;
00350                 }
00351 
00352                 if (*destlen < c->read_len * 4) {
00353                         /* Ran out of output buffer */
00354                         size_t i;
00355 
00356                         /* Shuffle remaining output down */
00357                         for (i = 0; i < c->read_len; i++)
00358                                 c->read_buf[i] = pread[i];
00359 
00360                         return PARSERUTILS_NOMEM;
00361                 }
00362         }
00363 
00364         /* Finally, the "normal" case; process all outstanding characters */
00365         while (*sourcelen > 0) {
00366                 error = charset_8859_codec_read_char(c,
00367                                 source, sourcelen, dest, destlen);
00368                 if (error != PARSERUTILS_OK) {
00369                         return error;
00370                 }
00371         }
00372 
00373         return PARSERUTILS_OK;
00374 }
00375 
00382 parserutils_error charset_8859_codec_reset(parserutils_charset_codec *codec)
00383 {
00384         charset_8859_codec *c = (charset_8859_codec *) codec;
00385 
00386         c->read_buf[0] = 0;
00387         c->read_len = 0;
00388 
00389         c->write_buf[0] = 0;
00390         c->write_len = 0;
00391 
00392         return PARSERUTILS_OK;
00393 }
00394 
00395 
00424 parserutils_error charset_8859_codec_read_char(charset_8859_codec *c,
00425                 const uint8_t **source, size_t *sourcelen,
00426                 uint8_t **dest, size_t *destlen)
00427 {
00428         uint32_t ucs4;
00429         parserutils_error error;
00430 
00431         /* Convert a single character */
00432         error = charset_8859_to_ucs4(c, *source, *sourcelen, &ucs4);
00433         if (error == PARSERUTILS_OK) {
00434                 /* Read a character */
00435                 error = charset_8859_codec_output_decoded_char(c,
00436                                 ucs4, dest, destlen);
00437                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00438                         /* output succeeded; update source pointers */
00439                         *source += 1;
00440                         *sourcelen -= 1;
00441                 }
00442 
00443                 return error;
00444         } else if (error == PARSERUTILS_NEEDDATA) {
00445                 /* Can only happen if sourcelen == 0 */
00446                 return error;
00447         } else if (error == PARSERUTILS_INVALID) {
00448                 /* Illegal input sequence */
00449 
00450                 /* Strict errormode; simply flag invalid character */
00451                 if (c->base.errormode ==
00452                                 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
00453                         return PARSERUTILS_INVALID;
00454                 }
00455 
00456                 /* output U+FFFD and continue processing. */
00457                 error = charset_8859_codec_output_decoded_char(c,
00458                                 0xFFFD, dest, destlen);
00459                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00460                         /* output succeeded; update source pointers */
00461                         *source += 1;
00462                         *sourcelen -= 1;
00463                 }
00464 
00465                 return error;
00466         }
00467 
00468         return PARSERUTILS_OK;
00469 }
00470 
00481 parserutils_error charset_8859_codec_output_decoded_char(charset_8859_codec *c,
00482                 uint32_t ucs4, uint8_t **dest, size_t *destlen)
00483 {
00484         if (*destlen < 4) {
00485                 /* Run out of output buffer */
00486                 c->read_len = 1;
00487                 c->read_buf[0] = ucs4;
00488 
00489                 return PARSERUTILS_NOMEM;
00490         }
00491 
00492         *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
00493         *dest += 4;
00494         *destlen -= 4;
00495 
00496         return PARSERUTILS_OK;
00497 }
00498 
00515 parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
00516                 uint32_t ucs4, uint8_t **s, size_t *len)
00517 {
00518         uint8_t out = 0;
00519 
00520         if (*len < 1)
00521                 return PARSERUTILS_NOMEM;
00522 
00523         if (ucs4 < 0x80) {
00524                 /* ASCII */
00525                 out = ucs4;
00526         } else {
00527                 uint32_t i;
00528 
00529                 for (i = 0; i < 96; i++) {
00530                         if (ucs4 == c->table[i])
00531                                 break;
00532                 }
00533 
00534                 if (i == 96) {
00535                         if (c->base.errormode ==
00536                                         PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
00537                                 return PARSERUTILS_INVALID;
00538                         else
00539                                 out = '?';
00540                 } else {
00541                         out = 0xA0 + i;
00542                 }
00543         }
00544 
00545         *(*s) = out;
00546         (*s)++;
00547         (*len)--;
00548 
00549         return PARSERUTILS_OK;
00550 }
00551 
00563 parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
00564                 const uint8_t *s, size_t len, uint32_t *ucs4)
00565 {
00566         uint32_t out;
00567 
00568         if (len < 1)
00569                 return PARSERUTILS_NEEDDATA;
00570 
00571         if (*s < 0x80) {
00572                 out = *s;
00573         } else if (*s >= 0xA0) {
00574                 if (c->table[*s - 0xA0] == 0xFFFF)
00575                         return PARSERUTILS_INVALID;
00576 
00577                 out = c->table[*s - 0xA0];
00578         } else {
00579                 return PARSERUTILS_INVALID;
00580         }
00581 
00582         *ucs4 = out;
00583 
00584         return PARSERUTILS_OK;
00585 }
00586 
00587 const parserutils_charset_handler charset_8859_codec_handler = {
00588         charset_8859_codec_handles_charset,
00589         charset_8859_codec_create
00590 };
00591 

Generated on Wed Jul 29 11:59:20 2015 for Libparserutils by  doxygen 1.5.6