codec_ext8.c

Go to the documentation of this file.
00001 /*
00002  * This file is part of LibParserUtils.
00003  * Licensed under the MIT License,
00004  *                http://www.opensource.org/licenses/mit-license.php
00005  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
00006  */
00007 
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011 
00012 #include <parserutils/charset/mibenum.h>
00013 
00014 #include "charset/codecs/codec_impl.h"
00015 #include "utils/endian.h"
00016 #include "utils/utils.h"
00017 
00018 #include "charset/codecs/ext8_tables.h"
00019 
00020 static struct {
00021         uint16_t mib;
00022         const char *name;
00023         size_t len;
00024         uint32_t *table;
00025 } known_charsets[] = {
00026         { 0, "Windows-1250", SLEN("Windows-1250"), w1250 },
00027         { 0, "Windows-1251", SLEN("Windows-1251"), w1251 },
00028         { 0, "Windows-1252", SLEN("Windows-1252"), w1252 },
00029         { 0, "Windows-1253", SLEN("Windows-1253"), w1253 },
00030         { 0, "Windows-1254", SLEN("Windows-1254"), w1254 },
00031         { 0, "Windows-1255", SLEN("Windows-1255"), w1255 },
00032         { 0, "Windows-1256", SLEN("Windows-1256"), w1256 },
00033         { 0, "Windows-1257", SLEN("Windows-1257"), w1257 },
00034         { 0, "Windows-1258", SLEN("Windows-1258"), w1258 },
00035 };
00036 
00040 typedef struct charset_ext8_codec {
00041         parserutils_charset_codec base; 
00043         uint32_t *table;                
00045 #define READ_BUFSIZE (8)
00046         uint32_t read_buf[READ_BUFSIZE];        
00049         size_t read_len;                
00051 #define WRITE_BUFSIZE (8)
00052         uint32_t write_buf[WRITE_BUFSIZE];      
00055         size_t write_len;               
00057 } charset_ext8_codec;
00058 
00059 static bool charset_ext8_codec_handles_charset(const char *charset);
00060 static parserutils_error charset_ext8_codec_create(const char *charset,
00061                 parserutils_charset_codec **codec);
00062 static parserutils_error charset_ext8_codec_destroy(
00063                 parserutils_charset_codec *codec);
00064 static parserutils_error charset_ext8_codec_encode(
00065                 parserutils_charset_codec *codec,
00066                 const uint8_t **source, size_t *sourcelen,
00067                 uint8_t **dest, size_t *destlen);
00068 static parserutils_error charset_ext8_codec_decode(
00069                 parserutils_charset_codec *codec,
00070                 const uint8_t **source, size_t *sourcelen,
00071                 uint8_t **dest, size_t *destlen);
00072 static parserutils_error charset_ext8_codec_reset(
00073                 parserutils_charset_codec *codec);
00074 static inline parserutils_error charset_ext8_codec_read_char(
00075                 charset_ext8_codec *c,
00076                 const uint8_t **source, size_t *sourcelen,
00077                 uint8_t **dest, size_t *destlen);
00078 static inline parserutils_error charset_ext8_codec_output_decoded_char(
00079                 charset_ext8_codec *c,
00080                 uint32_t ucs4, uint8_t **dest, size_t *destlen);
00081 static inline parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
00082                 uint32_t ucs4, uint8_t **s, size_t *len);
00083 static inline parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
00084                 const uint8_t *s, size_t len, uint32_t *ucs4);
00085 
00092 bool charset_ext8_codec_handles_charset(const char *charset)
00093 {
00094         uint32_t i;
00095         uint16_t match = parserutils_charset_mibenum_from_name(charset,
00096                         strlen(charset));
00097 
00098         if (known_charsets[0].mib == 0) {
00099                 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00100                         known_charsets[i].mib =
00101                                 parserutils_charset_mibenum_from_name(
00102                                                 known_charsets[i].name,
00103                                                 known_charsets[i].len);
00104                 }
00105         }
00106 
00107         for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00108                 if (known_charsets[i].mib == match)
00109                         return true;
00110         }
00111 
00112         return false;
00113 }
00114 
00124 parserutils_error charset_ext8_codec_create(const char *charset,
00125                 parserutils_charset_codec **codec)
00126 {
00127         uint32_t i;
00128         charset_ext8_codec *c;
00129         uint16_t match = parserutils_charset_mibenum_from_name(
00130                         charset, strlen(charset));
00131         uint32_t *table = NULL;
00132 
00133         for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00134                 if (known_charsets[i].mib == match) {
00135                         table = known_charsets[i].table;
00136                         break;
00137                 }
00138         }
00139 
00140         assert(table != NULL);
00141 
00142         c = malloc(sizeof(charset_ext8_codec));
00143         if (c == NULL)
00144                 return PARSERUTILS_NOMEM;
00145 
00146         c->table = table;
00147 
00148         c->read_buf[0] = 0;
00149         c->read_len = 0;
00150 
00151         c->write_buf[0] = 0;
00152         c->write_len = 0;
00153 
00154         /* Finally, populate vtable */
00155         c->base.handler.destroy = charset_ext8_codec_destroy;
00156         c->base.handler.encode = charset_ext8_codec_encode;
00157         c->base.handler.decode = charset_ext8_codec_decode;
00158         c->base.handler.reset = charset_ext8_codec_reset;
00159 
00160         *codec = (parserutils_charset_codec *) c;
00161 
00162         return PARSERUTILS_OK;
00163 }
00164 
00171 parserutils_error charset_ext8_codec_destroy (parserutils_charset_codec *codec)
00172 {
00173         UNUSED(codec);
00174 
00175         return PARSERUTILS_OK;
00176 }
00177 
00205 parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec,
00206                 const uint8_t **source, size_t *sourcelen,
00207                 uint8_t **dest, size_t *destlen)
00208 {
00209         charset_ext8_codec *c = (charset_ext8_codec *) codec;
00210         uint32_t ucs4;
00211         uint32_t *towrite;
00212         size_t towritelen;
00213         parserutils_error error;
00214 
00215         /* Process any outstanding characters from the previous call */
00216         if (c->write_len > 0) {
00217                 uint32_t *pwrite = c->write_buf;
00218 
00219                 while (c->write_len > 0) {
00220                         error = charset_ext8_from_ucs4(c, pwrite[0],
00221                                         dest, destlen);
00222                         if (error != PARSERUTILS_OK) {
00223                                 uint32_t len;
00224                                 assert(error == PARSERUTILS_NOMEM);
00225 
00226                                 for (len = 0; len < c->write_len; len++) {
00227                                         c->write_buf[len] = pwrite[len];
00228                                 }
00229 
00230                                 return error;
00231                         }
00232 
00233                         pwrite++;
00234                         c->write_len--;
00235                 }
00236         }
00237 
00238         /* Now process the characters for this call */
00239         while (*sourcelen > 0) {
00240                 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
00241                 towrite = &ucs4;
00242                 towritelen = 1;
00243 
00244                 /* Output current characters */
00245                 while (towritelen > 0) {
00246                         error = charset_ext8_from_ucs4(c, towrite[0], dest,
00247                                         destlen);
00248                         if (error != PARSERUTILS_OK) {
00249                                 uint32_t len;
00250                                 if (error != PARSERUTILS_NOMEM) {
00251                                         return error;
00252                                 }
00253 
00254                                 /* Insufficient output space */
00255                                 assert(towritelen < WRITE_BUFSIZE);
00256 
00257                                 c->write_len = towritelen;
00258 
00259                                 /* Copy pending chars to save area, for
00260                                  * processing next call. */
00261                                 for (len = 0; len < towritelen; len++)
00262                                         c->write_buf[len] = towrite[len];
00263 
00264                                 /* Claim character we've just buffered,
00265                                  * so it's not reprocessed */
00266                                 *source += 4;
00267                                 *sourcelen -= 4;
00268 
00269                                 return PARSERUTILS_NOMEM;
00270                         }
00271 
00272                         towrite++;
00273                         towritelen--;
00274                 }
00275 
00276                 *source += 4;
00277                 *sourcelen -= 4;
00278         }
00279 
00280         return PARSERUTILS_OK;
00281 }
00282 
00324 parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec,
00325                 const uint8_t **source, size_t *sourcelen,
00326                 uint8_t **dest, size_t *destlen)
00327 {
00328         charset_ext8_codec *c = (charset_ext8_codec *) codec;
00329         parserutils_error error;
00330 
00331         if (c->read_len > 0) {
00332                 /* Output left over from last decode */
00333                 uint32_t *pread = c->read_buf;
00334 
00335                 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
00336                         *((uint32_t *) (void *) *dest) =
00337                                         endian_host_to_big(pread[0]);
00338 
00339                         *dest += 4;
00340                         *destlen -= 4;
00341 
00342                         pread++;
00343                         c->read_len--;
00344                 }
00345 
00346                 if (*destlen < c->read_len * 4) {
00347                         /* Ran out of output buffer */
00348                         size_t i;
00349 
00350                         /* Shuffle remaining output down */
00351                         for (i = 0; i < c->read_len; i++)
00352                                 c->read_buf[i] = pread[i];
00353 
00354                         return PARSERUTILS_NOMEM;
00355                 }
00356         }
00357 
00358         /* Finally, the "normal" case; process all outstanding characters */
00359         while (*sourcelen > 0) {
00360                 error = charset_ext8_codec_read_char(c,
00361                                 source, sourcelen, dest, destlen);
00362                 if (error != PARSERUTILS_OK) {
00363                         return error;
00364                 }
00365         }
00366 
00367         return PARSERUTILS_OK;
00368 }
00369 
00376 parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec)
00377 {
00378         charset_ext8_codec *c = (charset_ext8_codec *) codec;
00379 
00380         c->read_buf[0] = 0;
00381         c->read_len = 0;
00382 
00383         c->write_buf[0] = 0;
00384         c->write_len = 0;
00385 
00386         return PARSERUTILS_OK;
00387 }
00388 
00389 
00418 parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c,
00419                 const uint8_t **source, size_t *sourcelen,
00420                 uint8_t **dest, size_t *destlen)
00421 {
00422         uint32_t ucs4;
00423         parserutils_error error;
00424 
00425         /* Convert a single character */
00426         error = charset_ext8_to_ucs4(c, *source, *sourcelen, &ucs4);
00427         if (error == PARSERUTILS_OK) {
00428                 /* Read a character */
00429                 error = charset_ext8_codec_output_decoded_char(c,
00430                                 ucs4, dest, destlen);
00431                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00432                         /* output succeeded; update source pointers */
00433                         *source += 1;
00434                         *sourcelen -= 1;
00435                 }
00436 
00437                 return error;
00438         } else if (error == PARSERUTILS_NEEDDATA) {
00439                 /* Can only happen if sourcelen == 0 */
00440                 return error;
00441         } else if (error == PARSERUTILS_INVALID) {
00442                 /* Illegal input sequence */
00443 
00444                 /* Strict errormode; simply flag invalid character */
00445                 if (c->base.errormode ==
00446                                 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
00447                         return PARSERUTILS_INVALID;
00448                 }
00449 
00450                 /* output U+FFFD and continue processing. */
00451                 error = charset_ext8_codec_output_decoded_char(c,
00452                                 0xFFFD, dest, destlen);
00453                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00454                         /* output succeeded; update source pointers */
00455                         *source += 1;
00456                         *sourcelen -= 1;
00457                 }
00458 
00459                 return error;
00460         }
00461 
00462         return PARSERUTILS_OK;
00463 }
00464 
00475 parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c,
00476                 uint32_t ucs4, uint8_t **dest, size_t *destlen)
00477 {
00478         if (*destlen < 4) {
00479                 /* Run out of output buffer */
00480                 c->read_len = 1;
00481                 c->read_buf[0] = ucs4;
00482 
00483                 return PARSERUTILS_NOMEM;
00484         }
00485 
00486         *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
00487         *dest += 4;
00488         *destlen -= 4;
00489 
00490         return PARSERUTILS_OK;
00491 }
00492 
00509 parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
00510                 uint32_t ucs4, uint8_t **s, size_t *len)
00511 {
00512         uint8_t out = 0;
00513 
00514         if (*len < 1)
00515                 return PARSERUTILS_NOMEM;
00516 
00517         if (ucs4 < 0x80) {
00518                 /* ASCII */
00519                 out = ucs4;
00520         } else {
00521                 uint32_t i;
00522 
00523                 for (i = 0; i < 128; i++) {
00524                         if (ucs4 == c->table[i])
00525                                 break;
00526                 }
00527 
00528                 if (i == 128) {
00529                         if (c->base.errormode ==
00530                                         PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
00531                                 return PARSERUTILS_INVALID;
00532                         else
00533                                 out = '?';
00534                 } else {
00535                         out = 0x80 + i;
00536                 }
00537         }
00538 
00539         *(*s) = out;
00540         (*s)++;
00541         (*len)--;
00542 
00543         return PARSERUTILS_OK;
00544 }
00545 
00557 parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
00558                 const uint8_t *s, size_t len, uint32_t *ucs4)
00559 {
00560         uint32_t out;
00561 
00562         if (len < 1)
00563                 return PARSERUTILS_NEEDDATA;
00564 
00565         if (*s < 0x80) {
00566                 out = *s;
00567         } else {
00568                 if (c->table[*s - 0x80] == 0xFFFF)
00569                         return PARSERUTILS_INVALID;
00570 
00571                 out = c->table[*s - 0x80];
00572         }
00573 
00574         *ucs4 = out;
00575 
00576         return PARSERUTILS_OK;
00577 }
00578 
00579 const parserutils_charset_handler charset_ext8_codec_handler = {
00580         charset_ext8_codec_handles_charset,
00581         charset_ext8_codec_create
00582 };
00583 

Generated on Wed Jul 29 11:59:20 2015 for Libparserutils by  doxygen 1.5.6