00001
00002
00003
00004
00005
00006
00007
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011
00012 #include <parserutils/charset/mibenum.h>
00013
00014 #include "charset/codecs/codec_impl.h"
00015 #include "utils/endian.h"
00016 #include "utils/utils.h"
00017
00018 #include "charset/codecs/8859_tables.h"
00019
00020 static struct {
00021 uint16_t mib;
00022 const char *name;
00023 size_t len;
00024 uint32_t *table;
00025 } known_charsets[] = {
00026 { 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 },
00027 { 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 },
00028 { 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 },
00029 { 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 },
00030 { 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 },
00031 { 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 },
00032 { 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 },
00033 { 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 },
00034 { 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 },
00035 { 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 },
00036 { 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 },
00037 { 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 },
00038 { 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 },
00039 { 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 },
00040 { 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 }
00041 };
00042
00046 typedef struct charset_8859_codec {
00047 parserutils_charset_codec base;
00049 uint32_t *table;
00051 #define READ_BUFSIZE (8)
00052 uint32_t read_buf[READ_BUFSIZE];
00055 size_t read_len;
00057 #define WRITE_BUFSIZE (8)
00058 uint32_t write_buf[WRITE_BUFSIZE];
00061 size_t write_len;
00063 } charset_8859_codec;
00064
00065 static bool charset_8859_codec_handles_charset(const char *charset);
00066 static parserutils_error charset_8859_codec_create(const char *charset,
00067 parserutils_charset_codec **codec);
00068 static parserutils_error charset_8859_codec_destroy(
00069 parserutils_charset_codec *codec);
00070 static parserutils_error charset_8859_codec_encode(
00071 parserutils_charset_codec *codec,
00072 const uint8_t **source, size_t *sourcelen,
00073 uint8_t **dest, size_t *destlen);
00074 static parserutils_error charset_8859_codec_decode(
00075 parserutils_charset_codec *codec,
00076 const uint8_t **source, size_t *sourcelen,
00077 uint8_t **dest, size_t *destlen);
00078 static parserutils_error charset_8859_codec_reset(
00079 parserutils_charset_codec *codec);
00080 static inline parserutils_error charset_8859_codec_read_char(
00081 charset_8859_codec *c,
00082 const uint8_t **source, size_t *sourcelen,
00083 uint8_t **dest, size_t *destlen);
00084 static inline parserutils_error charset_8859_codec_output_decoded_char(
00085 charset_8859_codec *c,
00086 uint32_t ucs4, uint8_t **dest, size_t *destlen);
00087 static inline parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
00088 uint32_t ucs4, uint8_t **s, size_t *len);
00089 static inline parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
00090 const uint8_t *s, size_t len, uint32_t *ucs4);
00091
00098 bool charset_8859_codec_handles_charset(const char *charset)
00099 {
00100 uint32_t i;
00101 uint16_t match = parserutils_charset_mibenum_from_name(charset,
00102 strlen(charset));
00103
00104 if (known_charsets[0].mib == 0) {
00105 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00106 known_charsets[i].mib =
00107 parserutils_charset_mibenum_from_name(
00108 known_charsets[i].name,
00109 known_charsets[i].len);
00110 }
00111 }
00112
00113 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00114 if (known_charsets[i].mib == match)
00115 return true;
00116 }
00117
00118 return false;
00119 }
00120
00130 parserutils_error charset_8859_codec_create(const char *charset,
00131 parserutils_charset_codec **codec)
00132 {
00133 uint32_t i;
00134 charset_8859_codec *c;
00135 uint16_t match = parserutils_charset_mibenum_from_name(
00136 charset, strlen(charset));
00137 uint32_t *table = NULL;
00138
00139 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00140 if (known_charsets[i].mib == match) {
00141 table = known_charsets[i].table;
00142 break;
00143 }
00144 }
00145
00146 assert(table != NULL);
00147
00148 c = malloc(sizeof(charset_8859_codec));
00149 if (c == NULL)
00150 return PARSERUTILS_NOMEM;
00151
00152 c->table = table;
00153
00154 c->read_buf[0] = 0;
00155 c->read_len = 0;
00156
00157 c->write_buf[0] = 0;
00158 c->write_len = 0;
00159
00160
00161 c->base.handler.destroy = charset_8859_codec_destroy;
00162 c->base.handler.encode = charset_8859_codec_encode;
00163 c->base.handler.decode = charset_8859_codec_decode;
00164 c->base.handler.reset = charset_8859_codec_reset;
00165
00166 *codec = (parserutils_charset_codec *) c;
00167
00168 return PARSERUTILS_OK;
00169 }
00170
00177 parserutils_error charset_8859_codec_destroy (parserutils_charset_codec *codec)
00178 {
00179 UNUSED(codec);
00180
00181 return PARSERUTILS_OK;
00182 }
00183
00211 parserutils_error charset_8859_codec_encode(parserutils_charset_codec *codec,
00212 const uint8_t **source, size_t *sourcelen,
00213 uint8_t **dest, size_t *destlen)
00214 {
00215 charset_8859_codec *c = (charset_8859_codec *) codec;
00216 uint32_t ucs4;
00217 uint32_t *towrite;
00218 size_t towritelen;
00219 parserutils_error error;
00220
00221
00222 if (c->write_len > 0) {
00223 uint32_t *pwrite = c->write_buf;
00224
00225 while (c->write_len > 0) {
00226 error = charset_8859_from_ucs4(c, pwrite[0],
00227 dest, destlen);
00228 if (error != PARSERUTILS_OK) {
00229 uint32_t len;
00230 assert(error == PARSERUTILS_NOMEM);
00231
00232 for (len = 0; len < c->write_len; len++) {
00233 c->write_buf[len] = pwrite[len];
00234 }
00235
00236 return error;
00237 }
00238
00239 pwrite++;
00240 c->write_len--;
00241 }
00242 }
00243
00244
00245 while (*sourcelen > 0) {
00246 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
00247 towrite = &ucs4;
00248 towritelen = 1;
00249
00250
00251 while (towritelen > 0) {
00252 error = charset_8859_from_ucs4(c, towrite[0], dest,
00253 destlen);
00254 if (error != PARSERUTILS_OK) {
00255 uint32_t len;
00256 if (error != PARSERUTILS_NOMEM) {
00257 return error;
00258 }
00259
00260
00261 assert(towritelen < WRITE_BUFSIZE);
00262
00263 c->write_len = towritelen;
00264
00265
00266
00267 for (len = 0; len < towritelen; len++)
00268 c->write_buf[len] = towrite[len];
00269
00270
00271
00272 *source += 4;
00273 *sourcelen -= 4;
00274
00275 return PARSERUTILS_NOMEM;
00276 }
00277
00278 towrite++;
00279 towritelen--;
00280 }
00281
00282 *source += 4;
00283 *sourcelen -= 4;
00284 }
00285
00286 return PARSERUTILS_OK;
00287 }
00288
00330 parserutils_error charset_8859_codec_decode(parserutils_charset_codec *codec,
00331 const uint8_t **source, size_t *sourcelen,
00332 uint8_t **dest, size_t *destlen)
00333 {
00334 charset_8859_codec *c = (charset_8859_codec *) codec;
00335 parserutils_error error;
00336
00337 if (c->read_len > 0) {
00338
00339 uint32_t *pread = c->read_buf;
00340
00341 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
00342 *((uint32_t *) (void *) *dest) =
00343 endian_host_to_big(pread[0]);
00344
00345 *dest += 4;
00346 *destlen -= 4;
00347
00348 pread++;
00349 c->read_len--;
00350 }
00351
00352 if (*destlen < c->read_len * 4) {
00353
00354 size_t i;
00355
00356
00357 for (i = 0; i < c->read_len; i++)
00358 c->read_buf[i] = pread[i];
00359
00360 return PARSERUTILS_NOMEM;
00361 }
00362 }
00363
00364
00365 while (*sourcelen > 0) {
00366 error = charset_8859_codec_read_char(c,
00367 source, sourcelen, dest, destlen);
00368 if (error != PARSERUTILS_OK) {
00369 return error;
00370 }
00371 }
00372
00373 return PARSERUTILS_OK;
00374 }
00375
00382 parserutils_error charset_8859_codec_reset(parserutils_charset_codec *codec)
00383 {
00384 charset_8859_codec *c = (charset_8859_codec *) codec;
00385
00386 c->read_buf[0] = 0;
00387 c->read_len = 0;
00388
00389 c->write_buf[0] = 0;
00390 c->write_len = 0;
00391
00392 return PARSERUTILS_OK;
00393 }
00394
00395
00424 parserutils_error charset_8859_codec_read_char(charset_8859_codec *c,
00425 const uint8_t **source, size_t *sourcelen,
00426 uint8_t **dest, size_t *destlen)
00427 {
00428 uint32_t ucs4;
00429 parserutils_error error;
00430
00431
00432 error = charset_8859_to_ucs4(c, *source, *sourcelen, &ucs4);
00433 if (error == PARSERUTILS_OK) {
00434
00435 error = charset_8859_codec_output_decoded_char(c,
00436 ucs4, dest, destlen);
00437 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00438
00439 *source += 1;
00440 *sourcelen -= 1;
00441 }
00442
00443 return error;
00444 } else if (error == PARSERUTILS_NEEDDATA) {
00445
00446 return error;
00447 } else if (error == PARSERUTILS_INVALID) {
00448
00449
00450
00451 if (c->base.errormode ==
00452 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
00453 return PARSERUTILS_INVALID;
00454 }
00455
00456
00457 error = charset_8859_codec_output_decoded_char(c,
00458 0xFFFD, dest, destlen);
00459 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00460
00461 *source += 1;
00462 *sourcelen -= 1;
00463 }
00464
00465 return error;
00466 }
00467
00468 return PARSERUTILS_OK;
00469 }
00470
00481 parserutils_error charset_8859_codec_output_decoded_char(charset_8859_codec *c,
00482 uint32_t ucs4, uint8_t **dest, size_t *destlen)
00483 {
00484 if (*destlen < 4) {
00485
00486 c->read_len = 1;
00487 c->read_buf[0] = ucs4;
00488
00489 return PARSERUTILS_NOMEM;
00490 }
00491
00492 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
00493 *dest += 4;
00494 *destlen -= 4;
00495
00496 return PARSERUTILS_OK;
00497 }
00498
00515 parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
00516 uint32_t ucs4, uint8_t **s, size_t *len)
00517 {
00518 uint8_t out = 0;
00519
00520 if (*len < 1)
00521 return PARSERUTILS_NOMEM;
00522
00523 if (ucs4 < 0x80) {
00524
00525 out = ucs4;
00526 } else {
00527 uint32_t i;
00528
00529 for (i = 0; i < 96; i++) {
00530 if (ucs4 == c->table[i])
00531 break;
00532 }
00533
00534 if (i == 96) {
00535 if (c->base.errormode ==
00536 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
00537 return PARSERUTILS_INVALID;
00538 else
00539 out = '?';
00540 } else {
00541 out = 0xA0 + i;
00542 }
00543 }
00544
00545 *(*s) = out;
00546 (*s)++;
00547 (*len)--;
00548
00549 return PARSERUTILS_OK;
00550 }
00551
00563 parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
00564 const uint8_t *s, size_t len, uint32_t *ucs4)
00565 {
00566 uint32_t out;
00567
00568 if (len < 1)
00569 return PARSERUTILS_NEEDDATA;
00570
00571 if (*s < 0x80) {
00572 out = *s;
00573 } else if (*s >= 0xA0) {
00574 if (c->table[*s - 0xA0] == 0xFFFF)
00575 return PARSERUTILS_INVALID;
00576
00577 out = c->table[*s - 0xA0];
00578 } else {
00579 return PARSERUTILS_INVALID;
00580 }
00581
00582 *ucs4 = out;
00583
00584 return PARSERUTILS_OK;
00585 }
00586
00587 const parserutils_charset_handler charset_8859_codec_handler = {
00588 charset_8859_codec_handles_charset,
00589 charset_8859_codec_create
00590 };
00591