00001
00002
00003
00004
00005
00006
00007
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011
00012 #include <parserutils/charset/mibenum.h>
00013
00014 #include "charset/codecs/codec_impl.h"
00015 #include "utils/endian.h"
00016 #include "utils/utils.h"
00017
00018 #include "charset/codecs/ext8_tables.h"
00019
00020 static struct {
00021 uint16_t mib;
00022 const char *name;
00023 size_t len;
00024 uint32_t *table;
00025 } known_charsets[] = {
00026 { 0, "Windows-1250", SLEN("Windows-1250"), w1250 },
00027 { 0, "Windows-1251", SLEN("Windows-1251"), w1251 },
00028 { 0, "Windows-1252", SLEN("Windows-1252"), w1252 },
00029 { 0, "Windows-1253", SLEN("Windows-1253"), w1253 },
00030 { 0, "Windows-1254", SLEN("Windows-1254"), w1254 },
00031 { 0, "Windows-1255", SLEN("Windows-1255"), w1255 },
00032 { 0, "Windows-1256", SLEN("Windows-1256"), w1256 },
00033 { 0, "Windows-1257", SLEN("Windows-1257"), w1257 },
00034 { 0, "Windows-1258", SLEN("Windows-1258"), w1258 },
00035 };
00036
00040 typedef struct charset_ext8_codec {
00041 parserutils_charset_codec base;
00043 uint32_t *table;
00045 #define READ_BUFSIZE (8)
00046 uint32_t read_buf[READ_BUFSIZE];
00049 size_t read_len;
00051 #define WRITE_BUFSIZE (8)
00052 uint32_t write_buf[WRITE_BUFSIZE];
00055 size_t write_len;
00057 } charset_ext8_codec;
00058
00059 static bool charset_ext8_codec_handles_charset(const char *charset);
00060 static parserutils_error charset_ext8_codec_create(const char *charset,
00061 parserutils_charset_codec **codec);
00062 static parserutils_error charset_ext8_codec_destroy(
00063 parserutils_charset_codec *codec);
00064 static parserutils_error charset_ext8_codec_encode(
00065 parserutils_charset_codec *codec,
00066 const uint8_t **source, size_t *sourcelen,
00067 uint8_t **dest, size_t *destlen);
00068 static parserutils_error charset_ext8_codec_decode(
00069 parserutils_charset_codec *codec,
00070 const uint8_t **source, size_t *sourcelen,
00071 uint8_t **dest, size_t *destlen);
00072 static parserutils_error charset_ext8_codec_reset(
00073 parserutils_charset_codec *codec);
00074 static inline parserutils_error charset_ext8_codec_read_char(
00075 charset_ext8_codec *c,
00076 const uint8_t **source, size_t *sourcelen,
00077 uint8_t **dest, size_t *destlen);
00078 static inline parserutils_error charset_ext8_codec_output_decoded_char(
00079 charset_ext8_codec *c,
00080 uint32_t ucs4, uint8_t **dest, size_t *destlen);
00081 static inline parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
00082 uint32_t ucs4, uint8_t **s, size_t *len);
00083 static inline parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
00084 const uint8_t *s, size_t len, uint32_t *ucs4);
00085
00092 bool charset_ext8_codec_handles_charset(const char *charset)
00093 {
00094 uint32_t i;
00095 uint16_t match = parserutils_charset_mibenum_from_name(charset,
00096 strlen(charset));
00097
00098 if (known_charsets[0].mib == 0) {
00099 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00100 known_charsets[i].mib =
00101 parserutils_charset_mibenum_from_name(
00102 known_charsets[i].name,
00103 known_charsets[i].len);
00104 }
00105 }
00106
00107 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00108 if (known_charsets[i].mib == match)
00109 return true;
00110 }
00111
00112 return false;
00113 }
00114
00124 parserutils_error charset_ext8_codec_create(const char *charset,
00125 parserutils_charset_codec **codec)
00126 {
00127 uint32_t i;
00128 charset_ext8_codec *c;
00129 uint16_t match = parserutils_charset_mibenum_from_name(
00130 charset, strlen(charset));
00131 uint32_t *table = NULL;
00132
00133 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
00134 if (known_charsets[i].mib == match) {
00135 table = known_charsets[i].table;
00136 break;
00137 }
00138 }
00139
00140 assert(table != NULL);
00141
00142 c = malloc(sizeof(charset_ext8_codec));
00143 if (c == NULL)
00144 return PARSERUTILS_NOMEM;
00145
00146 c->table = table;
00147
00148 c->read_buf[0] = 0;
00149 c->read_len = 0;
00150
00151 c->write_buf[0] = 0;
00152 c->write_len = 0;
00153
00154
00155 c->base.handler.destroy = charset_ext8_codec_destroy;
00156 c->base.handler.encode = charset_ext8_codec_encode;
00157 c->base.handler.decode = charset_ext8_codec_decode;
00158 c->base.handler.reset = charset_ext8_codec_reset;
00159
00160 *codec = (parserutils_charset_codec *) c;
00161
00162 return PARSERUTILS_OK;
00163 }
00164
00171 parserutils_error charset_ext8_codec_destroy (parserutils_charset_codec *codec)
00172 {
00173 UNUSED(codec);
00174
00175 return PARSERUTILS_OK;
00176 }
00177
00205 parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec,
00206 const uint8_t **source, size_t *sourcelen,
00207 uint8_t **dest, size_t *destlen)
00208 {
00209 charset_ext8_codec *c = (charset_ext8_codec *) codec;
00210 uint32_t ucs4;
00211 uint32_t *towrite;
00212 size_t towritelen;
00213 parserutils_error error;
00214
00215
00216 if (c->write_len > 0) {
00217 uint32_t *pwrite = c->write_buf;
00218
00219 while (c->write_len > 0) {
00220 error = charset_ext8_from_ucs4(c, pwrite[0],
00221 dest, destlen);
00222 if (error != PARSERUTILS_OK) {
00223 uint32_t len;
00224 assert(error == PARSERUTILS_NOMEM);
00225
00226 for (len = 0; len < c->write_len; len++) {
00227 c->write_buf[len] = pwrite[len];
00228 }
00229
00230 return error;
00231 }
00232
00233 pwrite++;
00234 c->write_len--;
00235 }
00236 }
00237
00238
00239 while (*sourcelen > 0) {
00240 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
00241 towrite = &ucs4;
00242 towritelen = 1;
00243
00244
00245 while (towritelen > 0) {
00246 error = charset_ext8_from_ucs4(c, towrite[0], dest,
00247 destlen);
00248 if (error != PARSERUTILS_OK) {
00249 uint32_t len;
00250 if (error != PARSERUTILS_NOMEM) {
00251 return error;
00252 }
00253
00254
00255 assert(towritelen < WRITE_BUFSIZE);
00256
00257 c->write_len = towritelen;
00258
00259
00260
00261 for (len = 0; len < towritelen; len++)
00262 c->write_buf[len] = towrite[len];
00263
00264
00265
00266 *source += 4;
00267 *sourcelen -= 4;
00268
00269 return PARSERUTILS_NOMEM;
00270 }
00271
00272 towrite++;
00273 towritelen--;
00274 }
00275
00276 *source += 4;
00277 *sourcelen -= 4;
00278 }
00279
00280 return PARSERUTILS_OK;
00281 }
00282
00324 parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec,
00325 const uint8_t **source, size_t *sourcelen,
00326 uint8_t **dest, size_t *destlen)
00327 {
00328 charset_ext8_codec *c = (charset_ext8_codec *) codec;
00329 parserutils_error error;
00330
00331 if (c->read_len > 0) {
00332
00333 uint32_t *pread = c->read_buf;
00334
00335 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
00336 *((uint32_t *) (void *) *dest) =
00337 endian_host_to_big(pread[0]);
00338
00339 *dest += 4;
00340 *destlen -= 4;
00341
00342 pread++;
00343 c->read_len--;
00344 }
00345
00346 if (*destlen < c->read_len * 4) {
00347
00348 size_t i;
00349
00350
00351 for (i = 0; i < c->read_len; i++)
00352 c->read_buf[i] = pread[i];
00353
00354 return PARSERUTILS_NOMEM;
00355 }
00356 }
00357
00358
00359 while (*sourcelen > 0) {
00360 error = charset_ext8_codec_read_char(c,
00361 source, sourcelen, dest, destlen);
00362 if (error != PARSERUTILS_OK) {
00363 return error;
00364 }
00365 }
00366
00367 return PARSERUTILS_OK;
00368 }
00369
00376 parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec)
00377 {
00378 charset_ext8_codec *c = (charset_ext8_codec *) codec;
00379
00380 c->read_buf[0] = 0;
00381 c->read_len = 0;
00382
00383 c->write_buf[0] = 0;
00384 c->write_len = 0;
00385
00386 return PARSERUTILS_OK;
00387 }
00388
00389
00418 parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c,
00419 const uint8_t **source, size_t *sourcelen,
00420 uint8_t **dest, size_t *destlen)
00421 {
00422 uint32_t ucs4;
00423 parserutils_error error;
00424
00425
00426 error = charset_ext8_to_ucs4(c, *source, *sourcelen, &ucs4);
00427 if (error == PARSERUTILS_OK) {
00428
00429 error = charset_ext8_codec_output_decoded_char(c,
00430 ucs4, dest, destlen);
00431 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00432
00433 *source += 1;
00434 *sourcelen -= 1;
00435 }
00436
00437 return error;
00438 } else if (error == PARSERUTILS_NEEDDATA) {
00439
00440 return error;
00441 } else if (error == PARSERUTILS_INVALID) {
00442
00443
00444
00445 if (c->base.errormode ==
00446 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
00447 return PARSERUTILS_INVALID;
00448 }
00449
00450
00451 error = charset_ext8_codec_output_decoded_char(c,
00452 0xFFFD, dest, destlen);
00453 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00454
00455 *source += 1;
00456 *sourcelen -= 1;
00457 }
00458
00459 return error;
00460 }
00461
00462 return PARSERUTILS_OK;
00463 }
00464
00475 parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c,
00476 uint32_t ucs4, uint8_t **dest, size_t *destlen)
00477 {
00478 if (*destlen < 4) {
00479
00480 c->read_len = 1;
00481 c->read_buf[0] = ucs4;
00482
00483 return PARSERUTILS_NOMEM;
00484 }
00485
00486 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
00487 *dest += 4;
00488 *destlen -= 4;
00489
00490 return PARSERUTILS_OK;
00491 }
00492
00509 parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
00510 uint32_t ucs4, uint8_t **s, size_t *len)
00511 {
00512 uint8_t out = 0;
00513
00514 if (*len < 1)
00515 return PARSERUTILS_NOMEM;
00516
00517 if (ucs4 < 0x80) {
00518
00519 out = ucs4;
00520 } else {
00521 uint32_t i;
00522
00523 for (i = 0; i < 128; i++) {
00524 if (ucs4 == c->table[i])
00525 break;
00526 }
00527
00528 if (i == 128) {
00529 if (c->base.errormode ==
00530 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
00531 return PARSERUTILS_INVALID;
00532 else
00533 out = '?';
00534 } else {
00535 out = 0x80 + i;
00536 }
00537 }
00538
00539 *(*s) = out;
00540 (*s)++;
00541 (*len)--;
00542
00543 return PARSERUTILS_OK;
00544 }
00545
00557 parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
00558 const uint8_t *s, size_t len, uint32_t *ucs4)
00559 {
00560 uint32_t out;
00561
00562 if (len < 1)
00563 return PARSERUTILS_NEEDDATA;
00564
00565 if (*s < 0x80) {
00566 out = *s;
00567 } else {
00568 if (c->table[*s - 0x80] == 0xFFFF)
00569 return PARSERUTILS_INVALID;
00570
00571 out = c->table[*s - 0x80];
00572 }
00573
00574 *ucs4 = out;
00575
00576 return PARSERUTILS_OK;
00577 }
00578
00579 const parserutils_charset_handler charset_ext8_codec_handler = {
00580 charset_ext8_codec_handles_charset,
00581 charset_ext8_codec_create
00582 };
00583