00001
00002
00003
00004
00005
00006
00007
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011
00012 #include <parserutils/charset/mibenum.h>
00013
00014 #include "charset/codecs/codec_impl.h"
00015 #include "charset/encodings/utf8impl.h"
00016 #include "utils/endian.h"
00017 #include "utils/utils.h"
00018
00022 typedef struct charset_utf8_codec {
00023 parserutils_charset_codec base;
00025 #define INVAL_BUFSIZE (32)
00026 uint8_t inval_buf[INVAL_BUFSIZE];
00029 size_t inval_len;
00030
00031 #define READ_BUFSIZE (8)
00032 uint32_t read_buf[READ_BUFSIZE];
00035 size_t read_len;
00037 #define WRITE_BUFSIZE (8)
00038 uint32_t write_buf[WRITE_BUFSIZE];
00041 size_t write_len;
00043 } charset_utf8_codec;
00044
00045 static bool charset_utf8_codec_handles_charset(const char *charset);
00046 static parserutils_error charset_utf8_codec_create(const char *charset,
00047 parserutils_charset_codec **codec);
00048 static parserutils_error charset_utf8_codec_destroy(
00049 parserutils_charset_codec *codec);
00050 static parserutils_error charset_utf8_codec_encode(
00051 parserutils_charset_codec *codec,
00052 const uint8_t **source, size_t *sourcelen,
00053 uint8_t **dest, size_t *destlen);
00054 static parserutils_error charset_utf8_codec_decode(
00055 parserutils_charset_codec *codec,
00056 const uint8_t **source, size_t *sourcelen,
00057 uint8_t **dest, size_t *destlen);
00058 static parserutils_error charset_utf8_codec_reset(
00059 parserutils_charset_codec *codec);
00060 static inline parserutils_error charset_utf8_codec_read_char(
00061 charset_utf8_codec *c,
00062 const uint8_t **source, size_t *sourcelen,
00063 uint8_t **dest, size_t *destlen);
00064 static inline parserutils_error charset_utf8_codec_output_decoded_char(
00065 charset_utf8_codec *c,
00066 uint32_t ucs4, uint8_t **dest, size_t *destlen);
00067
00074 bool charset_utf8_codec_handles_charset(const char *charset)
00075 {
00076 return parserutils_charset_mibenum_from_name(charset,
00077 strlen(charset)) ==
00078 parserutils_charset_mibenum_from_name("UTF-8",
00079 SLEN("UTF-8"));
00080 }
00081
00091 parserutils_error charset_utf8_codec_create(const char *charset,
00092 parserutils_charset_codec **codec)
00093 {
00094 charset_utf8_codec *c;
00095
00096 UNUSED(charset);
00097
00098 c = malloc(sizeof(charset_utf8_codec));
00099 if (c == NULL)
00100 return PARSERUTILS_NOMEM;
00101
00102 c->inval_buf[0] = '\0';
00103 c->inval_len = 0;
00104
00105 c->read_buf[0] = 0;
00106 c->read_len = 0;
00107
00108 c->write_buf[0] = 0;
00109 c->write_len = 0;
00110
00111
00112 c->base.handler.destroy = charset_utf8_codec_destroy;
00113 c->base.handler.encode = charset_utf8_codec_encode;
00114 c->base.handler.decode = charset_utf8_codec_decode;
00115 c->base.handler.reset = charset_utf8_codec_reset;
00116
00117 *codec = (parserutils_charset_codec *) c;
00118
00119 return PARSERUTILS_OK;
00120 }
00121
00128 parserutils_error charset_utf8_codec_destroy (parserutils_charset_codec *codec)
00129 {
00130 UNUSED(codec);
00131
00132 return PARSERUTILS_OK;
00133 }
00134
00162 parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec,
00163 const uint8_t **source, size_t *sourcelen,
00164 uint8_t **dest, size_t *destlen)
00165 {
00166 charset_utf8_codec *c = (charset_utf8_codec *) codec;
00167 uint32_t ucs4;
00168 uint32_t *towrite;
00169 size_t towritelen;
00170 parserutils_error error;
00171
00172
00173 if (c->write_len > 0) {
00174 uint32_t *pwrite = c->write_buf;
00175
00176 while (c->write_len > 0) {
00177 UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
00178 if (error != PARSERUTILS_OK) {
00179 uint32_t len;
00180 assert(error == PARSERUTILS_NOMEM);
00181
00182
00183 for (len = 0; len < c->write_len; len++) {
00184 c->write_buf[len] = pwrite[len];
00185 }
00186
00187 return PARSERUTILS_NOMEM;
00188 }
00189
00190 pwrite++;
00191 c->write_len--;
00192 }
00193 }
00194
00195
00196 while (*sourcelen > 0) {
00197 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
00198 towrite = &ucs4;
00199 towritelen = 1;
00200
00201
00202 while (towritelen > 0) {
00203 UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
00204 if (error != PARSERUTILS_OK) {
00205 uint32_t len;
00206 assert(error == PARSERUTILS_NOMEM);
00207
00208
00209 assert(towritelen < WRITE_BUFSIZE);
00210
00211 c->write_len = towritelen;
00212
00213
00214
00215 for (len = 0; len < towritelen; len++)
00216 c->write_buf[len] = towrite[len];
00217
00218
00219
00220 *source += 4;
00221 *sourcelen -= 4;
00222
00223 return PARSERUTILS_NOMEM;
00224 }
00225
00226 towrite++;
00227 towritelen--;
00228 }
00229
00230 *source += 4;
00231 *sourcelen -= 4;
00232 }
00233
00234 return PARSERUTILS_OK;
00235 }
00236
00278 parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec,
00279 const uint8_t **source, size_t *sourcelen,
00280 uint8_t **dest, size_t *destlen)
00281 {
00282 charset_utf8_codec *c = (charset_utf8_codec *) codec;
00283 parserutils_error error;
00284
00285 if (c->read_len > 0) {
00286
00287 uint32_t *pread = c->read_buf;
00288
00289 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
00290 *((uint32_t *) (void *) *dest) =
00291 endian_host_to_big(pread[0]);
00292
00293 *dest += 4;
00294 *destlen -= 4;
00295
00296 pread++;
00297 c->read_len--;
00298 }
00299
00300 if (*destlen < c->read_len * 4) {
00301
00302 size_t i;
00303
00304
00305 for (i = 0; i < c->read_len; i++)
00306 c->read_buf[i] = pread[i];
00307
00308 return PARSERUTILS_NOMEM;
00309 }
00310 }
00311
00312 if (c->inval_len > 0) {
00313
00314
00315
00316 uint8_t *in = c->inval_buf;
00317 size_t ol = c->inval_len;
00318 size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
00319 size_t orig_l = l;
00320
00321 memcpy(c->inval_buf + ol, *source, l);
00322
00323 l += c->inval_len;
00324
00325 error = charset_utf8_codec_read_char(c,
00326 (const uint8_t **) &in, &l, dest, destlen);
00327 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
00328 return error;
00329 }
00330
00331
00332 *source += max((signed) (orig_l - l), 0);
00333 *sourcelen -= max((signed) (orig_l - l), 0);
00334
00335
00336
00337
00338 assert((orig_l + ol) - l != 0);
00339
00340
00341 if (error != PARSERUTILS_OK)
00342 return error;
00343 }
00344
00345
00346 while (*sourcelen > 0) {
00347 error = charset_utf8_codec_read_char(c,
00348 source, sourcelen, dest, destlen);
00349 if (error != PARSERUTILS_OK) {
00350 return error;
00351 }
00352 }
00353
00354 return PARSERUTILS_OK;
00355 }
00356
00363 parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
00364 {
00365 charset_utf8_codec *c = (charset_utf8_codec *) codec;
00366
00367 c->inval_buf[0] = '\0';
00368 c->inval_len = 0;
00369
00370 c->read_buf[0] = 0;
00371 c->read_len = 0;
00372
00373 c->write_buf[0] = 0;
00374 c->write_len = 0;
00375
00376 return PARSERUTILS_OK;
00377 }
00378
00379
00408 parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c,
00409 const uint8_t **source, size_t *sourcelen,
00410 uint8_t **dest, size_t *destlen)
00411 {
00412 uint32_t ucs4;
00413 size_t sucs4;
00414 parserutils_error error;
00415
00416
00417 {
00418 const uint8_t *src = *source;
00419 size_t srclen = *sourcelen;
00420 uint32_t *uptr = &ucs4;
00421 size_t *usptr = &sucs4;
00422 UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
00423 }
00424 if (error == PARSERUTILS_OK) {
00425
00426 error = charset_utf8_codec_output_decoded_char(c,
00427 ucs4, dest, destlen);
00428 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00429
00430 *source += sucs4;
00431 *sourcelen -= sucs4;
00432 }
00433
00434
00435 c->inval_buf[0] = '\0';
00436 c->inval_len = 0;
00437
00438 return error;
00439 } else if (error == PARSERUTILS_NEEDDATA) {
00440
00441 assert(*sourcelen < INVAL_BUFSIZE);
00442
00443 memmove(c->inval_buf, *source, *sourcelen);
00444 c->inval_buf[*sourcelen] = '\0';
00445 c->inval_len = *sourcelen;
00446
00447 *source += *sourcelen;
00448 *sourcelen = 0;
00449
00450 return PARSERUTILS_OK;
00451 } else if (error == PARSERUTILS_INVALID) {
00452
00453 uint32_t nextchar;
00454
00455
00456 if (c->base.errormode ==
00457 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
00458
00459 c->inval_buf[0] = '\0';
00460 c->inval_len = 0;
00461
00462 return PARSERUTILS_INVALID;
00463 }
00464
00465
00466
00467
00468 {
00469 const uint8_t *src = *source;
00470 size_t srclen = *sourcelen;
00471 uint32_t off = 0;
00472 uint32_t *ncptr = &nextchar;
00473
00474 UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
00475 }
00476 if (error != PARSERUTILS_OK) {
00477 if (error == PARSERUTILS_NEEDDATA) {
00478
00479 assert(*sourcelen < INVAL_BUFSIZE);
00480
00481 memmove(c->inval_buf, *source, *sourcelen);
00482 c->inval_buf[*sourcelen] = '\0';
00483 c->inval_len = *sourcelen;
00484
00485 *source += *sourcelen;
00486 *sourcelen = 0;
00487
00488 nextchar = 0;
00489 } else {
00490 return error;
00491 }
00492 }
00493
00494
00495 c->inval_buf[0] = '\0';
00496 c->inval_len = 0;
00497
00498
00499 error = charset_utf8_codec_output_decoded_char(c,
00500 0xFFFD, dest, destlen);
00501 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00502
00503 *source += nextchar;
00504 *sourcelen -= nextchar;
00505 }
00506
00507 return error;
00508 }
00509
00510 return PARSERUTILS_OK;
00511 }
00512
00523 parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c,
00524 uint32_t ucs4, uint8_t **dest, size_t *destlen)
00525 {
00526 if (*destlen < 4) {
00527
00528 c->read_len = 1;
00529 c->read_buf[0] = ucs4;
00530
00531 return PARSERUTILS_NOMEM;
00532 }
00533
00534 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
00535 *dest += 4;
00536 *destlen -= 4;
00537
00538 return PARSERUTILS_OK;
00539 }
00540
00541
00542 const parserutils_charset_handler charset_utf8_codec_handler = {
00543 charset_utf8_codec_handles_charset,
00544 charset_utf8_codec_create
00545 };
00546