00001
00002
00003
00004
00005
00006
00007
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011
00012 #include <parserutils/charset/mibenum.h>
00013 #include <parserutils/charset/utf16.h>
00014
00015 #include "charset/codecs/codec_impl.h"
00016 #include "utils/endian.h"
00017 #include "utils/utils.h"
00018
00022 typedef struct charset_utf16_codec {
00023 parserutils_charset_codec base;
00025 #define INVAL_BUFSIZE (32)
00026 uint8_t inval_buf[INVAL_BUFSIZE];
00029 size_t inval_len;
00030
00031 #define READ_BUFSIZE (8)
00032 uint32_t read_buf[READ_BUFSIZE];
00035 size_t read_len;
00037 #define WRITE_BUFSIZE (8)
00038 uint32_t write_buf[WRITE_BUFSIZE];
00041 size_t write_len;
00043 } charset_utf16_codec;
00044
00045 static bool charset_utf16_codec_handles_charset(const char *charset);
00046 static parserutils_error charset_utf16_codec_create(const char *charset,
00047 parserutils_charset_codec **codec);
00048 static parserutils_error charset_utf16_codec_destroy(
00049 parserutils_charset_codec *codec);
00050 static parserutils_error charset_utf16_codec_encode(
00051 parserutils_charset_codec *codec,
00052 const uint8_t **source, size_t *sourcelen,
00053 uint8_t **dest, size_t *destlen);
00054 static parserutils_error charset_utf16_codec_decode(
00055 parserutils_charset_codec *codec,
00056 const uint8_t **source, size_t *sourcelen,
00057 uint8_t **dest, size_t *destlen);
00058 static parserutils_error charset_utf16_codec_reset(
00059 parserutils_charset_codec *codec);
00060 static inline parserutils_error charset_utf16_codec_read_char(
00061 charset_utf16_codec *c,
00062 const uint8_t **source, size_t *sourcelen,
00063 uint8_t **dest, size_t *destlen);
00064 static inline parserutils_error charset_utf16_codec_output_decoded_char(
00065 charset_utf16_codec *c,
00066 uint32_t ucs4, uint8_t **dest, size_t *destlen);
00067
00074 bool charset_utf16_codec_handles_charset(const char *charset)
00075 {
00076 return parserutils_charset_mibenum_from_name(charset, strlen(charset))
00077 ==
00078 parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16"));
00079 }
00080
00090 parserutils_error charset_utf16_codec_create(const char *charset,
00091 parserutils_charset_codec **codec)
00092 {
00093 charset_utf16_codec *c;
00094
00095 UNUSED(charset);
00096
00097 c = malloc(sizeof(charset_utf16_codec));
00098 if (c == NULL)
00099 return PARSERUTILS_NOMEM;
00100
00101 c->inval_buf[0] = '\0';
00102 c->inval_len = 0;
00103
00104 c->read_buf[0] = 0;
00105 c->read_len = 0;
00106
00107 c->write_buf[0] = 0;
00108 c->write_len = 0;
00109
00110
00111 c->base.handler.destroy = charset_utf16_codec_destroy;
00112 c->base.handler.encode = charset_utf16_codec_encode;
00113 c->base.handler.decode = charset_utf16_codec_decode;
00114 c->base.handler.reset = charset_utf16_codec_reset;
00115
00116 *codec = (parserutils_charset_codec *) c;
00117
00118 return PARSERUTILS_OK;
00119 }
00120
00127 parserutils_error charset_utf16_codec_destroy (parserutils_charset_codec *codec)
00128 {
00129 UNUSED(codec);
00130
00131 return PARSERUTILS_OK;
00132 }
00133
00161 parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec,
00162 const uint8_t **source, size_t *sourcelen,
00163 uint8_t **dest, size_t *destlen)
00164 {
00165 charset_utf16_codec *c = (charset_utf16_codec *) codec;
00166 uint32_t ucs4;
00167 uint32_t *towrite;
00168 size_t towritelen;
00169 parserutils_error error;
00170
00171
00172 if (c->write_len > 0) {
00173 uint32_t *pwrite = c->write_buf;
00174 uint8_t buf[4];
00175 size_t len;
00176
00177 while (c->write_len > 0) {
00178 error = parserutils_charset_utf16_from_ucs4(
00179 pwrite[0], buf, &len);
00180 assert(error == PARSERUTILS_OK);
00181
00182 if (*destlen < len) {
00183
00184 for (len = 0; len < c->write_len; len++)
00185 c->write_buf[len] = pwrite[len];
00186
00187 return PARSERUTILS_NOMEM;
00188 }
00189
00190 memcpy(*dest, buf, len);
00191
00192 *dest += len;
00193 *destlen -= len;
00194
00195 pwrite++;
00196 c->write_len--;
00197 }
00198 }
00199
00200
00201 while (*sourcelen > 0) {
00202 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
00203 towrite = &ucs4;
00204 towritelen = 1;
00205
00206
00207 while (towritelen > 0) {
00208 uint8_t buf[4];
00209 size_t len;
00210
00211 error = parserutils_charset_utf16_from_ucs4(
00212 towrite[0], buf, &len);
00213 assert(error == PARSERUTILS_OK);
00214
00215 if (*destlen < len) {
00216
00217 assert(towritelen < WRITE_BUFSIZE);
00218
00219 c->write_len = towritelen;
00220
00221
00222
00223 for (len = 0; len < towritelen; len++)
00224 c->write_buf[len] = towrite[len];
00225
00226
00227
00228 *source += 4;
00229 *sourcelen -= 4;
00230
00231 return PARSERUTILS_NOMEM;
00232 }
00233
00234 memcpy(*dest, buf, len);
00235
00236 *dest += len;
00237 *destlen -= len;
00238
00239 towrite++;
00240 towritelen--;
00241 }
00242
00243 *source += 4;
00244 *sourcelen -= 4;
00245 }
00246
00247 (void) error;
00248
00249 return PARSERUTILS_OK;
00250 }
00251
00293 parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec,
00294 const uint8_t **source, size_t *sourcelen,
00295 uint8_t **dest, size_t *destlen)
00296 {
00297 charset_utf16_codec *c = (charset_utf16_codec *) codec;
00298 parserutils_error error;
00299
00300 if (c->read_len > 0) {
00301
00302 uint32_t *pread = c->read_buf;
00303
00304 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
00305 *((uint32_t *) (void *) *dest) =
00306 endian_host_to_big(pread[0]);
00307
00308 *dest += 4;
00309 *destlen -= 4;
00310
00311 pread++;
00312 c->read_len--;
00313 }
00314
00315 if (*destlen < c->read_len * 4) {
00316
00317 size_t i;
00318
00319
00320 for (i = 0; i < c->read_len; i++)
00321 c->read_buf[i] = pread[i];
00322
00323 return PARSERUTILS_NOMEM;
00324 }
00325 }
00326
00327 if (c->inval_len > 0) {
00328
00329
00330
00331 uint8_t *in = c->inval_buf;
00332 size_t ol = c->inval_len;
00333 size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
00334 size_t orig_l = l;
00335
00336 memcpy(c->inval_buf + ol, *source, l);
00337
00338 l += c->inval_len;
00339
00340 error = charset_utf16_codec_read_char(c,
00341 (const uint8_t **) &in, &l, dest, destlen);
00342 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
00343 return error;
00344 }
00345
00346
00347 *source += max((signed) (orig_l - l), 0);
00348 *sourcelen -= max((signed) (orig_l - l), 0);
00349
00350
00351
00352
00353 assert((orig_l + ol) - l != 0);
00354
00355
00356 if (error != PARSERUTILS_OK)
00357 return error;
00358 }
00359
00360
00361 while (*sourcelen > 0) {
00362 error = charset_utf16_codec_read_char(c,
00363 source, sourcelen, dest, destlen);
00364 if (error != PARSERUTILS_OK) {
00365 return error;
00366 }
00367 }
00368
00369 return PARSERUTILS_OK;
00370 }
00371
00378 parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
00379 {
00380 charset_utf16_codec *c = (charset_utf16_codec *) codec;
00381
00382 c->inval_buf[0] = '\0';
00383 c->inval_len = 0;
00384
00385 c->read_buf[0] = 0;
00386 c->read_len = 0;
00387
00388 c->write_buf[0] = 0;
00389 c->write_len = 0;
00390
00391 return PARSERUTILS_OK;
00392 }
00393
00394
00423 parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c,
00424 const uint8_t **source, size_t *sourcelen,
00425 uint8_t **dest, size_t *destlen)
00426 {
00427 uint32_t ucs4;
00428 size_t sucs4;
00429 parserutils_error error;
00430
00431
00432 error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen,
00433 &ucs4, &sucs4);
00434 if (error == PARSERUTILS_OK) {
00435
00436 error = charset_utf16_codec_output_decoded_char(c,
00437 ucs4, dest, destlen);
00438 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00439
00440 *source += sucs4;
00441 *sourcelen -= sucs4;
00442 }
00443
00444
00445 c->inval_buf[0] = '\0';
00446 c->inval_len = 0;
00447
00448 return error;
00449 } else if (error == PARSERUTILS_NEEDDATA) {
00450
00451 assert(*sourcelen < INVAL_BUFSIZE);
00452
00453 memmove(c->inval_buf, *source, *sourcelen);
00454 c->inval_buf[*sourcelen] = '\0';
00455 c->inval_len = *sourcelen;
00456
00457 *source += *sourcelen;
00458 *sourcelen = 0;
00459
00460 return PARSERUTILS_OK;
00461 } else if (error == PARSERUTILS_INVALID) {
00462
00463 uint32_t nextchar;
00464
00465
00466 c->inval_buf[0] = '\0';
00467 c->inval_len = 0;
00468
00469
00470 if (c->base.errormode ==
00471 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
00472 return PARSERUTILS_INVALID;
00473 }
00474
00475
00476
00477
00478 error = parserutils_charset_utf16_next_paranoid(
00479 *source, *sourcelen, 0, &nextchar);
00480 if (error != PARSERUTILS_OK) {
00481 if (error == PARSERUTILS_NEEDDATA) {
00482
00483 assert(*sourcelen < INVAL_BUFSIZE);
00484
00485 memmove(c->inval_buf, *source, *sourcelen);
00486 c->inval_buf[*sourcelen] = '\0';
00487 c->inval_len = *sourcelen;
00488
00489 *source += *sourcelen;
00490 *sourcelen = 0;
00491
00492 nextchar = 0;
00493 } else {
00494 return error;
00495 }
00496 }
00497
00498
00499 error = charset_utf16_codec_output_decoded_char(c,
00500 0xFFFD, dest, destlen);
00501 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
00502
00503 *source += nextchar;
00504 *sourcelen -= nextchar;
00505 }
00506
00507 return error;
00508 }
00509
00510 return PARSERUTILS_OK;
00511 }
00512
00523 parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c,
00524 uint32_t ucs4, uint8_t **dest, size_t *destlen)
00525 {
00526 if (*destlen < 4) {
00527
00528 c->read_len = 1;
00529 c->read_buf[0] = ucs4;
00530
00531 return PARSERUTILS_NOMEM;
00532 }
00533
00534 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
00535 *dest += 4;
00536 *destlen -= 4;
00537
00538 return PARSERUTILS_OK;
00539 }
00540
00541
00542 const parserutils_charset_handler charset_utf16_codec_handler = {
00543 charset_utf16_codec_handles_charset,
00544 charset_utf16_codec_create
00545 };