00001
00002
00003
00004
00005
00006
00007
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011
00012 #include <parserutils/charset/mibenum.h>
00013 #include <parserutils/charset/utf8.h>
00014 #include <parserutils/input/inputstream.h>
00015
00016 #include "input/filter.h"
00017 #include "utils/utils.h"
00018
00022 typedef struct parserutils_inputstream_private {
00023 parserutils_inputstream public;
00025 parserutils_buffer *raw;
00027 bool done_first_chunk;
00030 uint16_t mibenum;
00031 uint32_t encsrc;
00033 parserutils_filter *input;
00035 parserutils_charset_detect_func csdetect;
00036 } parserutils_inputstream_private;
00037
00038 static inline parserutils_error parserutils_inputstream_refill_buffer(
00039 parserutils_inputstream_private *stream);
00040 static inline parserutils_error parserutils_inputstream_strip_bom(
00041 uint16_t *mibenum, parserutils_buffer *buffer);
00042
00059 parserutils_error parserutils_inputstream_create(const char *enc,
00060 uint32_t encsrc, parserutils_charset_detect_func csdetect,
00061 parserutils_inputstream **stream)
00062 {
00063 parserutils_inputstream_private *s;
00064 parserutils_error error;
00065
00066 if (stream == NULL)
00067 return PARSERUTILS_BADPARM;
00068
00069 s = malloc(sizeof(parserutils_inputstream_private));
00070 if (s == NULL)
00071 return PARSERUTILS_NOMEM;
00072
00073 error = parserutils_buffer_create(&s->raw);
00074 if (error != PARSERUTILS_OK) {
00075 free(s);
00076 return error;
00077 }
00078
00079 error = parserutils_buffer_create(&s->public.utf8);
00080 if (error != PARSERUTILS_OK) {
00081 parserutils_buffer_destroy(s->raw);
00082 free(s);
00083 return error;
00084 }
00085
00086 s->public.cursor = 0;
00087 s->public.had_eof = false;
00088 s->done_first_chunk = false;
00089
00090 error = parserutils__filter_create("UTF-8", &s->input);
00091 if (error != PARSERUTILS_OK) {
00092 parserutils_buffer_destroy(s->public.utf8);
00093 parserutils_buffer_destroy(s->raw);
00094 free(s);
00095 return error;
00096 }
00097
00098 if (enc != NULL) {
00099 parserutils_filter_optparams params;
00100
00101 s->mibenum =
00102 parserutils_charset_mibenum_from_name(enc, strlen(enc));
00103
00104 if (s->mibenum == 0) {
00105 parserutils__filter_destroy(s->input);
00106 parserutils_buffer_destroy(s->public.utf8);
00107 parserutils_buffer_destroy(s->raw);
00108 free(s);
00109 return PARSERUTILS_BADENCODING;
00110 }
00111
00112 params.encoding.name = enc;
00113
00114 error = parserutils__filter_setopt(s->input,
00115 PARSERUTILS_FILTER_SET_ENCODING,
00116 ¶ms);
00117 if (error != PARSERUTILS_OK) {
00118 parserutils__filter_destroy(s->input);
00119 parserutils_buffer_destroy(s->public.utf8);
00120 parserutils_buffer_destroy(s->raw);
00121 free(s);
00122 return error;
00123 }
00124
00125 s->encsrc = encsrc;
00126 } else {
00127 s->mibenum = 0;
00128 s->encsrc = 0;
00129 }
00130
00131 s->csdetect = csdetect;
00132
00133 *stream = (parserutils_inputstream *) s;
00134
00135 return PARSERUTILS_OK;
00136 }
00137
00144 parserutils_error parserutils_inputstream_destroy(
00145 parserutils_inputstream *stream)
00146 {
00147 parserutils_inputstream_private *s =
00148 (parserutils_inputstream_private *) stream;
00149
00150 if (stream == NULL)
00151 return PARSERUTILS_BADPARM;
00152
00153 parserutils__filter_destroy(s->input);
00154 parserutils_buffer_destroy(s->public.utf8);
00155 parserutils_buffer_destroy(s->raw);
00156 free(s);
00157
00158 return PARSERUTILS_OK;
00159 }
00160
00169 parserutils_error parserutils_inputstream_append(
00170 parserutils_inputstream *stream,
00171 const uint8_t *data, size_t len)
00172 {
00173 parserutils_inputstream_private *s =
00174 (parserutils_inputstream_private *) stream;
00175
00176 if (stream == NULL)
00177 return PARSERUTILS_BADPARM;
00178
00179 if (data == NULL) {
00180 s->public.had_eof = true;
00181 return PARSERUTILS_OK;
00182 }
00183
00184 return parserutils_buffer_append(s->raw, data, len);
00185 }
00186
00195 parserutils_error parserutils_inputstream_insert(
00196 parserutils_inputstream *stream,
00197 const uint8_t *data, size_t len)
00198 {
00199 parserutils_inputstream_private *s =
00200 (parserutils_inputstream_private *) stream;
00201
00202 if (stream == NULL || data == NULL)
00203 return PARSERUTILS_BADPARM;
00204
00205 return parserutils_buffer_insert(s->public.utf8, s->public.cursor,
00206 data, len);
00207 }
00208
00209 #define IS_ASCII(x) (((x) & 0x80) == 0)
00210
00232 parserutils_error parserutils_inputstream_peek_slow(
00233 parserutils_inputstream *stream,
00234 size_t offset, const uint8_t **ptr, size_t *length)
00235 {
00236 parserutils_inputstream_private *s =
00237 (parserutils_inputstream_private *) stream;
00238 parserutils_error error = PARSERUTILS_OK;
00239 size_t len;
00240
00241 if (stream == NULL || ptr == NULL || length == NULL)
00242 return PARSERUTILS_BADPARM;
00243
00244
00245 if (s->raw->length == 0) {
00246
00247 return s->public.had_eof ? PARSERUTILS_EOF
00248 : PARSERUTILS_NEEDDATA;
00249 }
00250
00251
00252 error = parserutils_inputstream_refill_buffer(s);
00253 if (error != PARSERUTILS_OK)
00254 return error;
00255
00256
00257 if (s->public.cursor + offset == s->public.utf8->length)
00258 return PARSERUTILS_NEEDDATA;
00259
00260
00261 if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
00262 len = 1;
00263 } else {
00264 error = parserutils_charset_utf8_char_byte_length(
00265 s->public.utf8->data + s->public.cursor + offset,
00266 &len);
00267
00268 if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
00269 return error;
00270
00271 if (error == PARSERUTILS_NEEDDATA) {
00272 return s->public.had_eof ? PARSERUTILS_EOF
00273 : PARSERUTILS_NEEDDATA;
00274 }
00275 }
00276
00277 (*length) = len;
00278 (*ptr) = (s->public.utf8->data + s->public.cursor + offset);
00279
00280 return PARSERUTILS_OK;
00281 }
00282
00283 #undef IS_ASCII
00284
00292 const char *parserutils_inputstream_read_charset(
00293 parserutils_inputstream *stream, uint32_t *source)
00294 {
00295 parserutils_inputstream_private *s =
00296 (parserutils_inputstream_private *) stream;
00297
00298 if (stream == NULL || source == NULL)
00299 return NULL;
00300
00301 *source = s->encsrc;
00302
00303 if (s->encsrc == 0)
00304 return "UTF-8";
00305
00306 return parserutils_charset_mibenum_to_name(s->mibenum);
00307 }
00308
00321 parserutils_error parserutils_inputstream_change_charset(
00322 parserutils_inputstream *stream,
00323 const char *enc, uint32_t source)
00324 {
00325 parserutils_inputstream_private *s =
00326 (parserutils_inputstream_private *) stream;
00327 parserutils_filter_optparams params;
00328 uint16_t temp;
00329 parserutils_error error;
00330
00331 if (stream == NULL || enc == NULL)
00332 return PARSERUTILS_BADPARM;
00333
00334 if (s->done_first_chunk)
00335 return PARSERUTILS_INVALID;
00336
00337 temp = parserutils_charset_mibenum_from_name(enc, strlen(enc));
00338 if (temp == 0)
00339 return PARSERUTILS_BADENCODING;
00340
00341
00342 params.encoding.name = enc;
00343 error = parserutils__filter_setopt(s->input,
00344 PARSERUTILS_FILTER_SET_ENCODING,
00345 ¶ms);
00346 if (error != PARSERUTILS_OK)
00347 return error;
00348
00349
00350 s->mibenum = temp;
00351 s->encsrc = source;
00352
00353 return PARSERUTILS_OK;
00354 }
00355
00356
00357
00358
00365 parserutils_error parserutils_inputstream_refill_buffer(
00366 parserutils_inputstream_private *stream)
00367 {
00368 const uint8_t *raw;
00369 uint8_t *utf8;
00370 size_t raw_length, utf8_space;
00371 parserutils_error error;
00372
00373
00374
00375 if (stream->done_first_chunk == false) {
00376 parserutils_filter_optparams params;
00377
00378
00379
00380
00381 if (stream->csdetect != NULL) {
00382 error = stream->csdetect(stream->raw->data,
00383 stream->raw->length,
00384 &stream->mibenum, &stream->encsrc);
00385 if (error != PARSERUTILS_OK) {
00386 if (error != PARSERUTILS_NEEDDATA ||
00387 stream->public.had_eof == false)
00388 return error;
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398 }
00399 }
00400
00401
00402
00403
00404
00405
00406
00407
00408 if (stream->mibenum == 0) {
00409 stream->mibenum =
00410 parserutils_charset_mibenum_from_name("UTF-8",
00411 SLEN("UTF-8"));
00412 stream->encsrc = 0;
00413 }
00414
00415 assert(stream->mibenum != 0);
00416
00417
00418 error = parserutils_inputstream_strip_bom(&stream->mibenum,
00419 stream->raw);
00420 if (error != PARSERUTILS_OK)
00421 return error;
00422
00423
00424 params.encoding.name =
00425 parserutils_charset_mibenum_to_name(stream->mibenum);
00426
00427 error = parserutils__filter_setopt(stream->input,
00428 PARSERUTILS_FILTER_SET_ENCODING,
00429 ¶ms);
00430 if (error != PARSERUTILS_OK)
00431 return error;
00432
00433 stream->done_first_chunk = true;
00434 }
00435
00436
00437 if (stream->public.cursor == stream->public.utf8->length) {
00438
00439 utf8 = stream->public.utf8->data;
00440 utf8_space = stream->public.utf8->allocated;
00441 } else {
00442
00443
00444
00445 memmove(stream->public.utf8->data,
00446 stream->public.utf8->data + stream->public.cursor,
00447 stream->public.utf8->length - stream->public.cursor);
00448
00449 stream->public.utf8->length -= stream->public.cursor;
00450
00451 if (stream->public.utf8->length >
00452 stream->public.utf8->allocated / 2) {
00453 error = parserutils_buffer_grow(stream->public.utf8);
00454 if (error != PARSERUTILS_OK)
00455 return error;
00456 }
00457
00458 utf8 = stream->public.utf8->data + stream->public.utf8->length;
00459 utf8_space = stream->public.utf8->allocated -
00460 stream->public.utf8->length;
00461 }
00462
00463 raw = stream->raw->data;
00464 raw_length = stream->raw->length;
00465
00466
00467 error = parserutils__filter_process_chunk(stream->input,
00468 &raw, &raw_length, &utf8, &utf8_space);
00469
00470
00471 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
00472 return error;
00473
00474
00475 error = parserutils_buffer_discard(stream->raw, 0,
00476 stream->raw->length - raw_length);
00477 if (error != PARSERUTILS_OK)
00478 return error;
00479
00480
00481 stream->public.utf8->length =
00482 stream->public.utf8->allocated - utf8_space;
00483
00484
00485 stream->public.cursor = 0;
00486
00487 return PARSERUTILS_OK;
00488 }
00489
00496 parserutils_error parserutils_inputstream_strip_bom(uint16_t *mibenum,
00497 parserutils_buffer *buffer)
00498 {
00499 static uint16_t utf8;
00500 static uint16_t utf16;
00501 static uint16_t utf16be;
00502 static uint16_t utf16le;
00503 static uint16_t utf32;
00504 static uint16_t utf32be;
00505 static uint16_t utf32le;
00506
00507 if (utf8 == 0) {
00508 utf8 = parserutils_charset_mibenum_from_name("UTF-8",
00509 SLEN("UTF-8"));
00510 utf16 = parserutils_charset_mibenum_from_name("UTF-16",
00511 SLEN("UTF-16"));
00512 utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
00513 SLEN("UTF-16BE"));
00514 utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
00515 SLEN("UTF-16LE"));
00516 utf32 = parserutils_charset_mibenum_from_name("UTF-32",
00517 SLEN("UTF-32"));
00518 utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
00519 SLEN("UTF-32BE"));
00520 utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
00521 SLEN("UTF-32LE"));
00522 }
00523
00524 #define UTF32_BOM_LEN (4)
00525 #define UTF16_BOM_LEN (2)
00526 #define UTF8_BOM_LEN (3)
00527
00528 if (*mibenum == utf8) {
00529 if (buffer->length >= UTF8_BOM_LEN &&
00530 buffer->data[0] == 0xEF &&
00531 buffer->data[1] == 0xBB &&
00532 buffer->data[2] == 0xBF) {
00533 return parserutils_buffer_discard(
00534 buffer, 0, UTF8_BOM_LEN);
00535 }
00536 } else if (*mibenum == utf16be) {
00537 if (buffer->length >= UTF16_BOM_LEN &&
00538 buffer->data[0] == 0xFE &&
00539 buffer->data[1] == 0xFF) {
00540 return parserutils_buffer_discard(
00541 buffer, 0, UTF16_BOM_LEN);
00542 }
00543 } else if (*mibenum == utf16le) {
00544 if (buffer->length >= UTF16_BOM_LEN &&
00545 buffer->data[0] == 0xFF &&
00546 buffer->data[1] == 0xFE) {
00547 return parserutils_buffer_discard(
00548 buffer, 0, UTF16_BOM_LEN);
00549 }
00550 } else if (*mibenum == utf16) {
00551 *mibenum = utf16be;
00552
00553 if (buffer->length >= UTF16_BOM_LEN) {
00554 if (buffer->data[0] == 0xFE &&
00555 buffer->data[1] == 0xFF) {
00556 return parserutils_buffer_discard(
00557 buffer, 0, UTF16_BOM_LEN);
00558 } else if (buffer->data[0] == 0xFF &&
00559 buffer->data[1] == 0xFE) {
00560 *mibenum = utf16le;
00561 return parserutils_buffer_discard(
00562 buffer, 0, UTF16_BOM_LEN);
00563 }
00564 }
00565 } else if (*mibenum == utf32be) {
00566 if (buffer->length >= UTF32_BOM_LEN &&
00567 buffer->data[0] == 0x00 &&
00568 buffer->data[1] == 0x00 &&
00569 buffer->data[2] == 0xFE &&
00570 buffer->data[3] == 0xFF) {
00571 return parserutils_buffer_discard(
00572 buffer, 0, UTF32_BOM_LEN);
00573 }
00574 } else if (*mibenum == utf32le) {
00575 if (buffer->length >= UTF32_BOM_LEN &&
00576 buffer->data[0] == 0xFF &&
00577 buffer->data[1] == 0xFE &&
00578 buffer->data[2] == 0x00 &&
00579 buffer->data[3] == 0x00) {
00580 return parserutils_buffer_discard(
00581 buffer, 0, UTF32_BOM_LEN);
00582 }
00583 } else if (*mibenum == utf32) {
00584 *mibenum = utf32be;
00585
00586 if (buffer->length >= UTF32_BOM_LEN) {
00587 if (buffer->data[0] == 0x00 &&
00588 buffer->data[1] == 0x00 &&
00589 buffer->data[2] == 0xFE &&
00590 buffer->data[3] == 0xFF) {
00591 return parserutils_buffer_discard(
00592 buffer, 0, UTF32_BOM_LEN);
00593 } else if (buffer->data[0] == 0xFF &&
00594 buffer->data[1] == 0xFE &&
00595 buffer->data[2] == 0x00 &&
00596 buffer->data[3] == 0x00) {
00597 *mibenum = utf32le;
00598 return parserutils_buffer_discard(
00599 buffer, 0, UTF32_BOM_LEN);
00600 }
00601 }
00602 }
00603
00604 #undef UTF8_BOM_LEN
00605 #undef UTF16_BOM_LEN
00606 #undef UTF32_BOM_LEN
00607
00608 return PARSERUTILS_OK;
00609 }
00610