inputstream.c

Go to the documentation of this file.
00001 /*
00002  * This file is part of LibParserUtils.
00003  * Licensed under the MIT License,
00004  *                http://www.opensource.org/licenses/mit-license.php
00005  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
00006  */
00007 
00008 #include <assert.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011 
00012 #include <parserutils/charset/mibenum.h>
00013 #include <parserutils/charset/utf8.h>
00014 #include <parserutils/input/inputstream.h>
00015 
00016 #include "input/filter.h"
00017 #include "utils/utils.h"
00018 
00022 typedef struct parserutils_inputstream_private {
00023         parserutils_inputstream public; 
00025         parserutils_buffer *raw;        
00027         bool done_first_chunk;          
00030         uint16_t mibenum;               
00031         uint32_t encsrc;                
00033         parserutils_filter *input;      
00035         parserutils_charset_detect_func csdetect; 
00036 } parserutils_inputstream_private;
00037 
00038 static inline parserutils_error parserutils_inputstream_refill_buffer(
00039                 parserutils_inputstream_private *stream);
00040 static inline parserutils_error parserutils_inputstream_strip_bom(
00041                 uint16_t *mibenum, parserutils_buffer *buffer);
00042 
00059 parserutils_error parserutils_inputstream_create(const char *enc,
00060                 uint32_t encsrc, parserutils_charset_detect_func csdetect,
00061                 parserutils_inputstream **stream)
00062 {
00063         parserutils_inputstream_private *s;
00064         parserutils_error error;
00065 
00066         if (stream == NULL)
00067                 return PARSERUTILS_BADPARM;
00068 
00069         s = malloc(sizeof(parserutils_inputstream_private));
00070         if (s == NULL)
00071                 return PARSERUTILS_NOMEM;
00072 
00073         error = parserutils_buffer_create(&s->raw);
00074         if (error != PARSERUTILS_OK) {
00075                 free(s);
00076                 return error;
00077         }
00078 
00079         error = parserutils_buffer_create(&s->public.utf8);
00080         if (error != PARSERUTILS_OK) {
00081                 parserutils_buffer_destroy(s->raw);
00082                 free(s);
00083                 return error;
00084         }
00085 
00086         s->public.cursor = 0;
00087         s->public.had_eof = false;
00088         s->done_first_chunk = false;
00089 
00090         error = parserutils__filter_create("UTF-8", &s->input);
00091         if (error != PARSERUTILS_OK) {
00092                 parserutils_buffer_destroy(s->public.utf8);
00093                 parserutils_buffer_destroy(s->raw);
00094                 free(s);
00095                 return error;
00096         }
00097 
00098         if (enc != NULL) {
00099                 parserutils_filter_optparams params;
00100 
00101                 s->mibenum = 
00102                         parserutils_charset_mibenum_from_name(enc, strlen(enc));
00103 
00104                 if (s->mibenum == 0) {
00105                         parserutils__filter_destroy(s->input);
00106                         parserutils_buffer_destroy(s->public.utf8);
00107                         parserutils_buffer_destroy(s->raw);
00108                         free(s);
00109                         return PARSERUTILS_BADENCODING;
00110                 }
00111 
00112                 params.encoding.name = enc;
00113 
00114                 error = parserutils__filter_setopt(s->input,
00115                                 PARSERUTILS_FILTER_SET_ENCODING, 
00116                                 &params);
00117                 if (error != PARSERUTILS_OK) {
00118                         parserutils__filter_destroy(s->input);
00119                         parserutils_buffer_destroy(s->public.utf8);
00120                         parserutils_buffer_destroy(s->raw);
00121                         free(s);
00122                         return error;
00123                 }
00124 
00125                 s->encsrc = encsrc;
00126         } else {
00127                 s->mibenum = 0;
00128                 s->encsrc = 0;
00129         }
00130 
00131         s->csdetect = csdetect;
00132 
00133         *stream = (parserutils_inputstream *) s;
00134 
00135         return PARSERUTILS_OK;
00136 }
00137 
00144 parserutils_error parserutils_inputstream_destroy(
00145                 parserutils_inputstream *stream)
00146 {
00147         parserutils_inputstream_private *s = 
00148                         (parserutils_inputstream_private *) stream;
00149 
00150         if (stream == NULL)
00151                 return PARSERUTILS_BADPARM;
00152 
00153         parserutils__filter_destroy(s->input);
00154         parserutils_buffer_destroy(s->public.utf8);
00155         parserutils_buffer_destroy(s->raw);
00156         free(s);
00157 
00158         return PARSERUTILS_OK;
00159 }
00160 
00169 parserutils_error parserutils_inputstream_append(
00170                 parserutils_inputstream *stream, 
00171                 const uint8_t *data, size_t len)
00172 {
00173         parserutils_inputstream_private *s = 
00174                         (parserutils_inputstream_private *) stream;
00175 
00176         if (stream == NULL)
00177                 return PARSERUTILS_BADPARM;
00178 
00179         if (data == NULL) {
00180                 s->public.had_eof = true;
00181                 return PARSERUTILS_OK;
00182         }
00183 
00184         return parserutils_buffer_append(s->raw, data, len);
00185 }
00186 
00195 parserutils_error parserutils_inputstream_insert(
00196                 parserutils_inputstream *stream,
00197                 const uint8_t *data, size_t len)
00198 {
00199         parserutils_inputstream_private *s = 
00200                         (parserutils_inputstream_private *) stream;
00201 
00202         if (stream == NULL || data == NULL)
00203                 return PARSERUTILS_BADPARM;
00204 
00205         return parserutils_buffer_insert(s->public.utf8, s->public.cursor, 
00206                         data, len);
00207 }
00208 
00209 #define IS_ASCII(x) (((x) & 0x80) == 0)
00210 
00232 parserutils_error parserutils_inputstream_peek_slow(
00233                 parserutils_inputstream *stream, 
00234                 size_t offset, const uint8_t **ptr, size_t *length)
00235 {
00236         parserutils_inputstream_private *s = 
00237                         (parserutils_inputstream_private *) stream;
00238         parserutils_error error = PARSERUTILS_OK;
00239         size_t len;
00240 
00241         if (stream == NULL || ptr == NULL || length == NULL)
00242                 return PARSERUTILS_BADPARM;
00243 
00244         /* There's insufficient data in the buffer, so read some more */
00245         if (s->raw->length == 0) {
00246                 /* No more data to be had */
00247                 return s->public.had_eof ? PARSERUTILS_EOF
00248                                          : PARSERUTILS_NEEDDATA;
00249         }
00250 
00251         /* Refill utf8 buffer from raw buffer */
00252         error = parserutils_inputstream_refill_buffer(s);
00253         if (error != PARSERUTILS_OK)
00254                 return error;
00255 
00256         /* Refill may have succeeded, but not actually produced any new data */
00257         if (s->public.cursor + offset == s->public.utf8->length)
00258                 return PARSERUTILS_NEEDDATA;
00259 
00260         /* Now try the read */
00261         if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
00262                 len = 1;
00263         } else {
00264                 error = parserutils_charset_utf8_char_byte_length(
00265                         s->public.utf8->data + s->public.cursor + offset,
00266                         &len);
00267 
00268                 if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
00269                         return error;
00270 
00271                 if (error == PARSERUTILS_NEEDDATA) {
00272                         return s->public.had_eof ? PARSERUTILS_EOF
00273                                                  : PARSERUTILS_NEEDDATA;
00274                 }
00275         }
00276 
00277         (*length) = len;
00278         (*ptr) = (s->public.utf8->data + s->public.cursor + offset);
00279 
00280         return PARSERUTILS_OK;
00281 }
00282 
00283 #undef IS_ASCII
00284 
00292 const char *parserutils_inputstream_read_charset(
00293                 parserutils_inputstream *stream, uint32_t *source)
00294 {
00295         parserutils_inputstream_private *s = 
00296                         (parserutils_inputstream_private *) stream;
00297 
00298         if (stream == NULL || source == NULL)
00299                 return NULL;
00300 
00301         *source = s->encsrc;
00302 
00303         if (s->encsrc == 0)
00304                 return "UTF-8";
00305 
00306         return parserutils_charset_mibenum_to_name(s->mibenum);
00307 }
00308 
00321 parserutils_error parserutils_inputstream_change_charset(
00322                 parserutils_inputstream *stream, 
00323                 const char *enc, uint32_t source)
00324 {
00325         parserutils_inputstream_private *s =
00326                         (parserutils_inputstream_private *) stream;
00327         parserutils_filter_optparams params;
00328         uint16_t temp;
00329         parserutils_error error;
00330 
00331         if (stream == NULL || enc == NULL)
00332                 return PARSERUTILS_BADPARM;
00333 
00334         if (s->done_first_chunk)
00335                 return PARSERUTILS_INVALID;
00336 
00337         temp = parserutils_charset_mibenum_from_name(enc, strlen(enc));
00338         if (temp == 0)
00339                 return PARSERUTILS_BADENCODING;
00340 
00341         /* Ensure filter is using the correct encoding */
00342         params.encoding.name = enc;
00343         error = parserutils__filter_setopt(s->input,
00344                         PARSERUTILS_FILTER_SET_ENCODING, 
00345                         &params);
00346         if (error != PARSERUTILS_OK)
00347                 return error;
00348 
00349         /* Finally, replace the current settings */
00350         s->mibenum = temp;
00351         s->encsrc = source;
00352 
00353         return PARSERUTILS_OK;
00354 }
00355 
00356 /******************************************************************************
00357  ******************************************************************************/
00358 
00365 parserutils_error parserutils_inputstream_refill_buffer(
00366                 parserutils_inputstream_private *stream)
00367 {
00368         const uint8_t *raw;
00369         uint8_t *utf8;
00370         size_t raw_length, utf8_space;
00371         parserutils_error error;
00372 
00373         /* If this is the first chunk of data, we must detect the charset and
00374          * strip the BOM, if one exists */
00375         if (stream->done_first_chunk == false) {
00376                 parserutils_filter_optparams params;
00377 
00378                 /* If there is a charset detection routine, give it an 
00379                  * opportunity to override any charset specified when the
00380                  * inputstream was created */
00381                 if (stream->csdetect != NULL) {
00382                         error = stream->csdetect(stream->raw->data, 
00383                                 stream->raw->length,
00384                                 &stream->mibenum, &stream->encsrc);
00385                         if (error != PARSERUTILS_OK) {
00386                                 if (error != PARSERUTILS_NEEDDATA ||
00387                                                 stream->public.had_eof == false)
00388                                         return error;
00389 
00390                                 /* We don't have enough data to detect the 
00391                                  * input encoding, but we're not going to get 
00392                                  * any more as we've been notified of EOF. 
00393                                  * Therefore, leave the encoding alone
00394                                  * so that any charset specified when the
00395                                  * inputstream was created will be preserved.
00396                                  * If there was no charset specified, then
00397                                  * we'll default to UTF-8, below */
00398                         }
00399                 }
00400 
00401                 /* Default to UTF-8 if there is still no encoding information 
00402                  * We'll do this if there was no encoding specified up-front
00403                  * and:
00404                  *    1) there was no charset detection routine
00405                  * or 2) there was insufficient data for the charset 
00406                  *       detection routine to detect an encoding
00407                  */
00408                 if (stream->mibenum == 0) {
00409                         stream->mibenum = 
00410                                 parserutils_charset_mibenum_from_name("UTF-8", 
00411                                         SLEN("UTF-8"));
00412                         stream->encsrc = 0;
00413                 }
00414 
00415                 assert(stream->mibenum != 0);
00416 
00417                 /* Strip any BOM, and update encoding as appropriate */
00418                 error = parserutils_inputstream_strip_bom(&stream->mibenum, 
00419                                 stream->raw);
00420                 if (error != PARSERUTILS_OK)
00421                         return error;
00422 
00423                 /* Ensure filter is using the correct encoding */
00424                 params.encoding.name = 
00425                         parserutils_charset_mibenum_to_name(stream->mibenum);
00426 
00427                 error = parserutils__filter_setopt(stream->input,
00428                                 PARSERUTILS_FILTER_SET_ENCODING, 
00429                                 &params);
00430                 if (error != PARSERUTILS_OK)
00431                         return error;
00432 
00433                 stream->done_first_chunk = true;
00434         }
00435 
00436         /* Work out how to perform the buffer fill */
00437         if (stream->public.cursor == stream->public.utf8->length) {
00438                 /* Cursor's at the end, so simply reuse the entire buffer */
00439                 utf8 = stream->public.utf8->data;
00440                 utf8_space = stream->public.utf8->allocated;
00441         } else {
00442                 /* Cursor's not at the end, so shift data after cursor to the
00443                  * bottom of the buffer. If the buffer's still over half full, 
00444                  * extend it. */
00445                 memmove(stream->public.utf8->data,
00446                         stream->public.utf8->data + stream->public.cursor,
00447                         stream->public.utf8->length - stream->public.cursor);
00448 
00449                 stream->public.utf8->length -= stream->public.cursor;
00450 
00451                 if (stream->public.utf8->length > 
00452                                 stream->public.utf8->allocated / 2) {
00453                         error = parserutils_buffer_grow(stream->public.utf8);
00454                         if (error != PARSERUTILS_OK)
00455                                 return error;
00456                 }
00457 
00458                 utf8 = stream->public.utf8->data + stream->public.utf8->length;
00459                 utf8_space = stream->public.utf8->allocated - 
00460                                 stream->public.utf8->length;
00461         }
00462 
00463         raw = stream->raw->data;
00464         raw_length = stream->raw->length;
00465 
00466         /* Try to fill utf8 buffer from the raw data */
00467         error = parserutils__filter_process_chunk(stream->input, 
00468                         &raw, &raw_length, &utf8, &utf8_space);
00469         /* _NOMEM implies that there's more input to read than available space
00470          * in the utf8 buffer. That's fine, so we'll ignore that error. */
00471         if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
00472                 return error;
00473 
00474         /* Remove the raw data we've processed from the raw buffer */
00475         error = parserutils_buffer_discard(stream->raw, 0, 
00476                         stream->raw->length - raw_length);
00477         if (error != PARSERUTILS_OK)
00478                 return error;
00479 
00480         /* Fix up the utf8 buffer information */
00481         stream->public.utf8->length = 
00482                         stream->public.utf8->allocated - utf8_space;
00483 
00484         /* Finally, fix up the cursor */
00485         stream->public.cursor = 0;
00486 
00487         return PARSERUTILS_OK;
00488 }
00489 
00496 parserutils_error parserutils_inputstream_strip_bom(uint16_t *mibenum, 
00497                 parserutils_buffer *buffer)
00498 {
00499         static uint16_t utf8;
00500         static uint16_t utf16;
00501         static uint16_t utf16be;
00502         static uint16_t utf16le;
00503         static uint16_t utf32;
00504         static uint16_t utf32be;
00505         static uint16_t utf32le;
00506 
00507         if (utf8 == 0) {
00508                 utf8 = parserutils_charset_mibenum_from_name("UTF-8", 
00509                                 SLEN("UTF-8"));
00510                 utf16 = parserutils_charset_mibenum_from_name("UTF-16", 
00511                                 SLEN("UTF-16"));
00512                 utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
00513                                 SLEN("UTF-16BE"));
00514                 utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
00515                                 SLEN("UTF-16LE"));
00516                 utf32 = parserutils_charset_mibenum_from_name("UTF-32", 
00517                                 SLEN("UTF-32"));
00518                 utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
00519                                 SLEN("UTF-32BE"));
00520                 utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
00521                                 SLEN("UTF-32LE"));
00522         }
00523 
00524 #define UTF32_BOM_LEN (4)
00525 #define UTF16_BOM_LEN (2)
00526 #define UTF8_BOM_LEN  (3)
00527 
00528         if (*mibenum == utf8) {
00529                 if (buffer->length >= UTF8_BOM_LEN && 
00530                                 buffer->data[0] == 0xEF &&
00531                                 buffer->data[1] == 0xBB && 
00532                                 buffer->data[2] == 0xBF) {
00533                         return parserutils_buffer_discard(
00534                                         buffer, 0, UTF8_BOM_LEN);
00535                 }
00536         } else if (*mibenum == utf16be) {
00537                 if (buffer->length >= UTF16_BOM_LEN &&
00538                                 buffer->data[0] == 0xFE &&
00539                                 buffer->data[1] == 0xFF) {
00540                         return parserutils_buffer_discard(
00541                                         buffer, 0, UTF16_BOM_LEN);
00542                 }
00543         } else if (*mibenum == utf16le) {
00544                 if (buffer->length >= UTF16_BOM_LEN &&
00545                                 buffer->data[0] == 0xFF &&
00546                                 buffer->data[1] == 0xFE) {
00547                         return parserutils_buffer_discard(
00548                                         buffer, 0, UTF16_BOM_LEN);
00549                 }
00550         } else if (*mibenum == utf16) {
00551                 *mibenum = utf16be;
00552 
00553                 if (buffer->length >= UTF16_BOM_LEN) {
00554                         if (buffer->data[0] == 0xFE && 
00555                                         buffer->data[1] == 0xFF) {
00556                                 return parserutils_buffer_discard(
00557                                                 buffer, 0, UTF16_BOM_LEN);
00558                         } else if (buffer->data[0] == 0xFF && 
00559                                         buffer->data[1] == 0xFE) {
00560                                 *mibenum = utf16le;
00561                                 return parserutils_buffer_discard(
00562                                                 buffer, 0, UTF16_BOM_LEN);
00563                         }
00564                 }
00565         } else if (*mibenum == utf32be) {
00566                 if (buffer->length >= UTF32_BOM_LEN &&
00567                                 buffer->data[0] == 0x00 &&
00568                                 buffer->data[1] == 0x00 &&
00569                                 buffer->data[2] == 0xFE &&
00570                                 buffer->data[3] == 0xFF) {
00571                         return parserutils_buffer_discard(
00572                                         buffer, 0, UTF32_BOM_LEN);
00573                 }
00574         } else if (*mibenum == utf32le) {
00575                 if (buffer->length >= UTF32_BOM_LEN &&
00576                                 buffer->data[0] == 0xFF &&
00577                                 buffer->data[1] == 0xFE &&
00578                                 buffer->data[2] == 0x00 &&
00579                                 buffer->data[3] == 0x00) {
00580                         return parserutils_buffer_discard(
00581                                         buffer, 0, UTF32_BOM_LEN);
00582                 }
00583         } else if (*mibenum == utf32) {
00584                 *mibenum = utf32be;
00585 
00586                 if (buffer->length >= UTF32_BOM_LEN) {
00587                         if (buffer->data[0] == 0x00 && 
00588                                         buffer->data[1] == 0x00 &&
00589                                         buffer->data[2] == 0xFE &&
00590                                         buffer->data[3] == 0xFF) {
00591                                 return parserutils_buffer_discard(
00592                                                 buffer, 0, UTF32_BOM_LEN);
00593                         } else if (buffer->data[0] == 0xFF && 
00594                                         buffer->data[1] == 0xFE &&
00595                                         buffer->data[2] == 0x00 &&
00596                                         buffer->data[3] == 0x00) {
00597                                 *mibenum = utf32le;
00598                                 return parserutils_buffer_discard(
00599                                                 buffer, 0, UTF32_BOM_LEN);
00600                         }
00601                 }
00602         }
00603 
00604 #undef UTF8_BOM_LEN
00605 #undef UTF16_BOM_LEN
00606 #undef UTF32_BOM_LEN
00607 
00608         return PARSERUTILS_OK;
00609 }
00610 

Generated on Wed Jul 29 11:59:21 2015 for Libparserutils by  doxygen 1.5.6