00001
00002 #include "yaml_private.h"
00003
00004
00005
00006
00007
00008 static int
00009 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
00010 size_t offset, int value);
00011
00012 static int
00013 yaml_parser_update_raw_buffer(yaml_parser_t *parser);
00014
00015 static int
00016 yaml_parser_determine_encoding(yaml_parser_t *parser);
00017
00018 YAML_DECLARE(int)
00019 yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
00020
00021
00022
00023
00024
00025 static int
00026 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
00027 size_t offset, int value)
00028 {
00029 parser->error = YAML_READER_ERROR;
00030 parser->problem = problem;
00031 parser->problem_offset = offset;
00032 parser->problem_value = value;
00033
00034 return 0;
00035 }
00036
00037
00038
00039
00040
00041 #define BOM_UTF8 "\xef\xbb\xbf"
00042 #define BOM_UTF16LE "\xff\xfe"
00043 #define BOM_UTF16BE "\xfe\xff"
00044
00045
00046
00047
00048
00049
00050 static int
00051 yaml_parser_determine_encoding(yaml_parser_t *parser)
00052 {
00053
00054
00055 while (!parser->eof
00056 && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
00057 if (!yaml_parser_update_raw_buffer(parser)) {
00058 return 0;
00059 }
00060 }
00061
00062
00063
00064 if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
00065 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
00066 parser->encoding = YAML_UTF16LE_ENCODING;
00067 parser->raw_buffer.pointer += 2;
00068 parser->offset += 2;
00069 }
00070 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
00071 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
00072 parser->encoding = YAML_UTF16BE_ENCODING;
00073 parser->raw_buffer.pointer += 2;
00074 parser->offset += 2;
00075 }
00076 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
00077 && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
00078 parser->encoding = YAML_UTF8_ENCODING;
00079 parser->raw_buffer.pointer += 3;
00080 parser->offset += 3;
00081 }
00082 else {
00083 parser->encoding = YAML_UTF8_ENCODING;
00084 }
00085
00086 return 1;
00087 }
00088
00089
00090
00091
00092
00093 static int
00094 yaml_parser_update_raw_buffer(yaml_parser_t *parser)
00095 {
00096 size_t size_read = 0;
00097
00098
00099
00100 if (parser->raw_buffer.start == parser->raw_buffer.pointer
00101 && parser->raw_buffer.last == parser->raw_buffer.end)
00102 return 1;
00103
00104
00105
00106 if (parser->eof) return 1;
00107
00108
00109
00110 if (parser->raw_buffer.start < parser->raw_buffer.pointer
00111 && parser->raw_buffer.pointer < parser->raw_buffer.last) {
00112 memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
00113 parser->raw_buffer.last - parser->raw_buffer.pointer);
00114 }
00115 parser->raw_buffer.last -=
00116 parser->raw_buffer.pointer - parser->raw_buffer.start;
00117 parser->raw_buffer.pointer = parser->raw_buffer.start;
00118
00119
00120
00121 if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
00122 parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
00123 return yaml_parser_set_reader_error(parser, "input error",
00124 parser->offset, -1);
00125 }
00126 parser->raw_buffer.last += size_read;
00127 if (!size_read) {
00128 parser->eof = 1;
00129 }
00130
00131 return 1;
00132 }
00133
00134
00135
00136
00137
00138
00139
00140
00141 YAML_DECLARE(int)
00142 yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
00143 {
00144 int first = 1;
00145
00146 assert(parser->read_handler);
00147
00148
00149
00150 if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
00151 return 1;
00152
00153
00154
00155 if (parser->unread >= length)
00156 return 1;
00157
00158
00159
00160 if (!parser->encoding) {
00161 if (!yaml_parser_determine_encoding(parser))
00162 return 0;
00163 }
00164
00165
00166
00167 if (parser->buffer.start < parser->buffer.pointer
00168 && parser->buffer.pointer < parser->buffer.last) {
00169 size_t size = parser->buffer.last - parser->buffer.pointer;
00170 memmove(parser->buffer.start, parser->buffer.pointer, size);
00171 parser->buffer.pointer = parser->buffer.start;
00172 parser->buffer.last = parser->buffer.start + size;
00173 }
00174 else if (parser->buffer.pointer == parser->buffer.last) {
00175 parser->buffer.pointer = parser->buffer.start;
00176 parser->buffer.last = parser->buffer.start;
00177 }
00178
00179
00180
00181 while (parser->unread < length)
00182 {
00183
00184
00185 if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) {
00186 if (!yaml_parser_update_raw_buffer(parser)) return 0;
00187 }
00188 first = 0;
00189
00190
00191
00192 while (parser->raw_buffer.pointer != parser->raw_buffer.last)
00193 {
00194 unsigned int value = 0, value2 = 0;
00195 int incomplete = 0;
00196 unsigned char octet;
00197 unsigned int width = 0;
00198 int low, high;
00199 size_t k;
00200 size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;
00201
00202
00203
00204 switch (parser->encoding)
00205 {
00206 case YAML_UTF8_ENCODING:
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230 octet = parser->raw_buffer.pointer[0];
00231 width = (octet & 0x80) == 0x00 ? 1 :
00232 (octet & 0xE0) == 0xC0 ? 2 :
00233 (octet & 0xF0) == 0xE0 ? 3 :
00234 (octet & 0xF8) == 0xF0 ? 4 : 0;
00235
00236
00237
00238 if (!width)
00239 return yaml_parser_set_reader_error(parser,
00240 "invalid leading UTF-8 octet",
00241 parser->offset, octet);
00242
00243
00244
00245 if (width > raw_unread) {
00246 if (parser->eof) {
00247 return yaml_parser_set_reader_error(parser,
00248 "incomplete UTF-8 octet sequence",
00249 parser->offset, -1);
00250 }
00251 incomplete = 1;
00252 break;
00253 }
00254
00255
00256
00257 value = (octet & 0x80) == 0x00 ? octet & 0x7F :
00258 (octet & 0xE0) == 0xC0 ? octet & 0x1F :
00259 (octet & 0xF0) == 0xE0 ? octet & 0x0F :
00260 (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
00261
00262
00263
00264 for (k = 1; k < width; k ++)
00265 {
00266 octet = parser->raw_buffer.pointer[k];
00267
00268
00269
00270 if ((octet & 0xC0) != 0x80)
00271 return yaml_parser_set_reader_error(parser,
00272 "invalid trailing UTF-8 octet",
00273 parser->offset+k, octet);
00274
00275
00276
00277 value = (value << 6) + (octet & 0x3F);
00278 }
00279
00280
00281
00282 if (!((width == 1) ||
00283 (width == 2 && value >= 0x80) ||
00284 (width == 3 && value >= 0x800) ||
00285 (width == 4 && value >= 0x10000)))
00286 return yaml_parser_set_reader_error(parser,
00287 "invalid length of a UTF-8 sequence",
00288 parser->offset, -1);
00289
00290
00291
00292 if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF)
00293 return yaml_parser_set_reader_error(parser,
00294 "invalid Unicode character",
00295 parser->offset, value);
00296
00297 break;
00298
00299 case YAML_UTF16LE_ENCODING:
00300 case YAML_UTF16BE_ENCODING:
00301
00302 low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
00303 high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332
00333 if (raw_unread < 2) {
00334 if (parser->eof) {
00335 return yaml_parser_set_reader_error(parser,
00336 "incomplete UTF-16 character",
00337 parser->offset, -1);
00338 }
00339 incomplete = 1;
00340 break;
00341 }
00342
00343
00344
00345 value = parser->raw_buffer.pointer[low]
00346 + (parser->raw_buffer.pointer[high] << 8);
00347
00348
00349
00350 if ((value & 0xFC00) == 0xDC00)
00351 return yaml_parser_set_reader_error(parser,
00352 "unexpected low surrogate area",
00353 parser->offset, value);
00354
00355
00356
00357 if ((value & 0xFC00) == 0xD800) {
00358
00359 width = 4;
00360
00361
00362
00363 if (raw_unread < 4) {
00364 if (parser->eof) {
00365 return yaml_parser_set_reader_error(parser,
00366 "incomplete UTF-16 surrogate pair",
00367 parser->offset, -1);
00368 }
00369 incomplete = 1;
00370 break;
00371 }
00372
00373
00374
00375 value2 = parser->raw_buffer.pointer[low+2]
00376 + (parser->raw_buffer.pointer[high+2] << 8);
00377
00378
00379
00380 if ((value2 & 0xFC00) != 0xDC00)
00381 return yaml_parser_set_reader_error(parser,
00382 "expected low surrogate area",
00383 parser->offset+2, value2);
00384
00385
00386
00387 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
00388 }
00389
00390 else {
00391 width = 2;
00392 }
00393
00394 break;
00395
00396 default:
00397 assert(1);
00398 }
00399
00400
00401
00402 if (incomplete) break;
00403
00404
00405
00406
00407
00408
00409
00410
00411 if (! (value == 0x09 || value == 0x0A || value == 0x0D
00412 || (value >= 0x20 && value <= 0x7E)
00413 || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF)
00414 || (value >= 0xE000 && value <= 0xFFFD)
00415 || (value >= 0x10000 && value <= 0x10FFFF)))
00416 return yaml_parser_set_reader_error(parser,
00417 "control characters are not allowed",
00418 parser->offset, value);
00419
00420
00421
00422 parser->raw_buffer.pointer += width;
00423 parser->offset += width;
00424
00425
00426
00427
00428 if (value <= 0x7F) {
00429 *(parser->buffer.last++) = value;
00430 }
00431
00432 else if (value <= 0x7FF) {
00433 *(parser->buffer.last++) = 0xC0 + (value >> 6);
00434 *(parser->buffer.last++) = 0x80 + (value & 0x3F);
00435 }
00436
00437 else if (value <= 0xFFFF) {
00438 *(parser->buffer.last++) = 0xE0 + (value >> 12);
00439 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
00440 *(parser->buffer.last++) = 0x80 + (value & 0x3F);
00441 }
00442
00443 else {
00444 *(parser->buffer.last++) = 0xF0 + (value >> 18);
00445 *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
00446 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
00447 *(parser->buffer.last++) = 0x80 + (value & 0x3F);
00448 }
00449
00450 parser->unread ++;
00451 }
00452
00453
00454
00455 if (parser->eof) {
00456 *(parser->buffer.last++) = '\0';
00457 parser->unread ++;
00458 return 1;
00459 }
00460
00461 }
00462
00463 if (parser->offset >= PTRDIFF_MAX)
00464 return yaml_parser_set_reader_error(parser, "input is too long",
00465 PTRDIFF_MAX, -1);
00466
00467 return 1;
00468 }
00469
00470