00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include "regparse.h"
00032
00033 #define WARN_BUFSIZE 256
00034
00035 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
00036
00037
00038 const OnigSyntaxType OnigSyntaxRuby = {
00039 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
00040 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
00041 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
00042 ONIG_SYN_OP_ESC_C_CONTROL )
00043 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
00044 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
00045 ONIG_SYN_OP2_OPTION_RUBY |
00046 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
00047 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
00048 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
00049 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
00050 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
00051 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
00052 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
00053 ONIG_SYN_OP2_ESC_H_XDIGIT |
00054 ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
00055 ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
00056 ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK |
00057 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP )
00058 , ( SYN_GNU_REGEX_BV |
00059 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
00060 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
00061 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
00062 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
00063 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
00064 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
00065 ONIG_SYN_WARN_CC_DUP |
00066 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
00067 , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE |
00068 ONIG_OPTION_WORD_BOUND_ALL_RANGE )
00069 ,
00070 {
00071 (OnigCodePoint )'\\'
00072 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00073 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00074 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00075 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00076 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00077 }
00078 };
00079
00080 const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
00081
00082 extern void onig_null_warn(const char* s ARG_UNUSED) { }
00083
00084 #ifdef DEFAULT_WARN_FUNCTION
00085 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
00086 #else
00087 static OnigWarnFunc onig_warn = onig_null_warn;
00088 #endif
00089
00090 #ifdef DEFAULT_VERB_WARN_FUNCTION
00091 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
00092 #else
00093 static OnigWarnFunc onig_verb_warn = onig_null_warn;
00094 #endif
00095
00096 extern void onig_set_warn_func(OnigWarnFunc f)
00097 {
00098 onig_warn = f;
00099 }
00100
00101 extern void onig_set_verb_warn_func(OnigWarnFunc f)
00102 {
00103 onig_verb_warn = f;
00104 }
00105
00106 static void CC_DUP_WARN(ScanEnv *env);
00107
00108 static void
00109 bbuf_free(BBuf* bbuf)
00110 {
00111 if (IS_NOT_NULL(bbuf)) {
00112 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
00113 xfree(bbuf);
00114 }
00115 }
00116
00117 static int
00118 bbuf_clone(BBuf** rto, BBuf* from)
00119 {
00120 int r;
00121 BBuf *to;
00122
00123 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
00124 CHECK_NULL_RETURN_MEMERR(to);
00125 r = BBUF_INIT(to, from->alloc);
00126 if (r != 0) return r;
00127 to->used = from->used;
00128 xmemcpy(to->p, from->p, from->used);
00129 return 0;
00130 }
00131
00132 #define BACKREF_REL_TO_ABS(rel_no, env) \
00133 ((env)->num_mem + 1 + (rel_no))
00134
00135 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
00136
00137 #define MBCODE_START_POS(enc) \
00138 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
00139
00140 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
00141 add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT)
00142
00143 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
00144 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
00145 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
00146 if (r) return r;\
00147 }\
00148 } while (0)
00149
00150
00151 #define BITSET_SET_BIT_CHKDUP(bs, pos) do { \
00152 if (BITSET_AT(bs, pos)) CC_DUP_WARN(env); \
00153 BS_ROOM(bs, pos) |= BS_BIT(pos); \
00154 } while (0)
00155
00156 #define BITSET_IS_EMPTY(bs,empty) do {\
00157 int i;\
00158 empty = 1;\
00159 for (i = 0; i < BITSET_SIZE; i++) {\
00160 if ((bs)[i] != 0) {\
00161 empty = 0; break;\
00162 }\
00163 }\
00164 } while (0)
00165
00166 static void
00167 bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to)
00168 {
00169 int i;
00170 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
00171 BITSET_SET_BIT_CHKDUP(bs, i);
00172 }
00173 }
00174
00175 #if 0
00176 static void
00177 bitset_set_all(BitSetRef bs)
00178 {
00179 int i;
00180 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
00181 }
00182 #endif
00183
00184 static void
00185 bitset_invert(BitSetRef bs)
00186 {
00187 int i;
00188 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
00189 }
00190
00191 static void
00192 bitset_invert_to(BitSetRef from, BitSetRef to)
00193 {
00194 int i;
00195 for (i = 0; i < BITSET_SIZE; i++) { to[i] = ~(from[i]); }
00196 }
00197
00198 static void
00199 bitset_and(BitSetRef dest, BitSetRef bs)
00200 {
00201 int i;
00202 for (i = 0; i < BITSET_SIZE; i++) { dest[i] &= bs[i]; }
00203 }
00204
00205 static void
00206 bitset_or(BitSetRef dest, BitSetRef bs)
00207 {
00208 int i;
00209 for (i = 0; i < BITSET_SIZE; i++) { dest[i] |= bs[i]; }
00210 }
00211
00212 static void
00213 bitset_copy(BitSetRef dest, BitSetRef bs)
00214 {
00215 int i;
00216 for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; }
00217 }
00218
00219 extern int
00220 onig_strncmp(const UChar* s1, const UChar* s2, int n)
00221 {
00222 int x;
00223
00224 while (n-- > 0) {
00225 x = *s2++ - *s1++;
00226 if (x) return x;
00227 }
00228 return 0;
00229 }
00230
00231 extern void
00232 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
00233 {
00234 ptrdiff_t len = end - src;
00235 if (len > 0) {
00236 xmemcpy(dest, src, len);
00237 dest[len] = (UChar )0;
00238 }
00239 }
00240
00241 #ifdef USE_NAMED_GROUP
00242 static UChar*
00243 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
00244 {
00245 ptrdiff_t slen;
00246 int term_len, i;
00247 UChar *r;
00248
00249 slen = end - s;
00250 term_len = ONIGENC_MBC_MINLEN(enc);
00251
00252 r = (UChar* )xmalloc(slen + term_len);
00253 CHECK_NULL_RETURN(r);
00254 xmemcpy(r, s, slen);
00255
00256 for (i = 0; i < term_len; i++)
00257 r[slen + i] = (UChar )0;
00258
00259 return r;
00260 }
00261 #endif
00262
00263
00264 #define PEND_VALUE 0
00265
00266 #ifdef __GNUC__
00267
00268 #define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev
00269 #else
00270 #define PFETCH_READY UChar* pfetch_prev
00271 #endif
00272 #define PEND (p < end ? 0 : 1)
00273 #define PUNFETCH p = pfetch_prev
00274 #define PINC do { \
00275 pfetch_prev = p; \
00276 p += enclen(enc, p, end); \
00277 } while (0)
00278 #define PFETCH(c) do { \
00279 c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
00280 pfetch_prev = p; \
00281 p += enclen(enc, p, end); \
00282 } while (0)
00283
00284 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
00285 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
00286
00287 static UChar*
00288 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
00289 size_t capa)
00290 {
00291 UChar* r;
00292
00293 if (dest)
00294 r = (UChar* )xrealloc(dest, capa + 1);
00295 else
00296 r = (UChar* )xmalloc(capa + 1);
00297
00298 CHECK_NULL_RETURN(r);
00299 onig_strcpy(r + (dest_end - dest), src, src_end);
00300 return r;
00301 }
00302
00303
00304 static UChar*
00305 strcat_capa_from_static(UChar* dest, UChar* dest_end,
00306 const UChar* src, const UChar* src_end, size_t capa)
00307 {
00308 UChar* r;
00309
00310 r = (UChar* )xmalloc(capa + 1);
00311 CHECK_NULL_RETURN(r);
00312 onig_strcpy(r, dest, dest_end);
00313 onig_strcpy(r + (dest_end - dest), src, src_end);
00314 return r;
00315 }
00316
00317
00318 #ifdef USE_ST_LIBRARY
00319
00320 #include "ruby/st.h"
00321
00322 typedef struct {
00323 const UChar* s;
00324 const UChar* end;
00325 } st_str_end_key;
00326
00327 static int
00328 str_end_cmp(st_data_t xp, st_data_t yp)
00329 {
00330 const st_str_end_key *x, *y;
00331 const UChar *p, *q;
00332 int c;
00333
00334 x = (const st_str_end_key *)xp;
00335 y = (const st_str_end_key *)yp;
00336 if ((x->end - x->s) != (y->end - y->s))
00337 return 1;
00338
00339 p = x->s;
00340 q = y->s;
00341 while (p < x->end) {
00342 c = (int )*p - (int )*q;
00343 if (c != 0) return c;
00344
00345 p++; q++;
00346 }
00347
00348 return 0;
00349 }
00350
00351 static st_index_t
00352 str_end_hash(st_data_t xp)
00353 {
00354 const st_str_end_key *x = (const st_str_end_key *)xp;
00355 const UChar *p;
00356 st_index_t val = 0;
00357
00358 p = x->s;
00359 while (p < x->end) {
00360 val = val * 997 + (int )*p++;
00361 }
00362
00363 return val + (val >> 5);
00364 }
00365
00366 extern hash_table_type*
00367 onig_st_init_strend_table_with_size(st_index_t size)
00368 {
00369 static const struct st_hash_type hashType = {
00370 str_end_cmp,
00371 str_end_hash,
00372 };
00373
00374 return (hash_table_type* )
00375 onig_st_init_table_with_size(&hashType, size);
00376 }
00377
00378 extern int
00379 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
00380 const UChar* end_key, hash_data_type *value)
00381 {
00382 st_str_end_key key;
00383
00384 key.s = (UChar* )str_key;
00385 key.end = (UChar* )end_key;
00386
00387 return onig_st_lookup(table, (st_data_t )(&key), value);
00388 }
00389
00390 extern int
00391 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
00392 const UChar* end_key, hash_data_type value)
00393 {
00394 st_str_end_key* key;
00395 int result;
00396
00397 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
00398 key->s = (UChar* )str_key;
00399 key->end = (UChar* )end_key;
00400 result = onig_st_insert(table, (st_data_t )key, value);
00401 if (result) {
00402 xfree(key);
00403 }
00404 return result;
00405 }
00406
00407 #endif
00408
00409
00410 #ifdef USE_NAMED_GROUP
00411
00412 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
00413
00414 typedef struct {
00415 UChar* name;
00416 size_t name_len;
00417 int back_num;
00418 int back_alloc;
00419 int back_ref1;
00420 int* back_refs;
00421 } NameEntry;
00422
00423 #ifdef USE_ST_LIBRARY
00424
00425 typedef st_table NameTable;
00426 typedef st_data_t HashDataType;
00427
00428 #ifdef ONIG_DEBUG
00429 static int
00430 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
00431 {
00432 int i;
00433 FILE* fp = (FILE* )arg;
00434
00435 fprintf(fp, "%s: ", e->name);
00436 if (e->back_num == 0)
00437 fputs("-", fp);
00438 else if (e->back_num == 1)
00439 fprintf(fp, "%d", e->back_ref1);
00440 else {
00441 for (i = 0; i < e->back_num; i++) {
00442 if (i > 0) fprintf(fp, ", ");
00443 fprintf(fp, "%d", e->back_refs[i]);
00444 }
00445 }
00446 fputs("\n", fp);
00447 return ST_CONTINUE;
00448 }
00449
00450 extern int
00451 onig_print_names(FILE* fp, regex_t* reg)
00452 {
00453 NameTable* t = (NameTable* )reg->name_table;
00454
00455 if (IS_NOT_NULL(t)) {
00456 fprintf(fp, "name table\n");
00457 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
00458 fputs("\n", fp);
00459 }
00460 return 0;
00461 }
00462 #endif
00463
00464 static int
00465 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
00466 {
00467 xfree(e->name);
00468 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
00469 xfree(key);
00470 xfree(e);
00471 return ST_DELETE;
00472 }
00473
00474 static int
00475 names_clear(regex_t* reg)
00476 {
00477 NameTable* t = (NameTable* )reg->name_table;
00478
00479 if (IS_NOT_NULL(t)) {
00480 onig_st_foreach(t, i_free_name_entry, 0);
00481 }
00482 return 0;
00483 }
00484
00485 extern int
00486 onig_names_free(regex_t* reg)
00487 {
00488 int r;
00489 NameTable* t;
00490
00491 r = names_clear(reg);
00492 if (r) return r;
00493
00494 t = (NameTable* )reg->name_table;
00495 if (IS_NOT_NULL(t)) onig_st_free_table(t);
00496 reg->name_table = (void* )NULL;
00497 return 0;
00498 }
00499
00500 static NameEntry*
00501 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
00502 {
00503 NameEntry* e;
00504 NameTable* t = (NameTable* )reg->name_table;
00505
00506 e = (NameEntry* )NULL;
00507 if (IS_NOT_NULL(t)) {
00508 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
00509 }
00510 return e;
00511 }
00512
00513 typedef struct {
00514 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
00515 regex_t* reg;
00516 void* arg;
00517 int ret;
00518 OnigEncoding enc;
00519 } INamesArg;
00520
00521 static int
00522 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
00523 {
00524 int r = (*(arg->func))(e->name,
00525 e->name + e->name_len,
00526 e->back_num,
00527 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
00528 arg->reg, arg->arg);
00529 if (r != 0) {
00530 arg->ret = r;
00531 return ST_STOP;
00532 }
00533 return ST_CONTINUE;
00534 }
00535
00536 extern int
00537 onig_foreach_name(regex_t* reg,
00538 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
00539 {
00540 INamesArg narg;
00541 NameTable* t = (NameTable* )reg->name_table;
00542
00543 narg.ret = 0;
00544 if (IS_NOT_NULL(t)) {
00545 narg.func = func;
00546 narg.reg = reg;
00547 narg.arg = arg;
00548 narg.enc = reg->enc;
00549 onig_st_foreach(t, i_names, (HashDataType )&narg);
00550 }
00551 return narg.ret;
00552 }
00553
00554 static int
00555 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
00556 {
00557 int i;
00558
00559 if (e->back_num > 1) {
00560 for (i = 0; i < e->back_num; i++) {
00561 e->back_refs[i] = map[e->back_refs[i]].new_val;
00562 }
00563 }
00564 else if (e->back_num == 1) {
00565 e->back_ref1 = map[e->back_ref1].new_val;
00566 }
00567
00568 return ST_CONTINUE;
00569 }
00570
00571 extern int
00572 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
00573 {
00574 NameTable* t = (NameTable* )reg->name_table;
00575
00576 if (IS_NOT_NULL(t)) {
00577 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
00578 }
00579 return 0;
00580 }
00581
00582
00583 extern int
00584 onig_number_of_names(regex_t* reg)
00585 {
00586 NameTable* t = (NameTable* )reg->name_table;
00587
00588 if (IS_NOT_NULL(t))
00589 return (int )t->num_entries;
00590 else
00591 return 0;
00592 }
00593
00594 #else
00595
00596 #define INIT_NAMES_ALLOC_NUM 8
00597
00598 typedef struct {
00599 NameEntry* e;
00600 int num;
00601 int alloc;
00602 } NameTable;
00603
00604 #ifdef ONIG_DEBUG
00605 extern int
00606 onig_print_names(FILE* fp, regex_t* reg)
00607 {
00608 int i, j;
00609 NameEntry* e;
00610 NameTable* t = (NameTable* )reg->name_table;
00611
00612 if (IS_NOT_NULL(t) && t->num > 0) {
00613 fprintf(fp, "name table\n");
00614 for (i = 0; i < t->num; i++) {
00615 e = &(t->e[i]);
00616 fprintf(fp, "%s: ", e->name);
00617 if (e->back_num == 0) {
00618 fputs("-", fp);
00619 }
00620 else if (e->back_num == 1) {
00621 fprintf(fp, "%d", e->back_ref1);
00622 }
00623 else {
00624 for (j = 0; j < e->back_num; j++) {
00625 if (j > 0) fprintf(fp, ", ");
00626 fprintf(fp, "%d", e->back_refs[j]);
00627 }
00628 }
00629 fputs("\n", fp);
00630 }
00631 fputs("\n", fp);
00632 }
00633 return 0;
00634 }
00635 #endif
00636
00637 static int
00638 names_clear(regex_t* reg)
00639 {
00640 int i;
00641 NameEntry* e;
00642 NameTable* t = (NameTable* )reg->name_table;
00643
00644 if (IS_NOT_NULL(t)) {
00645 for (i = 0; i < t->num; i++) {
00646 e = &(t->e[i]);
00647 if (IS_NOT_NULL(e->name)) {
00648 xfree(e->name);
00649 e->name = NULL;
00650 e->name_len = 0;
00651 e->back_num = 0;
00652 e->back_alloc = 0;
00653 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
00654 e->back_refs = (int* )NULL;
00655 }
00656 }
00657 if (IS_NOT_NULL(t->e)) {
00658 xfree(t->e);
00659 t->e = NULL;
00660 }
00661 t->num = 0;
00662 }
00663 return 0;
00664 }
00665
00666 extern int
00667 onig_names_free(regex_t* reg)
00668 {
00669 int r;
00670 NameTable* t;
00671
00672 r = names_clear(reg);
00673 if (r) return r;
00674
00675 t = (NameTable* )reg->name_table;
00676 if (IS_NOT_NULL(t)) xfree(t);
00677 reg->name_table = NULL;
00678 return 0;
00679 }
00680
00681 static NameEntry*
00682 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
00683 {
00684 int i, len;
00685 NameEntry* e;
00686 NameTable* t = (NameTable* )reg->name_table;
00687
00688 if (IS_NOT_NULL(t)) {
00689 len = name_end - name;
00690 for (i = 0; i < t->num; i++) {
00691 e = &(t->e[i]);
00692 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
00693 return e;
00694 }
00695 }
00696 return (NameEntry* )NULL;
00697 }
00698
00699 extern int
00700 onig_foreach_name(regex_t* reg,
00701 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
00702 {
00703 int i, r;
00704 NameEntry* e;
00705 NameTable* t = (NameTable* )reg->name_table;
00706
00707 if (IS_NOT_NULL(t)) {
00708 for (i = 0; i < t->num; i++) {
00709 e = &(t->e[i]);
00710 r = (*func)(e->name, e->name + e->name_len, e->back_num,
00711 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
00712 reg, arg);
00713 if (r != 0) return r;
00714 }
00715 }
00716 return 0;
00717 }
00718
00719 extern int
00720 onig_number_of_names(regex_t* reg)
00721 {
00722 NameTable* t = (NameTable* )reg->name_table;
00723
00724 if (IS_NOT_NULL(t))
00725 return t->num;
00726 else
00727 return 0;
00728 }
00729
00730 #endif
00731
00732 static int
00733 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
00734 {
00735 int alloc;
00736 NameEntry* e;
00737 NameTable* t = (NameTable* )reg->name_table;
00738
00739 if (name_end - name <= 0)
00740 return ONIGERR_EMPTY_GROUP_NAME;
00741
00742 e = name_find(reg, name, name_end);
00743 if (IS_NULL(e)) {
00744 #ifdef USE_ST_LIBRARY
00745 if (IS_NULL(t)) {
00746 t = onig_st_init_strend_table_with_size(5);
00747 reg->name_table = (void* )t;
00748 }
00749 e = (NameEntry* )xmalloc(sizeof(NameEntry));
00750 CHECK_NULL_RETURN_MEMERR(e);
00751
00752 e->name = strdup_with_null(reg->enc, name, name_end);
00753 if (IS_NULL(e->name)) {
00754 xfree(e);
00755 return ONIGERR_MEMORY;
00756 }
00757 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
00758 (HashDataType )e);
00759
00760 e->name_len = name_end - name;
00761 e->back_num = 0;
00762 e->back_alloc = 0;
00763 e->back_refs = (int* )NULL;
00764
00765 #else
00766
00767 if (IS_NULL(t)) {
00768 alloc = INIT_NAMES_ALLOC_NUM;
00769 t = (NameTable* )xmalloc(sizeof(NameTable));
00770 CHECK_NULL_RETURN_MEMERR(t);
00771 t->e = NULL;
00772 t->alloc = 0;
00773 t->num = 0;
00774
00775 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
00776 if (IS_NULL(t->e)) {
00777 xfree(t);
00778 return ONIGERR_MEMORY;
00779 }
00780 t->alloc = alloc;
00781 reg->name_table = t;
00782 goto clear;
00783 }
00784 else if (t->num == t->alloc) {
00785 int i;
00786 NameEntry* p;
00787
00788 alloc = t->alloc * 2;
00789 p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
00790 CHECK_NULL_RETURN_MEMERR(p);
00791 t->e = p;
00792 t->alloc = alloc;
00793
00794 clear:
00795 for (i = t->num; i < t->alloc; i++) {
00796 t->e[i].name = NULL;
00797 t->e[i].name_len = 0;
00798 t->e[i].back_num = 0;
00799 t->e[i].back_alloc = 0;
00800 t->e[i].back_refs = (int* )NULL;
00801 }
00802 }
00803 e = &(t->e[t->num]);
00804 t->num++;
00805 e->name = strdup_with_null(reg->enc, name, name_end);
00806 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
00807 e->name_len = name_end - name;
00808 #endif
00809 }
00810
00811 if (e->back_num >= 1 &&
00812 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
00813 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
00814 name, name_end);
00815 return ONIGERR_MULTIPLEX_DEFINED_NAME;
00816 }
00817
00818 e->back_num++;
00819 if (e->back_num == 1) {
00820 e->back_ref1 = backref;
00821 }
00822 else {
00823 if (e->back_num == 2) {
00824 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
00825 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
00826 CHECK_NULL_RETURN_MEMERR(e->back_refs);
00827 e->back_alloc = alloc;
00828 e->back_refs[0] = e->back_ref1;
00829 e->back_refs[1] = backref;
00830 }
00831 else {
00832 if (e->back_num > e->back_alloc) {
00833 int* p;
00834 alloc = e->back_alloc * 2;
00835 p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
00836 CHECK_NULL_RETURN_MEMERR(p);
00837 e->back_refs = p;
00838 e->back_alloc = alloc;
00839 }
00840 e->back_refs[e->back_num - 1] = backref;
00841 }
00842 }
00843
00844 return 0;
00845 }
00846
00847 extern int
00848 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
00849 const UChar* name_end, int** nums)
00850 {
00851 NameEntry* e = name_find(reg, name, name_end);
00852
00853 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
00854
00855 switch (e->back_num) {
00856 case 0:
00857 *nums = 0;
00858 break;
00859 case 1:
00860 *nums = &(e->back_ref1);
00861 break;
00862 default:
00863 *nums = e->back_refs;
00864 break;
00865 }
00866 return e->back_num;
00867 }
00868
00869 extern int
00870 onig_name_to_backref_number(regex_t* reg, const UChar* name,
00871 const UChar* name_end, OnigRegion *region)
00872 {
00873 int i, n, *nums;
00874
00875 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
00876 if (n < 0)
00877 return n;
00878 else if (n == 0)
00879 return ONIGERR_PARSER_BUG;
00880 else if (n == 1)
00881 return nums[0];
00882 else {
00883 if (IS_NOT_NULL(region)) {
00884 for (i = n - 1; i >= 0; i--) {
00885 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
00886 return nums[i];
00887 }
00888 }
00889 return nums[n - 1];
00890 }
00891 }
00892
00893 #else
00894
00895 extern int
00896 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
00897 const UChar* name_end, int** nums)
00898 {
00899 return ONIG_NO_SUPPORT_CONFIG;
00900 }
00901
00902 extern int
00903 onig_name_to_backref_number(regex_t* reg, const UChar* name,
00904 const UChar* name_end, OnigRegion* region)
00905 {
00906 return ONIG_NO_SUPPORT_CONFIG;
00907 }
00908
00909 extern int
00910 onig_foreach_name(regex_t* reg,
00911 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
00912 {
00913 return ONIG_NO_SUPPORT_CONFIG;
00914 }
00915
00916 extern int
00917 onig_number_of_names(regex_t* reg)
00918 {
00919 return 0;
00920 }
00921 #endif
00922
00923 extern int
00924 onig_noname_group_capture_is_active(regex_t* reg)
00925 {
00926 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
00927 return 0;
00928
00929 #ifdef USE_NAMED_GROUP
00930 if (onig_number_of_names(reg) > 0 &&
00931 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
00932 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
00933 return 0;
00934 }
00935 #endif
00936
00937 return 1;
00938 }
00939
00940
00941 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
00942
00943 static void
00944 scan_env_clear(ScanEnv* env)
00945 {
00946 int i;
00947
00948 BIT_STATUS_CLEAR(env->capture_history);
00949 BIT_STATUS_CLEAR(env->bt_mem_start);
00950 BIT_STATUS_CLEAR(env->bt_mem_end);
00951 BIT_STATUS_CLEAR(env->backrefed_mem);
00952 env->error = (UChar* )NULL;
00953 env->error_end = (UChar* )NULL;
00954 env->num_call = 0;
00955 env->num_mem = 0;
00956 #ifdef USE_NAMED_GROUP
00957 env->num_named = 0;
00958 #endif
00959 env->mem_alloc = 0;
00960 env->mem_nodes_dynamic = (Node** )NULL;
00961
00962 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
00963 env->mem_nodes_static[i] = NULL_NODE;
00964
00965 #ifdef USE_COMBINATION_EXPLOSION_CHECK
00966 env->num_comb_exp_check = 0;
00967 env->comb_exp_max_regnum = 0;
00968 env->curr_max_regnum = 0;
00969 env->has_recursion = 0;
00970 #endif
00971 env->warnings_flag = 0;
00972 }
00973
00974 static int
00975 scan_env_add_mem_entry(ScanEnv* env)
00976 {
00977 int i, need, alloc;
00978 Node** p;
00979
00980 need = env->num_mem + 1;
00981 if (need > ONIG_MAX_CAPTURE_GROUP_NUM)
00982 return ONIGERR_TOO_MANY_CAPTURE_GROUPS;
00983 if (need >= SCANENV_MEMNODES_SIZE) {
00984 if (env->mem_alloc <= need) {
00985 if (IS_NULL(env->mem_nodes_dynamic)) {
00986 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
00987 p = (Node** )xmalloc(sizeof(Node*) * alloc);
00988 xmemcpy(p, env->mem_nodes_static,
00989 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
00990 }
00991 else {
00992 alloc = env->mem_alloc * 2;
00993 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
00994 }
00995 CHECK_NULL_RETURN_MEMERR(p);
00996
00997 for (i = env->num_mem + 1; i < alloc; i++)
00998 p[i] = NULL_NODE;
00999
01000 env->mem_nodes_dynamic = p;
01001 env->mem_alloc = alloc;
01002 }
01003 }
01004
01005 env->num_mem++;
01006 return env->num_mem;
01007 }
01008
01009 static int
01010 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
01011 {
01012 if (env->num_mem >= num)
01013 SCANENV_MEM_NODES(env)[num] = node;
01014 else
01015 return ONIGERR_PARSER_BUG;
01016 return 0;
01017 }
01018
01019
01020 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01021 typedef struct _FreeNode {
01022 struct _FreeNode* next;
01023 } FreeNode;
01024
01025 static FreeNode* FreeNodeList = (FreeNode* )NULL;
01026 #endif
01027
01028 extern void
01029 onig_node_free(Node* node)
01030 {
01031 start:
01032 if (IS_NULL(node)) return ;
01033
01034 switch (NTYPE(node)) {
01035 case NT_STR:
01036 if (NSTR(node)->capa != 0 &&
01037 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
01038 xfree(NSTR(node)->s);
01039 }
01040 break;
01041
01042 case NT_LIST:
01043 case NT_ALT:
01044 onig_node_free(NCAR(node));
01045 {
01046 Node* next_node = NCDR(node);
01047
01048 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01049 {
01050 FreeNode* n = (FreeNode* )node;
01051
01052 THREAD_ATOMIC_START;
01053 n->next = FreeNodeList;
01054 FreeNodeList = n;
01055 THREAD_ATOMIC_END;
01056 }
01057 #else
01058 xfree(node);
01059 #endif
01060 node = next_node;
01061 goto start;
01062 }
01063 break;
01064
01065 case NT_CCLASS:
01066 {
01067 CClassNode* cc = NCCLASS(node);
01068
01069 if (IS_NCCLASS_SHARE(cc)) return ;
01070 if (cc->mbuf)
01071 bbuf_free(cc->mbuf);
01072 }
01073 break;
01074
01075 case NT_QTFR:
01076 if (NQTFR(node)->target)
01077 onig_node_free(NQTFR(node)->target);
01078 break;
01079
01080 case NT_ENCLOSE:
01081 if (NENCLOSE(node)->target)
01082 onig_node_free(NENCLOSE(node)->target);
01083 break;
01084
01085 case NT_BREF:
01086 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
01087 xfree(NBREF(node)->back_dynamic);
01088 break;
01089
01090 case NT_ANCHOR:
01091 if (NANCHOR(node)->target)
01092 onig_node_free(NANCHOR(node)->target);
01093 break;
01094 }
01095
01096 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01097 {
01098 FreeNode* n = (FreeNode* )node;
01099
01100 THREAD_ATOMIC_START;
01101 n->next = FreeNodeList;
01102 FreeNodeList = n;
01103 THREAD_ATOMIC_END;
01104 }
01105 #else
01106 xfree(node);
01107 #endif
01108 }
01109
01110 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01111 extern int
01112 onig_free_node_list(void)
01113 {
01114 FreeNode* n;
01115
01116
01117 while (IS_NOT_NULL(FreeNodeList)) {
01118 n = FreeNodeList;
01119 FreeNodeList = FreeNodeList->next;
01120 xfree(n);
01121 }
01122
01123 return 0;
01124 }
01125 #endif
01126
01127 static Node*
01128 node_new(void)
01129 {
01130 Node* node;
01131
01132 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01133 THREAD_ATOMIC_START;
01134 if (IS_NOT_NULL(FreeNodeList)) {
01135 node = (Node* )FreeNodeList;
01136 FreeNodeList = FreeNodeList->next;
01137 THREAD_ATOMIC_END;
01138 return node;
01139 }
01140 THREAD_ATOMIC_END;
01141 #endif
01142
01143 node = (Node* )xmalloc(sizeof(Node));
01144
01145 return node;
01146 }
01147
01148
01149 static void
01150 initialize_cclass(CClassNode* cc)
01151 {
01152 BITSET_CLEAR(cc->bs);
01153
01154 cc->flags = 0;
01155 cc->mbuf = NULL;
01156 }
01157
01158 static Node*
01159 node_new_cclass(void)
01160 {
01161 Node* node = node_new();
01162 CHECK_NULL_RETURN(node);
01163
01164 SET_NTYPE(node, NT_CCLASS);
01165 initialize_cclass(NCCLASS(node));
01166 return node;
01167 }
01168
01169 static Node*
01170 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
01171 const OnigCodePoint ranges[])
01172 {
01173 int n, i;
01174 CClassNode* cc;
01175 OnigCodePoint j;
01176
01177 Node* node = node_new_cclass();
01178 CHECK_NULL_RETURN(node);
01179
01180 cc = NCCLASS(node);
01181 if (not != 0) NCCLASS_SET_NOT(cc);
01182
01183 BITSET_CLEAR(cc->bs);
01184 if (sb_out > 0 && IS_NOT_NULL(ranges)) {
01185 n = ONIGENC_CODE_RANGE_NUM(ranges);
01186 for (i = 0; i < n; i++) {
01187 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
01188 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
01189 if (j >= sb_out) goto sb_end;
01190
01191 BITSET_SET_BIT(cc->bs, j);
01192 }
01193 }
01194 }
01195
01196 sb_end:
01197 if (IS_NULL(ranges)) {
01198 is_null:
01199 cc->mbuf = NULL;
01200 }
01201 else {
01202 BBuf* bbuf;
01203
01204 n = ONIGENC_CODE_RANGE_NUM(ranges);
01205 if (n == 0) goto is_null;
01206
01207 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
01208 CHECK_NULL_RETURN(bbuf);
01209 bbuf->alloc = n + 1;
01210 bbuf->used = n + 1;
01211 bbuf->p = (UChar* )((void* )ranges);
01212
01213 cc->mbuf = bbuf;
01214 }
01215
01216 return node;
01217 }
01218
01219 static Node*
01220 node_new_ctype(int type, int not, int ascii_range)
01221 {
01222 Node* node = node_new();
01223 CHECK_NULL_RETURN(node);
01224
01225 SET_NTYPE(node, NT_CTYPE);
01226 NCTYPE(node)->ctype = type;
01227 NCTYPE(node)->not = not;
01228 NCTYPE(node)->ascii_range = ascii_range;
01229 return node;
01230 }
01231
01232 static Node*
01233 node_new_anychar(void)
01234 {
01235 Node* node = node_new();
01236 CHECK_NULL_RETURN(node);
01237
01238 SET_NTYPE(node, NT_CANY);
01239 return node;
01240 }
01241
01242 static Node*
01243 node_new_list(Node* left, Node* right)
01244 {
01245 Node* node = node_new();
01246 CHECK_NULL_RETURN(node);
01247
01248 SET_NTYPE(node, NT_LIST);
01249 NCAR(node) = left;
01250 NCDR(node) = right;
01251 return node;
01252 }
01253
01254 extern Node*
01255 onig_node_new_list(Node* left, Node* right)
01256 {
01257 return node_new_list(left, right);
01258 }
01259
01260 extern Node*
01261 onig_node_list_add(Node* list, Node* x)
01262 {
01263 Node *n;
01264
01265 n = onig_node_new_list(x, NULL);
01266 if (IS_NULL(n)) return NULL_NODE;
01267
01268 if (IS_NOT_NULL(list)) {
01269 while (IS_NOT_NULL(NCDR(list)))
01270 list = NCDR(list);
01271
01272 NCDR(list) = n;
01273 }
01274
01275 return n;
01276 }
01277
01278 extern Node*
01279 onig_node_new_alt(Node* left, Node* right)
01280 {
01281 Node* node = node_new();
01282 CHECK_NULL_RETURN(node);
01283
01284 SET_NTYPE(node, NT_ALT);
01285 NCAR(node) = left;
01286 NCDR(node) = right;
01287 return node;
01288 }
01289
01290 extern Node*
01291 onig_node_new_anchor(int type)
01292 {
01293 Node* node = node_new();
01294 CHECK_NULL_RETURN(node);
01295
01296 SET_NTYPE(node, NT_ANCHOR);
01297 NANCHOR(node)->type = type;
01298 NANCHOR(node)->target = NULL;
01299 NANCHOR(node)->char_len = -1;
01300 NANCHOR(node)->ascii_range = 0;
01301 return node;
01302 }
01303
01304 static Node*
01305 node_new_backref(int back_num, int* backrefs, int by_name,
01306 #ifdef USE_BACKREF_WITH_LEVEL
01307 int exist_level, int nest_level,
01308 #endif
01309 ScanEnv* env)
01310 {
01311 int i;
01312 Node* node = node_new();
01313
01314 CHECK_NULL_RETURN(node);
01315
01316 SET_NTYPE(node, NT_BREF);
01317 NBREF(node)->state = 0;
01318 NBREF(node)->back_num = back_num;
01319 NBREF(node)->back_dynamic = (int* )NULL;
01320 if (by_name != 0)
01321 NBREF(node)->state |= NST_NAME_REF;
01322
01323 #ifdef USE_BACKREF_WITH_LEVEL
01324 if (exist_level != 0) {
01325 NBREF(node)->state |= NST_NEST_LEVEL;
01326 NBREF(node)->nest_level = nest_level;
01327 }
01328 #endif
01329
01330 for (i = 0; i < back_num; i++) {
01331 if (backrefs[i] <= env->num_mem &&
01332 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
01333 NBREF(node)->state |= NST_RECURSION;
01334 break;
01335 }
01336 }
01337
01338 if (back_num <= NODE_BACKREFS_SIZE) {
01339 for (i = 0; i < back_num; i++)
01340 NBREF(node)->back_static[i] = backrefs[i];
01341 }
01342 else {
01343 int* p = (int* )xmalloc(sizeof(int) * back_num);
01344 if (IS_NULL(p)) {
01345 onig_node_free(node);
01346 return NULL;
01347 }
01348 NBREF(node)->back_dynamic = p;
01349 for (i = 0; i < back_num; i++)
01350 p[i] = backrefs[i];
01351 }
01352 return node;
01353 }
01354
01355 #ifdef USE_SUBEXP_CALL
01356 static Node*
01357 node_new_call(UChar* name, UChar* name_end, int gnum)
01358 {
01359 Node* node = node_new();
01360 CHECK_NULL_RETURN(node);
01361
01362 SET_NTYPE(node, NT_CALL);
01363 NCALL(node)->state = 0;
01364 NCALL(node)->target = NULL_NODE;
01365 NCALL(node)->name = name;
01366 NCALL(node)->name_end = name_end;
01367 NCALL(node)->group_num = gnum;
01368 return node;
01369 }
01370 #endif
01371
01372 static Node*
01373 node_new_quantifier(int lower, int upper, int by_number)
01374 {
01375 Node* node = node_new();
01376 CHECK_NULL_RETURN(node);
01377
01378 SET_NTYPE(node, NT_QTFR);
01379 NQTFR(node)->state = 0;
01380 NQTFR(node)->target = NULL;
01381 NQTFR(node)->lower = lower;
01382 NQTFR(node)->upper = upper;
01383 NQTFR(node)->greedy = 1;
01384 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
01385 NQTFR(node)->head_exact = NULL_NODE;
01386 NQTFR(node)->next_head_exact = NULL_NODE;
01387 NQTFR(node)->is_refered = 0;
01388 if (by_number != 0)
01389 NQTFR(node)->state |= NST_BY_NUMBER;
01390
01391 #ifdef USE_COMBINATION_EXPLOSION_CHECK
01392 NQTFR(node)->comb_exp_check_num = 0;
01393 #endif
01394
01395 return node;
01396 }
01397
01398 static Node*
01399 node_new_enclose(int type)
01400 {
01401 Node* node = node_new();
01402 CHECK_NULL_RETURN(node);
01403
01404 SET_NTYPE(node, NT_ENCLOSE);
01405 NENCLOSE(node)->type = type;
01406 NENCLOSE(node)->state = 0;
01407 NENCLOSE(node)->regnum = 0;
01408 NENCLOSE(node)->option = 0;
01409 NENCLOSE(node)->target = NULL;
01410 NENCLOSE(node)->call_addr = -1;
01411 NENCLOSE(node)->opt_count = 0;
01412 return node;
01413 }
01414
01415 extern Node*
01416 onig_node_new_enclose(int type)
01417 {
01418 return node_new_enclose(type);
01419 }
01420
01421 static Node*
01422 node_new_enclose_memory(OnigOptionType option, int is_named)
01423 {
01424 Node* node = node_new_enclose(ENCLOSE_MEMORY);
01425 CHECK_NULL_RETURN(node);
01426 if (is_named != 0)
01427 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
01428
01429 #ifdef USE_SUBEXP_CALL
01430 NENCLOSE(node)->option = option;
01431 #endif
01432 return node;
01433 }
01434
01435 static Node*
01436 node_new_option(OnigOptionType option)
01437 {
01438 Node* node = node_new_enclose(ENCLOSE_OPTION);
01439 CHECK_NULL_RETURN(node);
01440 NENCLOSE(node)->option = option;
01441 return node;
01442 }
01443
01444 extern int
01445 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
01446 {
01447 ptrdiff_t addlen = end - s;
01448
01449 if (addlen > 0) {
01450 ptrdiff_t len = NSTR(node)->end - NSTR(node)->s;
01451
01452 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
01453 UChar* p;
01454 ptrdiff_t capa = len + addlen + NODE_STR_MARGIN;
01455
01456 if (capa <= NSTR(node)->capa) {
01457 onig_strcpy(NSTR(node)->s + len, s, end);
01458 }
01459 else {
01460 if (NSTR(node)->s == NSTR(node)->buf)
01461 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
01462 s, end, capa);
01463 else
01464 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
01465
01466 CHECK_NULL_RETURN_MEMERR(p);
01467 NSTR(node)->s = p;
01468 NSTR(node)->capa = (int )capa;
01469 }
01470 }
01471 else {
01472 onig_strcpy(NSTR(node)->s + len, s, end);
01473 }
01474 NSTR(node)->end = NSTR(node)->s + len + addlen;
01475 }
01476
01477 return 0;
01478 }
01479
01480 extern int
01481 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
01482 {
01483 onig_node_str_clear(node);
01484 return onig_node_str_cat(node, s, end);
01485 }
01486
01487 static int
01488 node_str_cat_char(Node* node, UChar c)
01489 {
01490 UChar s[1];
01491
01492 s[0] = c;
01493 return onig_node_str_cat(node, s, s + 1);
01494 }
01495
01496 static int
01497 node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c)
01498 {
01499 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
01500 int num = ONIGENC_CODE_TO_MBC(enc, c, buf);
01501 if (num < 0) return num;
01502 return onig_node_str_cat(node, buf, buf + num);
01503 }
01504
01505 extern void
01506 onig_node_conv_to_str_node(Node* node, int flag)
01507 {
01508 SET_NTYPE(node, NT_STR);
01509 NSTR(node)->flag = flag;
01510 NSTR(node)->capa = 0;
01511 NSTR(node)->s = NSTR(node)->buf;
01512 NSTR(node)->end = NSTR(node)->buf;
01513 }
01514
01515 extern void
01516 onig_node_str_clear(Node* node)
01517 {
01518 if (NSTR(node)->capa != 0 &&
01519 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
01520 xfree(NSTR(node)->s);
01521 }
01522
01523 NSTR(node)->capa = 0;
01524 NSTR(node)->flag = 0;
01525 NSTR(node)->s = NSTR(node)->buf;
01526 NSTR(node)->end = NSTR(node)->buf;
01527 }
01528
01529 static Node*
01530 node_new_str(const UChar* s, const UChar* end)
01531 {
01532 Node* node = node_new();
01533 CHECK_NULL_RETURN(node);
01534
01535 SET_NTYPE(node, NT_STR);
01536 NSTR(node)->capa = 0;
01537 NSTR(node)->flag = 0;
01538 NSTR(node)->s = NSTR(node)->buf;
01539 NSTR(node)->end = NSTR(node)->buf;
01540 if (onig_node_str_cat(node, s, end)) {
01541 onig_node_free(node);
01542 return NULL;
01543 }
01544 return node;
01545 }
01546
01547 extern Node*
01548 onig_node_new_str(const UChar* s, const UChar* end)
01549 {
01550 return node_new_str(s, end);
01551 }
01552
01553 static Node*
01554 node_new_str_raw(UChar* s, UChar* end)
01555 {
01556 Node* node = node_new_str(s, end);
01557 if (IS_NOT_NULL(node))
01558 NSTRING_SET_RAW(node);
01559 return node;
01560 }
01561
01562 static Node*
01563 node_new_empty(void)
01564 {
01565 return node_new_str(NULL, NULL);
01566 }
01567
01568 static Node*
01569 node_new_str_raw_char(UChar c)
01570 {
01571 UChar p[1];
01572
01573 p[0] = c;
01574 return node_new_str_raw(p, p + 1);
01575 }
01576
01577 static Node*
01578 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
01579 {
01580 const UChar *p;
01581 Node* n = NULL_NODE;
01582
01583 if (sn->end > sn->s) {
01584 p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end);
01585 if (p && p > sn->s) {
01586 n = node_new_str(p, sn->end);
01587 if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0)
01588 NSTRING_SET_RAW(n);
01589 sn->end = (UChar* )p;
01590 }
01591 }
01592 return n;
01593 }
01594
01595 static int
01596 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
01597 {
01598 if (sn->end > sn->s) {
01599 return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0);
01600 }
01601 return 0;
01602 }
01603
01604 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
01605 static int
01606 node_str_head_pad(StrNode* sn, int num, UChar val)
01607 {
01608 UChar buf[NODE_STR_BUF_SIZE];
01609 int i, len;
01610
01611 len = sn->end - sn->s;
01612 onig_strcpy(buf, sn->s, sn->end);
01613 onig_strcpy(&(sn->s[num]), buf, buf + len);
01614 sn->end += num;
01615
01616 for (i = 0; i < num; i++) {
01617 sn->s[i] = val;
01618 }
01619 }
01620 #endif
01621
01622 extern int
01623 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
01624 {
01625 unsigned int num, val;
01626 OnigCodePoint c;
01627 UChar* p = *src;
01628 PFETCH_READY;
01629
01630 num = 0;
01631 while (!PEND) {
01632 PFETCH(c);
01633 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
01634 val = (unsigned int )DIGITVAL(c);
01635 if ((INT_MAX_LIMIT - val) / 10UL < num)
01636 return -1;
01637
01638 num = num * 10 + val;
01639 }
01640 else {
01641 PUNFETCH;
01642 break;
01643 }
01644 }
01645 *src = p;
01646 return num;
01647 }
01648
01649 static int
01650 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen,
01651 int maxlen, OnigEncoding enc)
01652 {
01653 OnigCodePoint c;
01654 unsigned int num, val;
01655 int restlen;
01656 UChar* p = *src;
01657 PFETCH_READY;
01658
01659 restlen = maxlen - minlen;
01660 num = 0;
01661 while (!PEND && maxlen-- != 0) {
01662 PFETCH(c);
01663 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
01664 val = (unsigned int )XDIGITVAL(enc,c);
01665 if ((INT_MAX_LIMIT - val) / 16UL < num)
01666 return -1;
01667
01668 num = (num << 4) + XDIGITVAL(enc,c);
01669 }
01670 else {
01671 PUNFETCH;
01672 break;
01673 }
01674 }
01675 if (maxlen > restlen)
01676 return -2;
01677 *src = p;
01678 return num;
01679 }
01680
01681 static int
01682 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
01683 OnigEncoding enc)
01684 {
01685 OnigCodePoint c;
01686 unsigned int num, val;
01687 UChar* p = *src;
01688 PFETCH_READY;
01689
01690 num = 0;
01691 while (!PEND && maxlen-- != 0) {
01692 PFETCH(c);
01693 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
01694 val = ODIGITVAL(c);
01695 if ((INT_MAX_LIMIT - val) / 8UL < num)
01696 return -1;
01697
01698 num = (num << 3) + val;
01699 }
01700 else {
01701 PUNFETCH;
01702 break;
01703 }
01704 }
01705 *src = p;
01706 return num;
01707 }
01708
01709
01710 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
01711 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
01712
01713
01714
01715
01716
01717 static int
01718 new_code_range(BBuf** pbuf)
01719 {
01720 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
01721 int r;
01722 OnigCodePoint n;
01723 BBuf* bbuf;
01724
01725 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
01726 CHECK_NULL_RETURN_MEMERR(*pbuf);
01727 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
01728 if (r) return r;
01729
01730 n = 0;
01731 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
01732 return 0;
01733 }
01734
01735 static int
01736 add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to,
01737 int checkdup)
01738 {
01739 int r, inc_n, pos;
01740 OnigCodePoint low, high, bound, x;
01741 OnigCodePoint n, *data;
01742 BBuf* bbuf;
01743
01744 if (from > to) {
01745 n = from; from = to; to = n;
01746 }
01747
01748 if (IS_NULL(*pbuf)) {
01749 r = new_code_range(pbuf);
01750 if (r) return r;
01751 bbuf = *pbuf;
01752 n = 0;
01753 }
01754 else {
01755 bbuf = *pbuf;
01756 GET_CODE_POINT(n, bbuf->p);
01757 }
01758 data = (OnigCodePoint* )(bbuf->p);
01759 data++;
01760
01761 bound = (from == 0) ? 0 : n;
01762 for (low = 0; low < bound; ) {
01763 x = (low + bound) >> 1;
01764 if (from - 1 > data[x*2 + 1])
01765 low = x + 1;
01766 else
01767 bound = x;
01768 }
01769
01770 high = (to == ONIG_LAST_CODE_POINT) ? n : low;
01771 for (bound = n; high < bound; ) {
01772 x = (high + bound) >> 1;
01773 if (to + 1 >= data[x*2])
01774 high = x + 1;
01775 else
01776 bound = x;
01777 }
01778
01779
01780
01781
01782 inc_n = low + 1 - high;
01783 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
01784 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
01785
01786 if (inc_n != 1) {
01787 if (checkdup && from <= data[low*2+1]
01788 && (data[low*2] <= from || data[low*2+1] <= to))
01789 CC_DUP_WARN(env);
01790 if (from > data[low*2])
01791 from = data[low*2];
01792 if (to < data[(high - 1)*2 + 1])
01793 to = data[(high - 1)*2 + 1];
01794 }
01795
01796 if (inc_n != 0) {
01797 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
01798 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
01799
01800 if (inc_n > 0) {
01801 if (high < n) {
01802 int size = (n - high) * 2 * SIZE_CODE_POINT;
01803 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
01804 }
01805 }
01806 else {
01807 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
01808 }
01809 }
01810
01811 pos = SIZE_CODE_POINT * (1 + low * 2);
01812 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
01813 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
01814 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
01815 n += inc_n;
01816 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
01817
01818 return 0;
01819 }
01820
01821 static int
01822 add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
01823 {
01824 return add_code_range_to_buf0(pbuf, env, from, to, 1);
01825 }
01826
01827 static int
01828 add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup)
01829 {
01830 if (from > to) {
01831 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
01832 return 0;
01833 else
01834 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
01835 }
01836
01837 return add_code_range_to_buf0(pbuf, env, from, to, checkdup);
01838 }
01839
01840 static int
01841 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
01842 {
01843 return add_code_range0(pbuf, env, from, to, 1);
01844 }
01845
01846 static int
01847 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env)
01848 {
01849 int r, i, n;
01850 OnigCodePoint pre, from, *data, to = 0;
01851
01852 *pbuf = (BBuf* )NULL;
01853 if (IS_NULL(bbuf)) {
01854 set_all:
01855 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
01856 }
01857
01858 data = (OnigCodePoint* )(bbuf->p);
01859 GET_CODE_POINT(n, data);
01860 data++;
01861 if (n <= 0) goto set_all;
01862
01863 r = 0;
01864 pre = MBCODE_START_POS(enc);
01865 for (i = 0; i < n; i++) {
01866 from = data[i*2];
01867 to = data[i*2+1];
01868 if (pre <= from - 1) {
01869 r = add_code_range_to_buf(pbuf, env, pre, from - 1);
01870 if (r != 0) return r;
01871 }
01872 if (to == ONIG_LAST_CODE_POINT) break;
01873 pre = to + 1;
01874 }
01875 if (to < ONIG_LAST_CODE_POINT) {
01876 r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT);
01877 }
01878 return r;
01879 }
01880
01881 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
01882 BBuf *tbuf; \
01883 int tnot; \
01884 tnot = not1; not1 = not2; not2 = tnot; \
01885 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
01886 } while (0)
01887
01888 static int
01889 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
01890 BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
01891 {
01892 int r;
01893 OnigCodePoint i, n1, *data1;
01894 OnigCodePoint from, to;
01895
01896 *pbuf = (BBuf* )NULL;
01897 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
01898 if (not1 != 0 || not2 != 0)
01899 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
01900 return 0;
01901 }
01902
01903 r = 0;
01904 if (IS_NULL(bbuf2))
01905 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
01906
01907 if (IS_NULL(bbuf1)) {
01908 if (not1 != 0) {
01909 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
01910 }
01911 else {
01912 if (not2 == 0) {
01913 return bbuf_clone(pbuf, bbuf2);
01914 }
01915 else {
01916 return not_code_range_buf(enc, bbuf2, pbuf, env);
01917 }
01918 }
01919 }
01920
01921 if (not1 != 0)
01922 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
01923
01924 data1 = (OnigCodePoint* )(bbuf1->p);
01925 GET_CODE_POINT(n1, data1);
01926 data1++;
01927
01928 if (not2 == 0 && not1 == 0) {
01929 r = bbuf_clone(pbuf, bbuf2);
01930 }
01931 else if (not1 == 0) {
01932 r = not_code_range_buf(enc, bbuf2, pbuf, env);
01933 }
01934 if (r != 0) return r;
01935
01936 for (i = 0; i < n1; i++) {
01937 from = data1[i*2];
01938 to = data1[i*2+1];
01939 r = add_code_range_to_buf(pbuf, env, from, to);
01940 if (r != 0) return r;
01941 }
01942 return 0;
01943 }
01944
01945 static int
01946 and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1,
01947 OnigCodePoint* data, int n)
01948 {
01949 int i, r;
01950 OnigCodePoint from2, to2;
01951
01952 for (i = 0; i < n; i++) {
01953 from2 = data[i*2];
01954 to2 = data[i*2+1];
01955 if (from2 < from1) {
01956 if (to2 < from1) continue;
01957 else {
01958 from1 = to2 + 1;
01959 }
01960 }
01961 else if (from2 <= to1) {
01962 if (to2 < to1) {
01963 if (from1 <= from2 - 1) {
01964 r = add_code_range_to_buf(pbuf, env, from1, from2-1);
01965 if (r != 0) return r;
01966 }
01967 from1 = to2 + 1;
01968 }
01969 else {
01970 to1 = from2 - 1;
01971 }
01972 }
01973 else {
01974 from1 = from2;
01975 }
01976 if (from1 > to1) break;
01977 }
01978 if (from1 <= to1) {
01979 r = add_code_range_to_buf(pbuf, env, from1, to1);
01980 if (r != 0) return r;
01981 }
01982 return 0;
01983 }
01984
01985 static int
01986 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
01987 {
01988 int r;
01989 OnigCodePoint i, j, n1, n2, *data1, *data2;
01990 OnigCodePoint from, to, from1, to1, from2, to2;
01991
01992 *pbuf = (BBuf* )NULL;
01993 if (IS_NULL(bbuf1)) {
01994 if (not1 != 0 && IS_NOT_NULL(bbuf2))
01995 return bbuf_clone(pbuf, bbuf2);
01996 return 0;
01997 }
01998 else if (IS_NULL(bbuf2)) {
01999 if (not2 != 0)
02000 return bbuf_clone(pbuf, bbuf1);
02001 return 0;
02002 }
02003
02004 if (not1 != 0)
02005 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
02006
02007 data1 = (OnigCodePoint* )(bbuf1->p);
02008 data2 = (OnigCodePoint* )(bbuf2->p);
02009 GET_CODE_POINT(n1, data1);
02010 GET_CODE_POINT(n2, data2);
02011 data1++;
02012 data2++;
02013
02014 if (not2 == 0 && not1 == 0) {
02015 for (i = 0; i < n1; i++) {
02016 from1 = data1[i*2];
02017 to1 = data1[i*2+1];
02018 for (j = 0; j < n2; j++) {
02019 from2 = data2[j*2];
02020 to2 = data2[j*2+1];
02021 if (from2 > to1) break;
02022 if (to2 < from1) continue;
02023 from = MAX(from1, from2);
02024 to = MIN(to1, to2);
02025 r = add_code_range_to_buf(pbuf, env, from, to);
02026 if (r != 0) return r;
02027 }
02028 }
02029 }
02030 else if (not1 == 0) {
02031 for (i = 0; i < n1; i++) {
02032 from1 = data1[i*2];
02033 to1 = data1[i*2+1];
02034 r = and_code_range1(pbuf, env, from1, to1, data2, n2);
02035 if (r != 0) return r;
02036 }
02037 }
02038
02039 return 0;
02040 }
02041
02042 static int
02043 and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
02044 {
02045 OnigEncoding enc = env->enc;
02046 int r, not1, not2;
02047 BBuf *buf1, *buf2, *pbuf = 0;
02048 BitSetRef bsr1, bsr2;
02049 BitSet bs1, bs2;
02050
02051 not1 = IS_NCCLASS_NOT(dest);
02052 bsr1 = dest->bs;
02053 buf1 = dest->mbuf;
02054 not2 = IS_NCCLASS_NOT(cc);
02055 bsr2 = cc->bs;
02056 buf2 = cc->mbuf;
02057
02058 if (not1 != 0) {
02059 bitset_invert_to(bsr1, bs1);
02060 bsr1 = bs1;
02061 }
02062 if (not2 != 0) {
02063 bitset_invert_to(bsr2, bs2);
02064 bsr2 = bs2;
02065 }
02066 bitset_and(bsr1, bsr2);
02067 if (bsr1 != dest->bs) {
02068 bitset_copy(dest->bs, bsr1);
02069 bsr1 = dest->bs;
02070 }
02071 if (not1 != 0) {
02072 bitset_invert(dest->bs);
02073 }
02074
02075 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
02076 if (not1 != 0 && not2 != 0) {
02077 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env);
02078 }
02079 else {
02080 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env);
02081 if (r == 0 && not1 != 0) {
02082 BBuf *tbuf = 0;
02083 r = not_code_range_buf(enc, pbuf, &tbuf, env);
02084 bbuf_free(pbuf);
02085 pbuf = tbuf;
02086 }
02087 }
02088 if (r != 0) {
02089 bbuf_free(pbuf);
02090 return r;
02091 }
02092
02093 dest->mbuf = pbuf;
02094 bbuf_free(buf1);
02095 return r;
02096 }
02097 return 0;
02098 }
02099
02100 static int
02101 or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
02102 {
02103 OnigEncoding enc = env->enc;
02104 int r, not1, not2;
02105 BBuf *buf1, *buf2, *pbuf = 0;
02106 BitSetRef bsr1, bsr2;
02107 BitSet bs1, bs2;
02108
02109 not1 = IS_NCCLASS_NOT(dest);
02110 bsr1 = dest->bs;
02111 buf1 = dest->mbuf;
02112 not2 = IS_NCCLASS_NOT(cc);
02113 bsr2 = cc->bs;
02114 buf2 = cc->mbuf;
02115
02116 if (not1 != 0) {
02117 bitset_invert_to(bsr1, bs1);
02118 bsr1 = bs1;
02119 }
02120 if (not2 != 0) {
02121 bitset_invert_to(bsr2, bs2);
02122 bsr2 = bs2;
02123 }
02124 bitset_or(bsr1, bsr2);
02125 if (bsr1 != dest->bs) {
02126 bitset_copy(dest->bs, bsr1);
02127 bsr1 = dest->bs;
02128 }
02129 if (not1 != 0) {
02130 bitset_invert(dest->bs);
02131 }
02132
02133 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
02134 if (not1 != 0 && not2 != 0) {
02135 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env);
02136 }
02137 else {
02138 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env);
02139 if (r == 0 && not1 != 0) {
02140 BBuf *tbuf = 0;
02141 r = not_code_range_buf(enc, pbuf, &tbuf, env);
02142 bbuf_free(pbuf);
02143 pbuf = tbuf;
02144 }
02145 }
02146 if (r != 0) {
02147 bbuf_free(pbuf);
02148 return r;
02149 }
02150
02151 dest->mbuf = pbuf;
02152 bbuf_free(buf1);
02153 return r;
02154 }
02155 else
02156 return 0;
02157 }
02158
02159 static void UNKNOWN_ESC_WARN(ScanEnv *env, int c);
02160
02161 static int
02162 conv_backslash_value(int c, ScanEnv* env)
02163 {
02164 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
02165 switch (c) {
02166 case 'n': return '\n';
02167 case 't': return '\t';
02168 case 'r': return '\r';
02169 case 'f': return '\f';
02170 case 'a': return '\007';
02171 case 'b': return '\010';
02172 case 'e': return '\033';
02173 case 'v':
02174 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
02175 return '\v';
02176 break;
02177
02178 default:
02179 if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
02180 UNKNOWN_ESC_WARN(env, c);
02181 break;
02182 }
02183 }
02184 return c;
02185 }
02186
02187 #ifdef USE_NO_INVALID_QUANTIFIER
02188 #define is_invalid_quantifier_target(node) 0
02189 #else
02190 static int
02191 is_invalid_quantifier_target(Node* node)
02192 {
02193 switch (NTYPE(node)) {
02194 case NT_ANCHOR:
02195 return 1;
02196 break;
02197
02198 case NT_ENCLOSE:
02199
02200
02201 break;
02202
02203 case NT_LIST:
02204 do {
02205 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
02206 } while (IS_NOT_NULL(node = NCDR(node)));
02207 return 0;
02208 break;
02209
02210 case NT_ALT:
02211 do {
02212 if (is_invalid_quantifier_target(NCAR(node))) return 1;
02213 } while (IS_NOT_NULL(node = NCDR(node)));
02214 break;
02215
02216 default:
02217 break;
02218 }
02219 return 0;
02220 }
02221 #endif
02222
02223
02224 static int
02225 popular_quantifier_num(QtfrNode* q)
02226 {
02227 if (q->greedy) {
02228 if (q->lower == 0) {
02229 if (q->upper == 1) return 0;
02230 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
02231 }
02232 else if (q->lower == 1) {
02233 if (IS_REPEAT_INFINITE(q->upper)) return 2;
02234 }
02235 }
02236 else {
02237 if (q->lower == 0) {
02238 if (q->upper == 1) return 3;
02239 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
02240 }
02241 else if (q->lower == 1) {
02242 if (IS_REPEAT_INFINITE(q->upper)) return 5;
02243 }
02244 }
02245 return -1;
02246 }
02247
02248
02249 enum ReduceType {
02250 RQ_ASIS = 0,
02251 RQ_DEL = 1,
02252 RQ_A,
02253 RQ_AQ,
02254 RQ_QQ,
02255 RQ_P_QQ,
02256 RQ_PQ_Q
02257 };
02258
02259 static enum ReduceType const ReduceTypeTable[6][6] = {
02260 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS},
02261 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},
02262 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},
02263 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ},
02264 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL},
02265 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL}
02266 };
02267
02268 extern void
02269 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
02270 {
02271 int pnum, cnum;
02272 QtfrNode *p, *c;
02273
02274 p = NQTFR(pnode);
02275 c = NQTFR(cnode);
02276 pnum = popular_quantifier_num(p);
02277 cnum = popular_quantifier_num(c);
02278 if (pnum < 0 || cnum < 0) return ;
02279
02280 switch (ReduceTypeTable[cnum][pnum]) {
02281 case RQ_DEL:
02282 *pnode = *cnode;
02283 break;
02284 case RQ_A:
02285 p->target = c->target;
02286 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
02287 break;
02288 case RQ_AQ:
02289 p->target = c->target;
02290 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
02291 break;
02292 case RQ_QQ:
02293 p->target = c->target;
02294 p->lower = 0; p->upper = 1; p->greedy = 0;
02295 break;
02296 case RQ_P_QQ:
02297 p->target = cnode;
02298 p->lower = 0; p->upper = 1; p->greedy = 0;
02299 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
02300 return ;
02301 break;
02302 case RQ_PQ_Q:
02303 p->target = cnode;
02304 p->lower = 0; p->upper = 1; p->greedy = 1;
02305 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
02306 return ;
02307 break;
02308 case RQ_ASIS:
02309 p->target = cnode;
02310 return ;
02311 break;
02312 }
02313
02314 c->target = NULL_NODE;
02315 onig_node_free(cnode);
02316 }
02317
02318
02319 enum TokenSyms {
02320 TK_EOT = 0,
02321 TK_RAW_BYTE = 1,
02322 TK_CHAR,
02323 TK_STRING,
02324 TK_CODE_POINT,
02325 TK_ANYCHAR,
02326 TK_CHAR_TYPE,
02327 TK_BACKREF,
02328 TK_CALL,
02329 TK_ANCHOR,
02330 TK_OP_REPEAT,
02331 TK_INTERVAL,
02332 TK_ANYCHAR_ANYTIME,
02333 TK_ALT,
02334 TK_SUBEXP_OPEN,
02335 TK_SUBEXP_CLOSE,
02336 TK_CC_OPEN,
02337 TK_QUOTE_OPEN,
02338 TK_CHAR_PROPERTY,
02339 TK_LINEBREAK,
02340 TK_EXTENDED_GRAPHEME_CLUSTER,
02341 TK_KEEP,
02342
02343 TK_CC_CLOSE,
02344 TK_CC_RANGE,
02345 TK_POSIX_BRACKET_OPEN,
02346 TK_CC_AND,
02347 TK_CC_CC_OPEN
02348 };
02349
02350 typedef struct {
02351 enum TokenSyms type;
02352 int escaped;
02353 int base;
02354 UChar* backp;
02355 union {
02356 UChar* s;
02357 int c;
02358 OnigCodePoint code;
02359 struct {
02360 int subtype;
02361 int ascii_range;
02362 } anchor;
02363 struct {
02364 int lower;
02365 int upper;
02366 int greedy;
02367 int possessive;
02368 } repeat;
02369 struct {
02370 int num;
02371 int ref1;
02372 int* refs;
02373 int by_name;
02374 #ifdef USE_BACKREF_WITH_LEVEL
02375 int exist_level;
02376 int level;
02377 #endif
02378 } backref;
02379 struct {
02380 UChar* name;
02381 UChar* name_end;
02382 int gnum;
02383 int rel;
02384 } call;
02385 struct {
02386 int ctype;
02387 int not;
02388 } prop;
02389 } u;
02390 } OnigToken;
02391
02392
02393 static int
02394 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
02395 {
02396 int low, up, syn_allow, non_low = 0;
02397 int r = 0;
02398 OnigCodePoint c;
02399 OnigEncoding enc = env->enc;
02400 UChar* p = *src;
02401 PFETCH_READY;
02402
02403 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
02404
02405 if (PEND) {
02406 if (syn_allow)
02407 return 1;
02408 else
02409 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
02410 }
02411
02412 if (! syn_allow) {
02413 c = PPEEK;
02414 if (c == ')' || c == '(' || c == '|') {
02415 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
02416 }
02417 }
02418
02419 low = onig_scan_unsigned_number(&p, end, env->enc);
02420 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02421 if (low > ONIG_MAX_REPEAT_NUM)
02422 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02423
02424 if (p == *src) {
02425 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
02426
02427 low = 0;
02428 non_low = 1;
02429 }
02430 else
02431 goto invalid;
02432 }
02433
02434 if (PEND) goto invalid;
02435 PFETCH(c);
02436 if (c == ',') {
02437 UChar* prev = p;
02438 up = onig_scan_unsigned_number(&p, end, env->enc);
02439 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02440 if (up > ONIG_MAX_REPEAT_NUM)
02441 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02442
02443 if (p == prev) {
02444 if (non_low != 0)
02445 goto invalid;
02446 up = REPEAT_INFINITE;
02447 }
02448 }
02449 else {
02450 if (non_low != 0)
02451 goto invalid;
02452
02453 PUNFETCH;
02454 up = low;
02455 r = 2;
02456 }
02457
02458 if (PEND) goto invalid;
02459 PFETCH(c);
02460 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
02461 if (c != MC_ESC(env->syntax)) goto invalid;
02462 PFETCH(c);
02463 }
02464 if (c != '}') goto invalid;
02465
02466 if (!IS_REPEAT_INFINITE(up) && low > up) {
02467 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
02468 }
02469
02470 tok->type = TK_INTERVAL;
02471 tok->u.repeat.lower = low;
02472 tok->u.repeat.upper = up;
02473 *src = p;
02474 return r;
02475
02476 invalid:
02477 if (syn_allow)
02478 return 1;
02479 else
02480 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
02481 }
02482
02483
02484 static int
02485 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
02486 {
02487 int v;
02488 OnigCodePoint c;
02489 OnigEncoding enc = env->enc;
02490 UChar* p = *src;
02491 PFETCH_READY;
02492
02493 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
02494
02495 PFETCH(c);
02496 switch (c) {
02497 case 'M':
02498 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
02499 if (PEND) return ONIGERR_END_PATTERN_AT_META;
02500 PFETCH(c);
02501 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
02502 if (PEND) return ONIGERR_END_PATTERN_AT_META;
02503 PFETCH(c);
02504 if (c == MC_ESC(env->syntax)) {
02505 v = fetch_escaped_value(&p, end, env);
02506 if (v < 0) return v;
02507 c = (OnigCodePoint )v;
02508 }
02509 c = ((c & 0xff) | 0x80);
02510 }
02511 else
02512 goto backslash;
02513 break;
02514
02515 case 'C':
02516 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
02517 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
02518 PFETCH(c);
02519 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
02520 goto control;
02521 }
02522 else
02523 goto backslash;
02524
02525 case 'c':
02526 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
02527 control:
02528 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
02529 PFETCH(c);
02530 if (c == '?') {
02531 c = 0177;
02532 }
02533 else {
02534 if (c == MC_ESC(env->syntax)) {
02535 v = fetch_escaped_value(&p, end, env);
02536 if (v < 0) return v;
02537 c = (OnigCodePoint )v;
02538 }
02539 c &= 0x9f;
02540 }
02541 break;
02542 }
02543
02544
02545 default:
02546 {
02547 backslash:
02548 c = conv_backslash_value(c, env);
02549 }
02550 break;
02551 }
02552
02553 *src = p;
02554 return c;
02555 }
02556
02557 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
02558
02559 static OnigCodePoint
02560 get_name_end_code_point(OnigCodePoint start)
02561 {
02562 switch (start) {
02563 case '<': return (OnigCodePoint )'>'; break;
02564 case '\'': return (OnigCodePoint )'\''; break;
02565 case '(': return (OnigCodePoint )')'; break;
02566 case '{': return (OnigCodePoint )'}'; break;
02567 default:
02568 break;
02569 }
02570
02571 return (OnigCodePoint )0;
02572 }
02573
02574 #ifdef USE_NAMED_GROUP
02575 #ifdef USE_BACKREF_WITH_LEVEL
02576
02577
02578
02579
02580
02581 static int
02582 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
02583 UChar** rname_end, ScanEnv* env,
02584 int* rback_num, int* rlevel)
02585 {
02586 int r, sign, is_num, exist_level;
02587 OnigCodePoint end_code;
02588 OnigCodePoint c = 0;
02589 OnigEncoding enc = env->enc;
02590 UChar *name_end;
02591 UChar *pnum_head;
02592 UChar *p = *src;
02593 PFETCH_READY;
02594
02595 *rback_num = 0;
02596 is_num = exist_level = 0;
02597 sign = 1;
02598 pnum_head = *src;
02599
02600 end_code = get_name_end_code_point(start_code);
02601
02602 name_end = end;
02603 r = 0;
02604 if (PEND) {
02605 return ONIGERR_EMPTY_GROUP_NAME;
02606 }
02607 else {
02608 PFETCH(c);
02609 if (c == end_code)
02610 return ONIGERR_EMPTY_GROUP_NAME;
02611
02612 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02613 is_num = 1;
02614 }
02615 else if (c == '-') {
02616 is_num = 2;
02617 sign = -1;
02618 pnum_head = p;
02619 }
02620 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02621 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02622 }
02623 }
02624
02625 while (!PEND) {
02626 name_end = p;
02627 PFETCH(c);
02628 if (c == end_code || c == ')' || c == '+' || c == '-') {
02629 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
02630 break;
02631 }
02632
02633 if (is_num != 0) {
02634 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02635 is_num = 1;
02636 }
02637 else {
02638 r = ONIGERR_INVALID_GROUP_NAME;
02639 is_num = 0;
02640 }
02641 }
02642 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02643 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02644 }
02645 }
02646
02647 if (r == 0 && c != end_code) {
02648 if (c == '+' || c == '-') {
02649 int level;
02650 int flag = (c == '-' ? -1 : 1);
02651
02652 PFETCH(c);
02653 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
02654 PUNFETCH;
02655 level = onig_scan_unsigned_number(&p, end, enc);
02656 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
02657 *rlevel = (level * flag);
02658 exist_level = 1;
02659
02660 PFETCH(c);
02661 if (c == end_code)
02662 goto end;
02663 }
02664
02665 err:
02666 r = ONIGERR_INVALID_GROUP_NAME;
02667 name_end = end;
02668 }
02669
02670 end:
02671 if (r == 0) {
02672 if (is_num != 0) {
02673 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
02674 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
02675 else if (*rback_num == 0) goto err;
02676
02677 *rback_num *= sign;
02678 }
02679
02680 *rname_end = name_end;
02681 *src = p;
02682 return (exist_level ? 1 : 0);
02683 }
02684 else {
02685 onig_scan_env_set_error_string(env, r, *src, name_end);
02686 return r;
02687 }
02688 }
02689 #endif
02690
02691
02692
02693
02694
02695 static int
02696 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
02697 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
02698 {
02699 int r, is_num, sign;
02700 OnigCodePoint end_code;
02701 OnigCodePoint c = 0;
02702 OnigEncoding enc = env->enc;
02703 UChar *name_end;
02704 UChar *pnum_head;
02705 UChar *p = *src;
02706 PFETCH_READY;
02707
02708 *rback_num = 0;
02709
02710 end_code = get_name_end_code_point(start_code);
02711
02712 name_end = end;
02713 pnum_head = *src;
02714 r = 0;
02715 is_num = 0;
02716 sign = 1;
02717 if (PEND) {
02718 return ONIGERR_EMPTY_GROUP_NAME;
02719 }
02720 else {
02721 PFETCH(c);
02722 if (c == end_code)
02723 return ONIGERR_EMPTY_GROUP_NAME;
02724
02725 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02726 if (ref == 1)
02727 is_num = 1;
02728 else {
02729 r = ONIGERR_INVALID_GROUP_NAME;
02730 is_num = 0;
02731 }
02732 }
02733 else if (c == '-') {
02734 if (ref == 1) {
02735 is_num = 2;
02736 sign = -1;
02737 pnum_head = p;
02738 }
02739 else {
02740 r = ONIGERR_INVALID_GROUP_NAME;
02741 is_num = 0;
02742 }
02743 }
02744 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02745 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02746 }
02747 }
02748
02749 if (r == 0) {
02750 while (!PEND) {
02751 name_end = p;
02752 PFETCH(c);
02753 if (c == end_code || c == ')') {
02754 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
02755 break;
02756 }
02757
02758 if (is_num != 0) {
02759 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02760 is_num = 1;
02761 }
02762 else {
02763 if (!ONIGENC_IS_CODE_WORD(enc, c))
02764 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02765 else
02766 r = ONIGERR_INVALID_GROUP_NAME;
02767
02768 is_num = 0;
02769 }
02770 }
02771 else {
02772 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02773 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02774 }
02775 }
02776 }
02777
02778 if (c != end_code) {
02779 r = ONIGERR_INVALID_GROUP_NAME;
02780 name_end = end;
02781 }
02782
02783 if (is_num != 0) {
02784 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
02785 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
02786 else if (*rback_num == 0) {
02787 r = ONIGERR_INVALID_GROUP_NAME;
02788 goto err;
02789 }
02790
02791 *rback_num *= sign;
02792 }
02793
02794 *rname_end = name_end;
02795 *src = p;
02796 return 0;
02797 }
02798 else {
02799 while (!PEND) {
02800 name_end = p;
02801 PFETCH(c);
02802 if (c == end_code || c == ')')
02803 break;
02804 }
02805 if (PEND)
02806 name_end = end;
02807
02808 err:
02809 onig_scan_env_set_error_string(env, r, *src, name_end);
02810 return r;
02811 }
02812 }
02813 #else
02814 static int
02815 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
02816 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
02817 {
02818 int r, is_num, sign;
02819 OnigCodePoint end_code;
02820 OnigCodePoint c = 0;
02821 UChar *name_end;
02822 OnigEncoding enc = env->enc;
02823 UChar *pnum_head;
02824 UChar *p = *src;
02825 PFETCH_READY;
02826
02827 *rback_num = 0;
02828
02829 end_code = get_name_end_code_point(start_code);
02830
02831 *rname_end = name_end = end;
02832 r = 0;
02833 pnum_head = *src;
02834 is_num = 0;
02835 sign = 1;
02836
02837 if (PEND) {
02838 return ONIGERR_EMPTY_GROUP_NAME;
02839 }
02840 else {
02841 PFETCH(c);
02842 if (c == end_code)
02843 return ONIGERR_EMPTY_GROUP_NAME;
02844
02845 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02846 is_num = 1;
02847 }
02848 else if (c == '-') {
02849 is_num = 2;
02850 sign = -1;
02851 pnum_head = p;
02852 }
02853 else {
02854 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02855 }
02856 }
02857
02858 while (!PEND) {
02859 name_end = p;
02860
02861 PFETCH(c);
02862 if (c == end_code || c == ')') break;
02863 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
02864 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02865 }
02866 if (r == 0 && c != end_code) {
02867 r = ONIGERR_INVALID_GROUP_NAME;
02868 name_end = end;
02869 }
02870
02871 if (r == 0) {
02872 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
02873 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
02874 else if (*rback_num == 0) {
02875 r = ONIGERR_INVALID_GROUP_NAME;
02876 goto err;
02877 }
02878 *rback_num *= sign;
02879
02880 *rname_end = name_end;
02881 *src = p;
02882 return 0;
02883 }
02884 else {
02885 err:
02886 onig_scan_env_set_error_string(env, r, *src, name_end);
02887 return r;
02888 }
02889 }
02890 #endif
02891
02892 void onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc,
02893 UChar* pat, UChar* pat_end, const UChar *fmt, va_list args);
02894
02895 static void
02896 onig_syntax_warn(ScanEnv *env, const char *fmt, ...)
02897 {
02898 va_list args;
02899 UChar buf[WARN_BUFSIZE];
02900 va_start(args, fmt);
02901 onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
02902 env->pattern, env->pattern_end,
02903 (const UChar *)fmt, args);
02904 va_end(args);
02905 if (env->sourcefile == NULL)
02906 rb_warn("%s", (char *)buf);
02907 else
02908 rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf);
02909 }
02910
02911 static void
02912 CC_ESC_WARN(ScanEnv *env, UChar *c)
02913 {
02914 if (onig_warn == onig_null_warn) return ;
02915
02916 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
02917 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
02918 onig_syntax_warn(env, "character class has '%s' without escape", c);
02919 }
02920 }
02921
02922 static void
02923 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
02924 {
02925 if (onig_warn == onig_null_warn) return ;
02926
02927 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
02928 onig_syntax_warn(env, "regular expression has '%s' without escape", c);
02929 }
02930 }
02931
02932 static void
02933 CC_DUP_WARN(ScanEnv *env)
02934 {
02935 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
02936
02937 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) &&
02938 !(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) {
02939 env->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
02940 onig_syntax_warn(env, "character class has duplicated range");
02941 }
02942 }
02943
02944 static void
02945 UNKNOWN_ESC_WARN(ScanEnv *env, int c)
02946 {
02947 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
02948 onig_syntax_warn(env, "Unknown escape \\%c is ignored", c);
02949 }
02950
02951 static UChar*
02952 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
02953 UChar **next, OnigEncoding enc)
02954 {
02955 int i;
02956 OnigCodePoint x;
02957 UChar *q;
02958 UChar *p = from;
02959
02960 while (p < to) {
02961 x = ONIGENC_MBC_TO_CODE(enc, p, to);
02962 q = p + enclen(enc, p, to);
02963 if (x == s[0]) {
02964 for (i = 1; i < n && q < to; i++) {
02965 x = ONIGENC_MBC_TO_CODE(enc, q, to);
02966 if (x != s[i]) break;
02967 q += enclen(enc, q, to);
02968 }
02969 if (i >= n) {
02970 if (IS_NOT_NULL(next))
02971 *next = q;
02972 return p;
02973 }
02974 }
02975 p = q;
02976 }
02977 return NULL_UCHARP;
02978 }
02979
02980 static int
02981 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
02982 OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn)
02983 {
02984 int i, in_esc;
02985 OnigCodePoint x;
02986 UChar *q;
02987 UChar *p = from;
02988
02989 in_esc = 0;
02990 while (p < to) {
02991 if (in_esc) {
02992 in_esc = 0;
02993 p += enclen(enc, p, to);
02994 }
02995 else {
02996 x = ONIGENC_MBC_TO_CODE(enc, p, to);
02997 q = p + enclen(enc, p, to);
02998 if (x == s[0]) {
02999 for (i = 1; i < n && q < to; i++) {
03000 x = ONIGENC_MBC_TO_CODE(enc, q, to);
03001 if (x != s[i]) break;
03002 q += enclen(enc, q, to);
03003 }
03004 if (i >= n) return 1;
03005 p += enclen(enc, p, to);
03006 }
03007 else {
03008 x = ONIGENC_MBC_TO_CODE(enc, p, to);
03009 if (x == bad) return 0;
03010 else if (x == MC_ESC(syn)) in_esc = 1;
03011 p = q;
03012 }
03013 }
03014 }
03015 return 0;
03016 }
03017
03018 static int
03019 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
03020 {
03021 int num;
03022 OnigCodePoint c, c2;
03023 const OnigSyntaxType* syn = env->syntax;
03024 OnigEncoding enc = env->enc;
03025 UChar* prev;
03026 UChar* p = *src;
03027 PFETCH_READY;
03028
03029 if (PEND) {
03030 tok->type = TK_EOT;
03031 return tok->type;
03032 }
03033
03034 PFETCH(c);
03035 tok->type = TK_CHAR;
03036 tok->base = 0;
03037 tok->u.c = c;
03038 tok->escaped = 0;
03039
03040 if (c == ']') {
03041 tok->type = TK_CC_CLOSE;
03042 }
03043 else if (c == '-') {
03044 tok->type = TK_CC_RANGE;
03045 }
03046 else if (c == MC_ESC(syn)) {
03047 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
03048 goto end;
03049
03050 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
03051
03052 PFETCH(c);
03053 tok->escaped = 1;
03054 tok->u.c = c;
03055 switch (c) {
03056 case 'w':
03057 tok->type = TK_CHAR_TYPE;
03058 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
03059 tok->u.prop.not = 0;
03060 break;
03061 case 'W':
03062 tok->type = TK_CHAR_TYPE;
03063 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
03064 tok->u.prop.not = 1;
03065 break;
03066 case 'd':
03067 tok->type = TK_CHAR_TYPE;
03068 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
03069 tok->u.prop.not = 0;
03070 break;
03071 case 'D':
03072 tok->type = TK_CHAR_TYPE;
03073 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
03074 tok->u.prop.not = 1;
03075 break;
03076 case 's':
03077 tok->type = TK_CHAR_TYPE;
03078 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
03079 tok->u.prop.not = 0;
03080 break;
03081 case 'S':
03082 tok->type = TK_CHAR_TYPE;
03083 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
03084 tok->u.prop.not = 1;
03085 break;
03086 case 'h':
03087 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03088 tok->type = TK_CHAR_TYPE;
03089 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03090 tok->u.prop.not = 0;
03091 break;
03092 case 'H':
03093 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03094 tok->type = TK_CHAR_TYPE;
03095 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03096 tok->u.prop.not = 1;
03097 break;
03098
03099 case 'p':
03100 case 'P':
03101 c2 = PPEEK;
03102 if (c2 == '{' &&
03103 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
03104 PINC;
03105 tok->type = TK_CHAR_PROPERTY;
03106 tok->u.prop.not = (c == 'P' ? 1 : 0);
03107
03108 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
03109 PFETCH(c2);
03110 if (c2 == '^') {
03111 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
03112 }
03113 else
03114 PUNFETCH;
03115 }
03116 }
03117 else {
03118 onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
03119 }
03120 break;
03121
03122 case 'x':
03123 if (PEND) break;
03124
03125 prev = p;
03126 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
03127 PINC;
03128 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
03129 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
03130 if (!PEND) {
03131 c2 = PPEEK;
03132 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
03133 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
03134 }
03135
03136 if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) {
03137 PINC;
03138 tok->type = TK_CODE_POINT;
03139 tok->base = 16;
03140 tok->u.code = (OnigCodePoint )num;
03141 }
03142 else {
03143
03144 p = prev;
03145 }
03146 }
03147 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
03148 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
03149 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03150 if (p == prev) {
03151 num = 0;
03152 }
03153 tok->type = TK_RAW_BYTE;
03154 tok->base = 16;
03155 tok->u.c = num;
03156 }
03157 break;
03158
03159 case 'u':
03160 if (PEND) break;
03161
03162 prev = p;
03163 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
03164 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
03165 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
03166 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03167 if (p == prev) {
03168 num = 0;
03169 }
03170 tok->type = TK_CODE_POINT;
03171 tok->base = 16;
03172 tok->u.code = (OnigCodePoint )num;
03173 }
03174 break;
03175
03176 case '0':
03177 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
03178 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
03179 PUNFETCH;
03180 prev = p;
03181 num = scan_unsigned_octal_number(&p, end, 3, enc);
03182 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03183 if (p == prev) {
03184 num = 0;
03185 }
03186 tok->type = TK_RAW_BYTE;
03187 tok->base = 8;
03188 tok->u.c = num;
03189 }
03190 break;
03191
03192 default:
03193 PUNFETCH;
03194 num = fetch_escaped_value(&p, end, env);
03195 if (num < 0) return num;
03196 if (tok->u.c != num) {
03197 tok->u.code = (OnigCodePoint )num;
03198 tok->type = TK_CODE_POINT;
03199 }
03200 break;
03201 }
03202 }
03203 else if (c == '[') {
03204 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
03205 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
03206 tok->backp = p;
03207 PINC;
03208 if (str_exist_check_with_esc(send, 2, p, end,
03209 (OnigCodePoint )']', enc, syn)) {
03210 tok->type = TK_POSIX_BRACKET_OPEN;
03211 }
03212 else {
03213 PUNFETCH;
03214 goto cc_in_cc;
03215 }
03216 }
03217 else {
03218 cc_in_cc:
03219 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
03220 tok->type = TK_CC_CC_OPEN;
03221 }
03222 else {
03223 CC_ESC_WARN(env, (UChar* )"[");
03224 }
03225 }
03226 }
03227 else if (c == '&') {
03228 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
03229 !PEND && (PPEEK_IS('&'))) {
03230 PINC;
03231 tok->type = TK_CC_AND;
03232 }
03233 }
03234
03235 end:
03236 *src = p;
03237 return tok->type;
03238 }
03239
03240 #ifdef USE_NAMED_GROUP
03241 static int
03242 fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src,
03243 UChar* end, ScanEnv* env)
03244 {
03245 int r, num;
03246 const OnigSyntaxType* syn = env->syntax;
03247 UChar* prev;
03248 UChar* p = *src;
03249 UChar* name_end;
03250 int* backs;
03251 int back_num;
03252
03253 prev = p;
03254
03255 #ifdef USE_BACKREF_WITH_LEVEL
03256 name_end = NULL_UCHARP;
03257 r = fetch_name_with_level(c, &p, end, &name_end,
03258 env, &back_num, &tok->u.backref.level);
03259 if (r == 1) tok->u.backref.exist_level = 1;
03260 else tok->u.backref.exist_level = 0;
03261 #else
03262 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
03263 #endif
03264 if (r < 0) return r;
03265
03266 if (back_num != 0) {
03267 if (back_num < 0) {
03268 back_num = BACKREF_REL_TO_ABS(back_num, env);
03269 if (back_num <= 0)
03270 return ONIGERR_INVALID_BACKREF;
03271 }
03272
03273 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
03274 if (back_num > env->num_mem ||
03275 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
03276 return ONIGERR_INVALID_BACKREF;
03277 }
03278 tok->type = TK_BACKREF;
03279 tok->u.backref.by_name = 0;
03280 tok->u.backref.num = 1;
03281 tok->u.backref.ref1 = back_num;
03282 }
03283 else {
03284 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
03285 if (num <= 0) {
03286 onig_scan_env_set_error_string(env,
03287 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
03288 return ONIGERR_UNDEFINED_NAME_REFERENCE;
03289 }
03290 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
03291 int i;
03292 for (i = 0; i < num; i++) {
03293 if (backs[i] > env->num_mem ||
03294 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
03295 return ONIGERR_INVALID_BACKREF;
03296 }
03297 }
03298
03299 tok->type = TK_BACKREF;
03300 tok->u.backref.by_name = 1;
03301 if (num == 1) {
03302 tok->u.backref.num = 1;
03303 tok->u.backref.ref1 = backs[0];
03304 }
03305 else {
03306 tok->u.backref.num = num;
03307 tok->u.backref.refs = backs;
03308 }
03309 }
03310 *src = p;
03311 return 0;
03312 }
03313 #endif
03314
03315 static int
03316 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
03317 {
03318 int r, num;
03319 OnigCodePoint c;
03320 OnigEncoding enc = env->enc;
03321 const OnigSyntaxType* syn = env->syntax;
03322 UChar* prev;
03323 UChar* p = *src;
03324 PFETCH_READY;
03325
03326 start:
03327 if (PEND) {
03328 tok->type = TK_EOT;
03329 return tok->type;
03330 }
03331
03332 tok->type = TK_STRING;
03333 tok->base = 0;
03334 tok->backp = p;
03335
03336 PFETCH(c);
03337 if (IS_MC_ESC_CODE(c, syn)) {
03338 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
03339
03340 tok->backp = p;
03341 PFETCH(c);
03342
03343 tok->u.c = c;
03344 tok->escaped = 1;
03345 switch (c) {
03346 case '*':
03347 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
03348 tok->type = TK_OP_REPEAT;
03349 tok->u.repeat.lower = 0;
03350 tok->u.repeat.upper = REPEAT_INFINITE;
03351 goto greedy_check;
03352 break;
03353
03354 case '+':
03355 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
03356 tok->type = TK_OP_REPEAT;
03357 tok->u.repeat.lower = 1;
03358 tok->u.repeat.upper = REPEAT_INFINITE;
03359 goto greedy_check;
03360 break;
03361
03362 case '?':
03363 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
03364 tok->type = TK_OP_REPEAT;
03365 tok->u.repeat.lower = 0;
03366 tok->u.repeat.upper = 1;
03367 greedy_check:
03368 if (!PEND && PPEEK_IS('?') &&
03369 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
03370 PFETCH(c);
03371 tok->u.repeat.greedy = 0;
03372 tok->u.repeat.possessive = 0;
03373 }
03374 else {
03375 possessive_check:
03376 if (!PEND && PPEEK_IS('+') &&
03377 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
03378 tok->type != TK_INTERVAL) ||
03379 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
03380 tok->type == TK_INTERVAL))) {
03381 PFETCH(c);
03382 tok->u.repeat.greedy = 1;
03383 tok->u.repeat.possessive = 1;
03384 }
03385 else {
03386 tok->u.repeat.greedy = 1;
03387 tok->u.repeat.possessive = 0;
03388 }
03389 }
03390 break;
03391
03392 case '{':
03393 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
03394 r = fetch_range_quantifier(&p, end, tok, env);
03395 if (r < 0) return r;
03396 if (r == 0) goto greedy_check;
03397 else if (r == 2) {
03398 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
03399 goto possessive_check;
03400
03401 goto greedy_check;
03402 }
03403
03404 break;
03405
03406 case '|':
03407 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
03408 tok->type = TK_ALT;
03409 break;
03410
03411 case '(':
03412 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
03413 tok->type = TK_SUBEXP_OPEN;
03414 break;
03415
03416 case ')':
03417 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
03418 tok->type = TK_SUBEXP_CLOSE;
03419 break;
03420
03421 case 'w':
03422 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
03423 tok->type = TK_CHAR_TYPE;
03424 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
03425 tok->u.prop.not = 0;
03426 break;
03427
03428 case 'W':
03429 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
03430 tok->type = TK_CHAR_TYPE;
03431 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
03432 tok->u.prop.not = 1;
03433 break;
03434
03435 case 'b':
03436 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
03437 tok->type = TK_ANCHOR;
03438 tok->u.anchor.subtype = ANCHOR_WORD_BOUND;
03439 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
03440 && ! IS_WORD_BOUND_ALL_RANGE(env->option);
03441 break;
03442
03443 case 'B':
03444 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
03445 tok->type = TK_ANCHOR;
03446 tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND;
03447 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
03448 && ! IS_WORD_BOUND_ALL_RANGE(env->option);
03449 break;
03450
03451 #ifdef USE_WORD_BEGIN_END
03452 case '<':
03453 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
03454 tok->type = TK_ANCHOR;
03455 tok->u.anchor.subtype = ANCHOR_WORD_BEGIN;
03456 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
03457 break;
03458
03459 case '>':
03460 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
03461 tok->type = TK_ANCHOR;
03462 tok->u.anchor.subtype = ANCHOR_WORD_END;
03463 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
03464 break;
03465 #endif
03466
03467 case 's':
03468 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
03469 tok->type = TK_CHAR_TYPE;
03470 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
03471 tok->u.prop.not = 0;
03472 break;
03473
03474 case 'S':
03475 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
03476 tok->type = TK_CHAR_TYPE;
03477 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
03478 tok->u.prop.not = 1;
03479 break;
03480
03481 case 'd':
03482 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
03483 tok->type = TK_CHAR_TYPE;
03484 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
03485 tok->u.prop.not = 0;
03486 break;
03487
03488 case 'D':
03489 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
03490 tok->type = TK_CHAR_TYPE;
03491 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
03492 tok->u.prop.not = 1;
03493 break;
03494
03495 case 'h':
03496 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03497 tok->type = TK_CHAR_TYPE;
03498 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03499 tok->u.prop.not = 0;
03500 break;
03501
03502 case 'H':
03503 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03504 tok->type = TK_CHAR_TYPE;
03505 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03506 tok->u.prop.not = 1;
03507 break;
03508
03509 case 'A':
03510 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
03511 begin_buf:
03512 tok->type = TK_ANCHOR;
03513 tok->u.anchor.subtype = ANCHOR_BEGIN_BUF;
03514 break;
03515
03516 case 'Z':
03517 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
03518 tok->type = TK_ANCHOR;
03519 tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF;
03520 break;
03521
03522 case 'z':
03523 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
03524 end_buf:
03525 tok->type = TK_ANCHOR;
03526 tok->u.anchor.subtype = ANCHOR_END_BUF;
03527 break;
03528
03529 case 'G':
03530 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
03531 tok->type = TK_ANCHOR;
03532 tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION;
03533 break;
03534
03535 case '`':
03536 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
03537 goto begin_buf;
03538 break;
03539
03540 case '\'':
03541 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
03542 goto end_buf;
03543 break;
03544
03545 case 'x':
03546 if (PEND) break;
03547
03548 prev = p;
03549 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
03550 PINC;
03551 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
03552 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
03553 if (!PEND) {
03554 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
03555 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
03556 }
03557
03558 if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) {
03559 PINC;
03560 tok->type = TK_CODE_POINT;
03561 tok->u.code = (OnigCodePoint )num;
03562 }
03563 else {
03564
03565 p = prev;
03566 }
03567 }
03568 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
03569 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
03570 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03571 if (p == prev) {
03572 num = 0;
03573 }
03574 tok->type = TK_RAW_BYTE;
03575 tok->base = 16;
03576 tok->u.c = num;
03577 }
03578 break;
03579
03580 case 'u':
03581 if (PEND) break;
03582
03583 prev = p;
03584 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
03585 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
03586 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
03587 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03588 if (p == prev) {
03589 num = 0;
03590 }
03591 tok->type = TK_CODE_POINT;
03592 tok->base = 16;
03593 tok->u.code = (OnigCodePoint )num;
03594 }
03595 break;
03596
03597 case '1': case '2': case '3': case '4':
03598 case '5': case '6': case '7': case '8': case '9':
03599 PUNFETCH;
03600 prev = p;
03601 num = onig_scan_unsigned_number(&p, end, enc);
03602 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
03603 goto skip_backref;
03604 }
03605
03606 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
03607 (num <= env->num_mem || num <= 9)) {
03608 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
03609 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
03610 return ONIGERR_INVALID_BACKREF;
03611 }
03612
03613 tok->type = TK_BACKREF;
03614 tok->u.backref.num = 1;
03615 tok->u.backref.ref1 = num;
03616 tok->u.backref.by_name = 0;
03617 #ifdef USE_BACKREF_WITH_LEVEL
03618 tok->u.backref.exist_level = 0;
03619 #endif
03620 break;
03621 }
03622
03623 skip_backref:
03624 if (c == '8' || c == '9') {
03625
03626 p = prev; PINC;
03627 break;
03628 }
03629
03630 p = prev;
03631
03632 case '0':
03633 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
03634 prev = p;
03635 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
03636 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03637 if (p == prev) {
03638 num = 0;
03639 }
03640 tok->type = TK_RAW_BYTE;
03641 tok->base = 8;
03642 tok->u.c = num;
03643 }
03644 else if (c != '0') {
03645 PINC;
03646 }
03647 break;
03648
03649 #ifdef USE_NAMED_GROUP
03650 case 'k':
03651 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
03652 PFETCH(c);
03653 if (c == '<' || c == '\'') {
03654 r = fetch_named_backref_token(c, tok, &p, end, env);
03655 if (r < 0) return r;
03656 }
03657 else {
03658 PUNFETCH;
03659 onig_syntax_warn(env, "invalid back reference");
03660 }
03661 }
03662 break;
03663 #endif
03664
03665 #if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP)
03666 case 'g':
03667 #ifdef USE_NAMED_GROUP
03668 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) {
03669 PFETCH(c);
03670 if (c == '{') {
03671 r = fetch_named_backref_token(c, tok, &p, end, env);
03672 if (r < 0) return r;
03673 }
03674 else
03675 PUNFETCH;
03676 }
03677 #endif
03678 #ifdef USE_SUBEXP_CALL
03679 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
03680 PFETCH(c);
03681 if (c == '<' || c == '\'') {
03682 int gnum = -1, rel = 0;
03683 UChar* name_end;
03684 OnigCodePoint cnext;
03685
03686 cnext = PPEEK;
03687 if (cnext == '0') {
03688 PINC;
03689 if (PPEEK_IS(get_name_end_code_point(c))) {
03690 PINC;
03691 name_end = p;
03692 gnum = 0;
03693 }
03694 }
03695 else if (cnext == '+') {
03696 PINC;
03697 rel = 1;
03698 }
03699 prev = p;
03700 if (gnum < 0) {
03701 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
03702 if (r < 0) return r;
03703 }
03704
03705 tok->type = TK_CALL;
03706 tok->u.call.name = prev;
03707 tok->u.call.name_end = name_end;
03708 tok->u.call.gnum = gnum;
03709 tok->u.call.rel = rel;
03710 }
03711 else {
03712 onig_syntax_warn(env, "invalid subexp call");
03713 PUNFETCH;
03714 }
03715 }
03716 #endif
03717 break;
03718 #endif
03719
03720 case 'Q':
03721 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
03722 tok->type = TK_QUOTE_OPEN;
03723 }
03724 break;
03725
03726 case 'p':
03727 case 'P':
03728 if (PPEEK_IS('{') &&
03729 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
03730 PINC;
03731 tok->type = TK_CHAR_PROPERTY;
03732 tok->u.prop.not = (c == 'P' ? 1 : 0);
03733
03734 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
03735 PFETCH(c);
03736 if (c == '^') {
03737 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
03738 }
03739 else
03740 PUNFETCH;
03741 }
03742 }
03743 else {
03744 onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
03745 }
03746 break;
03747
03748 case 'R':
03749 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) {
03750 tok->type = TK_LINEBREAK;
03751 }
03752 break;
03753
03754 case 'X':
03755 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) {
03756 tok->type = TK_EXTENDED_GRAPHEME_CLUSTER;
03757 }
03758 break;
03759
03760 case 'K':
03761 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) {
03762 tok->type = TK_KEEP;
03763 }
03764 break;
03765
03766 default:
03767 PUNFETCH;
03768 num = fetch_escaped_value(&p, end, env);
03769 if (num < 0) return num;
03770
03771 if (tok->u.c != num) {
03772 tok->type = TK_CODE_POINT;
03773 tok->u.code = (OnigCodePoint )num;
03774 }
03775 else {
03776 p = tok->backp + enclen(enc, tok->backp, end);
03777 }
03778 break;
03779 }
03780 }
03781 else {
03782 tok->u.c = c;
03783 tok->escaped = 0;
03784
03785 #ifdef USE_VARIABLE_META_CHARS
03786 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
03787 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
03788 if (c == MC_ANYCHAR(syn))
03789 goto any_char;
03790 else if (c == MC_ANYTIME(syn))
03791 goto anytime;
03792 else if (c == MC_ZERO_OR_ONE_TIME(syn))
03793 goto zero_or_one_time;
03794 else if (c == MC_ONE_OR_MORE_TIME(syn))
03795 goto one_or_more_time;
03796 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
03797 tok->type = TK_ANYCHAR_ANYTIME;
03798 goto out;
03799 }
03800 }
03801 #endif
03802
03803 switch (c) {
03804 case '.':
03805 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
03806 #ifdef USE_VARIABLE_META_CHARS
03807 any_char:
03808 #endif
03809 tok->type = TK_ANYCHAR;
03810 break;
03811
03812 case '*':
03813 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
03814 #ifdef USE_VARIABLE_META_CHARS
03815 anytime:
03816 #endif
03817 tok->type = TK_OP_REPEAT;
03818 tok->u.repeat.lower = 0;
03819 tok->u.repeat.upper = REPEAT_INFINITE;
03820 goto greedy_check;
03821 break;
03822
03823 case '+':
03824 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
03825 #ifdef USE_VARIABLE_META_CHARS
03826 one_or_more_time:
03827 #endif
03828 tok->type = TK_OP_REPEAT;
03829 tok->u.repeat.lower = 1;
03830 tok->u.repeat.upper = REPEAT_INFINITE;
03831 goto greedy_check;
03832 break;
03833
03834 case '?':
03835 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
03836 #ifdef USE_VARIABLE_META_CHARS
03837 zero_or_one_time:
03838 #endif
03839 tok->type = TK_OP_REPEAT;
03840 tok->u.repeat.lower = 0;
03841 tok->u.repeat.upper = 1;
03842 goto greedy_check;
03843 break;
03844
03845 case '{':
03846 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
03847 r = fetch_range_quantifier(&p, end, tok, env);
03848 if (r < 0) return r;
03849 if (r == 0) goto greedy_check;
03850 else if (r == 2) {
03851 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
03852 goto possessive_check;
03853
03854 goto greedy_check;
03855 }
03856
03857 break;
03858
03859 case '|':
03860 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
03861 tok->type = TK_ALT;
03862 break;
03863
03864 case '(':
03865 if (PPEEK_IS('?') &&
03866 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
03867 PINC;
03868 if (PPEEK_IS('#')) {
03869 PFETCH(c);
03870 while (1) {
03871 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
03872 PFETCH(c);
03873 if (c == MC_ESC(syn)) {
03874 if (!PEND) PFETCH(c);
03875 }
03876 else {
03877 if (c == ')') break;
03878 }
03879 }
03880 goto start;
03881 }
03882 #ifdef USE_PERL_SUBEXP_CALL
03883
03884 c = PPEEK;
03885 if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) &&
03886 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
03887
03888 int gnum;
03889 UChar *name;
03890 UChar *name_end;
03891
03892 if (c == 'R' || c == '0') {
03893 PINC;
03894 if (!PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME;
03895 PINC;
03896 name_end = name = p;
03897 gnum = 0;
03898 }
03899 else {
03900 int numref = 1;
03901 if (c == '&') {
03902 PINC;
03903 numref = 0;
03904 }
03905 name = p;
03906 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref);
03907 if (r < 0) return r;
03908 }
03909
03910 tok->type = TK_CALL;
03911 tok->u.call.name = name;
03912 tok->u.call.name_end = name_end;
03913 tok->u.call.gnum = gnum;
03914 tok->u.call.rel = 0;
03915 break;
03916 }
03917 else if ((c == '-' || c == '+') &&
03918 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
03919
03920 int gnum;
03921 UChar *name;
03922 UChar *name_end;
03923 OnigCodePoint cnext;
03924 PFETCH_READY;
03925
03926 PINC;
03927 cnext = PPEEK;
03928 if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) {
03929 if (c == '-') PUNFETCH;
03930 name = p;
03931 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1);
03932 if (r < 0) return r;
03933
03934 tok->type = TK_CALL;
03935 tok->u.call.name = name;
03936 tok->u.call.name_end = name_end;
03937 tok->u.call.gnum = gnum;
03938 tok->u.call.rel = 1;
03939 break;
03940 }
03941 }
03942 #endif
03943 #ifdef USE_CAPITAL_P_NAMED_GROUP
03944 if (PPEEK_IS('P') &&
03945 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
03946 int gnum;
03947 UChar *name;
03948 UChar *name_end;
03949 PFETCH_READY;
03950
03951 PINC;
03952 PFETCH(c);
03953 if (c == '=') {
03954 r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env);
03955 if (r < 0) return r;
03956 break;
03957 }
03958 else if (c == '>') {
03959 name = p;
03960 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0);
03961 if (r < 0) return r;
03962
03963 tok->type = TK_CALL;
03964 tok->u.call.name = name;
03965 tok->u.call.name_end = name_end;
03966 tok->u.call.gnum = gnum;
03967 tok->u.call.rel = 0;
03968 break;
03969 }
03970 PUNFETCH;
03971 }
03972 #endif
03973 PUNFETCH;
03974 }
03975
03976 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
03977 tok->type = TK_SUBEXP_OPEN;
03978 break;
03979
03980 case ')':
03981 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
03982 tok->type = TK_SUBEXP_CLOSE;
03983 break;
03984
03985 case '^':
03986 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
03987 tok->type = TK_ANCHOR;
03988 tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
03989 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
03990 break;
03991
03992 case '$':
03993 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
03994 tok->type = TK_ANCHOR;
03995 tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
03996 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
03997 break;
03998
03999 case '[':
04000 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
04001 tok->type = TK_CC_OPEN;
04002 break;
04003
04004 case ']':
04005 if (*src > env->pattern)
04006 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
04007 break;
04008
04009 case '#':
04010 if (IS_EXTEND(env->option)) {
04011 while (!PEND) {
04012 PFETCH(c);
04013 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
04014 break;
04015 }
04016 goto start;
04017 break;
04018 }
04019 break;
04020
04021 case ' ': case '\t': case '\n': case '\r': case '\f':
04022 if (IS_EXTEND(env->option))
04023 goto start;
04024 break;
04025
04026 default:
04027
04028 break;
04029 }
04030 }
04031
04032 #ifdef USE_VARIABLE_META_CHARS
04033 out:
04034 #endif
04035 *src = p;
04036 return tok->type;
04037 }
04038
04039 static int
04040 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
04041 ScanEnv* env,
04042 OnigCodePoint sb_out, const OnigCodePoint mbr[])
04043 {
04044 int i, r;
04045 OnigCodePoint j;
04046
04047 int n = ONIGENC_CODE_RANGE_NUM(mbr);
04048
04049 if (not == 0) {
04050 for (i = 0; i < n; i++) {
04051 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
04052 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
04053 if (j >= sb_out) {
04054 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
04055 r = add_code_range_to_buf(&(cc->mbuf), env, j,
04056 ONIGENC_CODE_RANGE_TO(mbr, i));
04057 if (r != 0) return r;
04058 i++;
04059 }
04060
04061 goto sb_end;
04062 }
04063 BITSET_SET_BIT_CHKDUP(cc->bs, j);
04064 }
04065 }
04066
04067 sb_end:
04068 for ( ; i < n; i++) {
04069 r = add_code_range_to_buf(&(cc->mbuf), env,
04070 ONIGENC_CODE_RANGE_FROM(mbr, i),
04071 ONIGENC_CODE_RANGE_TO(mbr, i));
04072 if (r != 0) return r;
04073 }
04074 }
04075 else {
04076 OnigCodePoint prev = 0;
04077
04078 for (i = 0; i < n; i++) {
04079 for (j = prev;
04080 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
04081 if (j >= sb_out) {
04082 goto sb_end2;
04083 }
04084 BITSET_SET_BIT_CHKDUP(cc->bs, j);
04085 }
04086 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
04087 }
04088 for (j = prev; j < sb_out; j++) {
04089 BITSET_SET_BIT_CHKDUP(cc->bs, j);
04090 }
04091
04092 sb_end2:
04093 prev = sb_out;
04094
04095 for (i = 0; i < n; i++) {
04096 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
04097 r = add_code_range_to_buf(&(cc->mbuf), env, prev,
04098 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
04099 if (r != 0) return r;
04100 }
04101 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
04102 }
04103 if (prev < 0x7fffffff) {
04104 r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff);
04105 if (r != 0) return r;
04106 }
04107 }
04108
04109 return 0;
04110 }
04111
04112 static int
04113 add_ctype_to_cc(CClassNode* cc, int ctype, int not, int char_prop, ScanEnv* env)
04114 {
04115 int maxcode, ascii_range;
04116 int c, r;
04117 const OnigCodePoint *ranges;
04118 OnigCodePoint sb_out;
04119 OnigEncoding enc = env->enc;
04120 OnigOptionType option = env->option;
04121
04122 ascii_range = IS_ASCII_RANGE(option) && (char_prop == 0);
04123
04124 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
04125 if (r == 0) {
04126 if (ascii_range) {
04127 CClassNode ccwork;
04128 initialize_cclass(&ccwork);
04129 r = add_ctype_to_cc_by_range(&ccwork, ctype, not, env, sb_out,
04130 ranges);
04131 if (r == 0) {
04132 if (not) {
04133 r = add_code_range_to_buf0(&(ccwork.mbuf), env, 0x80, ONIG_LAST_CODE_POINT, FALSE);
04134 }
04135 else {
04136 CClassNode ccascii;
04137 initialize_cclass(&ccascii);
04138 if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
04139 add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F);
04140 }
04141 else {
04142 bitset_set_range(env, ccascii.bs, 0x00, 0x7F);
04143 }
04144 r = and_cclass(&ccwork, &ccascii, env);
04145 if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf);
04146 }
04147 if (r == 0) {
04148 r = or_cclass(cc, &ccwork, env);
04149 }
04150 if (IS_NOT_NULL(ccwork.mbuf)) bbuf_free(ccwork.mbuf);
04151 }
04152 }
04153 else {
04154 r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges);
04155 }
04156 return r;
04157 }
04158 else if (r != ONIG_NO_SUPPORT_CONFIG) {
04159 return r;
04160 }
04161
04162 maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
04163 r = 0;
04164 switch (ctype) {
04165 case ONIGENC_CTYPE_ALPHA:
04166 case ONIGENC_CTYPE_BLANK:
04167 case ONIGENC_CTYPE_CNTRL:
04168 case ONIGENC_CTYPE_DIGIT:
04169 case ONIGENC_CTYPE_LOWER:
04170 case ONIGENC_CTYPE_PUNCT:
04171 case ONIGENC_CTYPE_SPACE:
04172 case ONIGENC_CTYPE_UPPER:
04173 case ONIGENC_CTYPE_XDIGIT:
04174 case ONIGENC_CTYPE_ASCII:
04175 case ONIGENC_CTYPE_ALNUM:
04176 if (not != 0) {
04177 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
04178 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
04179 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04180 }
04181 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04182 }
04183 else {
04184 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
04185 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
04186 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04187 }
04188 }
04189 break;
04190
04191 case ONIGENC_CTYPE_GRAPH:
04192 case ONIGENC_CTYPE_PRINT:
04193 if (not != 0) {
04194 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
04195 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)
04196 || c >= maxcode)
04197 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04198 }
04199 if (ascii_range)
04200 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04201 }
04202 else {
04203 for (c = 0; c < maxcode; c++) {
04204 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
04205 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04206 }
04207 if (! ascii_range)
04208 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04209 }
04210 break;
04211
04212 case ONIGENC_CTYPE_WORD:
04213 if (not == 0) {
04214 for (c = 0; c < maxcode; c++) {
04215 if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c);
04216 }
04217 if (! ascii_range)
04218 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04219 }
04220 else {
04221 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
04222 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0)
04223 && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode))
04224 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04225 }
04226 if (ascii_range)
04227 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04228 }
04229 break;
04230
04231 default:
04232 return ONIGERR_PARSER_BUG;
04233 break;
04234 }
04235
04236 return r;
04237 }
04238
04239 static int
04240 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
04241 {
04242 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
04243 #define POSIX_BRACKET_NAME_MIN_LEN 4
04244
04245 static const PosixBracketEntryType PBS[] = {
04246 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
04247 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
04248 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
04249 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
04250 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
04251 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
04252 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
04253 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
04254 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
04255 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
04256 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
04257 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
04258 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
04259 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
04260 { (UChar* )NULL, -1, 0 }
04261 };
04262
04263 const PosixBracketEntryType *pb;
04264 int not, i, r;
04265 OnigCodePoint c;
04266 OnigEncoding enc = env->enc;
04267 UChar *p = *src;
04268 PFETCH_READY;
04269
04270 if (PPEEK_IS('^')) {
04271 PINC;
04272 not = 1;
04273 }
04274 else
04275 not = 0;
04276
04277 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
04278 goto not_posix_bracket;
04279
04280 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
04281 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
04282 p = (UChar* )onigenc_step(enc, p, end, pb->len);
04283 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
04284 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
04285
04286 r = add_ctype_to_cc(cc, pb->ctype, not,
04287 IS_POSIX_BRACKET_ALL_RANGE(env->option),
04288 env);
04289 if (r != 0) return r;
04290
04291 PINC; PINC;
04292 *src = p;
04293 return 0;
04294 }
04295 }
04296
04297 not_posix_bracket:
04298 c = 0;
04299 i = 0;
04300 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
04301 PINC;
04302 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
04303 }
04304 if (c == ':' && ! PEND) {
04305 PINC;
04306 if (! PEND) {
04307 PFETCH(c);
04308 if (c == ']')
04309 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
04310 }
04311 }
04312
04313 return 1;
04314 }
04315
04316 static int
04317 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
04318 {
04319 int r;
04320 OnigCodePoint c;
04321 OnigEncoding enc = env->enc;
04322 UChar *prev, *start, *p = *src;
04323 PFETCH_READY;
04324
04325 r = 0;
04326 start = prev = p;
04327
04328 while (!PEND) {
04329 prev = p;
04330 PFETCH(c);
04331 if (c == '}') {
04332 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
04333 if (r < 0) break;
04334
04335 *src = p;
04336 return r;
04337 }
04338 else if (c == '(' || c == ')' || c == '{' || c == '|') {
04339 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
04340 break;
04341 }
04342 }
04343
04344 onig_scan_env_set_error_string(env, r, *src, prev);
04345 return r;
04346 }
04347
04348 static int
04349 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
04350 ScanEnv* env)
04351 {
04352 int r, ctype;
04353 CClassNode* cc;
04354
04355 ctype = fetch_char_property_to_ctype(src, end, env);
04356 if (ctype < 0) return ctype;
04357
04358 *np = node_new_cclass();
04359 CHECK_NULL_RETURN_MEMERR(*np);
04360 cc = NCCLASS(*np);
04361 r = add_ctype_to_cc(cc, ctype, 0, 1, env);
04362 if (r != 0) return r;
04363 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
04364
04365 return 0;
04366 }
04367
04368
04369 enum CCSTATE {
04370 CCS_VALUE,
04371 CCS_RANGE,
04372 CCS_COMPLETE,
04373 CCS_START
04374 };
04375
04376 enum CCVALTYPE {
04377 CCV_SB,
04378 CCV_CODE_POINT,
04379 CCV_CLASS
04380 };
04381
04382 static int
04383 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
04384 enum CCSTATE* state, ScanEnv* env)
04385 {
04386 int r;
04387
04388 if (*state == CCS_RANGE)
04389 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
04390
04391 if (*state == CCS_VALUE && *type != CCV_CLASS) {
04392 if (*type == CCV_SB)
04393 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
04394 else if (*type == CCV_CODE_POINT) {
04395 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
04396 if (r < 0) return r;
04397 }
04398 }
04399
04400 *state = CCS_VALUE;
04401 *type = CCV_CLASS;
04402 return 0;
04403 }
04404
04405 static int
04406 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
04407 int* vs_israw, int v_israw,
04408 enum CCVALTYPE intype, enum CCVALTYPE* type,
04409 enum CCSTATE* state, ScanEnv* env)
04410 {
04411 int r;
04412
04413 switch (*state) {
04414 case CCS_VALUE:
04415 if (*type == CCV_SB)
04416 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
04417 else if (*type == CCV_CODE_POINT) {
04418 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
04419 if (r < 0) return r;
04420 }
04421 break;
04422
04423 case CCS_RANGE:
04424 if (intype == *type) {
04425 if (intype == CCV_SB) {
04426 if (*vs > 0xff || v > 0xff)
04427 return ONIGERR_INVALID_CODE_POINT_VALUE;
04428
04429 if (*vs > v) {
04430 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
04431 goto ccs_range_end;
04432 else
04433 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
04434 }
04435 bitset_set_range(env, cc->bs, (int )*vs, (int )v);
04436 }
04437 else {
04438 r = add_code_range(&(cc->mbuf), env, *vs, v);
04439 if (r < 0) return r;
04440 }
04441 }
04442 else {
04443 #if 0
04444 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
04445 #endif
04446 if (*vs > v) {
04447 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
04448 goto ccs_range_end;
04449 else
04450 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
04451 }
04452 bitset_set_range(env, cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
04453 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
04454 if (r < 0) return r;
04455 #if 0
04456 }
04457 else
04458 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
04459 #endif
04460 }
04461 ccs_range_end:
04462 *state = CCS_COMPLETE;
04463 break;
04464
04465 case CCS_COMPLETE:
04466 case CCS_START:
04467 *state = CCS_VALUE;
04468 break;
04469
04470 default:
04471 break;
04472 }
04473
04474 *vs_israw = v_israw;
04475 *vs = v;
04476 *type = intype;
04477 return 0;
04478 }
04479
04480 static int
04481 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
04482 ScanEnv* env)
04483 {
04484 int in_esc;
04485 OnigCodePoint code;
04486 OnigEncoding enc = env->enc;
04487 UChar* p = from;
04488 PFETCH_READY;
04489
04490 in_esc = 0;
04491 while (! PEND) {
04492 if (ignore_escaped && in_esc) {
04493 in_esc = 0;
04494 }
04495 else {
04496 PFETCH(code);
04497 if (code == c) return 1;
04498 if (code == MC_ESC(env->syntax)) in_esc = 1;
04499 }
04500 }
04501 return 0;
04502 }
04503
04504 static int
04505 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
04506 ScanEnv* env)
04507 {
04508 int r, neg, len, fetched, and_start;
04509 OnigCodePoint v, vs;
04510 UChar *p;
04511 Node* node;
04512 CClassNode *cc, *prev_cc;
04513 CClassNode work_cc;
04514
04515 enum CCSTATE state;
04516 enum CCVALTYPE val_type, in_type;
04517 int val_israw, in_israw;
04518
04519 prev_cc = (CClassNode* )NULL;
04520 *np = NULL_NODE;
04521 r = fetch_token_in_cc(tok, src, end, env);
04522 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
04523 neg = 1;
04524 r = fetch_token_in_cc(tok, src, end, env);
04525 }
04526 else {
04527 neg = 0;
04528 }
04529
04530 if (r < 0) return r;
04531 if (r == TK_CC_CLOSE) {
04532 if (! code_exist_check((OnigCodePoint )']',
04533 *src, env->pattern_end, 1, env))
04534 return ONIGERR_EMPTY_CHAR_CLASS;
04535
04536 CC_ESC_WARN(env, (UChar* )"]");
04537 r = tok->type = TK_CHAR;
04538 }
04539
04540 *np = node = node_new_cclass();
04541 CHECK_NULL_RETURN_MEMERR(node);
04542 cc = NCCLASS(node);
04543
04544 and_start = 0;
04545 state = CCS_START;
04546 p = *src;
04547 while (r != TK_CC_CLOSE) {
04548 fetched = 0;
04549 switch (r) {
04550 case TK_CHAR:
04551 if ((tok->u.code >= SINGLE_BYTE_SIZE) ||
04552 (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) {
04553 in_type = CCV_CODE_POINT;
04554 }
04555 else if (len < 0) {
04556 r = len;
04557 goto err;
04558 }
04559 else {
04560 sb_char:
04561 in_type = CCV_SB;
04562 }
04563 v = (OnigCodePoint )tok->u.c;
04564 in_israw = 0;
04565 goto val_entry2;
04566 break;
04567
04568 case TK_RAW_BYTE:
04569
04570 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
04571 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
04572 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
04573 UChar* psave = p;
04574 int i, base = tok->base;
04575
04576 buf[0] = (UChar )tok->u.c;
04577 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
04578 r = fetch_token_in_cc(tok, &p, end, env);
04579 if (r < 0) goto err;
04580 if (r != TK_RAW_BYTE || tok->base != base) {
04581 fetched = 1;
04582 break;
04583 }
04584 buf[i] = (UChar )tok->u.c;
04585 }
04586
04587 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
04588 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
04589 goto err;
04590 }
04591
04592 len = enclen(env->enc, buf, buf+i);
04593 if (i < len) {
04594 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
04595 goto err;
04596 }
04597 else if (i > len) {
04598 p = psave;
04599 for (i = 1; i < len; i++) {
04600 r = fetch_token_in_cc(tok, &p, end, env);
04601 }
04602 fetched = 0;
04603 }
04604
04605 if (i == 1) {
04606 v = (OnigCodePoint )buf[0];
04607 goto raw_single;
04608 }
04609 else {
04610 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
04611 in_type = CCV_CODE_POINT;
04612 }
04613 }
04614 else {
04615 v = (OnigCodePoint )tok->u.c;
04616 raw_single:
04617 in_type = CCV_SB;
04618 }
04619 in_israw = 1;
04620 goto val_entry2;
04621 break;
04622
04623 case TK_CODE_POINT:
04624 v = tok->u.code;
04625 in_israw = 1;
04626 val_entry:
04627 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
04628 if (len < 0) {
04629 r = len;
04630 goto err;
04631 }
04632 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
04633 val_entry2:
04634 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
04635 &state, env);
04636 if (r != 0) goto err;
04637 break;
04638
04639 case TK_POSIX_BRACKET_OPEN:
04640 r = parse_posix_bracket(cc, &p, end, env);
04641 if (r < 0) goto err;
04642 if (r == 1) {
04643 CC_ESC_WARN(env, (UChar* )"[");
04644 p = tok->backp;
04645 v = (OnigCodePoint )tok->u.c;
04646 in_israw = 0;
04647 goto val_entry;
04648 }
04649 goto next_class;
04650 break;
04651
04652 case TK_CHAR_TYPE:
04653 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, 0, env);
04654 if (r != 0) return r;
04655
04656 next_class:
04657 r = next_state_class(cc, &vs, &val_type, &state, env);
04658 if (r != 0) goto err;
04659 break;
04660
04661 case TK_CHAR_PROPERTY:
04662 {
04663 int ctype;
04664
04665 ctype = fetch_char_property_to_ctype(&p, end, env);
04666 if (ctype < 0) return ctype;
04667 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 1, env);
04668 if (r != 0) return r;
04669 goto next_class;
04670 }
04671 break;
04672
04673 case TK_CC_RANGE:
04674 if (state == CCS_VALUE) {
04675 r = fetch_token_in_cc(tok, &p, end, env);
04676 if (r < 0) goto err;
04677 fetched = 1;
04678 if (r == TK_CC_CLOSE) {
04679 range_end_val:
04680 v = (OnigCodePoint )'-';
04681 in_israw = 0;
04682 goto val_entry;
04683 }
04684 else if (r == TK_CC_AND) {
04685 CC_ESC_WARN(env, (UChar* )"-");
04686 goto range_end_val;
04687 }
04688 state = CCS_RANGE;
04689 }
04690 else if (state == CCS_START) {
04691
04692 v = (OnigCodePoint )tok->u.c;
04693 in_israw = 0;
04694
04695 r = fetch_token_in_cc(tok, &p, end, env);
04696 if (r < 0) goto err;
04697 fetched = 1;
04698
04699 if (r == TK_CC_RANGE || and_start != 0)
04700 CC_ESC_WARN(env, (UChar* )"-");
04701
04702 goto val_entry;
04703 }
04704 else if (state == CCS_RANGE) {
04705 CC_ESC_WARN(env, (UChar* )"-");
04706 goto sb_char;
04707 }
04708 else {
04709 r = fetch_token_in_cc(tok, &p, end, env);
04710 if (r < 0) goto err;
04711 fetched = 1;
04712 if (r == TK_CC_CLOSE) goto range_end_val;
04713 else if (r == TK_CC_AND) {
04714 CC_ESC_WARN(env, (UChar* )"-");
04715 goto range_end_val;
04716 }
04717
04718 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
04719 CC_ESC_WARN(env, (UChar* )"-");
04720 goto range_end_val;
04721 }
04722 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
04723 goto err;
04724 }
04725 break;
04726
04727 case TK_CC_CC_OPEN:
04728 {
04729 Node *anode;
04730 CClassNode* acc;
04731
04732 r = parse_char_class(&anode, tok, &p, end, env);
04733 if (r == 0) {
04734 acc = NCCLASS(anode);
04735 r = or_cclass(cc, acc, env);
04736 }
04737 onig_node_free(anode);
04738 if (r != 0) goto err;
04739 }
04740 break;
04741
04742 case TK_CC_AND:
04743 {
04744 if (state == CCS_VALUE) {
04745 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
04746 &val_type, &state, env);
04747 if (r != 0) goto err;
04748 }
04749
04750 and_start = 1;
04751 state = CCS_START;
04752
04753 if (IS_NOT_NULL(prev_cc)) {
04754 r = and_cclass(prev_cc, cc, env);
04755 if (r != 0) goto err;
04756 bbuf_free(cc->mbuf);
04757 }
04758 else {
04759 prev_cc = cc;
04760 cc = &work_cc;
04761 }
04762 initialize_cclass(cc);
04763 }
04764 break;
04765
04766 case TK_EOT:
04767 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
04768 goto err;
04769 break;
04770 default:
04771 r = ONIGERR_PARSER_BUG;
04772 goto err;
04773 break;
04774 }
04775
04776 if (fetched)
04777 r = tok->type;
04778 else {
04779 r = fetch_token_in_cc(tok, &p, end, env);
04780 if (r < 0) goto err;
04781 }
04782 }
04783
04784 if (state == CCS_VALUE) {
04785 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
04786 &val_type, &state, env);
04787 if (r != 0) goto err;
04788 }
04789
04790 if (IS_NOT_NULL(prev_cc)) {
04791 r = and_cclass(prev_cc, cc, env);
04792 if (r != 0) goto err;
04793 bbuf_free(cc->mbuf);
04794 cc = prev_cc;
04795 }
04796
04797 if (neg != 0)
04798 NCCLASS_SET_NOT(cc);
04799 else
04800 NCCLASS_CLEAR_NOT(cc);
04801 if (IS_NCCLASS_NOT(cc) &&
04802 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
04803 int is_empty;
04804
04805 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
04806 if (is_empty != 0)
04807 BITSET_IS_EMPTY(cc->bs, is_empty);
04808
04809 if (is_empty == 0) {
04810 #define NEWLINE_CODE 0x0a
04811
04812 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
04813 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
04814 BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE);
04815 else {
04816 r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
04817 if (r < 0) goto err;
04818 }
04819 }
04820 }
04821 }
04822 *src = p;
04823 return 0;
04824
04825 err:
04826 if (cc != NCCLASS(*np))
04827 bbuf_free(cc->mbuf);
04828 return r;
04829 }
04830
04831 static int parse_subexp(Node** top, OnigToken* tok, int term,
04832 UChar** src, UChar* end, ScanEnv* env);
04833
04834 static int
04835 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
04836 ScanEnv* env)
04837 {
04838 int r = 0, num;
04839 Node *target, *work1 = NULL, *work2 = NULL;
04840 OnigOptionType option;
04841 OnigCodePoint c;
04842 OnigEncoding enc = env->enc;
04843
04844 #ifdef USE_NAMED_GROUP
04845 int list_capture;
04846 #endif
04847
04848 UChar* p = *src;
04849 PFETCH_READY;
04850
04851 *np = NULL;
04852 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
04853
04854 option = env->option;
04855 if (PPEEK_IS('?') &&
04856 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
04857 PINC;
04858 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
04859
04860 PFETCH(c);
04861 switch (c) {
04862 case ':':
04863 group:
04864 r = fetch_token(tok, &p, end, env);
04865 if (r < 0) return r;
04866 r = parse_subexp(np, tok, term, &p, end, env);
04867 if (r < 0) return r;
04868 *src = p;
04869 return 1;
04870 break;
04871
04872 case '=':
04873 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
04874 break;
04875 case '!':
04876 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
04877 break;
04878 case '>':
04879 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
04880 break;
04881
04882 #ifdef USE_NAMED_GROUP
04883 case '\'':
04884 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
04885 goto named_group1;
04886 }
04887 else
04888 return ONIGERR_UNDEFINED_GROUP_OPTION;
04889 break;
04890
04891 #ifdef USE_CAPITAL_P_NAMED_GROUP
04892 case 'P':
04893 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
04894 PFETCH(c);
04895 if (c == '<') goto named_group1;
04896 }
04897 return ONIGERR_UNDEFINED_GROUP_OPTION;
04898 break;
04899 #endif
04900 #endif
04901
04902 case '<':
04903 PFETCH(c);
04904 if (c == '=')
04905 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
04906 else if (c == '!')
04907 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
04908 #ifdef USE_NAMED_GROUP
04909 else {
04910 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
04911 UChar *name;
04912 UChar *name_end;
04913
04914 PUNFETCH;
04915 c = '<';
04916
04917 named_group1:
04918 list_capture = 0;
04919
04920 named_group2:
04921 name = p;
04922 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
04923 if (r < 0) return r;
04924
04925 num = scan_env_add_mem_entry(env);
04926 if (num < 0) return num;
04927 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
04928 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
04929
04930 r = name_add(env->reg, name, name_end, num, env);
04931 if (r != 0) return r;
04932 *np = node_new_enclose_memory(env->option, 1);
04933 CHECK_NULL_RETURN_MEMERR(*np);
04934 NENCLOSE(*np)->regnum = num;
04935 if (list_capture != 0)
04936 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
04937 env->num_named++;
04938 }
04939 else {
04940 return ONIGERR_UNDEFINED_GROUP_OPTION;
04941 }
04942 }
04943 #else
04944 else {
04945 return ONIGERR_UNDEFINED_GROUP_OPTION;
04946 }
04947 #endif
04948 break;
04949
04950 case '@':
04951 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
04952 #ifdef USE_NAMED_GROUP
04953 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
04954 PFETCH(c);
04955 if (c == '<' || c == '\'') {
04956 list_capture = 1;
04957 goto named_group2;
04958 }
04959 PUNFETCH;
04960 }
04961 #endif
04962 *np = node_new_enclose_memory(env->option, 0);
04963 CHECK_NULL_RETURN_MEMERR(*np);
04964 num = scan_env_add_mem_entry(env);
04965 if (num < 0) {
04966 onig_node_free(*np);
04967 return num;
04968 }
04969 else if (num >= (int )BIT_STATUS_BITS_NUM) {
04970 onig_node_free(*np);
04971 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
04972 }
04973 NENCLOSE(*np)->regnum = num;
04974 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
04975 }
04976 else {
04977 return ONIGERR_UNDEFINED_GROUP_OPTION;
04978 }
04979 break;
04980
04981 case '(':
04982 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) {
04983 UChar *name = NULL;
04984 UChar *name_end;
04985 PFETCH(c);
04986 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
04987 PUNFETCH;
04988 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1);
04989 if (r < 0) return r;
04990 if (num < 0) {
04991 num = BACKREF_REL_TO_ABS(num, env);
04992 if (num <= 0)
04993 return ONIGERR_INVALID_BACKREF;
04994 }
04995 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
04996 if (num > env->num_mem ||
04997 IS_NULL(SCANENV_MEM_NODES(env)[num]))
04998 return ONIGERR_INVALID_BACKREF;
04999 }
05000 }
05001 #ifdef USE_NAMED_GROUP
05002 else if (c == '<' || c == '\'') {
05003 int nums;
05004 int *backs;
05005
05006 name = p;
05007 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
05008 if (r < 0) return r;
05009 PFETCH(c);
05010 if (c != ')') return ONIGERR_UNDEFINED_GROUP_OPTION;
05011
05012 nums = onig_name_to_group_numbers(env->reg, name, name_end, &backs);
05013 if (nums <= 0) {
05014 onig_scan_env_set_error_string(env,
05015 ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end);
05016 return ONIGERR_UNDEFINED_NAME_REFERENCE;
05017 }
05018 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
05019 int i;
05020 for (i = 0; i < nums; i++) {
05021 if (backs[i] > env->num_mem ||
05022 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
05023 return ONIGERR_INVALID_BACKREF;
05024 }
05025 }
05026 num = backs[0];
05027 }
05028 #endif
05029 else
05030 return ONIGERR_INVALID_CONDITION_PATTERN;
05031 *np = node_new_enclose(ENCLOSE_CONDITION);
05032 CHECK_NULL_RETURN_MEMERR(*np);
05033 NENCLOSE(*np)->regnum = num;
05034 if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF;
05035 }
05036 else
05037 return ONIGERR_UNDEFINED_GROUP_OPTION;
05038 break;
05039
05040 #if 0
05041 case '|':
05042 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) {
05043
05044 }
05045 else
05046 return ONIGERR_UNDEFINED_GROUP_OPTION;
05047 break;
05048 #endif
05049
05050 case '^':
05051 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
05052
05053 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
05054 ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
05055 ONOFF(option, ONIG_OPTION_SINGLELINE, 0);
05056 ONOFF(option, ONIG_OPTION_MULTILINE, 1);
05057 ONOFF(option, ONIG_OPTION_EXTEND, 1);
05058 PFETCH(c);
05059 }
05060 #if 0
05061 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
05062
05063 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
05064 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
05065 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
05066 ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
05067 ONOFF(option, ONIG_OPTION_MULTILINE, 1);
05068 ONOFF(option, ONIG_OPTION_EXTEND, 1);
05069 PFETCH(c);
05070 }
05071 #endif
05072 else {
05073 return ONIGERR_UNDEFINED_GROUP_OPTION;
05074 }
05075
05076 #ifdef USE_POSIXLINE_OPTION
05077 case 'p':
05078 #endif
05079 case '-': case 'i': case 'm': case 's': case 'x':
05080 case 'a': case 'd': case 'l': case 'u':
05081 {
05082 int neg = 0;
05083
05084 while (1) {
05085 switch (c) {
05086 case ':':
05087 case ')':
05088 break;
05089
05090 case '-': neg = 1; break;
05091 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
05092 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
05093 case 's':
05094 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
05095 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
05096 }
05097 else
05098 return ONIGERR_UNDEFINED_GROUP_OPTION;
05099 break;
05100
05101 case 'm':
05102 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
05103 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
05104 }
05105 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
05106 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
05107 }
05108 else
05109 return ONIGERR_UNDEFINED_GROUP_OPTION;
05110 break;
05111 #ifdef USE_POSIXLINE_OPTION
05112 case 'p':
05113 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
05114 break;
05115 #endif
05116
05117 case 'a':
05118 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
05119 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
05120 (neg == 0)) {
05121 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
05122 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
05123 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
05124 }
05125 else
05126 return ONIGERR_UNDEFINED_GROUP_OPTION;
05127 break;
05128
05129 case 'u':
05130 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
05131 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
05132 (neg == 0)) {
05133 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
05134 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
05135 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
05136 }
05137 else
05138 return ONIGERR_UNDEFINED_GROUP_OPTION;
05139 break;
05140
05141 case 'd':
05142 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) &&
05143 (neg == 0)) {
05144 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
05145 }
05146 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) &&
05147 (neg == 0)) {
05148 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
05149 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
05150 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
05151 }
05152 else
05153 return ONIGERR_UNDEFINED_GROUP_OPTION;
05154 break;
05155
05156 case 'l':
05157 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) {
05158 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
05159 }
05160 else
05161 return ONIGERR_UNDEFINED_GROUP_OPTION;
05162 break;
05163
05164 default:
05165 return ONIGERR_UNDEFINED_GROUP_OPTION;
05166 }
05167
05168 if (c == ')') {
05169 *np = node_new_option(option);
05170 CHECK_NULL_RETURN_MEMERR(*np);
05171 *src = p;
05172 return 2;
05173 }
05174 else if (c == ':') {
05175 OnigOptionType prev = env->option;
05176
05177 env->option = option;
05178 r = fetch_token(tok, &p, end, env);
05179 if (r < 0) return r;
05180 r = parse_subexp(&target, tok, term, &p, end, env);
05181 env->option = prev;
05182 if (r < 0) return r;
05183 *np = node_new_option(option);
05184 CHECK_NULL_RETURN_MEMERR(*np);
05185 NENCLOSE(*np)->target = target;
05186 *src = p;
05187 return 0;
05188 }
05189
05190 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
05191 PFETCH(c);
05192 }
05193 }
05194 break;
05195
05196 default:
05197 return ONIGERR_UNDEFINED_GROUP_OPTION;
05198 }
05199 }
05200 else {
05201 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
05202 goto group;
05203
05204 *np = node_new_enclose_memory(env->option, 0);
05205 CHECK_NULL_RETURN_MEMERR(*np);
05206 num = scan_env_add_mem_entry(env);
05207 if (num < 0) return num;
05208 NENCLOSE(*np)->regnum = num;
05209 }
05210
05211 CHECK_NULL_RETURN_MEMERR(*np);
05212 r = fetch_token(tok, &p, end, env);
05213 if (r < 0) return r;
05214 r = parse_subexp(&target, tok, term, &p, end, env);
05215 if (r < 0) {
05216 onig_node_free(target);
05217 return r;
05218 }
05219
05220 if (NTYPE(*np) == NT_ANCHOR)
05221 NANCHOR(*np)->target = target;
05222 else {
05223 NENCLOSE(*np)->target = target;
05224 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
05225
05226 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
05227 if (r != 0) return r;
05228 }
05229 else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) {
05230 if (NTYPE(target) != NT_ALT) {
05231
05232 work1 = node_new_empty();
05233 if (IS_NULL(work1)) goto err;
05234 work2 = onig_node_new_alt(work1, NULL_NODE);
05235 if (IS_NULL(work2)) goto err;
05236 work1 = onig_node_new_alt(target, work2);
05237 if (IS_NULL(work1)) goto err;
05238 NENCLOSE(*np)->target = work1;
05239 }
05240 }
05241 }
05242
05243 *src = p;
05244 return 0;
05245
05246 err:
05247 onig_node_free(work1);
05248 onig_node_free(work2);
05249 onig_node_free(*np);
05250 *np = NULL;
05251 return ONIGERR_MEMORY;
05252 }
05253
05254 static const char* const PopularQStr[] = {
05255 "?", "*", "+", "??", "*?", "+?"
05256 };
05257
05258 static const char* const ReduceQStr[] = {
05259 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
05260 };
05261
05262 static int
05263 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
05264 {
05265 QtfrNode* qn;
05266
05267 qn = NQTFR(qnode);
05268 if (qn->lower == 1 && qn->upper == 1) {
05269 return 1;
05270 }
05271
05272 switch (NTYPE(target)) {
05273 case NT_STR:
05274 if (! group) {
05275 StrNode* sn = NSTR(target);
05276 if (str_node_can_be_split(sn, env->enc)) {
05277 Node* n = str_node_split_last_char(sn, env->enc);
05278 if (IS_NOT_NULL(n)) {
05279 qn->target = n;
05280 return 2;
05281 }
05282 }
05283 }
05284 break;
05285
05286 case NT_QTFR:
05287 {
05288
05289 QtfrNode* qnt = NQTFR(target);
05290 int nestq_num = popular_quantifier_num(qn);
05291 int targetq_num = popular_quantifier_num(qnt);
05292
05293 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
05294 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
05295 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
05296 switch (ReduceTypeTable[targetq_num][nestq_num]) {
05297 case RQ_ASIS:
05298 break;
05299
05300 case RQ_DEL:
05301 if (onig_warn != onig_null_warn) {
05302 onig_syntax_warn(env, "regular expression has redundant nested repeat operator '%s'",
05303 PopularQStr[targetq_num]);
05304 }
05305 goto warn_exit;
05306 break;
05307
05308 default:
05309 if (onig_warn != onig_null_warn) {
05310 onig_syntax_warn(env, "nested repeat operator '%s' and '%s' was replaced with '%s' in regular expression",
05311 PopularQStr[targetq_num], PopularQStr[nestq_num],
05312 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
05313 }
05314 goto warn_exit;
05315 break;
05316 }
05317 }
05318
05319 warn_exit:
05320 #endif
05321 if (targetq_num >= 0) {
05322 if (nestq_num >= 0) {
05323 onig_reduce_nested_quantifier(qnode, target);
05324 goto q_exit;
05325 }
05326 else if (targetq_num == 1 || targetq_num == 2) {
05327
05328 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
05329 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
05330 }
05331 }
05332 }
05333 }
05334 break;
05335
05336 default:
05337 break;
05338 }
05339
05340 qn->target = target;
05341 q_exit:
05342 return 0;
05343 }
05344
05345
05346 #ifdef USE_SHARED_CCLASS_TABLE
05347
05348 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
05349
05350
05351
05352 typedef struct {
05353 OnigEncoding enc;
05354 int not;
05355 int type;
05356 } type_cclass_key;
05357
05358 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
05359 {
05360 if (x->type != y->type) return 1;
05361 if (x->enc != y->enc) return 1;
05362 if (x->not != y->not) return 1;
05363 return 0;
05364 }
05365
05366 static st_index_t type_cclass_hash(type_cclass_key* key)
05367 {
05368 int i, val;
05369 UChar *p;
05370
05371 val = 0;
05372
05373 p = (UChar* )&(key->enc);
05374 for (i = 0; i < (int )sizeof(key->enc); i++) {
05375 val = val * 997 + (int )*p++;
05376 }
05377
05378 p = (UChar* )(&key->type);
05379 for (i = 0; i < (int )sizeof(key->type); i++) {
05380 val = val * 997 + (int )*p++;
05381 }
05382
05383 val += key->not;
05384 return val + (val >> 5);
05385 }
05386
05387 static const struct st_hash_type type_type_cclass_hash = {
05388 type_cclass_cmp,
05389 type_cclass_hash,
05390 };
05391
05392 static st_table* OnigTypeCClassTable;
05393
05394
05395 static int
05396 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
05397 {
05398 if (IS_NOT_NULL(node)) {
05399 CClassNode* cc = NCCLASS(node);
05400 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
05401 xfree(node);
05402 }
05403
05404 if (IS_NOT_NULL(key)) xfree(key);
05405 return ST_DELETE;
05406 }
05407
05408 extern int
05409 onig_free_shared_cclass_table(void)
05410 {
05411 THREAD_ATOMIC_START;
05412 if (IS_NOT_NULL(OnigTypeCClassTable)) {
05413 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
05414 onig_st_free_table(OnigTypeCClassTable);
05415 OnigTypeCClassTable = NULL;
05416 }
05417 THREAD_ATOMIC_END;
05418
05419 return 0;
05420 }
05421
05422 #endif
05423
05424
05425 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
05426 static int
05427 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
05428 {
05429 BBuf *tbuf;
05430 int r;
05431
05432 if (IS_NCCLASS_NOT(cc)) {
05433 bitset_invert(cc->bs);
05434
05435 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
05436 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
05437 if (r != 0) return r;
05438
05439 bbuf_free(cc->mbuf);
05440 cc->mbuf = tbuf;
05441 }
05442
05443 NCCLASS_CLEAR_NOT(cc);
05444 }
05445
05446 return 0;
05447 }
05448 #endif
05449
05450 typedef struct {
05451 ScanEnv* env;
05452 CClassNode* cc;
05453 Node* alt_root;
05454 Node** ptail;
05455 } IApplyCaseFoldArg;
05456
05457 static int
05458 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
05459 int to_len, void* arg)
05460 {
05461 IApplyCaseFoldArg* iarg;
05462 ScanEnv* env;
05463 CClassNode* cc;
05464 BitSetRef bs;
05465
05466 iarg = (IApplyCaseFoldArg* )arg;
05467 env = iarg->env;
05468 cc = iarg->cc;
05469 bs = cc->bs;
05470
05471 if (to_len == 1) {
05472 int is_in = onig_is_code_in_cc(env->enc, from, cc);
05473 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
05474 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
05475 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
05476 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
05477 add_code_range0(&(cc->mbuf), env, *to, *to, 0);
05478 }
05479 else {
05480 BITSET_SET_BIT(bs, *to);
05481 }
05482 }
05483 #else
05484 if (is_in != 0) {
05485 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
05486 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
05487 add_code_range0(&(cc->mbuf), env, *to, *to, 0);
05488 }
05489 else {
05490 if (IS_NCCLASS_NOT(cc)) {
05491 BITSET_CLEAR_BIT(bs, *to);
05492 }
05493 else
05494 BITSET_SET_BIT(bs, *to);
05495 }
05496 }
05497 #endif
05498 }
05499 else {
05500 int r, i, len;
05501 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
05502 Node *snode = NULL_NODE;
05503
05504 if (onig_is_code_in_cc(env->enc, from, cc)
05505 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
05506 && !IS_NCCLASS_NOT(cc)
05507 #endif
05508 ) {
05509 for (i = 0; i < to_len; i++) {
05510 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
05511 if (i == 0) {
05512 snode = onig_node_new_str(buf, buf + len);
05513 CHECK_NULL_RETURN_MEMERR(snode);
05514
05515
05516
05517 NSTRING_SET_AMBIG(snode);
05518 }
05519 else {
05520 r = onig_node_str_cat(snode, buf, buf + len);
05521 if (r < 0) {
05522 onig_node_free(snode);
05523 return r;
05524 }
05525 }
05526 }
05527
05528 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
05529 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
05530 iarg->ptail = &(NCDR((*(iarg->ptail))));
05531 }
05532 }
05533
05534 return 0;
05535 }
05536
05537 static int
05538 node_linebreak(Node** np, ScanEnv* env)
05539 {
05540
05541 Node* left = NULL;
05542 Node* right = NULL;
05543 Node* target1 = NULL;
05544 Node* target2 = NULL;
05545 CClassNode* cc;
05546 int num1, num2;
05547 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
05548
05549
05550 num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
05551 if (num1 < 0) return num1;
05552 num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
05553 if (num2 < 0) return num2;
05554 left = node_new_str_raw(buf, buf + num1 + num2);
05555 if (IS_NULL(left)) goto err;
05556
05557
05558 right = node_new_cclass();
05559 if (IS_NULL(right)) goto err;
05560 cc = NCCLASS(right);
05561 if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
05562 add_code_range(&(cc->mbuf), env, 0x0A, 0x0D);
05563 }
05564 else {
05565 bitset_set_range(env, cc->bs, 0x0A, 0x0D);
05566 }
05567
05568
05569 if (ONIGENC_IS_UNICODE(env->enc)) {
05570
05571 add_code_range(&(cc->mbuf), env, 0x85, 0x85);
05572 add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
05573 }
05574
05575
05576 target1 = onig_node_new_alt(right, NULL_NODE);
05577 if (IS_NULL(target1)) goto err;
05578 right = NULL;
05579 target2 = onig_node_new_alt(left, target1);
05580 if (IS_NULL(target2)) goto err;
05581 left = NULL;
05582 target1 = NULL;
05583
05584
05585 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
05586 if (IS_NULL(*np)) goto err;
05587 NENCLOSE(*np)->target = target2;
05588 return ONIG_NORMAL;
05589
05590 err:
05591 onig_node_free(left);
05592 onig_node_free(right);
05593 onig_node_free(target1);
05594 onig_node_free(target2);
05595 return ONIGERR_MEMORY;
05596 }
05597
05598 static int
05599 node_extended_grapheme_cluster(Node** np, ScanEnv* env)
05600 {
05601
05602 Node* np1 = NULL;
05603 Node* np2 = NULL;
05604 Node* qn = NULL;
05605 Node* list1 = NULL;
05606 Node* list2 = NULL;
05607 int r = 0;
05608
05609 #ifdef USE_UNICODE_PROPERTIES
05610 if (ONIGENC_IS_UNICODE(env->enc)) {
05611
05612 CClassNode* cc1;
05613 CClassNode* cc2;
05614 UChar* propname = (UChar* )"M";
05615 int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII,
05616 propname, propname + 1);
05617 if (ctype >= 0) {
05618
05619 np1 = node_new_cclass();
05620 if (IS_NULL(np1)) goto err;
05621 cc1 = NCCLASS(np1);
05622 r = add_ctype_to_cc(cc1, ctype, 0, 1, env);
05623 if (r != 0) goto err;
05624 NCCLASS_SET_NOT(cc1);
05625
05626
05627 np2 = node_new_cclass();
05628 if (IS_NULL(np2)) goto err;
05629 cc2 = NCCLASS(np2);
05630 r = add_ctype_to_cc(cc2, ctype, 0, 1, env);
05631 if (r != 0) goto err;
05632
05633 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
05634 if (IS_NULL(qn)) goto err;
05635 NQTFR(qn)->target = np2;
05636 np2 = NULL;
05637
05638
05639 list2 = node_new_list(qn, NULL_NODE);
05640 if (IS_NULL(list2)) goto err;
05641 qn = NULL;
05642 list1 = node_new_list(np1, list2);
05643 if (IS_NULL(list1)) goto err;
05644 np1 = NULL;
05645 list2 = NULL;
05646
05647
05648 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
05649 if (IS_NULL(*np)) goto err;
05650 NENCLOSE(*np)->target = list1;
05651 return ONIG_NORMAL;
05652 }
05653 }
05654 #endif
05655 if (IS_NULL(*np)) {
05656
05657 OnigOptionType option;
05658 np1 = node_new_anychar();
05659 if (IS_NULL(np1)) goto err;
05660
05661 option = env->option;
05662 ONOFF(option, ONIG_OPTION_MULTILINE, 0);
05663 *np = node_new_option(option);
05664 if (IS_NULL(*np)) goto err;
05665 NENCLOSE(*np)->target = np1;
05666 }
05667 return ONIG_NORMAL;
05668
05669 err:
05670 onig_node_free(np1);
05671 onig_node_free(np2);
05672 onig_node_free(qn);
05673 onig_node_free(list1);
05674 onig_node_free(list2);
05675 return (r == 0) ? ONIGERR_MEMORY : r;
05676 }
05677
05678 static int
05679 countbits(unsigned int bits)
05680 {
05681 bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555);
05682 bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333);
05683 bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f);
05684 bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff);
05685 return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff);
05686 }
05687
05688 static int
05689 is_onechar_cclass(CClassNode* cc, OnigCodePoint* code)
05690 {
05691 const OnigCodePoint not_found = ONIG_LAST_CODE_POINT;
05692 OnigCodePoint c = not_found;
05693 int i;
05694 BBuf *bbuf = cc->mbuf;
05695
05696 if (IS_NCCLASS_NOT(cc)) return 0;
05697
05698
05699 if (IS_NOT_NULL(bbuf)) {
05700 OnigCodePoint n, *data;
05701 GET_CODE_POINT(n, bbuf->p);
05702 data = (OnigCodePoint* )(bbuf->p) + 1;
05703 if ((n == 1) && (data[0] == data[1])) {
05704
05705 c = data[0];
05706 if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) {
05707
05708 c = not_found;
05709 }
05710 }
05711 else {
05712 return 0;
05713 }
05714 }
05715
05716
05717 for (i = 0; i < BITSET_SIZE; i++) {
05718 Bits b1 = cc->bs[i];
05719 if (b1 != 0) {
05720 if (((b1 & (b1 - 1)) == 0) && (c == not_found)) {
05721 c = BITS_IN_ROOM * i + countbits(b1 - 1);
05722 } else {
05723 return 0;
05724 }
05725 }
05726 }
05727
05728 if (c != not_found) {
05729 *code = c;
05730 return 1;
05731 }
05732
05733
05734 return 0;
05735 }
05736
05737
05738 static int
05739 parse_exp(Node** np, OnigToken* tok, int term,
05740 UChar** src, UChar* end, ScanEnv* env)
05741 {
05742 int r, len, group = 0;
05743 Node* qn;
05744 Node** targetp;
05745
05746 *np = NULL;
05747 if (tok->type == (enum TokenSyms )term)
05748 goto end_of_token;
05749
05750 switch (tok->type) {
05751 case TK_ALT:
05752 case TK_EOT:
05753 end_of_token:
05754 *np = node_new_empty();
05755 return tok->type;
05756 break;
05757
05758 case TK_SUBEXP_OPEN:
05759 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
05760 if (r < 0) return r;
05761 if (r == 1) group = 1;
05762 else if (r == 2) {
05763 Node* target;
05764 OnigOptionType prev = env->option;
05765
05766 env->option = NENCLOSE(*np)->option;
05767 r = fetch_token(tok, src, end, env);
05768 if (r < 0) return r;
05769 r = parse_subexp(&target, tok, term, src, end, env);
05770 env->option = prev;
05771 if (r < 0) {
05772 onig_node_free(target);
05773 return r;
05774 }
05775 NENCLOSE(*np)->target = target;
05776 return tok->type;
05777 }
05778 break;
05779
05780 case TK_SUBEXP_CLOSE:
05781 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
05782 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
05783
05784 if (tok->escaped) goto tk_raw_byte;
05785 else goto tk_byte;
05786 break;
05787
05788 case TK_LINEBREAK:
05789 r = node_linebreak(np, env);
05790 if (r < 0) return r;
05791 break;
05792
05793 case TK_EXTENDED_GRAPHEME_CLUSTER:
05794 r = node_extended_grapheme_cluster(np, env);
05795 if (r < 0) return r;
05796 break;
05797
05798 case TK_KEEP:
05799 *np = onig_node_new_anchor(ANCHOR_KEEP);
05800 CHECK_NULL_RETURN_MEMERR(*np);
05801 break;
05802
05803 case TK_STRING:
05804 tk_byte:
05805 {
05806 *np = node_new_str(tok->backp, *src);
05807 CHECK_NULL_RETURN_MEMERR(*np);
05808
05809 string_loop:
05810 while (1) {
05811 r = fetch_token(tok, src, end, env);
05812 if (r < 0) return r;
05813 if (r == TK_STRING) {
05814 r = onig_node_str_cat(*np, tok->backp, *src);
05815 }
05816 #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
05817 else if (r == TK_CODE_POINT) {
05818 r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
05819 }
05820 #endif
05821 else {
05822 break;
05823 }
05824 if (r < 0) return r;
05825 }
05826
05827 string_end:
05828 targetp = np;
05829 goto repeat;
05830 }
05831 break;
05832
05833 case TK_RAW_BYTE:
05834 tk_raw_byte:
05835 {
05836 *np = node_new_str_raw_char((UChar )tok->u.c);
05837 CHECK_NULL_RETURN_MEMERR(*np);
05838 len = 1;
05839 while (1) {
05840 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
05841 if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) {
05842 r = fetch_token(tok, src, end, env);
05843 NSTRING_CLEAR_RAW(*np);
05844 goto string_end;
05845 }
05846 }
05847
05848 r = fetch_token(tok, src, end, env);
05849 if (r < 0) return r;
05850 if (r != TK_RAW_BYTE) {
05851
05852 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
05853 int rem;
05854 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
05855 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
05856 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
05857 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
05858 NSTRING_CLEAR_RAW(*np);
05859 goto string_end;
05860 }
05861 }
05862 #endif
05863 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
05864 }
05865
05866 r = node_str_cat_char(*np, (UChar )tok->u.c);
05867 if (r < 0) return r;
05868
05869 len++;
05870 }
05871 }
05872 break;
05873
05874 case TK_CODE_POINT:
05875 {
05876 *np = node_new_empty();
05877 CHECK_NULL_RETURN_MEMERR(*np);
05878 r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
05879 if (r != 0) return r;
05880 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
05881 NSTRING_SET_RAW(*np);
05882 #else
05883 goto string_loop;
05884 #endif
05885 }
05886 break;
05887
05888 case TK_QUOTE_OPEN:
05889 {
05890 OnigCodePoint end_op[2];
05891 UChar *qstart, *qend, *nextp;
05892
05893 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
05894 end_op[1] = (OnigCodePoint )'E';
05895 qstart = *src;
05896 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
05897 if (IS_NULL(qend)) {
05898 nextp = qend = end;
05899 }
05900 *np = node_new_str(qstart, qend);
05901 CHECK_NULL_RETURN_MEMERR(*np);
05902 *src = nextp;
05903 }
05904 break;
05905
05906 case TK_CHAR_TYPE:
05907 {
05908 switch (tok->u.prop.ctype) {
05909 case ONIGENC_CTYPE_WORD:
05910 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not,
05911 IS_ASCII_RANGE(env->option));
05912 CHECK_NULL_RETURN_MEMERR(*np);
05913 break;
05914
05915 case ONIGENC_CTYPE_SPACE:
05916 case ONIGENC_CTYPE_DIGIT:
05917 case ONIGENC_CTYPE_XDIGIT:
05918 {
05919 CClassNode* cc;
05920
05921 #ifdef USE_SHARED_CCLASS_TABLE
05922 const OnigCodePoint *mbr;
05923 OnigCodePoint sb_out;
05924
05925 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
05926 &sb_out, &mbr);
05927 if (r == 0 &&
05928 ! IS_ASCII_RANGE(env->option) &&
05929 ONIGENC_CODE_RANGE_NUM(mbr)
05930 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
05931 type_cclass_key key;
05932 type_cclass_key* new_key;
05933
05934 key.enc = env->enc;
05935 key.not = tok->u.prop.not;
05936 key.type = tok->u.prop.ctype;
05937
05938 THREAD_ATOMIC_START;
05939
05940 if (IS_NULL(OnigTypeCClassTable)) {
05941 OnigTypeCClassTable
05942 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
05943 if (IS_NULL(OnigTypeCClassTable)) {
05944 THREAD_ATOMIC_END;
05945 return ONIGERR_MEMORY;
05946 }
05947 }
05948 else {
05949 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
05950 (st_data_t* )np)) {
05951 THREAD_ATOMIC_END;
05952 break;
05953 }
05954 }
05955
05956 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
05957 sb_out, mbr);
05958 if (IS_NULL(*np)) {
05959 THREAD_ATOMIC_END;
05960 return ONIGERR_MEMORY;
05961 }
05962
05963 cc = NCCLASS(*np);
05964 NCCLASS_SET_SHARE(cc);
05965 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
05966 xmemcpy(new_key, &key, sizeof(type_cclass_key));
05967 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
05968 (st_data_t )*np);
05969
05970 THREAD_ATOMIC_END;
05971 }
05972 else {
05973 #endif
05974 *np = node_new_cclass();
05975 CHECK_NULL_RETURN_MEMERR(*np);
05976 cc = NCCLASS(*np);
05977 r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0, 0, env);
05978 if (r != 0) return r;
05979 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
05980 #ifdef USE_SHARED_CCLASS_TABLE
05981 }
05982 #endif
05983 }
05984 break;
05985
05986 default:
05987 return ONIGERR_PARSER_BUG;
05988 break;
05989 }
05990 }
05991 break;
05992
05993 case TK_CHAR_PROPERTY:
05994 r = parse_char_property(np, tok, src, end, env);
05995 if (r != 0) return r;
05996 break;
05997
05998 case TK_CC_OPEN:
05999 {
06000 CClassNode* cc;
06001 OnigCodePoint code;
06002
06003 r = parse_char_class(np, tok, src, end, env);
06004 if (r != 0) return r;
06005
06006 cc = NCCLASS(*np);
06007 if (is_onechar_cclass(cc, &code)) {
06008 onig_node_free(*np);
06009 *np = node_new_empty();
06010 CHECK_NULL_RETURN_MEMERR(*np);
06011 r = node_str_cat_codepoint(*np, env->enc, code);
06012 if (r != 0) return r;
06013 goto string_loop;
06014 }
06015 if (IS_IGNORECASE(env->option)) {
06016 IApplyCaseFoldArg iarg;
06017
06018 iarg.env = env;
06019 iarg.cc = cc;
06020 iarg.alt_root = NULL_NODE;
06021 iarg.ptail = &(iarg.alt_root);
06022
06023 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
06024 i_apply_case_fold, &iarg);
06025 if (r != 0) {
06026 onig_node_free(iarg.alt_root);
06027 return r;
06028 }
06029 if (IS_NOT_NULL(iarg.alt_root)) {
06030 Node* work = onig_node_new_alt(*np, iarg.alt_root);
06031 if (IS_NULL(work)) {
06032 onig_node_free(iarg.alt_root);
06033 return ONIGERR_MEMORY;
06034 }
06035 *np = work;
06036 }
06037 }
06038 }
06039 break;
06040
06041 case TK_ANYCHAR:
06042 *np = node_new_anychar();
06043 CHECK_NULL_RETURN_MEMERR(*np);
06044 break;
06045
06046 case TK_ANYCHAR_ANYTIME:
06047 *np = node_new_anychar();
06048 CHECK_NULL_RETURN_MEMERR(*np);
06049 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
06050 CHECK_NULL_RETURN_MEMERR(qn);
06051 NQTFR(qn)->target = *np;
06052 *np = qn;
06053 break;
06054
06055 case TK_BACKREF:
06056 len = tok->u.backref.num;
06057 *np = node_new_backref(len,
06058 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
06059 tok->u.backref.by_name,
06060 #ifdef USE_BACKREF_WITH_LEVEL
06061 tok->u.backref.exist_level,
06062 tok->u.backref.level,
06063 #endif
06064 env);
06065 CHECK_NULL_RETURN_MEMERR(*np);
06066 break;
06067
06068 #ifdef USE_SUBEXP_CALL
06069 case TK_CALL:
06070 {
06071 int gnum = tok->u.call.gnum;
06072
06073 if (gnum < 0 || tok->u.call.rel != 0) {
06074 if (gnum > 0) gnum--;
06075 gnum = BACKREF_REL_TO_ABS(gnum, env);
06076 if (gnum <= 0)
06077 return ONIGERR_INVALID_BACKREF;
06078 }
06079 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
06080 CHECK_NULL_RETURN_MEMERR(*np);
06081 env->num_call++;
06082 }
06083 break;
06084 #endif
06085
06086 case TK_ANCHOR:
06087 *np = onig_node_new_anchor(tok->u.anchor.subtype);
06088 CHECK_NULL_RETURN_MEMERR(*np);
06089 NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range;
06090 break;
06091
06092 case TK_OP_REPEAT:
06093 case TK_INTERVAL:
06094 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
06095 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
06096 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
06097 else
06098 *np = node_new_empty();
06099 }
06100 else {
06101 goto tk_byte;
06102 }
06103 break;
06104
06105 default:
06106 return ONIGERR_PARSER_BUG;
06107 break;
06108 }
06109
06110 {
06111 targetp = np;
06112
06113 re_entry:
06114 r = fetch_token(tok, src, end, env);
06115 if (r < 0) return r;
06116
06117 repeat:
06118 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
06119 if (is_invalid_quantifier_target(*targetp))
06120 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
06121
06122 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
06123 (r == TK_INTERVAL ? 1 : 0));
06124 CHECK_NULL_RETURN_MEMERR(qn);
06125 NQTFR(qn)->greedy = tok->u.repeat.greedy;
06126 r = set_quantifier(qn, *targetp, group, env);
06127 if (r < 0) {
06128 onig_node_free(qn);
06129 return r;
06130 }
06131
06132 if (tok->u.repeat.possessive != 0) {
06133 Node* en;
06134 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
06135 if (IS_NULL(en)) {
06136 onig_node_free(qn);
06137 return ONIGERR_MEMORY;
06138 }
06139 NENCLOSE(en)->target = qn;
06140 qn = en;
06141 }
06142
06143 if (r == 0) {
06144 *targetp = qn;
06145 }
06146 else if (r == 1) {
06147 onig_node_free(qn);
06148 }
06149 else if (r == 2) {
06150 Node *tmp;
06151
06152 *targetp = node_new_list(*targetp, NULL);
06153 if (IS_NULL(*targetp)) {
06154 onig_node_free(qn);
06155 return ONIGERR_MEMORY;
06156 }
06157 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
06158 if (IS_NULL(tmp)) {
06159 onig_node_free(qn);
06160 return ONIGERR_MEMORY;
06161 }
06162 targetp = &(NCAR(tmp));
06163 }
06164 goto re_entry;
06165 }
06166 }
06167
06168 return r;
06169 }
06170
06171 static int
06172 parse_branch(Node** top, OnigToken* tok, int term,
06173 UChar** src, UChar* end, ScanEnv* env)
06174 {
06175 int r;
06176 Node *node, **headp;
06177
06178 *top = NULL;
06179 r = parse_exp(&node, tok, term, src, end, env);
06180 if (r < 0) {
06181 onig_node_free(node);
06182 return r;
06183 }
06184
06185 if (r == TK_EOT || r == term || r == TK_ALT) {
06186 *top = node;
06187 }
06188 else {
06189 *top = node_new_list(node, NULL);
06190 headp = &(NCDR(*top));
06191 while (r != TK_EOT && r != term && r != TK_ALT) {
06192 r = parse_exp(&node, tok, term, src, end, env);
06193 if (r < 0) {
06194 onig_node_free(node);
06195 return r;
06196 }
06197
06198 if (NTYPE(node) == NT_LIST) {
06199 *headp = node;
06200 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
06201 headp = &(NCDR(node));
06202 }
06203 else {
06204 *headp = node_new_list(node, NULL);
06205 headp = &(NCDR(*headp));
06206 }
06207 }
06208 }
06209
06210 return r;
06211 }
06212
06213
06214 static int
06215 parse_subexp(Node** top, OnigToken* tok, int term,
06216 UChar** src, UChar* end, ScanEnv* env)
06217 {
06218 int r;
06219 Node *node, **headp;
06220
06221 *top = NULL;
06222 r = parse_branch(&node, tok, term, src, end, env);
06223 if (r < 0) {
06224 onig_node_free(node);
06225 return r;
06226 }
06227
06228 if (r == term) {
06229 *top = node;
06230 }
06231 else if (r == TK_ALT) {
06232 *top = onig_node_new_alt(node, NULL);
06233 headp = &(NCDR(*top));
06234 while (r == TK_ALT) {
06235 r = fetch_token(tok, src, end, env);
06236 if (r < 0) return r;
06237 r = parse_branch(&node, tok, term, src, end, env);
06238 if (r < 0) {
06239 onig_node_free(node);
06240 return r;
06241 }
06242
06243 *headp = onig_node_new_alt(node, NULL);
06244 headp = &(NCDR(*headp));
06245 }
06246
06247 if (tok->type != (enum TokenSyms )term)
06248 goto err;
06249 }
06250 else {
06251 onig_node_free(node);
06252 err:
06253 if (term == TK_SUBEXP_CLOSE)
06254 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
06255 else
06256 return ONIGERR_PARSER_BUG;
06257 }
06258
06259 return r;
06260 }
06261
06262 static int
06263 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
06264 {
06265 int r;
06266 OnigToken tok;
06267
06268 r = fetch_token(&tok, src, end, env);
06269 if (r < 0) return r;
06270 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
06271 if (r < 0) return r;
06272
06273 #ifdef USE_SUBEXP_CALL
06274 if (env->num_call > 0) {
06275
06276 const int num = 0;
06277 Node* np;
06278 np = node_new_enclose_memory(env->option, 0);
06279 CHECK_NULL_RETURN_MEMERR(np);
06280 NENCLOSE(np)->regnum = num;
06281 NENCLOSE(np)->target = *top;
06282 r = scan_env_set_mem_node(env, num, np);
06283 if (r != 0) return r;
06284 *top = np;
06285 }
06286 #endif
06287 return 0;
06288 }
06289
06290 extern int
06291 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
06292 regex_t* reg, ScanEnv* env)
06293 {
06294 int r;
06295 UChar* p;
06296
06297 #ifdef USE_NAMED_GROUP
06298 names_clear(reg);
06299 #endif
06300
06301 scan_env_clear(env);
06302 env->option = reg->options;
06303 env->case_fold_flag = reg->case_fold_flag;
06304 env->enc = reg->enc;
06305 env->syntax = reg->syntax;
06306 env->pattern = (UChar* )pattern;
06307 env->pattern_end = (UChar* )end;
06308 env->reg = reg;
06309
06310 *root = NULL;
06311 p = (UChar* )pattern;
06312 r = parse_regexp(root, &p, (UChar* )end, env);
06313 reg->num_mem = env->num_mem;
06314 return r;
06315 }
06316
06317 extern void
06318 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
06319 UChar* arg, UChar* arg_end)
06320 {
06321 env->error = arg;
06322 env->error_end = arg_end;
06323 }
06324