00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include "regparse.h"
00032
00033 #define WARN_BUFSIZE 256
00034
00035 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
00036
00037
00038 const OnigSyntaxType OnigSyntaxRuby = {
00039 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
00040 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
00041 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
00042 ONIG_SYN_OP_ESC_C_CONTROL )
00043 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
00044 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
00045 ONIG_SYN_OP2_OPTION_RUBY |
00046 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
00047 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
00048 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
00049 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
00050 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
00051 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
00052 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
00053 ONIG_SYN_OP2_ESC_H_XDIGIT |
00054 ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
00055 ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
00056 ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK |
00057 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP )
00058 , ( SYN_GNU_REGEX_BV |
00059 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
00060 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
00061 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
00062 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
00063 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
00064 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
00065 ONIG_SYN_WARN_CC_DUP |
00066 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
00067 , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE |
00068 ONIG_OPTION_WORD_BOUND_ALL_RANGE )
00069 ,
00070 {
00071 (OnigCodePoint )'\\'
00072 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00073 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00074 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00075 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00076 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00077 }
00078 };
00079
00080 const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
00081
00082 extern void onig_null_warn(const char* s ARG_UNUSED) { }
00083
00084 #ifdef DEFAULT_WARN_FUNCTION
00085 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
00086 #else
00087 static OnigWarnFunc onig_warn = onig_null_warn;
00088 #endif
00089
00090 #ifdef DEFAULT_VERB_WARN_FUNCTION
00091 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
00092 #else
00093 static OnigWarnFunc onig_verb_warn = onig_null_warn;
00094 #endif
00095
00096 extern void onig_set_warn_func(OnigWarnFunc f)
00097 {
00098 onig_warn = f;
00099 }
00100
00101 extern void onig_set_verb_warn_func(OnigWarnFunc f)
00102 {
00103 onig_verb_warn = f;
00104 }
00105
00106 static void CC_DUP_WARN(ScanEnv *env);
00107
00108 static void
00109 bbuf_free(BBuf* bbuf)
00110 {
00111 if (IS_NOT_NULL(bbuf)) {
00112 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
00113 xfree(bbuf);
00114 }
00115 }
00116
00117 static int
00118 bbuf_clone(BBuf** rto, BBuf* from)
00119 {
00120 int r;
00121 BBuf *to;
00122
00123 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
00124 CHECK_NULL_RETURN_MEMERR(to);
00125 r = BBUF_INIT(to, from->alloc);
00126 if (r != 0) return r;
00127 to->used = from->used;
00128 xmemcpy(to->p, from->p, from->used);
00129 return 0;
00130 }
00131
00132 #define BACKREF_REL_TO_ABS(rel_no, env) \
00133 ((env)->num_mem + 1 + (rel_no))
00134
00135 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
00136
00137 #define MBCODE_START_POS(enc) \
00138 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
00139
00140 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
00141 add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT)
00142
00143 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
00144 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
00145 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
00146 if (r) return r;\
00147 }\
00148 } while (0)
00149
00150
00151 #define BITSET_SET_BIT_CHKDUP(bs, pos) do { \
00152 if (BITSET_AT(bs, pos)) CC_DUP_WARN(env); \
00153 BS_ROOM(bs, pos) |= BS_BIT(pos); \
00154 } while (0)
00155
00156 #define BITSET_IS_EMPTY(bs,empty) do {\
00157 int i;\
00158 empty = 1;\
00159 for (i = 0; i < BITSET_SIZE; i++) {\
00160 if ((bs)[i] != 0) {\
00161 empty = 0; break;\
00162 }\
00163 }\
00164 } while (0)
00165
00166 static void
00167 bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to)
00168 {
00169 int i;
00170 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
00171 BITSET_SET_BIT_CHKDUP(bs, i);
00172 }
00173 }
00174
00175 #if 0
00176 static void
00177 bitset_set_all(BitSetRef bs)
00178 {
00179 int i;
00180 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
00181 }
00182 #endif
00183
00184 static void
00185 bitset_invert(BitSetRef bs)
00186 {
00187 int i;
00188 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
00189 }
00190
00191 static void
00192 bitset_invert_to(BitSetRef from, BitSetRef to)
00193 {
00194 int i;
00195 for (i = 0; i < BITSET_SIZE; i++) { to[i] = ~(from[i]); }
00196 }
00197
00198 static void
00199 bitset_and(BitSetRef dest, BitSetRef bs)
00200 {
00201 int i;
00202 for (i = 0; i < BITSET_SIZE; i++) { dest[i] &= bs[i]; }
00203 }
00204
00205 static void
00206 bitset_or(BitSetRef dest, BitSetRef bs)
00207 {
00208 int i;
00209 for (i = 0; i < BITSET_SIZE; i++) { dest[i] |= bs[i]; }
00210 }
00211
00212 static void
00213 bitset_copy(BitSetRef dest, BitSetRef bs)
00214 {
00215 int i;
00216 for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; }
00217 }
00218
00219 extern int
00220 onig_strncmp(const UChar* s1, const UChar* s2, int n)
00221 {
00222 int x;
00223
00224 while (n-- > 0) {
00225 x = *s2++ - *s1++;
00226 if (x) return x;
00227 }
00228 return 0;
00229 }
00230
00231 extern void
00232 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
00233 {
00234 ptrdiff_t len = end - src;
00235 if (len > 0) {
00236 xmemcpy(dest, src, len);
00237 dest[len] = (UChar )0;
00238 }
00239 }
00240
00241 #ifdef USE_NAMED_GROUP
00242 static UChar*
00243 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
00244 {
00245 ptrdiff_t slen;
00246 int term_len, i;
00247 UChar *r;
00248
00249 slen = end - s;
00250 term_len = ONIGENC_MBC_MINLEN(enc);
00251
00252 r = (UChar* )xmalloc(slen + term_len);
00253 CHECK_NULL_RETURN(r);
00254 xmemcpy(r, s, slen);
00255
00256 for (i = 0; i < term_len; i++)
00257 r[slen + i] = (UChar )0;
00258
00259 return r;
00260 }
00261 #endif
00262
00263
00264 #define PEND_VALUE 0
00265
00266 #ifdef __GNUC__
00267
00268 #define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev
00269 #else
00270 #define PFETCH_READY UChar* pfetch_prev
00271 #endif
00272 #define PEND (p < end ? 0 : 1)
00273 #define PUNFETCH p = pfetch_prev
00274 #define PINC do { \
00275 pfetch_prev = p; \
00276 p += enclen(enc, p, end); \
00277 } while (0)
00278 #define PFETCH(c) do { \
00279 c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
00280 pfetch_prev = p; \
00281 p += enclen(enc, p, end); \
00282 } while (0)
00283
00284 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
00285 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
00286
00287 static UChar*
00288 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
00289 size_t capa)
00290 {
00291 UChar* r;
00292
00293 if (dest)
00294 r = (UChar* )xrealloc(dest, capa + 1);
00295 else
00296 r = (UChar* )xmalloc(capa + 1);
00297
00298 CHECK_NULL_RETURN(r);
00299 onig_strcpy(r + (dest_end - dest), src, src_end);
00300 return r;
00301 }
00302
00303
00304 static UChar*
00305 strcat_capa_from_static(UChar* dest, UChar* dest_end,
00306 const UChar* src, const UChar* src_end, size_t capa)
00307 {
00308 UChar* r;
00309
00310 r = (UChar* )xmalloc(capa + 1);
00311 CHECK_NULL_RETURN(r);
00312 onig_strcpy(r, dest, dest_end);
00313 onig_strcpy(r + (dest_end - dest), src, src_end);
00314 return r;
00315 }
00316
00317
00318 #ifdef USE_ST_LIBRARY
00319
00320 #include "ruby/st.h"
00321
00322 typedef struct {
00323 const UChar* s;
00324 const UChar* end;
00325 } st_str_end_key;
00326
00327 static int
00328 str_end_cmp(st_data_t xp, st_data_t yp)
00329 {
00330 const st_str_end_key *x, *y;
00331 const UChar *p, *q;
00332 int c;
00333
00334 x = (const st_str_end_key *)xp;
00335 y = (const st_str_end_key *)yp;
00336 if ((x->end - x->s) != (y->end - y->s))
00337 return 1;
00338
00339 p = x->s;
00340 q = y->s;
00341 while (p < x->end) {
00342 c = (int )*p - (int )*q;
00343 if (c != 0) return c;
00344
00345 p++; q++;
00346 }
00347
00348 return 0;
00349 }
00350
00351 static st_index_t
00352 str_end_hash(st_data_t xp)
00353 {
00354 const st_str_end_key *x = (const st_str_end_key *)xp;
00355 const UChar *p;
00356 st_index_t val = 0;
00357
00358 p = x->s;
00359 while (p < x->end) {
00360 val = val * 997 + (int )*p++;
00361 }
00362
00363 return val + (val >> 5);
00364 }
00365
00366 extern hash_table_type*
00367 onig_st_init_strend_table_with_size(st_index_t size)
00368 {
00369 static const struct st_hash_type hashType = {
00370 str_end_cmp,
00371 str_end_hash,
00372 };
00373
00374 return (hash_table_type* )
00375 onig_st_init_table_with_size(&hashType, size);
00376 }
00377
00378 extern int
00379 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
00380 const UChar* end_key, hash_data_type *value)
00381 {
00382 st_str_end_key key;
00383
00384 key.s = (UChar* )str_key;
00385 key.end = (UChar* )end_key;
00386
00387 return onig_st_lookup(table, (st_data_t )(&key), value);
00388 }
00389
00390 extern int
00391 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
00392 const UChar* end_key, hash_data_type value)
00393 {
00394 st_str_end_key* key;
00395 int result;
00396
00397 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
00398 key->s = (UChar* )str_key;
00399 key->end = (UChar* )end_key;
00400 result = onig_st_insert(table, (st_data_t )key, value);
00401 if (result) {
00402 xfree(key);
00403 }
00404 return result;
00405 }
00406
00407 #endif
00408
00409
00410 #ifdef USE_NAMED_GROUP
00411
00412 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
00413
00414 typedef struct {
00415 UChar* name;
00416 size_t name_len;
00417 int back_num;
00418 int back_alloc;
00419 int back_ref1;
00420 int* back_refs;
00421 } NameEntry;
00422
00423 #ifdef USE_ST_LIBRARY
00424
00425 typedef st_table NameTable;
00426 typedef st_data_t HashDataType;
00427
00428 #ifdef ONIG_DEBUG
00429 static int
00430 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
00431 {
00432 int i;
00433 FILE* fp = (FILE* )arg;
00434
00435 fprintf(fp, "%s: ", e->name);
00436 if (e->back_num == 0)
00437 fputs("-", fp);
00438 else if (e->back_num == 1)
00439 fprintf(fp, "%d", e->back_ref1);
00440 else {
00441 for (i = 0; i < e->back_num; i++) {
00442 if (i > 0) fprintf(fp, ", ");
00443 fprintf(fp, "%d", e->back_refs[i]);
00444 }
00445 }
00446 fputs("\n", fp);
00447 return ST_CONTINUE;
00448 }
00449
00450 extern int
00451 onig_print_names(FILE* fp, regex_t* reg)
00452 {
00453 NameTable* t = (NameTable* )reg->name_table;
00454
00455 if (IS_NOT_NULL(t)) {
00456 fprintf(fp, "name table\n");
00457 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
00458 fputs("\n", fp);
00459 }
00460 return 0;
00461 }
00462 #endif
00463
00464 static int
00465 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
00466 {
00467 xfree(e->name);
00468 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
00469 xfree(key);
00470 xfree(e);
00471 return ST_DELETE;
00472 }
00473
00474 static int
00475 names_clear(regex_t* reg)
00476 {
00477 NameTable* t = (NameTable* )reg->name_table;
00478
00479 if (IS_NOT_NULL(t)) {
00480 onig_st_foreach(t, i_free_name_entry, 0);
00481 }
00482 return 0;
00483 }
00484
00485 extern int
00486 onig_names_free(regex_t* reg)
00487 {
00488 int r;
00489 NameTable* t;
00490
00491 r = names_clear(reg);
00492 if (r) return r;
00493
00494 t = (NameTable* )reg->name_table;
00495 if (IS_NOT_NULL(t)) onig_st_free_table(t);
00496 reg->name_table = (void* )NULL;
00497 return 0;
00498 }
00499
00500 static NameEntry*
00501 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
00502 {
00503 NameEntry* e;
00504 NameTable* t = (NameTable* )reg->name_table;
00505
00506 e = (NameEntry* )NULL;
00507 if (IS_NOT_NULL(t)) {
00508 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
00509 }
00510 return e;
00511 }
00512
00513 typedef struct {
00514 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
00515 regex_t* reg;
00516 void* arg;
00517 int ret;
00518 OnigEncoding enc;
00519 } INamesArg;
00520
00521 static int
00522 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
00523 {
00524 int r = (*(arg->func))(e->name,
00525 e->name + e->name_len,
00526 e->back_num,
00527 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
00528 arg->reg, arg->arg);
00529 if (r != 0) {
00530 arg->ret = r;
00531 return ST_STOP;
00532 }
00533 return ST_CONTINUE;
00534 }
00535
00536 extern int
00537 onig_foreach_name(regex_t* reg,
00538 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
00539 {
00540 INamesArg narg;
00541 NameTable* t = (NameTable* )reg->name_table;
00542
00543 narg.ret = 0;
00544 if (IS_NOT_NULL(t)) {
00545 narg.func = func;
00546 narg.reg = reg;
00547 narg.arg = arg;
00548 narg.enc = reg->enc;
00549 onig_st_foreach(t, i_names, (HashDataType )&narg);
00550 }
00551 return narg.ret;
00552 }
00553
00554 static int
00555 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
00556 {
00557 int i;
00558
00559 if (e->back_num > 1) {
00560 for (i = 0; i < e->back_num; i++) {
00561 e->back_refs[i] = map[e->back_refs[i]].new_val;
00562 }
00563 }
00564 else if (e->back_num == 1) {
00565 e->back_ref1 = map[e->back_ref1].new_val;
00566 }
00567
00568 return ST_CONTINUE;
00569 }
00570
00571 extern int
00572 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
00573 {
00574 NameTable* t = (NameTable* )reg->name_table;
00575
00576 if (IS_NOT_NULL(t)) {
00577 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
00578 }
00579 return 0;
00580 }
00581
00582
00583 extern int
00584 onig_number_of_names(regex_t* reg)
00585 {
00586 NameTable* t = (NameTable* )reg->name_table;
00587
00588 if (IS_NOT_NULL(t))
00589 return (int )t->num_entries;
00590 else
00591 return 0;
00592 }
00593
00594 #else
00595
00596 #define INIT_NAMES_ALLOC_NUM 8
00597
00598 typedef struct {
00599 NameEntry* e;
00600 int num;
00601 int alloc;
00602 } NameTable;
00603
00604 #ifdef ONIG_DEBUG
00605 extern int
00606 onig_print_names(FILE* fp, regex_t* reg)
00607 {
00608 int i, j;
00609 NameEntry* e;
00610 NameTable* t = (NameTable* )reg->name_table;
00611
00612 if (IS_NOT_NULL(t) && t->num > 0) {
00613 fprintf(fp, "name table\n");
00614 for (i = 0; i < t->num; i++) {
00615 e = &(t->e[i]);
00616 fprintf(fp, "%s: ", e->name);
00617 if (e->back_num == 0) {
00618 fputs("-", fp);
00619 }
00620 else if (e->back_num == 1) {
00621 fprintf(fp, "%d", e->back_ref1);
00622 }
00623 else {
00624 for (j = 0; j < e->back_num; j++) {
00625 if (j > 0) fprintf(fp, ", ");
00626 fprintf(fp, "%d", e->back_refs[j]);
00627 }
00628 }
00629 fputs("\n", fp);
00630 }
00631 fputs("\n", fp);
00632 }
00633 return 0;
00634 }
00635 #endif
00636
00637 static int
00638 names_clear(regex_t* reg)
00639 {
00640 int i;
00641 NameEntry* e;
00642 NameTable* t = (NameTable* )reg->name_table;
00643
00644 if (IS_NOT_NULL(t)) {
00645 for (i = 0; i < t->num; i++) {
00646 e = &(t->e[i]);
00647 if (IS_NOT_NULL(e->name)) {
00648 xfree(e->name);
00649 e->name = NULL;
00650 e->name_len = 0;
00651 e->back_num = 0;
00652 e->back_alloc = 0;
00653 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
00654 e->back_refs = (int* )NULL;
00655 }
00656 }
00657 if (IS_NOT_NULL(t->e)) {
00658 xfree(t->e);
00659 t->e = NULL;
00660 }
00661 t->num = 0;
00662 }
00663 return 0;
00664 }
00665
00666 extern int
00667 onig_names_free(regex_t* reg)
00668 {
00669 int r;
00670 NameTable* t;
00671
00672 r = names_clear(reg);
00673 if (r) return r;
00674
00675 t = (NameTable* )reg->name_table;
00676 if (IS_NOT_NULL(t)) xfree(t);
00677 reg->name_table = NULL;
00678 return 0;
00679 }
00680
00681 static NameEntry*
00682 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
00683 {
00684 int i, len;
00685 NameEntry* e;
00686 NameTable* t = (NameTable* )reg->name_table;
00687
00688 if (IS_NOT_NULL(t)) {
00689 len = name_end - name;
00690 for (i = 0; i < t->num; i++) {
00691 e = &(t->e[i]);
00692 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
00693 return e;
00694 }
00695 }
00696 return (NameEntry* )NULL;
00697 }
00698
00699 extern int
00700 onig_foreach_name(regex_t* reg,
00701 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
00702 {
00703 int i, r;
00704 NameEntry* e;
00705 NameTable* t = (NameTable* )reg->name_table;
00706
00707 if (IS_NOT_NULL(t)) {
00708 for (i = 0; i < t->num; i++) {
00709 e = &(t->e[i]);
00710 r = (*func)(e->name, e->name + e->name_len, e->back_num,
00711 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
00712 reg, arg);
00713 if (r != 0) return r;
00714 }
00715 }
00716 return 0;
00717 }
00718
00719 extern int
00720 onig_number_of_names(regex_t* reg)
00721 {
00722 NameTable* t = (NameTable* )reg->name_table;
00723
00724 if (IS_NOT_NULL(t))
00725 return t->num;
00726 else
00727 return 0;
00728 }
00729
00730 #endif
00731
00732 static int
00733 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
00734 {
00735 int alloc;
00736 NameEntry* e;
00737 NameTable* t = (NameTable* )reg->name_table;
00738
00739 if (name_end - name <= 0)
00740 return ONIGERR_EMPTY_GROUP_NAME;
00741
00742 e = name_find(reg, name, name_end);
00743 if (IS_NULL(e)) {
00744 #ifdef USE_ST_LIBRARY
00745 if (IS_NULL(t)) {
00746 t = onig_st_init_strend_table_with_size(5);
00747 reg->name_table = (void* )t;
00748 }
00749 e = (NameEntry* )xmalloc(sizeof(NameEntry));
00750 CHECK_NULL_RETURN_MEMERR(e);
00751
00752 e->name = strdup_with_null(reg->enc, name, name_end);
00753 if (IS_NULL(e->name)) {
00754 xfree(e);
00755 return ONIGERR_MEMORY;
00756 }
00757 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
00758 (HashDataType )e);
00759
00760 e->name_len = name_end - name;
00761 e->back_num = 0;
00762 e->back_alloc = 0;
00763 e->back_refs = (int* )NULL;
00764
00765 #else
00766
00767 if (IS_NULL(t)) {
00768 alloc = INIT_NAMES_ALLOC_NUM;
00769 t = (NameTable* )xmalloc(sizeof(NameTable));
00770 CHECK_NULL_RETURN_MEMERR(t);
00771 t->e = NULL;
00772 t->alloc = 0;
00773 t->num = 0;
00774
00775 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
00776 if (IS_NULL(t->e)) {
00777 xfree(t);
00778 return ONIGERR_MEMORY;
00779 }
00780 t->alloc = alloc;
00781 reg->name_table = t;
00782 goto clear;
00783 }
00784 else if (t->num == t->alloc) {
00785 int i;
00786 NameEntry* p;
00787
00788 alloc = t->alloc * 2;
00789 p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
00790 CHECK_NULL_RETURN_MEMERR(p);
00791 t->e = p;
00792 t->alloc = alloc;
00793
00794 clear:
00795 for (i = t->num; i < t->alloc; i++) {
00796 t->e[i].name = NULL;
00797 t->e[i].name_len = 0;
00798 t->e[i].back_num = 0;
00799 t->e[i].back_alloc = 0;
00800 t->e[i].back_refs = (int* )NULL;
00801 }
00802 }
00803 e = &(t->e[t->num]);
00804 t->num++;
00805 e->name = strdup_with_null(reg->enc, name, name_end);
00806 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
00807 e->name_len = name_end - name;
00808 #endif
00809 }
00810
00811 if (e->back_num >= 1 &&
00812 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
00813 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
00814 name, name_end);
00815 return ONIGERR_MULTIPLEX_DEFINED_NAME;
00816 }
00817
00818 e->back_num++;
00819 if (e->back_num == 1) {
00820 e->back_ref1 = backref;
00821 }
00822 else {
00823 if (e->back_num == 2) {
00824 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
00825 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
00826 CHECK_NULL_RETURN_MEMERR(e->back_refs);
00827 e->back_alloc = alloc;
00828 e->back_refs[0] = e->back_ref1;
00829 e->back_refs[1] = backref;
00830 }
00831 else {
00832 if (e->back_num > e->back_alloc) {
00833 int* p;
00834 alloc = e->back_alloc * 2;
00835 p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
00836 CHECK_NULL_RETURN_MEMERR(p);
00837 e->back_refs = p;
00838 e->back_alloc = alloc;
00839 }
00840 e->back_refs[e->back_num - 1] = backref;
00841 }
00842 }
00843
00844 return 0;
00845 }
00846
00847 extern int
00848 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
00849 const UChar* name_end, int** nums)
00850 {
00851 NameEntry* e = name_find(reg, name, name_end);
00852
00853 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
00854
00855 switch (e->back_num) {
00856 case 0:
00857 *nums = 0;
00858 break;
00859 case 1:
00860 *nums = &(e->back_ref1);
00861 break;
00862 default:
00863 *nums = e->back_refs;
00864 break;
00865 }
00866 return e->back_num;
00867 }
00868
00869 extern int
00870 onig_name_to_backref_number(regex_t* reg, const UChar* name,
00871 const UChar* name_end, OnigRegion *region)
00872 {
00873 int i, n, *nums;
00874
00875 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
00876 if (n < 0)
00877 return n;
00878 else if (n == 0)
00879 return ONIGERR_PARSER_BUG;
00880 else if (n == 1)
00881 return nums[0];
00882 else {
00883 if (IS_NOT_NULL(region)) {
00884 for (i = n - 1; i >= 0; i--) {
00885 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
00886 return nums[i];
00887 }
00888 }
00889 return nums[n - 1];
00890 }
00891 }
00892
00893 #else
00894
00895 extern int
00896 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
00897 const UChar* name_end, int** nums)
00898 {
00899 return ONIG_NO_SUPPORT_CONFIG;
00900 }
00901
00902 extern int
00903 onig_name_to_backref_number(regex_t* reg, const UChar* name,
00904 const UChar* name_end, OnigRegion* region)
00905 {
00906 return ONIG_NO_SUPPORT_CONFIG;
00907 }
00908
00909 extern int
00910 onig_foreach_name(regex_t* reg,
00911 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
00912 {
00913 return ONIG_NO_SUPPORT_CONFIG;
00914 }
00915
00916 extern int
00917 onig_number_of_names(regex_t* reg)
00918 {
00919 return 0;
00920 }
00921 #endif
00922
00923 extern int
00924 onig_noname_group_capture_is_active(regex_t* reg)
00925 {
00926 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
00927 return 0;
00928
00929 #ifdef USE_NAMED_GROUP
00930 if (onig_number_of_names(reg) > 0 &&
00931 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
00932 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
00933 return 0;
00934 }
00935 #endif
00936
00937 return 1;
00938 }
00939
00940
00941 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
00942
00943 static void
00944 scan_env_clear(ScanEnv* env)
00945 {
00946 int i;
00947
00948 BIT_STATUS_CLEAR(env->capture_history);
00949 BIT_STATUS_CLEAR(env->bt_mem_start);
00950 BIT_STATUS_CLEAR(env->bt_mem_end);
00951 BIT_STATUS_CLEAR(env->backrefed_mem);
00952 env->error = (UChar* )NULL;
00953 env->error_end = (UChar* )NULL;
00954 env->num_call = 0;
00955 env->num_mem = 0;
00956 #ifdef USE_NAMED_GROUP
00957 env->num_named = 0;
00958 #endif
00959 env->mem_alloc = 0;
00960 env->mem_nodes_dynamic = (Node** )NULL;
00961
00962 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
00963 env->mem_nodes_static[i] = NULL_NODE;
00964
00965 #ifdef USE_COMBINATION_EXPLOSION_CHECK
00966 env->num_comb_exp_check = 0;
00967 env->comb_exp_max_regnum = 0;
00968 env->curr_max_regnum = 0;
00969 env->has_recursion = 0;
00970 #endif
00971 env->warnings_flag = 0;
00972 }
00973
00974 static int
00975 scan_env_add_mem_entry(ScanEnv* env)
00976 {
00977 int i, need, alloc;
00978 Node** p;
00979
00980 need = env->num_mem + 1;
00981 if (need > ONIG_MAX_CAPTURE_GROUP_NUM)
00982 return ONIGERR_TOO_MANY_CAPTURE_GROUPS;
00983 if (need >= SCANENV_MEMNODES_SIZE) {
00984 if (env->mem_alloc <= need) {
00985 if (IS_NULL(env->mem_nodes_dynamic)) {
00986 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
00987 p = (Node** )xmalloc(sizeof(Node*) * alloc);
00988 xmemcpy(p, env->mem_nodes_static,
00989 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
00990 }
00991 else {
00992 alloc = env->mem_alloc * 2;
00993 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
00994 }
00995 CHECK_NULL_RETURN_MEMERR(p);
00996
00997 for (i = env->num_mem + 1; i < alloc; i++)
00998 p[i] = NULL_NODE;
00999
01000 env->mem_nodes_dynamic = p;
01001 env->mem_alloc = alloc;
01002 }
01003 }
01004
01005 env->num_mem++;
01006 return env->num_mem;
01007 }
01008
01009 static int
01010 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
01011 {
01012 if (env->num_mem >= num)
01013 SCANENV_MEM_NODES(env)[num] = node;
01014 else
01015 return ONIGERR_PARSER_BUG;
01016 return 0;
01017 }
01018
01019
01020 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01021 typedef struct _FreeNode {
01022 struct _FreeNode* next;
01023 } FreeNode;
01024
01025 static FreeNode* FreeNodeList = (FreeNode* )NULL;
01026 #endif
01027
01028 extern void
01029 onig_node_free(Node* node)
01030 {
01031 start:
01032 if (IS_NULL(node)) return ;
01033
01034 switch (NTYPE(node)) {
01035 case NT_STR:
01036 if (NSTR(node)->capa != 0 &&
01037 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
01038 xfree(NSTR(node)->s);
01039 }
01040 break;
01041
01042 case NT_LIST:
01043 case NT_ALT:
01044 onig_node_free(NCAR(node));
01045 {
01046 Node* next_node = NCDR(node);
01047
01048 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01049 {
01050 FreeNode* n = (FreeNode* )node;
01051
01052 THREAD_ATOMIC_START;
01053 n->next = FreeNodeList;
01054 FreeNodeList = n;
01055 THREAD_ATOMIC_END;
01056 }
01057 #else
01058 xfree(node);
01059 #endif
01060 node = next_node;
01061 goto start;
01062 }
01063 break;
01064
01065 case NT_CCLASS:
01066 {
01067 CClassNode* cc = NCCLASS(node);
01068
01069 if (IS_NCCLASS_SHARE(cc)) return ;
01070 if (cc->mbuf)
01071 bbuf_free(cc->mbuf);
01072 }
01073 break;
01074
01075 case NT_QTFR:
01076 if (NQTFR(node)->target)
01077 onig_node_free(NQTFR(node)->target);
01078 break;
01079
01080 case NT_ENCLOSE:
01081 if (NENCLOSE(node)->target)
01082 onig_node_free(NENCLOSE(node)->target);
01083 break;
01084
01085 case NT_BREF:
01086 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
01087 xfree(NBREF(node)->back_dynamic);
01088 break;
01089
01090 case NT_ANCHOR:
01091 if (NANCHOR(node)->target)
01092 onig_node_free(NANCHOR(node)->target);
01093 break;
01094 }
01095
01096 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01097 {
01098 FreeNode* n = (FreeNode* )node;
01099
01100 THREAD_ATOMIC_START;
01101 n->next = FreeNodeList;
01102 FreeNodeList = n;
01103 THREAD_ATOMIC_END;
01104 }
01105 #else
01106 xfree(node);
01107 #endif
01108 }
01109
01110 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01111 extern int
01112 onig_free_node_list(void)
01113 {
01114 FreeNode* n;
01115
01116
01117 while (IS_NOT_NULL(FreeNodeList)) {
01118 n = FreeNodeList;
01119 FreeNodeList = FreeNodeList->next;
01120 xfree(n);
01121 }
01122
01123 return 0;
01124 }
01125 #endif
01126
01127 static Node*
01128 node_new(void)
01129 {
01130 Node* node;
01131
01132 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01133 THREAD_ATOMIC_START;
01134 if (IS_NOT_NULL(FreeNodeList)) {
01135 node = (Node* )FreeNodeList;
01136 FreeNodeList = FreeNodeList->next;
01137 THREAD_ATOMIC_END;
01138 return node;
01139 }
01140 THREAD_ATOMIC_END;
01141 #endif
01142
01143 node = (Node* )xmalloc(sizeof(Node));
01144
01145 return node;
01146 }
01147
01148
01149 static void
01150 initialize_cclass(CClassNode* cc)
01151 {
01152 BITSET_CLEAR(cc->bs);
01153
01154 cc->flags = 0;
01155 cc->mbuf = NULL;
01156 }
01157
01158 static Node*
01159 node_new_cclass(void)
01160 {
01161 Node* node = node_new();
01162 CHECK_NULL_RETURN(node);
01163
01164 SET_NTYPE(node, NT_CCLASS);
01165 initialize_cclass(NCCLASS(node));
01166 return node;
01167 }
01168
01169 static Node*
01170 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
01171 const OnigCodePoint ranges[])
01172 {
01173 int n, i;
01174 CClassNode* cc;
01175 OnigCodePoint j;
01176
01177 Node* node = node_new_cclass();
01178 CHECK_NULL_RETURN(node);
01179
01180 cc = NCCLASS(node);
01181 if (not != 0) NCCLASS_SET_NOT(cc);
01182
01183 BITSET_CLEAR(cc->bs);
01184 if (sb_out > 0 && IS_NOT_NULL(ranges)) {
01185 n = ONIGENC_CODE_RANGE_NUM(ranges);
01186 for (i = 0; i < n; i++) {
01187 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
01188 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
01189 if (j >= sb_out) goto sb_end;
01190
01191 BITSET_SET_BIT(cc->bs, j);
01192 }
01193 }
01194 }
01195
01196 sb_end:
01197 if (IS_NULL(ranges)) {
01198 is_null:
01199 cc->mbuf = NULL;
01200 }
01201 else {
01202 BBuf* bbuf;
01203
01204 n = ONIGENC_CODE_RANGE_NUM(ranges);
01205 if (n == 0) goto is_null;
01206
01207 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
01208 CHECK_NULL_RETURN(bbuf);
01209 bbuf->alloc = n + 1;
01210 bbuf->used = n + 1;
01211 bbuf->p = (UChar* )((void* )ranges);
01212
01213 cc->mbuf = bbuf;
01214 }
01215
01216 return node;
01217 }
01218
01219 static Node*
01220 node_new_ctype(int type, int not, int ascii_range)
01221 {
01222 Node* node = node_new();
01223 CHECK_NULL_RETURN(node);
01224
01225 SET_NTYPE(node, NT_CTYPE);
01226 NCTYPE(node)->ctype = type;
01227 NCTYPE(node)->not = not;
01228 NCTYPE(node)->ascii_range = ascii_range;
01229 return node;
01230 }
01231
01232 static Node*
01233 node_new_anychar(void)
01234 {
01235 Node* node = node_new();
01236 CHECK_NULL_RETURN(node);
01237
01238 SET_NTYPE(node, NT_CANY);
01239 return node;
01240 }
01241
01242 static Node*
01243 node_new_list(Node* left, Node* right)
01244 {
01245 Node* node = node_new();
01246 CHECK_NULL_RETURN(node);
01247
01248 SET_NTYPE(node, NT_LIST);
01249 NCAR(node) = left;
01250 NCDR(node) = right;
01251 return node;
01252 }
01253
01254 extern Node*
01255 onig_node_new_list(Node* left, Node* right)
01256 {
01257 return node_new_list(left, right);
01258 }
01259
01260 extern Node*
01261 onig_node_list_add(Node* list, Node* x)
01262 {
01263 Node *n;
01264
01265 n = onig_node_new_list(x, NULL);
01266 if (IS_NULL(n)) return NULL_NODE;
01267
01268 if (IS_NOT_NULL(list)) {
01269 while (IS_NOT_NULL(NCDR(list)))
01270 list = NCDR(list);
01271
01272 NCDR(list) = n;
01273 }
01274
01275 return n;
01276 }
01277
01278 extern Node*
01279 onig_node_new_alt(Node* left, Node* right)
01280 {
01281 Node* node = node_new();
01282 CHECK_NULL_RETURN(node);
01283
01284 SET_NTYPE(node, NT_ALT);
01285 NCAR(node) = left;
01286 NCDR(node) = right;
01287 return node;
01288 }
01289
01290 extern Node*
01291 onig_node_new_anchor(int type)
01292 {
01293 Node* node = node_new();
01294 CHECK_NULL_RETURN(node);
01295
01296 SET_NTYPE(node, NT_ANCHOR);
01297 NANCHOR(node)->type = type;
01298 NANCHOR(node)->target = NULL;
01299 NANCHOR(node)->char_len = -1;
01300 NANCHOR(node)->ascii_range = 0;
01301 return node;
01302 }
01303
01304 static Node*
01305 node_new_backref(int back_num, int* backrefs, int by_name,
01306 #ifdef USE_BACKREF_WITH_LEVEL
01307 int exist_level, int nest_level,
01308 #endif
01309 ScanEnv* env)
01310 {
01311 int i;
01312 Node* node = node_new();
01313
01314 CHECK_NULL_RETURN(node);
01315
01316 SET_NTYPE(node, NT_BREF);
01317 NBREF(node)->state = 0;
01318 NBREF(node)->back_num = back_num;
01319 NBREF(node)->back_dynamic = (int* )NULL;
01320 if (by_name != 0)
01321 NBREF(node)->state |= NST_NAME_REF;
01322
01323 #ifdef USE_BACKREF_WITH_LEVEL
01324 if (exist_level != 0) {
01325 NBREF(node)->state |= NST_NEST_LEVEL;
01326 NBREF(node)->nest_level = nest_level;
01327 }
01328 #endif
01329
01330 for (i = 0; i < back_num; i++) {
01331 if (backrefs[i] <= env->num_mem &&
01332 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
01333 NBREF(node)->state |= NST_RECURSION;
01334 break;
01335 }
01336 }
01337
01338 if (back_num <= NODE_BACKREFS_SIZE) {
01339 for (i = 0; i < back_num; i++)
01340 NBREF(node)->back_static[i] = backrefs[i];
01341 }
01342 else {
01343 int* p = (int* )xmalloc(sizeof(int) * back_num);
01344 if (IS_NULL(p)) {
01345 onig_node_free(node);
01346 return NULL;
01347 }
01348 NBREF(node)->back_dynamic = p;
01349 for (i = 0; i < back_num; i++)
01350 p[i] = backrefs[i];
01351 }
01352 return node;
01353 }
01354
01355 #ifdef USE_SUBEXP_CALL
01356 static Node*
01357 node_new_call(UChar* name, UChar* name_end, int gnum)
01358 {
01359 Node* node = node_new();
01360 CHECK_NULL_RETURN(node);
01361
01362 SET_NTYPE(node, NT_CALL);
01363 NCALL(node)->state = 0;
01364 NCALL(node)->target = NULL_NODE;
01365 NCALL(node)->name = name;
01366 NCALL(node)->name_end = name_end;
01367 NCALL(node)->group_num = gnum;
01368 return node;
01369 }
01370 #endif
01371
01372 static Node*
01373 node_new_quantifier(int lower, int upper, int by_number)
01374 {
01375 Node* node = node_new();
01376 CHECK_NULL_RETURN(node);
01377
01378 SET_NTYPE(node, NT_QTFR);
01379 NQTFR(node)->state = 0;
01380 NQTFR(node)->target = NULL;
01381 NQTFR(node)->lower = lower;
01382 NQTFR(node)->upper = upper;
01383 NQTFR(node)->greedy = 1;
01384 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
01385 NQTFR(node)->head_exact = NULL_NODE;
01386 NQTFR(node)->next_head_exact = NULL_NODE;
01387 NQTFR(node)->is_refered = 0;
01388 if (by_number != 0)
01389 NQTFR(node)->state |= NST_BY_NUMBER;
01390
01391 #ifdef USE_COMBINATION_EXPLOSION_CHECK
01392 NQTFR(node)->comb_exp_check_num = 0;
01393 #endif
01394
01395 return node;
01396 }
01397
01398 static Node*
01399 node_new_enclose(int type)
01400 {
01401 Node* node = node_new();
01402 CHECK_NULL_RETURN(node);
01403
01404 SET_NTYPE(node, NT_ENCLOSE);
01405 NENCLOSE(node)->type = type;
01406 NENCLOSE(node)->state = 0;
01407 NENCLOSE(node)->regnum = 0;
01408 NENCLOSE(node)->option = 0;
01409 NENCLOSE(node)->target = NULL;
01410 NENCLOSE(node)->call_addr = -1;
01411 NENCLOSE(node)->opt_count = 0;
01412 return node;
01413 }
01414
01415 extern Node*
01416 onig_node_new_enclose(int type)
01417 {
01418 return node_new_enclose(type);
01419 }
01420
01421 static Node*
01422 node_new_enclose_memory(OnigOptionType option, int is_named)
01423 {
01424 Node* node = node_new_enclose(ENCLOSE_MEMORY);
01425 CHECK_NULL_RETURN(node);
01426 if (is_named != 0)
01427 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
01428
01429 #ifdef USE_SUBEXP_CALL
01430 NENCLOSE(node)->option = option;
01431 #endif
01432 return node;
01433 }
01434
01435 static Node*
01436 node_new_option(OnigOptionType option)
01437 {
01438 Node* node = node_new_enclose(ENCLOSE_OPTION);
01439 CHECK_NULL_RETURN(node);
01440 NENCLOSE(node)->option = option;
01441 return node;
01442 }
01443
01444 extern int
01445 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
01446 {
01447 ptrdiff_t addlen = end - s;
01448
01449 if (addlen > 0) {
01450 ptrdiff_t len = NSTR(node)->end - NSTR(node)->s;
01451
01452 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
01453 UChar* p;
01454 ptrdiff_t capa = len + addlen + NODE_STR_MARGIN;
01455
01456 if (capa <= NSTR(node)->capa) {
01457 onig_strcpy(NSTR(node)->s + len, s, end);
01458 }
01459 else {
01460 if (NSTR(node)->s == NSTR(node)->buf)
01461 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
01462 s, end, capa);
01463 else
01464 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
01465
01466 CHECK_NULL_RETURN_MEMERR(p);
01467 NSTR(node)->s = p;
01468 NSTR(node)->capa = (int )capa;
01469 }
01470 }
01471 else {
01472 onig_strcpy(NSTR(node)->s + len, s, end);
01473 }
01474 NSTR(node)->end = NSTR(node)->s + len + addlen;
01475 }
01476
01477 return 0;
01478 }
01479
01480 extern int
01481 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
01482 {
01483 onig_node_str_clear(node);
01484 return onig_node_str_cat(node, s, end);
01485 }
01486
01487 static int
01488 node_str_cat_char(Node* node, UChar c)
01489 {
01490 UChar s[1];
01491
01492 s[0] = c;
01493 return onig_node_str_cat(node, s, s + 1);
01494 }
01495
01496 static int
01497 node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c)
01498 {
01499 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
01500 int num = ONIGENC_CODE_TO_MBC(enc, c, buf);
01501 if (num < 0) return num;
01502 return onig_node_str_cat(node, buf, buf + num);
01503 }
01504
01505 extern void
01506 onig_node_conv_to_str_node(Node* node, int flag)
01507 {
01508 SET_NTYPE(node, NT_STR);
01509 NSTR(node)->flag = flag;
01510 NSTR(node)->capa = 0;
01511 NSTR(node)->s = NSTR(node)->buf;
01512 NSTR(node)->end = NSTR(node)->buf;
01513 }
01514
01515 extern void
01516 onig_node_str_clear(Node* node)
01517 {
01518 if (NSTR(node)->capa != 0 &&
01519 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
01520 xfree(NSTR(node)->s);
01521 }
01522
01523 NSTR(node)->capa = 0;
01524 NSTR(node)->flag = 0;
01525 NSTR(node)->s = NSTR(node)->buf;
01526 NSTR(node)->end = NSTR(node)->buf;
01527 }
01528
01529 static Node*
01530 node_new_str(const UChar* s, const UChar* end)
01531 {
01532 Node* node = node_new();
01533 CHECK_NULL_RETURN(node);
01534
01535 SET_NTYPE(node, NT_STR);
01536 NSTR(node)->capa = 0;
01537 NSTR(node)->flag = 0;
01538 NSTR(node)->s = NSTR(node)->buf;
01539 NSTR(node)->end = NSTR(node)->buf;
01540 if (onig_node_str_cat(node, s, end)) {
01541 onig_node_free(node);
01542 return NULL;
01543 }
01544 return node;
01545 }
01546
01547 extern Node*
01548 onig_node_new_str(const UChar* s, const UChar* end)
01549 {
01550 return node_new_str(s, end);
01551 }
01552
01553 static Node*
01554 node_new_str_raw(UChar* s, UChar* end)
01555 {
01556 Node* node = node_new_str(s, end);
01557 if (IS_NOT_NULL(node))
01558 NSTRING_SET_RAW(node);
01559 return node;
01560 }
01561
01562 static Node*
01563 node_new_empty(void)
01564 {
01565 return node_new_str(NULL, NULL);
01566 }
01567
01568 static Node*
01569 node_new_str_raw_char(UChar c)
01570 {
01571 UChar p[1];
01572
01573 p[0] = c;
01574 return node_new_str_raw(p, p + 1);
01575 }
01576
01577 static Node*
01578 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
01579 {
01580 const UChar *p;
01581 Node* n = NULL_NODE;
01582
01583 if (sn->end > sn->s) {
01584 p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end);
01585 if (p && p > sn->s) {
01586 n = node_new_str(p, sn->end);
01587 if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0)
01588 NSTRING_SET_RAW(n);
01589 sn->end = (UChar* )p;
01590 }
01591 }
01592 return n;
01593 }
01594
01595 static int
01596 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
01597 {
01598 if (sn->end > sn->s) {
01599 return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0);
01600 }
01601 return 0;
01602 }
01603
01604 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
01605 static int
01606 node_str_head_pad(StrNode* sn, int num, UChar val)
01607 {
01608 UChar buf[NODE_STR_BUF_SIZE];
01609 int i, len;
01610
01611 len = sn->end - sn->s;
01612 onig_strcpy(buf, sn->s, sn->end);
01613 onig_strcpy(&(sn->s[num]), buf, buf + len);
01614 sn->end += num;
01615
01616 for (i = 0; i < num; i++) {
01617 sn->s[i] = val;
01618 }
01619 }
01620 #endif
01621
01622 extern int
01623 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
01624 {
01625 unsigned int num, val;
01626 OnigCodePoint c;
01627 UChar* p = *src;
01628 PFETCH_READY;
01629
01630 num = 0;
01631 while (!PEND) {
01632 PFETCH(c);
01633 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
01634 val = (unsigned int )DIGITVAL(c);
01635 if ((INT_MAX_LIMIT - val) / 10UL < num)
01636 return -1;
01637
01638 num = num * 10 + val;
01639 }
01640 else {
01641 PUNFETCH;
01642 break;
01643 }
01644 }
01645 *src = p;
01646 return num;
01647 }
01648
01649 static int
01650 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen,
01651 int maxlen, OnigEncoding enc)
01652 {
01653 OnigCodePoint c;
01654 unsigned int num, val;
01655 int restlen;
01656 UChar* p = *src;
01657 PFETCH_READY;
01658
01659 restlen = maxlen - minlen;
01660 num = 0;
01661 while (!PEND && maxlen-- != 0) {
01662 PFETCH(c);
01663 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
01664 val = (unsigned int )XDIGITVAL(enc,c);
01665 if ((INT_MAX_LIMIT - val) / 16UL < num)
01666 return -1;
01667
01668 num = (num << 4) + XDIGITVAL(enc,c);
01669 }
01670 else {
01671 PUNFETCH;
01672 break;
01673 }
01674 }
01675 if (maxlen > restlen)
01676 return -2;
01677 *src = p;
01678 return num;
01679 }
01680
01681 static int
01682 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
01683 OnigEncoding enc)
01684 {
01685 OnigCodePoint c;
01686 unsigned int num, val;
01687 UChar* p = *src;
01688 PFETCH_READY;
01689
01690 num = 0;
01691 while (!PEND && maxlen-- != 0) {
01692 PFETCH(c);
01693 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
01694 val = ODIGITVAL(c);
01695 if ((INT_MAX_LIMIT - val) / 8UL < num)
01696 return -1;
01697
01698 num = (num << 3) + val;
01699 }
01700 else {
01701 PUNFETCH;
01702 break;
01703 }
01704 }
01705 *src = p;
01706 return num;
01707 }
01708
01709
01710 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
01711 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
01712
01713
01714
01715
01716
01717 static int
01718 new_code_range(BBuf** pbuf)
01719 {
01720 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
01721 int r;
01722 OnigCodePoint n;
01723 BBuf* bbuf;
01724
01725 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
01726 CHECK_NULL_RETURN_MEMERR(*pbuf);
01727 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
01728 if (r) return r;
01729
01730 n = 0;
01731 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
01732 return 0;
01733 }
01734
01735 static int
01736 add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to,
01737 int checkdup)
01738 {
01739 int r, inc_n, pos;
01740 OnigCodePoint low, high, bound, x;
01741 OnigCodePoint n, *data;
01742 BBuf* bbuf;
01743
01744 if (from > to) {
01745 n = from; from = to; to = n;
01746 }
01747
01748 if (IS_NULL(*pbuf)) {
01749 r = new_code_range(pbuf);
01750 if (r) return r;
01751 bbuf = *pbuf;
01752 n = 0;
01753 }
01754 else {
01755 bbuf = *pbuf;
01756 GET_CODE_POINT(n, bbuf->p);
01757 }
01758 data = (OnigCodePoint* )(bbuf->p);
01759 data++;
01760
01761 bound = (from == 0) ? 0 : n;
01762 for (low = 0; low < bound; ) {
01763 x = (low + bound) >> 1;
01764 if (from - 1 > data[x*2 + 1])
01765 low = x + 1;
01766 else
01767 bound = x;
01768 }
01769
01770 high = (to == ONIG_LAST_CODE_POINT) ? n : low;
01771 for (bound = n; high < bound; ) {
01772 x = (high + bound) >> 1;
01773 if (to + 1 >= data[x*2])
01774 high = x + 1;
01775 else
01776 bound = x;
01777 }
01778
01779
01780
01781
01782 inc_n = low + 1 - high;
01783 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
01784 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
01785
01786 if (inc_n != 1) {
01787 if (checkdup && from <= data[low*2+1]
01788 && (data[low*2] <= from || data[low*2+1] <= to))
01789 CC_DUP_WARN(env);
01790 if (from > data[low*2])
01791 from = data[low*2];
01792 if (to < data[(high - 1)*2 + 1])
01793 to = data[(high - 1)*2 + 1];
01794 }
01795
01796 if (inc_n != 0) {
01797 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
01798 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
01799
01800 if (inc_n > 0) {
01801 if (high < n) {
01802 int size = (n - high) * 2 * SIZE_CODE_POINT;
01803 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
01804 }
01805 }
01806 else {
01807 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
01808 }
01809 }
01810
01811 pos = SIZE_CODE_POINT * (1 + low * 2);
01812 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
01813 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
01814 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
01815 n += inc_n;
01816 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
01817
01818 return 0;
01819 }
01820
01821 static int
01822 add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
01823 {
01824 return add_code_range_to_buf0(pbuf, env, from, to, 1);
01825 }
01826
01827 static int
01828 add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup)
01829 {
01830 if (from > to) {
01831 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
01832 return 0;
01833 else
01834 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
01835 }
01836
01837 return add_code_range_to_buf0(pbuf, env, from, to, checkdup);
01838 }
01839
01840 static int
01841 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
01842 {
01843 return add_code_range0(pbuf, env, from, to, 1);
01844 }
01845
01846 static int
01847 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env)
01848 {
01849 int r, i, n;
01850 OnigCodePoint pre, from, *data, to = 0;
01851
01852 *pbuf = (BBuf* )NULL;
01853 if (IS_NULL(bbuf)) {
01854 set_all:
01855 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
01856 }
01857
01858 data = (OnigCodePoint* )(bbuf->p);
01859 GET_CODE_POINT(n, data);
01860 data++;
01861 if (n <= 0) goto set_all;
01862
01863 r = 0;
01864 pre = MBCODE_START_POS(enc);
01865 for (i = 0; i < n; i++) {
01866 from = data[i*2];
01867 to = data[i*2+1];
01868 if (pre <= from - 1) {
01869 r = add_code_range_to_buf(pbuf, env, pre, from - 1);
01870 if (r != 0) return r;
01871 }
01872 if (to == ONIG_LAST_CODE_POINT) break;
01873 pre = to + 1;
01874 }
01875 if (to < ONIG_LAST_CODE_POINT) {
01876 r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT);
01877 }
01878 return r;
01879 }
01880
01881 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
01882 BBuf *tbuf; \
01883 int tnot; \
01884 tnot = not1; not1 = not2; not2 = tnot; \
01885 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
01886 } while (0)
01887
01888 static int
01889 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
01890 BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
01891 {
01892 int r;
01893 OnigCodePoint i, n1, *data1;
01894 OnigCodePoint from, to;
01895
01896 *pbuf = (BBuf* )NULL;
01897 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
01898 if (not1 != 0 || not2 != 0)
01899 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
01900 return 0;
01901 }
01902
01903 r = 0;
01904 if (IS_NULL(bbuf2))
01905 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
01906
01907 if (IS_NULL(bbuf1)) {
01908 if (not1 != 0) {
01909 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
01910 }
01911 else {
01912 if (not2 == 0) {
01913 return bbuf_clone(pbuf, bbuf2);
01914 }
01915 else {
01916 return not_code_range_buf(enc, bbuf2, pbuf, env);
01917 }
01918 }
01919 }
01920
01921 if (not1 != 0)
01922 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
01923
01924 data1 = (OnigCodePoint* )(bbuf1->p);
01925 GET_CODE_POINT(n1, data1);
01926 data1++;
01927
01928 if (not2 == 0 && not1 == 0) {
01929 r = bbuf_clone(pbuf, bbuf2);
01930 }
01931 else if (not1 == 0) {
01932 r = not_code_range_buf(enc, bbuf2, pbuf, env);
01933 }
01934 if (r != 0) return r;
01935
01936 for (i = 0; i < n1; i++) {
01937 from = data1[i*2];
01938 to = data1[i*2+1];
01939 r = add_code_range_to_buf(pbuf, env, from, to);
01940 if (r != 0) return r;
01941 }
01942 return 0;
01943 }
01944
01945 static int
01946 and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1,
01947 OnigCodePoint* data, int n)
01948 {
01949 int i, r;
01950 OnigCodePoint from2, to2;
01951
01952 for (i = 0; i < n; i++) {
01953 from2 = data[i*2];
01954 to2 = data[i*2+1];
01955 if (from2 < from1) {
01956 if (to2 < from1) continue;
01957 else {
01958 from1 = to2 + 1;
01959 }
01960 }
01961 else if (from2 <= to1) {
01962 if (to2 < to1) {
01963 if (from1 <= from2 - 1) {
01964 r = add_code_range_to_buf(pbuf, env, from1, from2-1);
01965 if (r != 0) return r;
01966 }
01967 from1 = to2 + 1;
01968 }
01969 else {
01970 to1 = from2 - 1;
01971 }
01972 }
01973 else {
01974 from1 = from2;
01975 }
01976 if (from1 > to1) break;
01977 }
01978 if (from1 <= to1) {
01979 r = add_code_range_to_buf(pbuf, env, from1, to1);
01980 if (r != 0) return r;
01981 }
01982 return 0;
01983 }
01984
01985 static int
01986 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
01987 {
01988 int r;
01989 OnigCodePoint i, j, n1, n2, *data1, *data2;
01990 OnigCodePoint from, to, from1, to1, from2, to2;
01991
01992 *pbuf = (BBuf* )NULL;
01993 if (IS_NULL(bbuf1)) {
01994 if (not1 != 0 && IS_NOT_NULL(bbuf2))
01995 return bbuf_clone(pbuf, bbuf2);
01996 return 0;
01997 }
01998 else if (IS_NULL(bbuf2)) {
01999 if (not2 != 0)
02000 return bbuf_clone(pbuf, bbuf1);
02001 return 0;
02002 }
02003
02004 if (not1 != 0)
02005 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
02006
02007 data1 = (OnigCodePoint* )(bbuf1->p);
02008 data2 = (OnigCodePoint* )(bbuf2->p);
02009 GET_CODE_POINT(n1, data1);
02010 GET_CODE_POINT(n2, data2);
02011 data1++;
02012 data2++;
02013
02014 if (not2 == 0 && not1 == 0) {
02015 for (i = 0; i < n1; i++) {
02016 from1 = data1[i*2];
02017 to1 = data1[i*2+1];
02018 for (j = 0; j < n2; j++) {
02019 from2 = data2[j*2];
02020 to2 = data2[j*2+1];
02021 if (from2 > to1) break;
02022 if (to2 < from1) continue;
02023 from = MAX(from1, from2);
02024 to = MIN(to1, to2);
02025 r = add_code_range_to_buf(pbuf, env, from, to);
02026 if (r != 0) return r;
02027 }
02028 }
02029 }
02030 else if (not1 == 0) {
02031 for (i = 0; i < n1; i++) {
02032 from1 = data1[i*2];
02033 to1 = data1[i*2+1];
02034 r = and_code_range1(pbuf, env, from1, to1, data2, n2);
02035 if (r != 0) return r;
02036 }
02037 }
02038
02039 return 0;
02040 }
02041
02042 static int
02043 and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
02044 {
02045 OnigEncoding enc = env->enc;
02046 int r, not1, not2;
02047 BBuf *buf1, *buf2, *pbuf = 0;
02048 BitSetRef bsr1, bsr2;
02049 BitSet bs1, bs2;
02050
02051 not1 = IS_NCCLASS_NOT(dest);
02052 bsr1 = dest->bs;
02053 buf1 = dest->mbuf;
02054 not2 = IS_NCCLASS_NOT(cc);
02055 bsr2 = cc->bs;
02056 buf2 = cc->mbuf;
02057
02058 if (not1 != 0) {
02059 bitset_invert_to(bsr1, bs1);
02060 bsr1 = bs1;
02061 }
02062 if (not2 != 0) {
02063 bitset_invert_to(bsr2, bs2);
02064 bsr2 = bs2;
02065 }
02066 bitset_and(bsr1, bsr2);
02067 if (bsr1 != dest->bs) {
02068 bitset_copy(dest->bs, bsr1);
02069 bsr1 = dest->bs;
02070 }
02071 if (not1 != 0) {
02072 bitset_invert(dest->bs);
02073 }
02074
02075 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
02076 if (not1 != 0 && not2 != 0) {
02077 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env);
02078 }
02079 else {
02080 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env);
02081 if (r == 0 && not1 != 0) {
02082 BBuf *tbuf = 0;
02083 r = not_code_range_buf(enc, pbuf, &tbuf, env);
02084 bbuf_free(pbuf);
02085 pbuf = tbuf;
02086 }
02087 }
02088 if (r != 0) {
02089 bbuf_free(pbuf);
02090 return r;
02091 }
02092
02093 dest->mbuf = pbuf;
02094 bbuf_free(buf1);
02095 return r;
02096 }
02097 return 0;
02098 }
02099
02100 static int
02101 or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
02102 {
02103 OnigEncoding enc = env->enc;
02104 int r, not1, not2;
02105 BBuf *buf1, *buf2, *pbuf = 0;
02106 BitSetRef bsr1, bsr2;
02107 BitSet bs1, bs2;
02108
02109 not1 = IS_NCCLASS_NOT(dest);
02110 bsr1 = dest->bs;
02111 buf1 = dest->mbuf;
02112 not2 = IS_NCCLASS_NOT(cc);
02113 bsr2 = cc->bs;
02114 buf2 = cc->mbuf;
02115
02116 if (not1 != 0) {
02117 bitset_invert_to(bsr1, bs1);
02118 bsr1 = bs1;
02119 }
02120 if (not2 != 0) {
02121 bitset_invert_to(bsr2, bs2);
02122 bsr2 = bs2;
02123 }
02124 bitset_or(bsr1, bsr2);
02125 if (bsr1 != dest->bs) {
02126 bitset_copy(dest->bs, bsr1);
02127 bsr1 = dest->bs;
02128 }
02129 if (not1 != 0) {
02130 bitset_invert(dest->bs);
02131 }
02132
02133 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
02134 if (not1 != 0 && not2 != 0) {
02135 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env);
02136 }
02137 else {
02138 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env);
02139 if (r == 0 && not1 != 0) {
02140 BBuf *tbuf = 0;
02141 r = not_code_range_buf(enc, pbuf, &tbuf, env);
02142 bbuf_free(pbuf);
02143 pbuf = tbuf;
02144 }
02145 }
02146 if (r != 0) {
02147 bbuf_free(pbuf);
02148 return r;
02149 }
02150
02151 dest->mbuf = pbuf;
02152 bbuf_free(buf1);
02153 return r;
02154 }
02155 else
02156 return 0;
02157 }
02158
02159 static void UNKNOWN_ESC_WARN(ScanEnv *env, int c);
02160
02161 static int
02162 conv_backslash_value(int c, ScanEnv* env)
02163 {
02164 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
02165 switch (c) {
02166 case 'n': return '\n';
02167 case 't': return '\t';
02168 case 'r': return '\r';
02169 case 'f': return '\f';
02170 case 'a': return '\007';
02171 case 'b': return '\010';
02172 case 'e': return '\033';
02173 case 'v':
02174 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
02175 return '\v';
02176 break;
02177
02178 default:
02179 if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
02180 UNKNOWN_ESC_WARN(env, c);
02181 break;
02182 }
02183 }
02184 return c;
02185 }
02186
02187 #ifdef USE_NO_INVALID_QUANTIFIER
02188 #define is_invalid_quantifier_target(node) 0
02189 #else
02190 static int
02191 is_invalid_quantifier_target(Node* node)
02192 {
02193 switch (NTYPE(node)) {
02194 case NT_ANCHOR:
02195 return 1;
02196 break;
02197
02198 case NT_ENCLOSE:
02199
02200
02201 break;
02202
02203 case NT_LIST:
02204 do {
02205 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
02206 } while (IS_NOT_NULL(node = NCDR(node)));
02207 return 0;
02208 break;
02209
02210 case NT_ALT:
02211 do {
02212 if (is_invalid_quantifier_target(NCAR(node))) return 1;
02213 } while (IS_NOT_NULL(node = NCDR(node)));
02214 break;
02215
02216 default:
02217 break;
02218 }
02219 return 0;
02220 }
02221 #endif
02222
02223
02224 static int
02225 popular_quantifier_num(QtfrNode* q)
02226 {
02227 if (q->greedy) {
02228 if (q->lower == 0) {
02229 if (q->upper == 1) return 0;
02230 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
02231 }
02232 else if (q->lower == 1) {
02233 if (IS_REPEAT_INFINITE(q->upper)) return 2;
02234 }
02235 }
02236 else {
02237 if (q->lower == 0) {
02238 if (q->upper == 1) return 3;
02239 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
02240 }
02241 else if (q->lower == 1) {
02242 if (IS_REPEAT_INFINITE(q->upper)) return 5;
02243 }
02244 }
02245 return -1;
02246 }
02247
02248
02249 enum ReduceType {
02250 RQ_ASIS = 0,
02251 RQ_DEL = 1,
02252 RQ_A,
02253 RQ_AQ,
02254 RQ_QQ,
02255 RQ_P_QQ,
02256 RQ_PQ_Q
02257 };
02258
02259 static enum ReduceType const ReduceTypeTable[6][6] = {
02260 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS},
02261 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},
02262 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},
02263 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ},
02264 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL},
02265 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL}
02266 };
02267
02268 extern void
02269 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
02270 {
02271 int pnum, cnum;
02272 QtfrNode *p, *c;
02273
02274 p = NQTFR(pnode);
02275 c = NQTFR(cnode);
02276 pnum = popular_quantifier_num(p);
02277 cnum = popular_quantifier_num(c);
02278 if (pnum < 0 || cnum < 0) return ;
02279
02280 switch (ReduceTypeTable[cnum][pnum]) {
02281 case RQ_DEL:
02282 *pnode = *cnode;
02283 break;
02284 case RQ_A:
02285 p->target = c->target;
02286 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
02287 break;
02288 case RQ_AQ:
02289 p->target = c->target;
02290 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
02291 break;
02292 case RQ_QQ:
02293 p->target = c->target;
02294 p->lower = 0; p->upper = 1; p->greedy = 0;
02295 break;
02296 case RQ_P_QQ:
02297 p->target = cnode;
02298 p->lower = 0; p->upper = 1; p->greedy = 0;
02299 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
02300 return ;
02301 break;
02302 case RQ_PQ_Q:
02303 p->target = cnode;
02304 p->lower = 0; p->upper = 1; p->greedy = 1;
02305 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
02306 return ;
02307 break;
02308 case RQ_ASIS:
02309 p->target = cnode;
02310 return ;
02311 break;
02312 }
02313
02314 c->target = NULL_NODE;
02315 onig_node_free(cnode);
02316 }
02317
02318
02319 enum TokenSyms {
02320 TK_EOT = 0,
02321 TK_RAW_BYTE = 1,
02322 TK_CHAR,
02323 TK_STRING,
02324 TK_CODE_POINT,
02325 TK_ANYCHAR,
02326 TK_CHAR_TYPE,
02327 TK_BACKREF,
02328 TK_CALL,
02329 TK_ANCHOR,
02330 TK_OP_REPEAT,
02331 TK_INTERVAL,
02332 TK_ANYCHAR_ANYTIME,
02333 TK_ALT,
02334 TK_SUBEXP_OPEN,
02335 TK_SUBEXP_CLOSE,
02336 TK_CC_OPEN,
02337 TK_QUOTE_OPEN,
02338 TK_CHAR_PROPERTY,
02339 TK_LINEBREAK,
02340 TK_EXTENDED_GRAPHEME_CLUSTER,
02341 TK_KEEP,
02342
02343 TK_CC_CLOSE,
02344 TK_CC_RANGE,
02345 TK_POSIX_BRACKET_OPEN,
02346 TK_CC_AND,
02347 TK_CC_CC_OPEN
02348 };
02349
02350 typedef struct {
02351 enum TokenSyms type;
02352 int escaped;
02353 int base;
02354 UChar* backp;
02355 union {
02356 UChar* s;
02357 int c;
02358 OnigCodePoint code;
02359 struct {
02360 int subtype;
02361 int ascii_range;
02362 } anchor;
02363 struct {
02364 int lower;
02365 int upper;
02366 int greedy;
02367 int possessive;
02368 } repeat;
02369 struct {
02370 int num;
02371 int ref1;
02372 int* refs;
02373 int by_name;
02374 #ifdef USE_BACKREF_WITH_LEVEL
02375 int exist_level;
02376 int level;
02377 #endif
02378 } backref;
02379 struct {
02380 UChar* name;
02381 UChar* name_end;
02382 int gnum;
02383 int rel;
02384 } call;
02385 struct {
02386 int ctype;
02387 int not;
02388 } prop;
02389 } u;
02390 } OnigToken;
02391
02392
02393 static int
02394 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
02395 {
02396 int low, up, syn_allow, non_low = 0;
02397 int r = 0;
02398 OnigCodePoint c;
02399 OnigEncoding enc = env->enc;
02400 UChar* p = *src;
02401 PFETCH_READY;
02402
02403 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
02404
02405 if (PEND) {
02406 if (syn_allow)
02407 return 1;
02408 else
02409 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
02410 }
02411
02412 if (! syn_allow) {
02413 c = PPEEK;
02414 if (c == ')' || c == '(' || c == '|') {
02415 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
02416 }
02417 }
02418
02419 low = onig_scan_unsigned_number(&p, end, env->enc);
02420 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02421 if (low > ONIG_MAX_REPEAT_NUM)
02422 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02423
02424 if (p == *src) {
02425 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
02426
02427 low = 0;
02428 non_low = 1;
02429 }
02430 else
02431 goto invalid;
02432 }
02433
02434 if (PEND) goto invalid;
02435 PFETCH(c);
02436 if (c == ',') {
02437 UChar* prev = p;
02438 up = onig_scan_unsigned_number(&p, end, env->enc);
02439 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02440 if (up > ONIG_MAX_REPEAT_NUM)
02441 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02442
02443 if (p == prev) {
02444 if (non_low != 0)
02445 goto invalid;
02446 up = REPEAT_INFINITE;
02447 }
02448 }
02449 else {
02450 if (non_low != 0)
02451 goto invalid;
02452
02453 PUNFETCH;
02454 up = low;
02455 r = 2;
02456 }
02457
02458 if (PEND) goto invalid;
02459 PFETCH(c);
02460 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
02461 if (c != MC_ESC(env->syntax)) goto invalid;
02462 PFETCH(c);
02463 }
02464 if (c != '}') goto invalid;
02465
02466 if (!IS_REPEAT_INFINITE(up) && low > up) {
02467 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
02468 }
02469
02470 tok->type = TK_INTERVAL;
02471 tok->u.repeat.lower = low;
02472 tok->u.repeat.upper = up;
02473 *src = p;
02474 return r;
02475
02476 invalid:
02477 if (syn_allow)
02478 return 1;
02479 else
02480 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
02481 }
02482
02483
02484 static int
02485 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
02486 {
02487 int v;
02488 OnigCodePoint c;
02489 OnigEncoding enc = env->enc;
02490 UChar* p = *src;
02491 PFETCH_READY;
02492
02493 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
02494
02495 PFETCH(c);
02496 switch (c) {
02497 case 'M':
02498 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
02499 if (PEND) return ONIGERR_END_PATTERN_AT_META;
02500 PFETCH(c);
02501 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
02502 if (PEND) return ONIGERR_END_PATTERN_AT_META;
02503 PFETCH(c);
02504 if (c == MC_ESC(env->syntax)) {
02505 v = fetch_escaped_value(&p, end, env);
02506 if (v < 0) return v;
02507 c = (OnigCodePoint )v;
02508 }
02509 c = ((c & 0xff) | 0x80);
02510 }
02511 else
02512 goto backslash;
02513 break;
02514
02515 case 'C':
02516 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
02517 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
02518 PFETCH(c);
02519 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
02520 goto control;
02521 }
02522 else
02523 goto backslash;
02524
02525 case 'c':
02526 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
02527 control:
02528 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
02529 PFETCH(c);
02530 if (c == '?') {
02531 c = 0177;
02532 }
02533 else {
02534 if (c == MC_ESC(env->syntax)) {
02535 v = fetch_escaped_value(&p, end, env);
02536 if (v < 0) return v;
02537 c = (OnigCodePoint )v;
02538 }
02539 c &= 0x9f;
02540 }
02541 break;
02542 }
02543
02544
02545 default:
02546 {
02547 backslash:
02548 c = conv_backslash_value(c, env);
02549 }
02550 break;
02551 }
02552
02553 *src = p;
02554 return c;
02555 }
02556
02557 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
02558
02559 static OnigCodePoint
02560 get_name_end_code_point(OnigCodePoint start)
02561 {
02562 switch (start) {
02563 case '<': return (OnigCodePoint )'>'; break;
02564 case '\'': return (OnigCodePoint )'\''; break;
02565 case '(': return (OnigCodePoint )')'; break;
02566 case '{': return (OnigCodePoint )'}'; break;
02567 default:
02568 break;
02569 }
02570
02571 return (OnigCodePoint )0;
02572 }
02573
02574 #ifdef USE_NAMED_GROUP
02575 #ifdef USE_BACKREF_WITH_LEVEL
02576
02577
02578
02579
02580
02581 static int
02582 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
02583 UChar** rname_end, ScanEnv* env,
02584 int* rback_num, int* rlevel)
02585 {
02586 int r, sign, is_num, exist_level;
02587 OnigCodePoint end_code;
02588 OnigCodePoint c = 0;
02589 OnigEncoding enc = env->enc;
02590 UChar *name_end;
02591 UChar *pnum_head;
02592 UChar *p = *src;
02593 PFETCH_READY;
02594
02595 *rback_num = 0;
02596 is_num = exist_level = 0;
02597 sign = 1;
02598 pnum_head = *src;
02599
02600 end_code = get_name_end_code_point(start_code);
02601
02602 name_end = end;
02603 r = 0;
02604 if (PEND) {
02605 return ONIGERR_EMPTY_GROUP_NAME;
02606 }
02607 else {
02608 PFETCH(c);
02609 if (c == end_code)
02610 return ONIGERR_EMPTY_GROUP_NAME;
02611
02612 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02613 is_num = 1;
02614 }
02615 else if (c == '-') {
02616 is_num = 2;
02617 sign = -1;
02618 pnum_head = p;
02619 }
02620 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02621 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02622 }
02623 }
02624
02625 while (!PEND) {
02626 name_end = p;
02627 PFETCH(c);
02628 if (c == end_code || c == ')' || c == '+' || c == '-') {
02629 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
02630 break;
02631 }
02632
02633 if (is_num != 0) {
02634 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02635 is_num = 1;
02636 }
02637 else {
02638 r = ONIGERR_INVALID_GROUP_NAME;
02639 is_num = 0;
02640 }
02641 }
02642 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02643 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02644 }
02645 }
02646
02647 if (r == 0 && c != end_code) {
02648 if (c == '+' || c == '-') {
02649 int level;
02650 int flag = (c == '-' ? -1 : 1);
02651
02652 PFETCH(c);
02653 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
02654 PUNFETCH;
02655 level = onig_scan_unsigned_number(&p, end, enc);
02656 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
02657 *rlevel = (level * flag);
02658 exist_level = 1;
02659
02660 PFETCH(c);
02661 if (c == end_code)
02662 goto end;
02663 }
02664
02665 err:
02666 r = ONIGERR_INVALID_GROUP_NAME;
02667 name_end = end;
02668 }
02669
02670 end:
02671 if (r == 0) {
02672 if (is_num != 0) {
02673 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
02674 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
02675 else if (*rback_num == 0) goto err;
02676
02677 *rback_num *= sign;
02678 }
02679
02680 *rname_end = name_end;
02681 *src = p;
02682 return (exist_level ? 1 : 0);
02683 }
02684 else {
02685 onig_scan_env_set_error_string(env, r, *src, name_end);
02686 return r;
02687 }
02688 }
02689 #endif
02690
02691
02692
02693
02694
02695 static int
02696 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
02697 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
02698 {
02699 int r, is_num, sign;
02700 OnigCodePoint end_code;
02701 OnigCodePoint c = 0;
02702 OnigEncoding enc = env->enc;
02703 UChar *name_end;
02704 UChar *pnum_head;
02705 UChar *p = *src;
02706 PFETCH_READY;
02707
02708 *rback_num = 0;
02709
02710 end_code = get_name_end_code_point(start_code);
02711
02712 name_end = end;
02713 pnum_head = *src;
02714 r = 0;
02715 is_num = 0;
02716 sign = 1;
02717 if (PEND) {
02718 return ONIGERR_EMPTY_GROUP_NAME;
02719 }
02720 else {
02721 PFETCH(c);
02722 if (c == end_code)
02723 return ONIGERR_EMPTY_GROUP_NAME;
02724
02725 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02726 if (ref == 1)
02727 is_num = 1;
02728 else {
02729 r = ONIGERR_INVALID_GROUP_NAME;
02730 is_num = 0;
02731 }
02732 }
02733 else if (c == '-') {
02734 if (ref == 1) {
02735 is_num = 2;
02736 sign = -1;
02737 pnum_head = p;
02738 }
02739 else {
02740 r = ONIGERR_INVALID_GROUP_NAME;
02741 is_num = 0;
02742 }
02743 }
02744 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02745 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02746 }
02747 }
02748
02749 if (r == 0) {
02750 while (!PEND) {
02751 name_end = p;
02752 PFETCH(c);
02753 if (c == end_code || c == ')') {
02754 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
02755 break;
02756 }
02757
02758 if (is_num != 0) {
02759 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02760 is_num = 1;
02761 }
02762 else {
02763 if (!ONIGENC_IS_CODE_WORD(enc, c))
02764 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02765 else
02766 r = ONIGERR_INVALID_GROUP_NAME;
02767
02768 is_num = 0;
02769 }
02770 }
02771 else {
02772 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02773 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02774 }
02775 }
02776 }
02777
02778 if (c != end_code) {
02779 r = ONIGERR_INVALID_GROUP_NAME;
02780 name_end = end;
02781 }
02782
02783 if (is_num != 0) {
02784 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
02785 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
02786 else if (*rback_num == 0) {
02787 r = ONIGERR_INVALID_GROUP_NAME;
02788 goto err;
02789 }
02790
02791 *rback_num *= sign;
02792 }
02793
02794 *rname_end = name_end;
02795 *src = p;
02796 return 0;
02797 }
02798 else {
02799 while (!PEND) {
02800 name_end = p;
02801 PFETCH(c);
02802 if (c == end_code || c == ')')
02803 break;
02804 }
02805 if (PEND)
02806 name_end = end;
02807
02808 err:
02809 onig_scan_env_set_error_string(env, r, *src, name_end);
02810 return r;
02811 }
02812 }
02813 #else
02814 static int
02815 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
02816 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
02817 {
02818 int r, is_num, sign;
02819 OnigCodePoint end_code;
02820 OnigCodePoint c = 0;
02821 UChar *name_end;
02822 OnigEncoding enc = env->enc;
02823 UChar *pnum_head;
02824 UChar *p = *src;
02825 PFETCH_READY;
02826
02827 *rback_num = 0;
02828
02829 end_code = get_name_end_code_point(start_code);
02830
02831 *rname_end = name_end = end;
02832 r = 0;
02833 pnum_head = *src;
02834 is_num = 0;
02835 sign = 1;
02836
02837 if (PEND) {
02838 return ONIGERR_EMPTY_GROUP_NAME;
02839 }
02840 else {
02841 PFETCH(c);
02842 if (c == end_code)
02843 return ONIGERR_EMPTY_GROUP_NAME;
02844
02845 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02846 is_num = 1;
02847 }
02848 else if (c == '-') {
02849 is_num = 2;
02850 sign = -1;
02851 pnum_head = p;
02852 }
02853 else {
02854 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02855 }
02856 }
02857
02858 while (!PEND) {
02859 name_end = p;
02860
02861 PFETCH(c);
02862 if (c == end_code || c == ')') break;
02863 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
02864 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02865 }
02866 if (r == 0 && c != end_code) {
02867 r = ONIGERR_INVALID_GROUP_NAME;
02868 name_end = end;
02869 }
02870
02871 if (r == 0) {
02872 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
02873 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
02874 else if (*rback_num == 0) {
02875 r = ONIGERR_INVALID_GROUP_NAME;
02876 goto err;
02877 }
02878 *rback_num *= sign;
02879
02880 *rname_end = name_end;
02881 *src = p;
02882 return 0;
02883 }
02884 else {
02885 err:
02886 onig_scan_env_set_error_string(env, r, *src, name_end);
02887 return r;
02888 }
02889 }
02890 #endif
02891
02892 void onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc,
02893 UChar* pat, UChar* pat_end, const UChar *fmt, va_list args);
02894
02895 static void
02896 onig_syntax_warn(ScanEnv *env, const char *fmt, ...)
02897 {
02898 va_list args;
02899 UChar buf[WARN_BUFSIZE];
02900 va_start(args, fmt);
02901 onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
02902 env->pattern, env->pattern_end,
02903 (const UChar *)fmt, args);
02904 va_end(args);
02905 if (env->sourcefile == NULL)
02906 rb_warn("%s", (char *)buf);
02907 else
02908 rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf);
02909 }
02910
02911 static void
02912 CC_ESC_WARN(ScanEnv *env, UChar *c)
02913 {
02914 if (onig_warn == onig_null_warn) return ;
02915
02916 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
02917 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
02918 onig_syntax_warn(env, "character class has '%s' without escape", c);
02919 }
02920 }
02921
02922 static void
02923 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
02924 {
02925 if (onig_warn == onig_null_warn) return ;
02926
02927 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
02928 onig_syntax_warn(env, "regular expression has '%s' without escape", c);
02929 }
02930 }
02931
02932 static void
02933 CC_DUP_WARN(ScanEnv *env)
02934 {
02935 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
02936
02937 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) &&
02938 !(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) {
02939 env->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
02940 onig_syntax_warn(env, "character class has duplicated range");
02941 }
02942 }
02943
02944 static void
02945 UNKNOWN_ESC_WARN(ScanEnv *env, int c)
02946 {
02947 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
02948 onig_syntax_warn(env, "Unknown escape \\%c is ignored", c);
02949 }
02950
02951 static UChar*
02952 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
02953 UChar **next, OnigEncoding enc)
02954 {
02955 int i;
02956 OnigCodePoint x;
02957 UChar *q;
02958 UChar *p = from;
02959
02960 while (p < to) {
02961 x = ONIGENC_MBC_TO_CODE(enc, p, to);
02962 q = p + enclen(enc, p, to);
02963 if (x == s[0]) {
02964 for (i = 1; i < n && q < to; i++) {
02965 x = ONIGENC_MBC_TO_CODE(enc, q, to);
02966 if (x != s[i]) break;
02967 q += enclen(enc, q, to);
02968 }
02969 if (i >= n) {
02970 if (IS_NOT_NULL(next))
02971 *next = q;
02972 return p;
02973 }
02974 }
02975 p = q;
02976 }
02977 return NULL_UCHARP;
02978 }
02979
02980 static int
02981 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
02982 OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn)
02983 {
02984 int i, in_esc;
02985 OnigCodePoint x;
02986 UChar *q;
02987 UChar *p = from;
02988
02989 in_esc = 0;
02990 while (p < to) {
02991 if (in_esc) {
02992 in_esc = 0;
02993 p += enclen(enc, p, to);
02994 }
02995 else {
02996 x = ONIGENC_MBC_TO_CODE(enc, p, to);
02997 q = p + enclen(enc, p, to);
02998 if (x == s[0]) {
02999 for (i = 1; i < n && q < to; i++) {
03000 x = ONIGENC_MBC_TO_CODE(enc, q, to);
03001 if (x != s[i]) break;
03002 q += enclen(enc, q, to);
03003 }
03004 if (i >= n) return 1;
03005 p += enclen(enc, p, to);
03006 }
03007 else {
03008 x = ONIGENC_MBC_TO_CODE(enc, p, to);
03009 if (x == bad) return 0;
03010 else if (x == MC_ESC(syn)) in_esc = 1;
03011 p = q;
03012 }
03013 }
03014 }
03015 return 0;
03016 }
03017
03018 static int
03019 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
03020 {
03021 int num;
03022 OnigCodePoint c, c2;
03023 const OnigSyntaxType* syn = env->syntax;
03024 OnigEncoding enc = env->enc;
03025 UChar* prev;
03026 UChar* p = *src;
03027 PFETCH_READY;
03028
03029 if (PEND) {
03030 tok->type = TK_EOT;
03031 return tok->type;
03032 }
03033
03034 PFETCH(c);
03035 tok->type = TK_CHAR;
03036 tok->base = 0;
03037 tok->u.c = c;
03038 tok->escaped = 0;
03039
03040 if (c == ']') {
03041 tok->type = TK_CC_CLOSE;
03042 }
03043 else if (c == '-') {
03044 tok->type = TK_CC_RANGE;
03045 }
03046 else if (c == MC_ESC(syn)) {
03047 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
03048 goto end;
03049
03050 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
03051
03052 PFETCH(c);
03053 tok->escaped = 1;
03054 tok->u.c = c;
03055 switch (c) {
03056 case 'w':
03057 tok->type = TK_CHAR_TYPE;
03058 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
03059 tok->u.prop.not = 0;
03060 break;
03061 case 'W':
03062 tok->type = TK_CHAR_TYPE;
03063 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
03064 tok->u.prop.not = 1;
03065 break;
03066 case 'd':
03067 tok->type = TK_CHAR_TYPE;
03068 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
03069 tok->u.prop.not = 0;
03070 break;
03071 case 'D':
03072 tok->type = TK_CHAR_TYPE;
03073 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
03074 tok->u.prop.not = 1;
03075 break;
03076 case 's':
03077 tok->type = TK_CHAR_TYPE;
03078 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
03079 tok->u.prop.not = 0;
03080 break;
03081 case 'S':
03082 tok->type = TK_CHAR_TYPE;
03083 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
03084 tok->u.prop.not = 1;
03085 break;
03086 case 'h':
03087 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03088 tok->type = TK_CHAR_TYPE;
03089 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03090 tok->u.prop.not = 0;
03091 break;
03092 case 'H':
03093 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03094 tok->type = TK_CHAR_TYPE;
03095 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03096 tok->u.prop.not = 1;
03097 break;
03098
03099 case 'p':
03100 case 'P':
03101 c2 = PPEEK;
03102 if (c2 == '{' &&
03103 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
03104 PINC;
03105 tok->type = TK_CHAR_PROPERTY;
03106 tok->u.prop.not = (c == 'P' ? 1 : 0);
03107
03108 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
03109 PFETCH(c2);
03110 if (c2 == '^') {
03111 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
03112 }
03113 else
03114 PUNFETCH;
03115 }
03116 }
03117 else {
03118 onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
03119 }
03120 break;
03121
03122 case 'x':
03123 if (PEND) break;
03124
03125 prev = p;
03126 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
03127 PINC;
03128 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
03129 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
03130 if (!PEND) {
03131 c2 = PPEEK;
03132 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
03133 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
03134 }
03135
03136 if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) {
03137 PINC;
03138 tok->type = TK_CODE_POINT;
03139 tok->base = 16;
03140 tok->u.code = (OnigCodePoint )num;
03141 }
03142 else {
03143
03144 p = prev;
03145 }
03146 }
03147 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
03148 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
03149 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03150 if (p == prev) {
03151 num = 0;
03152 }
03153 tok->type = TK_RAW_BYTE;
03154 tok->base = 16;
03155 tok->u.c = num;
03156 }
03157 break;
03158
03159 case 'u':
03160 if (PEND) break;
03161
03162 prev = p;
03163 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
03164 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
03165 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
03166 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03167 if (p == prev) {
03168 num = 0;
03169 }
03170 tok->type = TK_CODE_POINT;
03171 tok->base = 16;
03172 tok->u.code = (OnigCodePoint )num;
03173 }
03174 break;
03175
03176 case '0':
03177 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
03178 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
03179 PUNFETCH;
03180 prev = p;
03181 num = scan_unsigned_octal_number(&p, end, 3, enc);
03182 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03183 if (p == prev) {
03184 num = 0;
03185 }
03186 tok->type = TK_RAW_BYTE;
03187 tok->base = 8;
03188 tok->u.c = num;
03189 }
03190 break;
03191
03192 default:
03193 PUNFETCH;
03194 num = fetch_escaped_value(&p, end, env);
03195 if (num < 0) return num;
03196 if (tok->u.c != num) {
03197 tok->u.code = (OnigCodePoint )num;
03198 tok->type = TK_CODE_POINT;
03199 }
03200 break;
03201 }
03202 }
03203 else if (c == '[') {
03204 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
03205 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
03206 tok->backp = p;
03207 PINC;
03208 if (str_exist_check_with_esc(send, 2, p, end,
03209 (OnigCodePoint )']', enc, syn)) {
03210 tok->type = TK_POSIX_BRACKET_OPEN;
03211 }
03212 else {
03213 PUNFETCH;
03214 goto cc_in_cc;
03215 }
03216 }
03217 else {
03218 cc_in_cc:
03219 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
03220 tok->type = TK_CC_CC_OPEN;
03221 }
03222 else {
03223 CC_ESC_WARN(env, (UChar* )"[");
03224 }
03225 }
03226 }
03227 else if (c == '&') {
03228 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
03229 !PEND && (PPEEK_IS('&'))) {
03230 PINC;
03231 tok->type = TK_CC_AND;
03232 }
03233 }
03234
03235 end:
03236 *src = p;
03237 return tok->type;
03238 }
03239
03240 #ifdef USE_NAMED_GROUP
03241 static int
03242 fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src,
03243 UChar* end, ScanEnv* env)
03244 {
03245 int r, num;
03246 const OnigSyntaxType* syn = env->syntax;
03247 UChar* prev;
03248 UChar* p = *src;
03249 UChar* name_end;
03250 int* backs;
03251 int back_num;
03252
03253 prev = p;
03254
03255 #ifdef USE_BACKREF_WITH_LEVEL
03256 name_end = NULL_UCHARP;
03257 r = fetch_name_with_level(c, &p, end, &name_end,
03258 env, &back_num, &tok->u.backref.level);
03259 if (r == 1) tok->u.backref.exist_level = 1;
03260 else tok->u.backref.exist_level = 0;
03261 #else
03262 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
03263 #endif
03264 if (r < 0) return r;
03265
03266 if (back_num != 0) {
03267 if (back_num < 0) {
03268 back_num = BACKREF_REL_TO_ABS(back_num, env);
03269 if (back_num <= 0)
03270 return ONIGERR_INVALID_BACKREF;
03271 }
03272
03273 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
03274 if (back_num > env->num_mem ||
03275 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
03276 return ONIGERR_INVALID_BACKREF;
03277 }
03278 tok->type = TK_BACKREF;
03279 tok->u.backref.by_name = 0;
03280 tok->u.backref.num = 1;
03281 tok->u.backref.ref1 = back_num;
03282 }
03283 else {
03284 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
03285 if (num <= 0) {
03286 onig_scan_env_set_error_string(env,
03287 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
03288 return ONIGERR_UNDEFINED_NAME_REFERENCE;
03289 }
03290 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
03291 int i;
03292 for (i = 0; i < num; i++) {
03293 if (backs[i] > env->num_mem ||
03294 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
03295 return ONIGERR_INVALID_BACKREF;
03296 }
03297 }
03298
03299 tok->type = TK_BACKREF;
03300 tok->u.backref.by_name = 1;
03301 if (num == 1) {
03302 tok->u.backref.num = 1;
03303 tok->u.backref.ref1 = backs[0];
03304 }
03305 else {
03306 tok->u.backref.num = num;
03307 tok->u.backref.refs = backs;
03308 }
03309 }
03310 *src = p;
03311 return 0;
03312 }
03313 #endif
03314
03315 static int
03316 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
03317 {
03318 int r, num;
03319 OnigCodePoint c;
03320 OnigEncoding enc = env->enc;
03321 const OnigSyntaxType* syn = env->syntax;
03322 UChar* prev;
03323 UChar* p = *src;
03324 PFETCH_READY;
03325
03326 start:
03327 if (PEND) {
03328 tok->type = TK_EOT;
03329 return tok->type;
03330 }
03331
03332 tok->type = TK_STRING;
03333 tok->base = 0;
03334 tok->backp = p;
03335
03336 PFETCH(c);
03337 if (IS_MC_ESC_CODE(c, syn)) {
03338 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
03339
03340 tok->backp = p;
03341 PFETCH(c);
03342
03343 tok->u.c = c;
03344 tok->escaped = 1;
03345 switch (c) {
03346 case '*':
03347 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
03348 tok->type = TK_OP_REPEAT;
03349 tok->u.repeat.lower = 0;
03350 tok->u.repeat.upper = REPEAT_INFINITE;
03351 goto greedy_check;
03352 break;
03353
03354 case '+':
03355 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
03356 tok->type = TK_OP_REPEAT;
03357 tok->u.repeat.lower = 1;
03358 tok->u.repeat.upper = REPEAT_INFINITE;
03359 goto greedy_check;
03360 break;
03361
03362 case '?':
03363 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
03364 tok->type = TK_OP_REPEAT;
03365 tok->u.repeat.lower = 0;
03366 tok->u.repeat.upper = 1;
03367 greedy_check:
03368 if (!PEND && PPEEK_IS('?') &&
03369 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
03370 PFETCH(c);
03371 tok->u.repeat.greedy = 0;
03372 tok->u.repeat.possessive = 0;
03373 }
03374 else {
03375 possessive_check:
03376 if (!PEND && PPEEK_IS('+') &&
03377 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
03378 tok->type != TK_INTERVAL) ||
03379 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
03380 tok->type == TK_INTERVAL))) {
03381 PFETCH(c);
03382 tok->u.repeat.greedy = 1;
03383 tok->u.repeat.possessive = 1;
03384 }
03385 else {
03386 tok->u.repeat.greedy = 1;
03387 tok->u.repeat.possessive = 0;
03388 }
03389 }
03390 break;
03391
03392 case '{':
03393 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
03394 r = fetch_range_quantifier(&p, end, tok, env);
03395 if (r < 0) return r;
03396 if (r == 0) goto greedy_check;
03397 else if (r == 2) {
03398 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
03399 goto possessive_check;
03400
03401 goto greedy_check;
03402 }
03403
03404 break;
03405
03406 case '|':
03407 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
03408 tok->type = TK_ALT;
03409 break;
03410
03411 case '(':
03412 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
03413 tok->type = TK_SUBEXP_OPEN;
03414 break;
03415
03416 case ')':
03417 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
03418 tok->type = TK_SUBEXP_CLOSE;
03419 break;
03420
03421 case 'w':
03422 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
03423 tok->type = TK_CHAR_TYPE;
03424 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
03425 tok->u.prop.not = 0;
03426 break;
03427
03428 case 'W':
03429 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
03430 tok->type = TK_CHAR_TYPE;
03431 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
03432 tok->u.prop.not = 1;
03433 break;
03434
03435 case 'b':
03436 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
03437 tok->type = TK_ANCHOR;
03438 tok->u.anchor.subtype = ANCHOR_WORD_BOUND;
03439 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
03440 && ! IS_WORD_BOUND_ALL_RANGE(env->option);
03441 break;
03442
03443 case 'B':
03444 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
03445 tok->type = TK_ANCHOR;
03446 tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND;
03447 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
03448 && ! IS_WORD_BOUND_ALL_RANGE(env->option);
03449 break;
03450
03451 #ifdef USE_WORD_BEGIN_END
03452 case '<':
03453 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
03454 tok->type = TK_ANCHOR;
03455 tok->u.anchor.subtype = ANCHOR_WORD_BEGIN;
03456 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
03457 break;
03458
03459 case '>':
03460 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
03461 tok->type = TK_ANCHOR;
03462 tok->u.anchor.subtype = ANCHOR_WORD_END;
03463 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
03464 break;
03465 #endif
03466
03467 case 's':
03468 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
03469 tok->type = TK_CHAR_TYPE;
03470 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
03471 tok->u.prop.not = 0;
03472 break;
03473
03474 case 'S':
03475 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
03476 tok->type = TK_CHAR_TYPE;
03477 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
03478 tok->u.prop.not = 1;
03479 break;
03480
03481 case 'd':
03482 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
03483 tok->type = TK_CHAR_TYPE;
03484 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
03485 tok->u.prop.not = 0;
03486 break;
03487
03488 case 'D':
03489 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
03490 tok->type = TK_CHAR_TYPE;
03491 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
03492 tok->u.prop.not = 1;
03493 break;
03494
03495 case 'h':
03496 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03497 tok->type = TK_CHAR_TYPE;
03498 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03499 tok->u.prop.not = 0;
03500 break;
03501
03502 case 'H':
03503 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03504 tok->type = TK_CHAR_TYPE;
03505 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03506 tok->u.prop.not = 1;
03507 break;
03508
03509 case 'A':
03510 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
03511 begin_buf:
03512 tok->type = TK_ANCHOR;
03513 tok->u.anchor.subtype = ANCHOR_BEGIN_BUF;
03514 break;
03515
03516 case 'Z':
03517 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
03518 tok->type = TK_ANCHOR;
03519 tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF;
03520 break;
03521
03522 case 'z':
03523 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
03524 end_buf:
03525 tok->type = TK_ANCHOR;
03526 tok->u.anchor.subtype = ANCHOR_END_BUF;
03527 break;
03528
03529 case 'G':
03530 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
03531 tok->type = TK_ANCHOR;
03532 tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION;
03533 break;
03534
03535 case '`':
03536 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
03537 goto begin_buf;
03538 break;
03539
03540 case '\'':
03541 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
03542 goto end_buf;
03543 break;
03544
03545 case 'x':
03546 if (PEND) break;
03547
03548 prev = p;
03549 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
03550 PINC;
03551 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
03552 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
03553 if (!PEND) {
03554 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
03555 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
03556 }
03557
03558 if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) {
03559 PINC;
03560 tok->type = TK_CODE_POINT;
03561 tok->u.code = (OnigCodePoint )num;
03562 }
03563 else {
03564
03565 p = prev;
03566 }
03567 }
03568 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
03569 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
03570 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03571 if (p == prev) {
03572 num = 0;
03573 }
03574 tok->type = TK_RAW_BYTE;
03575 tok->base = 16;
03576 tok->u.c = num;
03577 }
03578 break;
03579
03580 case 'u':
03581 if (PEND) break;
03582
03583 prev = p;
03584 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
03585 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
03586 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
03587 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03588 if (p == prev) {
03589 num = 0;
03590 }
03591 tok->type = TK_CODE_POINT;
03592 tok->base = 16;
03593 tok->u.code = (OnigCodePoint )num;
03594 }
03595 break;
03596
03597 case '1': case '2': case '3': case '4':
03598 case '5': case '6': case '7': case '8': case '9':
03599 PUNFETCH;
03600 prev = p;
03601 num = onig_scan_unsigned_number(&p, end, enc);
03602 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
03603 goto skip_backref;
03604 }
03605
03606 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
03607 (num <= env->num_mem || num <= 9)) {
03608 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
03609 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
03610 return ONIGERR_INVALID_BACKREF;
03611 }
03612
03613 tok->type = TK_BACKREF;
03614 tok->u.backref.num = 1;
03615 tok->u.backref.ref1 = num;
03616 tok->u.backref.by_name = 0;
03617 #ifdef USE_BACKREF_WITH_LEVEL
03618 tok->u.backref.exist_level = 0;
03619 #endif
03620 break;
03621 }
03622
03623 skip_backref:
03624 if (c == '8' || c == '9') {
03625
03626 p = prev; PINC;
03627 break;
03628 }
03629
03630 p = prev;
03631
03632 case '0':
03633 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
03634 prev = p;
03635 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
03636 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03637 if (p == prev) {
03638 num = 0;
03639 }
03640 tok->type = TK_RAW_BYTE;
03641 tok->base = 8;
03642 tok->u.c = num;
03643 }
03644 else if (c != '0') {
03645 PINC;
03646 }
03647 break;
03648
03649 #ifdef USE_NAMED_GROUP
03650 case 'k':
03651 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
03652 PFETCH(c);
03653 if (c == '<' || c == '\'') {
03654 r = fetch_named_backref_token(c, tok, &p, end, env);
03655 if (r < 0) return r;
03656 }
03657 else {
03658 PUNFETCH;
03659 onig_syntax_warn(env, "invalid back reference");
03660 }
03661 }
03662 break;
03663 #endif
03664
03665 #if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP)
03666 case 'g':
03667 #ifdef USE_NAMED_GROUP
03668 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) {
03669 PFETCH(c);
03670 if (c == '{') {
03671 r = fetch_named_backref_token(c, tok, &p, end, env);
03672 if (r < 0) return r;
03673 }
03674 else
03675 PUNFETCH;
03676 }
03677 #endif
03678 #ifdef USE_SUBEXP_CALL
03679 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
03680 PFETCH(c);
03681 if (c == '<' || c == '\'') {
03682 int gnum = -1, rel = 0;
03683 UChar* name_end;
03684 OnigCodePoint cnext;
03685
03686 cnext = PPEEK;
03687 if (cnext == '0') {
03688 PINC;
03689 if (PPEEK_IS(get_name_end_code_point(c))) {
03690 PINC;
03691 name_end = p;
03692 gnum = 0;
03693 }
03694 }
03695 else if (cnext == '+') {
03696 PINC;
03697 rel = 1;
03698 }
03699 prev = p;
03700 if (gnum < 0) {
03701 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
03702 if (r < 0) return r;
03703 }
03704
03705 tok->type = TK_CALL;
03706 tok->u.call.name = prev;
03707 tok->u.call.name_end = name_end;
03708 tok->u.call.gnum = gnum;
03709 tok->u.call.rel = rel;
03710 }
03711 else {
03712 onig_syntax_warn(env, "invalid subexp call");
03713 PUNFETCH;
03714 }
03715 }
03716 #endif
03717 break;
03718 #endif
03719
03720 case 'Q':
03721 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
03722 tok->type = TK_QUOTE_OPEN;
03723 }
03724 break;
03725
03726 case 'p':
03727 case 'P':
03728 if (PPEEK_IS('{') &&
03729 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
03730 PINC;
03731 tok->type = TK_CHAR_PROPERTY;
03732 tok->u.prop.not = (c == 'P' ? 1 : 0);
03733
03734 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
03735 PFETCH(c);
03736 if (c == '^') {
03737 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
03738 }
03739 else
03740 PUNFETCH;
03741 }
03742 }
03743 else {
03744 onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
03745 }
03746 break;
03747
03748 case 'R':
03749 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) {
03750 tok->type = TK_LINEBREAK;
03751 }
03752 break;
03753
03754 case 'X':
03755 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) {
03756 tok->type = TK_EXTENDED_GRAPHEME_CLUSTER;
03757 }
03758 break;
03759
03760 case 'K':
03761 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) {
03762 tok->type = TK_KEEP;
03763 }
03764 break;
03765
03766 default:
03767 PUNFETCH;
03768 num = fetch_escaped_value(&p, end, env);
03769 if (num < 0) return num;
03770
03771 if (tok->u.c != num) {
03772 tok->type = TK_CODE_POINT;
03773 tok->u.code = (OnigCodePoint )num;
03774 }
03775 else {
03776 p = tok->backp + enclen(enc, tok->backp, end);
03777 }
03778 break;
03779 }
03780 }
03781 else {
03782 tok->u.c = c;
03783 tok->escaped = 0;
03784
03785 #ifdef USE_VARIABLE_META_CHARS
03786 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
03787 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
03788 if (c == MC_ANYCHAR(syn))
03789 goto any_char;
03790 else if (c == MC_ANYTIME(syn))
03791 goto anytime;
03792 else if (c == MC_ZERO_OR_ONE_TIME(syn))
03793 goto zero_or_one_time;
03794 else if (c == MC_ONE_OR_MORE_TIME(syn))
03795 goto one_or_more_time;
03796 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
03797 tok->type = TK_ANYCHAR_ANYTIME;
03798 goto out;
03799 }
03800 }
03801 #endif
03802
03803 switch (c) {
03804 case '.':
03805 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
03806 #ifdef USE_VARIABLE_META_CHARS
03807 any_char:
03808 #endif
03809 tok->type = TK_ANYCHAR;
03810 break;
03811
03812 case '*':
03813 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
03814 #ifdef USE_VARIABLE_META_CHARS
03815 anytime:
03816 #endif
03817 tok->type = TK_OP_REPEAT;
03818 tok->u.repeat.lower = 0;
03819 tok->u.repeat.upper = REPEAT_INFINITE;
03820 goto greedy_check;
03821 break;
03822
03823 case '+':
03824 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
03825 #ifdef USE_VARIABLE_META_CHARS
03826 one_or_more_time:
03827 #endif
03828 tok->type = TK_OP_REPEAT;
03829 tok->u.repeat.lower = 1;
03830 tok->u.repeat.upper = REPEAT_INFINITE;
03831 goto greedy_check;
03832 break;
03833
03834 case '?':
03835 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
03836 #ifdef USE_VARIABLE_META_CHARS
03837 zero_or_one_time:
03838 #endif
03839 tok->type = TK_OP_REPEAT;
03840 tok->u.repeat.lower = 0;
03841 tok->u.repeat.upper = 1;
03842 goto greedy_check;
03843 break;
03844
03845 case '{':
03846 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
03847 r = fetch_range_quantifier(&p, end, tok, env);
03848 if (r < 0) return r;
03849 if (r == 0) goto greedy_check;
03850 else if (r == 2) {
03851 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
03852 goto possessive_check;
03853
03854 goto greedy_check;
03855 }
03856
03857 break;
03858
03859 case '|':
03860 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
03861 tok->type = TK_ALT;
03862 break;
03863
03864 case '(':
03865 if (PPEEK_IS('?') &&
03866 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
03867 PINC;
03868 if (PPEEK_IS('#')) {
03869 PFETCH(c);
03870 while (1) {
03871 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
03872 PFETCH(c);
03873 if (c == MC_ESC(syn)) {
03874 if (!PEND) PFETCH(c);
03875 }
03876 else {
03877 if (c == ')') break;
03878 }
03879 }
03880 goto start;
03881 }
03882 #ifdef USE_PERL_SUBEXP_CALL
03883
03884 c = PPEEK;
03885 if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) &&
03886 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
03887
03888 int gnum;
03889 UChar *name;
03890 UChar *name_end;
03891
03892 if (c == 'R' || c == '0') {
03893 PINC;
03894 if (!PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME;
03895 PINC;
03896 name_end = name = p;
03897 gnum = 0;
03898 }
03899 else {
03900 int numref = 1;
03901 if (c == '&') {
03902 PINC;
03903 numref = 0;
03904 }
03905 name = p;
03906 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref);
03907 if (r < 0) return r;
03908 }
03909
03910 tok->type = TK_CALL;
03911 tok->u.call.name = name;
03912 tok->u.call.name_end = name_end;
03913 tok->u.call.gnum = gnum;
03914 tok->u.call.rel = 0;
03915 break;
03916 }
03917 else if ((c == '-' || c == '+') &&
03918 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
03919
03920 int gnum;
03921 UChar *name;
03922 UChar *name_end;
03923 OnigCodePoint cnext;
03924 PFETCH_READY;
03925
03926 PINC;
03927 cnext = PPEEK;
03928 if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) {
03929 if (c == '-') PUNFETCH;
03930 name = p;
03931 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1);
03932 if (r < 0) return r;
03933
03934 tok->type = TK_CALL;
03935 tok->u.call.name = name;
03936 tok->u.call.name_end = name_end;
03937 tok->u.call.gnum = gnum;
03938 tok->u.call.rel = 1;
03939 break;
03940 }
03941 }
03942 #endif
03943 #ifdef USE_CAPITAL_P_NAMED_GROUP
03944 if (PPEEK_IS('P') &&
03945 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
03946 int gnum;
03947 UChar *name;
03948 UChar *name_end;
03949 PFETCH_READY;
03950
03951 PINC;
03952 PFETCH(c);
03953 if (c == '=') {
03954 r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env);
03955 if (r < 0) return r;
03956 break;
03957 }
03958 else if (c == '>') {
03959 name = p;
03960 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0);
03961 if (r < 0) return r;
03962
03963 tok->type = TK_CALL;
03964 tok->u.call.name = name;
03965 tok->u.call.name_end = name_end;
03966 tok->u.call.gnum = gnum;
03967 tok->u.call.rel = 0;
03968 break;
03969 }
03970 PUNFETCH;
03971 }
03972 #endif
03973 PUNFETCH;
03974 }
03975
03976 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
03977 tok->type = TK_SUBEXP_OPEN;
03978 break;
03979
03980 case ')':
03981 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
03982 tok->type = TK_SUBEXP_CLOSE;
03983 break;
03984
03985 case '^':
03986 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
03987 tok->type = TK_ANCHOR;
03988 tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
03989 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
03990 break;
03991
03992 case '$':
03993 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
03994 tok->type = TK_ANCHOR;
03995 tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
03996 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
03997 break;
03998
03999 case '[':
04000 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
04001 tok->type = TK_CC_OPEN;
04002 break;
04003
04004 case ']':
04005 if (*src > env->pattern)
04006 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
04007 break;
04008
04009 case '#':
04010 if (IS_EXTEND(env->option)) {
04011 while (!PEND) {
04012 PFETCH(c);
04013 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
04014 break;
04015 }
04016 goto start;
04017 break;
04018 }
04019 break;
04020
04021 case ' ': case '\t': case '\n': case '\r': case '\f':
04022 if (IS_EXTEND(env->option))
04023 goto start;
04024 break;
04025
04026 default:
04027
04028 break;
04029 }
04030 }
04031
04032 #ifdef USE_VARIABLE_META_CHARS
04033 out:
04034 #endif
04035 *src = p;
04036 return tok->type;
04037 }
04038
04039 static int
04040 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
04041 ScanEnv* env,
04042 OnigCodePoint sb_out, const OnigCodePoint mbr[])
04043 {
04044 int i, r;
04045 OnigCodePoint j;
04046
04047 int n = ONIGENC_CODE_RANGE_NUM(mbr);
04048
04049 if (not == 0) {
04050 for (i = 0; i < n; i++) {
04051 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
04052 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
04053 if (j >= sb_out) {
04054 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
04055 r = add_code_range_to_buf(&(cc->mbuf), env, j,
04056 ONIGENC_CODE_RANGE_TO(mbr, i));
04057 if (r != 0) return r;
04058 i++;
04059 }
04060
04061 goto sb_end;
04062 }
04063 BITSET_SET_BIT_CHKDUP(cc->bs, j);
04064 }
04065 }
04066
04067 sb_end:
04068 for ( ; i < n; i++) {
04069 r = add_code_range_to_buf(&(cc->mbuf), env,
04070 ONIGENC_CODE_RANGE_FROM(mbr, i),
04071 ONIGENC_CODE_RANGE_TO(mbr, i));
04072 if (r != 0) return r;
04073 }
04074 }
04075 else {
04076 OnigCodePoint prev = 0;
04077
04078 for (i = 0; i < n; i++) {
04079 for (j = prev;
04080 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
04081 if (j >= sb_out) {
04082 goto sb_end2;
04083 }
04084 BITSET_SET_BIT_CHKDUP(cc->bs, j);
04085 }
04086 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
04087 }
04088 for (j = prev; j < sb_out; j++) {
04089 BITSET_SET_BIT_CHKDUP(cc->bs, j);
04090 }
04091
04092 sb_end2:
04093 prev = sb_out;
04094
04095 for (i = 0; i < n; i++) {
04096 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
04097 r = add_code_range_to_buf(&(cc->mbuf), env, prev,
04098 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
04099 if (r != 0) return r;
04100 }
04101 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
04102 }
04103 if (prev < 0x7fffffff) {
04104 r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff);
04105 if (r != 0) return r;
04106 }
04107 }
04108
04109 return 0;
04110 }
04111
04112 static int
04113 add_ctype_to_cc(CClassNode* cc, int ctype, int not, int char_prop, ScanEnv* env)
04114 {
04115 int maxcode, ascii_range;
04116 int c, r;
04117 const OnigCodePoint *ranges;
04118 OnigCodePoint sb_out;
04119 OnigEncoding enc = env->enc;
04120 OnigOptionType option = env->option;
04121
04122 ascii_range = IS_ASCII_RANGE(option) && (char_prop == 0);
04123
04124 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
04125 if (r == 0) {
04126 if (ascii_range) {
04127 CClassNode ccwork;
04128 initialize_cclass(&ccwork);
04129 r = add_ctype_to_cc_by_range(&ccwork, ctype, not, env, sb_out,
04130 ranges);
04131 if (r == 0) {
04132 if (not) {
04133 r = add_code_range_to_buf0(&(ccwork.mbuf), env, 0x80, ONIG_LAST_CODE_POINT, FALSE);
04134 }
04135 else {
04136 CClassNode ccascii;
04137 initialize_cclass(&ccascii);
04138 if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
04139 add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F);
04140 }
04141 else {
04142 bitset_set_range(env, ccascii.bs, 0x00, 0x7F);
04143 }
04144 r = and_cclass(&ccwork, &ccascii, env);
04145 if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf);
04146 }
04147 if (r == 0) {
04148 r = or_cclass(cc, &ccwork, env);
04149 }
04150 if (IS_NOT_NULL(ccwork.mbuf)) bbuf_free(ccwork.mbuf);
04151 }
04152 }
04153 else {
04154 r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges);
04155 }
04156 return r;
04157 }
04158 else if (r != ONIG_NO_SUPPORT_CONFIG) {
04159 return r;
04160 }
04161
04162 maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
04163 r = 0;
04164 switch (ctype) {
04165 case ONIGENC_CTYPE_ALPHA:
04166 case ONIGENC_CTYPE_BLANK:
04167 case ONIGENC_CTYPE_CNTRL:
04168 case ONIGENC_CTYPE_DIGIT:
04169 case ONIGENC_CTYPE_LOWER:
04170 case ONIGENC_CTYPE_PUNCT:
04171 case ONIGENC_CTYPE_SPACE:
04172 case ONIGENC_CTYPE_UPPER:
04173 case ONIGENC_CTYPE_XDIGIT:
04174 case ONIGENC_CTYPE_ASCII:
04175 case ONIGENC_CTYPE_ALNUM:
04176 if (not != 0) {
04177 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
04178 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
04179 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04180 }
04181 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04182 }
04183 else {
04184 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
04185 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
04186 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04187 }
04188 }
04189 break;
04190
04191 case ONIGENC_CTYPE_GRAPH:
04192 case ONIGENC_CTYPE_PRINT:
04193 if (not != 0) {
04194 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
04195 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)
04196 || c >= maxcode)
04197 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04198 }
04199 if (ascii_range)
04200 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04201 }
04202 else {
04203 for (c = 0; c < maxcode; c++) {
04204 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
04205 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04206 }
04207 if (! ascii_range)
04208 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04209 }
04210 break;
04211
04212 case ONIGENC_CTYPE_WORD:
04213 if (not == 0) {
04214 for (c = 0; c < maxcode; c++) {
04215 if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c);
04216 }
04217 if (! ascii_range)
04218 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04219 }
04220 else {
04221 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
04222 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0)
04223 && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode))
04224 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04225 }
04226 if (ascii_range)
04227 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
04228 }
04229 break;
04230
04231 default:
04232 return ONIGERR_PARSER_BUG;
04233 break;
04234 }
04235
04236 return r;
04237 }
04238
04239 static int
04240 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
04241 {
04242 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
04243 #define POSIX_BRACKET_NAME_MIN_LEN 4
04244
04245 static const PosixBracketEntryType PBS[] = {
04246 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
04247 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
04248 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
04249 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
04250 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
04251 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
04252 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
04253 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
04254 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
04255 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
04256 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
04257 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
04258 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
04259 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
04260 { (UChar* )NULL, -1, 0 }
04261 };
04262
04263 const PosixBracketEntryType *pb;
04264 int not, i, r;
04265 OnigCodePoint c;
04266 OnigEncoding enc = env->enc;
04267 UChar *p = *src;
04268 PFETCH_READY;
04269
04270 if (PPEEK_IS('^')) {
04271 PINC;
04272 not = 1;
04273 }
04274 else
04275 not = 0;
04276
04277 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
04278 goto not_posix_bracket;
04279
04280 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
04281 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
04282 p = (UChar* )onigenc_step(enc, p, end, pb->len);
04283 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
04284 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
04285
04286 r = add_ctype_to_cc(cc, pb->ctype, not,
04287 IS_POSIX_BRACKET_ALL_RANGE(env->option),
04288 env);
04289 if (r != 0) return r;
04290
04291 PINC; PINC;
04292 *src = p;
04293 return 0;
04294 }
04295 }
04296
04297 not_posix_bracket:
04298 c = 0;
04299 i = 0;
04300 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
04301 PINC;
04302 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
04303 }
04304 if (c == ':' && ! PEND) {
04305 PINC;
04306 if (! PEND) {
04307 PFETCH(c);
04308 if (c == ']')
04309 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
04310 }
04311 }
04312
04313 return 1;
04314 }
04315
04316 static int
04317 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
04318 {
04319 int r;
04320 OnigCodePoint c;
04321 OnigEncoding enc = env->enc;
04322 UChar *prev, *start, *p = *src;
04323 PFETCH_READY;
04324
04325 r = 0;
04326 start = prev = p;
04327
04328 while (!PEND) {
04329 prev = p;
04330 PFETCH(c);
04331 if (c == '}') {
04332 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
04333 if (r < 0) break;
04334
04335 *src = p;
04336 return r;
04337 }
04338 else if (c == '(' || c == ')' || c == '{' || c == '|') {
04339 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
04340 break;
04341 }
04342 }
04343
04344 onig_scan_env_set_error_string(env, r, *src, prev);
04345 return r;
04346 }
04347
04348 static int
04349 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
04350 ScanEnv* env)
04351 {
04352 int r, ctype;
04353 CClassNode* cc;
04354
04355 ctype = fetch_char_property_to_ctype(src, end, env);
04356 if (ctype < 0) return ctype;
04357
04358 *np = node_new_cclass();
04359 CHECK_NULL_RETURN_MEMERR(*np);
04360 cc = NCCLASS(*np);
04361 r = add_ctype_to_cc(cc, ctype, 0, 1, env);
04362 if (r != 0) return r;
04363 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
04364
04365 return 0;
04366 }
04367
04368
04369 enum CCSTATE {
04370 CCS_VALUE,
04371 CCS_RANGE,
04372 CCS_COMPLETE,
04373 CCS_START
04374 };
04375
04376 enum CCVALTYPE {
04377 CCV_SB,
04378 CCV_CODE_POINT,
04379 CCV_CLASS
04380 };
04381
04382 static int
04383 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
04384 enum CCSTATE* state, ScanEnv* env)
04385 {
04386 int r;
04387
04388 if (*state == CCS_RANGE)
04389 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
04390
04391 if (*state == CCS_VALUE && *type != CCV_CLASS) {
04392 if (*type == CCV_SB)
04393 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
04394 else if (*type == CCV_CODE_POINT) {
04395 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
04396 if (r < 0) return r;
04397 }
04398 }
04399
04400 *state = CCS_VALUE;
04401 *type = CCV_CLASS;
04402 return 0;
04403 }
04404
04405 static int
04406 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
04407 int* vs_israw, int v_israw,
04408 enum CCVALTYPE intype, enum CCVALTYPE* type,
04409 enum CCSTATE* state, ScanEnv* env)
04410 {
04411 int r;
04412
04413 switch (*state) {
04414 case CCS_VALUE:
04415 if (*type == CCV_SB)
04416 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
04417 else if (*type == CCV_CODE_POINT) {
04418 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
04419 if (r < 0) return r;
04420 }
04421 break;
04422
04423 case CCS_RANGE:
04424 if (intype == *type) {
04425 if (intype == CCV_SB) {
04426 if (*vs > 0xff || v > 0xff)
04427 return ONIGERR_INVALID_CODE_POINT_VALUE;
04428
04429 if (*vs > v) {
04430 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
04431 goto ccs_range_end;
04432 else
04433 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
04434 }
04435 bitset_set_range(env, cc->bs, (int )*vs, (int )v);
04436 }
04437 else {
04438 r = add_code_range(&(cc->mbuf), env, *vs, v);
04439 if (r < 0) return r;
04440 }
04441 }
04442 else {
04443 #if 0
04444 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
04445 #endif
04446 if (*vs > v) {
04447 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
04448 goto ccs_range_end;
04449 else
04450 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
04451 }
04452 bitset_set_range(env, cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
04453 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
04454 if (r < 0) return r;
04455 #if 0
04456 }
04457 else
04458 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
04459 #endif
04460 }
04461 ccs_range_end:
04462 *state = CCS_COMPLETE;
04463 break;
04464
04465 case CCS_COMPLETE:
04466 case CCS_START:
04467 *state = CCS_VALUE;
04468 break;
04469
04470 default:
04471 break;
04472 }
04473
04474 *vs_israw = v_israw;
04475 *vs = v;
04476 *type = intype;
04477 return 0;
04478 }
04479
04480 static int
04481 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
04482 ScanEnv* env)
04483 {
04484 int in_esc;
04485 OnigCodePoint code;
04486 OnigEncoding enc = env->enc;
04487 UChar* p = from;
04488 PFETCH_READY;
04489
04490 in_esc = 0;
04491 while (! PEND) {
04492 if (ignore_escaped && in_esc) {
04493 in_esc = 0;
04494 }
04495 else {
04496 PFETCH(code);
04497 if (code == c) return 1;
04498 if (code == MC_ESC(env->syntax)) in_esc = 1;
04499 }
04500 }
04501 return 0;
04502 }
04503
04504 static int
04505 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
04506 ScanEnv* env)
04507 {
04508 int r, neg, len, fetched, and_start;
04509 OnigCodePoint v, vs;
04510 UChar *p;
04511 Node* node;
04512 CClassNode *cc, *prev_cc;
04513 CClassNode work_cc;
04514
04515 enum CCSTATE state;
04516 enum CCVALTYPE val_type, in_type;
04517 int val_israw, in_israw;
04518
04519 prev_cc = (CClassNode* )NULL;
04520 *np = NULL_NODE;
04521 r = fetch_token_in_cc(tok, src, end, env);
04522 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
04523 neg = 1;
04524 r = fetch_token_in_cc(tok, src, end, env);
04525 }
04526 else {
04527 neg = 0;
04528 }
04529
04530 if (r < 0) return r;
04531 if (r == TK_CC_CLOSE) {
04532 if (! code_exist_check((OnigCodePoint )']',
04533 *src, env->pattern_end, 1, env))
04534 return ONIGERR_EMPTY_CHAR_CLASS;
04535
04536 CC_ESC_WARN(env, (UChar* )"]");
04537 r = tok->type = TK_CHAR;
04538 }
04539
04540 *np = node = node_new_cclass();
04541 CHECK_NULL_RETURN_MEMERR(node);
04542 cc = NCCLASS(node);
04543
04544 and_start = 0;
04545 state = CCS_START;
04546 p = *src;
04547 while (r != TK_CC_CLOSE) {
04548 fetched = 0;
04549 switch (r) {
04550 case TK_CHAR:
04551 if ((tok->u.code >= SINGLE_BYTE_SIZE) ||
04552 (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) {
04553 in_type = CCV_CODE_POINT;
04554 }
04555 else if (len < 0) {
04556 r = len;
04557 goto err;
04558 }
04559 else {
04560 sb_char:
04561 in_type = CCV_SB;
04562 }
04563 v = (OnigCodePoint )tok->u.c;
04564 in_israw = 0;
04565 goto val_entry2;
04566 break;
04567
04568 case TK_RAW_BYTE:
04569
04570 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
04571 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
04572 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
04573 UChar* psave = p;
04574 int i, base = tok->base;
04575
04576 buf[0] = (UChar )tok->u.c;
04577 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
04578 r = fetch_token_in_cc(tok, &p, end, env);
04579 if (r < 0) goto err;
04580 if (r != TK_RAW_BYTE || tok->base != base) {
04581 fetched = 1;
04582 break;
04583 }
04584 buf[i] = (UChar )tok->u.c;
04585 }
04586
04587 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
04588 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
04589 goto err;
04590 }
04591
04592 len = enclen(env->enc, buf, buf+i);
04593 if (i < len) {
04594 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
04595 goto err;
04596 }
04597 else if (i > len) {
04598 p = psave;
04599 for (i = 1; i < len; i++) {
04600 r = fetch_token_in_cc(tok, &p, end, env);
04601 }
04602 fetched = 0;
04603 }
04604
04605 if (i == 1) {
04606 v = (OnigCodePoint )buf[0];
04607 goto raw_single;
04608 }
04609 else {
04610 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
04611 in_type = CCV_CODE_POINT;
04612 }
04613 }
04614 else {
04615 v = (OnigCodePoint )tok->u.c;
04616 raw_single:
04617 in_type = CCV_SB;
04618 }
04619 in_israw = 1;
04620 goto val_entry2;
04621 break;
04622
04623 case TK_CODE_POINT:
04624 v = tok->u.code;
04625 in_israw = 1;
04626 val_entry:
04627 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
04628 if (len < 0) {
04629 r = len;
04630 goto err;
04631 }
04632 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
04633 val_entry2:
04634 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
04635 &state, env);
04636 if (r != 0) goto err;
04637 break;
04638
04639 case TK_POSIX_BRACKET_OPEN:
04640 r = parse_posix_bracket(cc, &p, end, env);
04641 if (r < 0) goto err;
04642 if (r == 1) {
04643 CC_ESC_WARN(env, (UChar* )"[");
04644 p = tok->backp;
04645 v = (OnigCodePoint )tok->u.c;
04646 in_israw = 0;
04647 goto val_entry;
04648 }
04649 goto next_class;
04650 break;
04651
04652 case TK_CHAR_TYPE:
04653 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, 0, env);
04654 if (r != 0) return r;
04655
04656 next_class:
04657 r = next_state_class(cc, &vs, &val_type, &state, env);
04658 if (r != 0) goto err;
04659 break;
04660
04661 case TK_CHAR_PROPERTY:
04662 {
04663 int ctype;
04664
04665 ctype = fetch_char_property_to_ctype(&p, end, env);
04666 if (ctype < 0) return ctype;
04667 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 1, env);
04668 if (r != 0) return r;
04669 goto next_class;
04670 }
04671 break;
04672
04673 case TK_CC_RANGE:
04674 if (state == CCS_VALUE) {
04675 r = fetch_token_in_cc(tok, &p, end, env);
04676 if (r < 0) goto err;
04677 fetched = 1;
04678 if (r == TK_CC_CLOSE) {
04679 range_end_val:
04680 v = (OnigCodePoint )'-';
04681 in_israw = 0;
04682 goto val_entry;
04683 }
04684 else if (r == TK_CC_AND) {
04685 CC_ESC_WARN(env, (UChar* )"-");
04686 goto range_end_val;
04687 }
04688 state = CCS_RANGE;
04689 }
04690 else if (state == CCS_START) {
04691
04692 v = (OnigCodePoint )tok->u.c;
04693 in_israw = 0;
04694
04695 r = fetch_token_in_cc(tok, &p, end, env);
04696 if (r < 0) goto err;
04697 fetched = 1;
04698
04699 if (r == TK_CC_RANGE || and_start != 0)
04700 CC_ESC_WARN(env, (UChar* )"-");
04701
04702 goto val_entry;
04703 }
04704 else if (state == CCS_RANGE) {
04705 CC_ESC_WARN(env, (UChar* )"-");
04706 goto sb_char;
04707 }
04708 else {
04709 r = fetch_token_in_cc(tok, &p, end, env);
04710 if (r < 0) goto err;
04711 fetched = 1;
04712 if (r == TK_CC_CLOSE) goto range_end_val;
04713 else if (r == TK_CC_AND) {
04714 CC_ESC_WARN(env, (UChar* )"-");
04715 goto range_end_val;
04716 }
04717
04718 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
04719 CC_ESC_WARN(env, (UChar* )"-");
04720 goto range_end_val;
04721 }
04722 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
04723 goto err;
04724 }
04725 break;
04726
04727 case TK_CC_CC_OPEN:
04728 {
04729 Node *anode;
04730 CClassNode* acc;
04731
04732 r = parse_char_class(&anode, tok, &p, end, env);
04733 if (r == 0) {
04734 acc = NCCLASS(anode);
04735 r = or_cclass(cc, acc, env);
04736 }
04737 onig_node_free(anode);
04738 if (r != 0) goto err;
04739 }
04740 break;
04741
04742 case TK_CC_AND:
04743 {
04744 if (state == CCS_VALUE) {
04745 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
04746 &val_type, &state, env);
04747 if (r != 0) goto err;
04748 }
04749
04750 and_start = 1;
04751 state = CCS_START;
04752
04753 if (IS_NOT_NULL(prev_cc)) {
04754 r = and_cclass(prev_cc, cc, env);
04755 if (r != 0) goto err;
04756 bbuf_free(cc->mbuf);
04757 }
04758 else {
04759 prev_cc = cc;
04760 cc = &work_cc;
04761 }
04762 initialize_cclass(cc);
04763 }
04764 break;
04765
04766 case TK_EOT:
04767 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
04768 goto err;
04769 break;
04770 default:
04771 r = ONIGERR_PARSER_BUG;
04772 goto err;
04773 break;
04774 }
04775
04776 if (fetched)
04777 r = tok->type;
04778 else {
04779 r = fetch_token_in_cc(tok, &p, end, env);
04780 if (r < 0) goto err;
04781 }
04782 }
04783
04784 if (state == CCS_VALUE) {
04785 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
04786 &val_type, &state, env);
04787 if (r != 0) goto err;
04788 }
04789
04790 if (IS_NOT_NULL(prev_cc)) {
04791 r = and_cclass(prev_cc, cc, env);
04792 if (r != 0) goto err;
04793 bbuf_free(cc->mbuf);
04794 cc = prev_cc;
04795 }
04796
04797 if (neg != 0)
04798 NCCLASS_SET_NOT(cc);
04799 else
04800 NCCLASS_CLEAR_NOT(cc);
04801 if (IS_NCCLASS_NOT(cc) &&
04802 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
04803 int is_empty;
04804
04805 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
04806 if (is_empty != 0)
04807 BITSET_IS_EMPTY(cc->bs, is_empty);
04808
04809 if (is_empty == 0) {
04810 #define NEWLINE_CODE 0x0a
04811
04812 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
04813 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
04814 BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE);
04815 else {
04816 r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
04817 if (r < 0) goto err;
04818 }
04819 }
04820 }
04821 }
04822 *src = p;
04823 return 0;
04824
04825 err:
04826 if (cc != NCCLASS(*np))
04827 bbuf_free(cc->mbuf);
04828 return r;
04829 }
04830
04831 static int parse_subexp(Node** top, OnigToken* tok, int term,
04832 UChar** src, UChar* end, ScanEnv* env);
04833
04834 static int
04835 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
04836 ScanEnv* env)
04837 {
04838 int r = 0, num;
04839 Node *target, *work1 = NULL, *work2 = NULL;
04840 OnigOptionType option;
04841 OnigCodePoint c;
04842 OnigEncoding enc = env->enc;
04843
04844 #ifdef USE_NAMED_GROUP
04845 int list_capture;
04846 #endif
04847
04848 UChar* p = *src;
04849 PFETCH_READY;
04850
04851 *np = NULL;
04852 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
04853
04854 option = env->option;
04855 if (PPEEK_IS('?') &&
04856 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
04857 PINC;
04858 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
04859
04860 PFETCH(c);
04861 switch (c) {
04862 case ':':
04863 group:
04864 r = fetch_token(tok, &p, end, env);
04865 if (r < 0) return r;
04866 r = parse_subexp(np, tok, term, &p, end, env);
04867 if (r < 0) return r;
04868 *src = p;
04869 return 1;
04870 break;
04871
04872 case '=':
04873 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
04874 break;
04875 case '!':
04876 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
04877 break;
04878 case '>':
04879 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
04880 break;
04881
04882 #ifdef USE_NAMED_GROUP
04883 case '\'':
04884 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
04885 goto named_group1;
04886 }
04887 else
04888 return ONIGERR_UNDEFINED_GROUP_OPTION;
04889 break;
04890
04891 #ifdef USE_CAPITAL_P_NAMED_GROUP
04892 case 'P':
04893 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
04894 PFETCH(c);
04895 if (c == '<') goto named_group1;
04896 }
04897 return ONIGERR_UNDEFINED_GROUP_OPTION;
04898 break;
04899 #endif
04900 #endif
04901
04902 case '<':
04903 PFETCH(c);
04904 if (c == '=')
04905 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
04906 else if (c == '!')
04907 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
04908 #ifdef USE_NAMED_GROUP
04909 else {
04910 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
04911 UChar *name;
04912 UChar *name_end;
04913
04914 PUNFETCH;
04915 c = '<';
04916
04917 named_group1:
04918 list_capture = 0;
04919
04920 named_group2:
04921 name = p;
04922 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
04923 if (r < 0) return r;
04924
04925 num = scan_env_add_mem_entry(env);
04926 if (num < 0) return num;
04927 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
04928 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
04929
04930 r = name_add(env->reg, name, name_end, num, env);
04931 if (r != 0) return r;
04932 *np = node_new_enclose_memory(env->option, 1);
04933 CHECK_NULL_RETURN_MEMERR(*np);
04934 NENCLOSE(*np)->regnum = num;
04935 if (list_capture != 0)
04936 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
04937 env->num_named++;
04938 }
04939 else {
04940 return ONIGERR_UNDEFINED_GROUP_OPTION;
04941 }
04942 }
04943 #else
04944 else {
04945 return ONIGERR_UNDEFINED_GROUP_OPTION;
04946 }
04947 #endif
04948 break;
04949
04950 case '@':
04951 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
04952 #ifdef USE_NAMED_GROUP
04953 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
04954 PFETCH(c);
04955 if (c == '<' || c == '\'') {
04956 list_capture = 1;
04957 goto named_group2;
04958 }
04959 PUNFETCH;
04960 }
04961 #endif
04962 *np = node_new_enclose_memory(env->option, 0);
04963 CHECK_NULL_RETURN_MEMERR(*np);
04964 num = scan_env_add_mem_entry(env);
04965 if (num < 0) return num;
04966 if (num >= (int )BIT_STATUS_BITS_NUM)
04967 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
04968 NENCLOSE(*np)->regnum = num;
04969 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
04970 }
04971 else {
04972 return ONIGERR_UNDEFINED_GROUP_OPTION;
04973 }
04974 break;
04975
04976 case '(':
04977 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) {
04978 UChar *name = NULL;
04979 UChar *name_end;
04980 PFETCH(c);
04981 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
04982 PUNFETCH;
04983 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1);
04984 if (r < 0) return r;
04985 if (num < 0) {
04986 num = BACKREF_REL_TO_ABS(num, env);
04987 if (num <= 0)
04988 return ONIGERR_INVALID_BACKREF;
04989 }
04990 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
04991 if (num > env->num_mem ||
04992 IS_NULL(SCANENV_MEM_NODES(env)[num]))
04993 return ONIGERR_INVALID_BACKREF;
04994 }
04995 }
04996 #ifdef USE_NAMED_GROUP
04997 else if (c == '<' || c == '\'') {
04998 int nums;
04999 int *backs;
05000
05001 name = p;
05002 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
05003 if (r < 0) return r;
05004 PFETCH(c);
05005 if (c != ')') return ONIGERR_UNDEFINED_GROUP_OPTION;
05006
05007 nums = onig_name_to_group_numbers(env->reg, name, name_end, &backs);
05008 if (nums <= 0) {
05009 onig_scan_env_set_error_string(env,
05010 ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end);
05011 return ONIGERR_UNDEFINED_NAME_REFERENCE;
05012 }
05013 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
05014 int i;
05015 for (i = 0; i < nums; i++) {
05016 if (backs[i] > env->num_mem ||
05017 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
05018 return ONIGERR_INVALID_BACKREF;
05019 }
05020 }
05021 num = backs[0];
05022 }
05023 #endif
05024 else
05025 return ONIGERR_INVALID_CONDITION_PATTERN;
05026 *np = node_new_enclose(ENCLOSE_CONDITION);
05027 CHECK_NULL_RETURN_MEMERR(*np);
05028 NENCLOSE(*np)->regnum = num;
05029 if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF;
05030 }
05031 else
05032 return ONIGERR_UNDEFINED_GROUP_OPTION;
05033 break;
05034
05035 #if 0
05036 case '|':
05037 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) {
05038
05039 }
05040 else
05041 return ONIGERR_UNDEFINED_GROUP_OPTION;
05042 break;
05043 #endif
05044
05045 case '^':
05046 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
05047
05048 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
05049 ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
05050 ONOFF(option, ONIG_OPTION_SINGLELINE, 0);
05051 ONOFF(option, ONIG_OPTION_MULTILINE, 1);
05052 ONOFF(option, ONIG_OPTION_EXTEND, 1);
05053 PFETCH(c);
05054 }
05055 #if 0
05056 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
05057
05058 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
05059 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
05060 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
05061 ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
05062 ONOFF(option, ONIG_OPTION_MULTILINE, 1);
05063 ONOFF(option, ONIG_OPTION_EXTEND, 1);
05064 PFETCH(c);
05065 }
05066 #endif
05067 else {
05068 return ONIGERR_UNDEFINED_GROUP_OPTION;
05069 }
05070
05071 #ifdef USE_POSIXLINE_OPTION
05072 case 'p':
05073 #endif
05074 case '-': case 'i': case 'm': case 's': case 'x':
05075 case 'a': case 'd': case 'l': case 'u':
05076 {
05077 int neg = 0;
05078
05079 while (1) {
05080 switch (c) {
05081 case ':':
05082 case ')':
05083 break;
05084
05085 case '-': neg = 1; break;
05086 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
05087 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
05088 case 's':
05089 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
05090 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
05091 }
05092 else
05093 return ONIGERR_UNDEFINED_GROUP_OPTION;
05094 break;
05095
05096 case 'm':
05097 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
05098 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
05099 }
05100 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
05101 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
05102 }
05103 else
05104 return ONIGERR_UNDEFINED_GROUP_OPTION;
05105 break;
05106 #ifdef USE_POSIXLINE_OPTION
05107 case 'p':
05108 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
05109 break;
05110 #endif
05111
05112 case 'a':
05113 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
05114 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
05115 (neg == 0)) {
05116 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
05117 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
05118 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
05119 }
05120 else
05121 return ONIGERR_UNDEFINED_GROUP_OPTION;
05122 break;
05123
05124 case 'u':
05125 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
05126 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
05127 (neg == 0)) {
05128 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
05129 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
05130 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
05131 }
05132 else
05133 return ONIGERR_UNDEFINED_GROUP_OPTION;
05134 break;
05135
05136 case 'd':
05137 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) &&
05138 (neg == 0)) {
05139 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
05140 }
05141 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) &&
05142 (neg == 0)) {
05143 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
05144 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
05145 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
05146 }
05147 else
05148 return ONIGERR_UNDEFINED_GROUP_OPTION;
05149 break;
05150
05151 case 'l':
05152 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) {
05153 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
05154 }
05155 else
05156 return ONIGERR_UNDEFINED_GROUP_OPTION;
05157 break;
05158
05159 default:
05160 return ONIGERR_UNDEFINED_GROUP_OPTION;
05161 }
05162
05163 if (c == ')') {
05164 *np = node_new_option(option);
05165 CHECK_NULL_RETURN_MEMERR(*np);
05166 *src = p;
05167 return 2;
05168 }
05169 else if (c == ':') {
05170 OnigOptionType prev = env->option;
05171
05172 env->option = option;
05173 r = fetch_token(tok, &p, end, env);
05174 if (r < 0) return r;
05175 r = parse_subexp(&target, tok, term, &p, end, env);
05176 env->option = prev;
05177 if (r < 0) return r;
05178 *np = node_new_option(option);
05179 CHECK_NULL_RETURN_MEMERR(*np);
05180 NENCLOSE(*np)->target = target;
05181 *src = p;
05182 return 0;
05183 }
05184
05185 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
05186 PFETCH(c);
05187 }
05188 }
05189 break;
05190
05191 default:
05192 return ONIGERR_UNDEFINED_GROUP_OPTION;
05193 }
05194 }
05195 else {
05196 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
05197 goto group;
05198
05199 *np = node_new_enclose_memory(env->option, 0);
05200 CHECK_NULL_RETURN_MEMERR(*np);
05201 num = scan_env_add_mem_entry(env);
05202 if (num < 0) return num;
05203 NENCLOSE(*np)->regnum = num;
05204 }
05205
05206 CHECK_NULL_RETURN_MEMERR(*np);
05207 r = fetch_token(tok, &p, end, env);
05208 if (r < 0) return r;
05209 r = parse_subexp(&target, tok, term, &p, end, env);
05210 if (r < 0) {
05211 onig_node_free(target);
05212 return r;
05213 }
05214
05215 if (NTYPE(*np) == NT_ANCHOR)
05216 NANCHOR(*np)->target = target;
05217 else {
05218 NENCLOSE(*np)->target = target;
05219 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
05220
05221 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
05222 if (r != 0) return r;
05223 }
05224 else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) {
05225 if (NTYPE(target) != NT_ALT) {
05226
05227 work1 = node_new_empty();
05228 if (IS_NULL(work1)) goto err;
05229 work2 = onig_node_new_alt(work1, NULL_NODE);
05230 if (IS_NULL(work2)) goto err;
05231 work1 = onig_node_new_alt(target, work2);
05232 if (IS_NULL(work1)) goto err;
05233 NENCLOSE(*np)->target = work1;
05234 }
05235 }
05236 }
05237
05238 *src = p;
05239 return 0;
05240
05241 err:
05242 onig_node_free(work1);
05243 onig_node_free(work2);
05244 onig_node_free(*np);
05245 *np = NULL;
05246 return ONIGERR_MEMORY;
05247 }
05248
05249 static const char* const PopularQStr[] = {
05250 "?", "*", "+", "??", "*?", "+?"
05251 };
05252
05253 static const char* const ReduceQStr[] = {
05254 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
05255 };
05256
05257 static int
05258 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
05259 {
05260 QtfrNode* qn;
05261
05262 qn = NQTFR(qnode);
05263 if (qn->lower == 1 && qn->upper == 1) {
05264 return 1;
05265 }
05266
05267 switch (NTYPE(target)) {
05268 case NT_STR:
05269 if (! group) {
05270 StrNode* sn = NSTR(target);
05271 if (str_node_can_be_split(sn, env->enc)) {
05272 Node* n = str_node_split_last_char(sn, env->enc);
05273 if (IS_NOT_NULL(n)) {
05274 qn->target = n;
05275 return 2;
05276 }
05277 }
05278 }
05279 break;
05280
05281 case NT_QTFR:
05282 {
05283
05284 QtfrNode* qnt = NQTFR(target);
05285 int nestq_num = popular_quantifier_num(qn);
05286 int targetq_num = popular_quantifier_num(qnt);
05287
05288 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
05289 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
05290 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
05291 switch (ReduceTypeTable[targetq_num][nestq_num]) {
05292 case RQ_ASIS:
05293 break;
05294
05295 case RQ_DEL:
05296 if (onig_warn != onig_null_warn) {
05297 onig_syntax_warn(env, "regular expression has redundant nested repeat operator '%s'",
05298 PopularQStr[targetq_num]);
05299 }
05300 goto warn_exit;
05301 break;
05302
05303 default:
05304 if (onig_warn != onig_null_warn) {
05305 onig_syntax_warn(env, "nested repeat operator '%s' and '%s' was replaced with '%s' in regular expression",
05306 PopularQStr[targetq_num], PopularQStr[nestq_num],
05307 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
05308 }
05309 goto warn_exit;
05310 break;
05311 }
05312 }
05313
05314 warn_exit:
05315 #endif
05316 if (targetq_num >= 0) {
05317 if (nestq_num >= 0) {
05318 onig_reduce_nested_quantifier(qnode, target);
05319 goto q_exit;
05320 }
05321 else if (targetq_num == 1 || targetq_num == 2) {
05322
05323 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
05324 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
05325 }
05326 }
05327 }
05328 }
05329 break;
05330
05331 default:
05332 break;
05333 }
05334
05335 qn->target = target;
05336 q_exit:
05337 return 0;
05338 }
05339
05340
05341 #ifdef USE_SHARED_CCLASS_TABLE
05342
05343 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
05344
05345
05346
05347 typedef struct {
05348 OnigEncoding enc;
05349 int not;
05350 int type;
05351 } type_cclass_key;
05352
05353 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
05354 {
05355 if (x->type != y->type) return 1;
05356 if (x->enc != y->enc) return 1;
05357 if (x->not != y->not) return 1;
05358 return 0;
05359 }
05360
05361 static st_index_t type_cclass_hash(type_cclass_key* key)
05362 {
05363 int i, val;
05364 UChar *p;
05365
05366 val = 0;
05367
05368 p = (UChar* )&(key->enc);
05369 for (i = 0; i < (int )sizeof(key->enc); i++) {
05370 val = val * 997 + (int )*p++;
05371 }
05372
05373 p = (UChar* )(&key->type);
05374 for (i = 0; i < (int )sizeof(key->type); i++) {
05375 val = val * 997 + (int )*p++;
05376 }
05377
05378 val += key->not;
05379 return val + (val >> 5);
05380 }
05381
05382 static const struct st_hash_type type_type_cclass_hash = {
05383 type_cclass_cmp,
05384 type_cclass_hash,
05385 };
05386
05387 static st_table* OnigTypeCClassTable;
05388
05389
05390 static int
05391 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
05392 {
05393 if (IS_NOT_NULL(node)) {
05394 CClassNode* cc = NCCLASS(node);
05395 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
05396 xfree(node);
05397 }
05398
05399 if (IS_NOT_NULL(key)) xfree(key);
05400 return ST_DELETE;
05401 }
05402
05403 extern int
05404 onig_free_shared_cclass_table(void)
05405 {
05406 THREAD_ATOMIC_START;
05407 if (IS_NOT_NULL(OnigTypeCClassTable)) {
05408 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
05409 onig_st_free_table(OnigTypeCClassTable);
05410 OnigTypeCClassTable = NULL;
05411 }
05412 THREAD_ATOMIC_END;
05413
05414 return 0;
05415 }
05416
05417 #endif
05418
05419
05420 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
05421 static int
05422 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
05423 {
05424 BBuf *tbuf;
05425 int r;
05426
05427 if (IS_NCCLASS_NOT(cc)) {
05428 bitset_invert(cc->bs);
05429
05430 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
05431 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
05432 if (r != 0) return r;
05433
05434 bbuf_free(cc->mbuf);
05435 cc->mbuf = tbuf;
05436 }
05437
05438 NCCLASS_CLEAR_NOT(cc);
05439 }
05440
05441 return 0;
05442 }
05443 #endif
05444
05445 typedef struct {
05446 ScanEnv* env;
05447 CClassNode* cc;
05448 Node* alt_root;
05449 Node** ptail;
05450 } IApplyCaseFoldArg;
05451
05452 static int
05453 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
05454 int to_len, void* arg)
05455 {
05456 IApplyCaseFoldArg* iarg;
05457 ScanEnv* env;
05458 CClassNode* cc;
05459 BitSetRef bs;
05460
05461 iarg = (IApplyCaseFoldArg* )arg;
05462 env = iarg->env;
05463 cc = iarg->cc;
05464 bs = cc->bs;
05465
05466 if (to_len == 1) {
05467 int is_in = onig_is_code_in_cc(env->enc, from, cc);
05468 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
05469 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
05470 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
05471 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
05472 add_code_range0(&(cc->mbuf), env, *to, *to, 0);
05473 }
05474 else {
05475 BITSET_SET_BIT(bs, *to);
05476 }
05477 }
05478 #else
05479 if (is_in != 0) {
05480 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
05481 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
05482 add_code_range0(&(cc->mbuf), env, *to, *to, 0);
05483 }
05484 else {
05485 if (IS_NCCLASS_NOT(cc)) {
05486 BITSET_CLEAR_BIT(bs, *to);
05487 }
05488 else
05489 BITSET_SET_BIT(bs, *to);
05490 }
05491 }
05492 #endif
05493 }
05494 else {
05495 int r, i, len;
05496 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
05497 Node *snode = NULL_NODE;
05498
05499 if (onig_is_code_in_cc(env->enc, from, cc)
05500 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
05501 && !IS_NCCLASS_NOT(cc)
05502 #endif
05503 ) {
05504 for (i = 0; i < to_len; i++) {
05505 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
05506 if (i == 0) {
05507 snode = onig_node_new_str(buf, buf + len);
05508 CHECK_NULL_RETURN_MEMERR(snode);
05509
05510
05511
05512 NSTRING_SET_AMBIG(snode);
05513 }
05514 else {
05515 r = onig_node_str_cat(snode, buf, buf + len);
05516 if (r < 0) {
05517 onig_node_free(snode);
05518 return r;
05519 }
05520 }
05521 }
05522
05523 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
05524 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
05525 iarg->ptail = &(NCDR((*(iarg->ptail))));
05526 }
05527 }
05528
05529 return 0;
05530 }
05531
05532 static int
05533 node_linebreak(Node** np, ScanEnv* env)
05534 {
05535
05536 Node* left = NULL;
05537 Node* right = NULL;
05538 Node* target1 = NULL;
05539 Node* target2 = NULL;
05540 CClassNode* cc;
05541 int num1, num2;
05542 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
05543
05544
05545 num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
05546 if (num1 < 0) return num1;
05547 num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
05548 if (num2 < 0) return num2;
05549 left = node_new_str_raw(buf, buf + num1 + num2);
05550 if (IS_NULL(left)) goto err;
05551
05552
05553 right = node_new_cclass();
05554 if (IS_NULL(right)) goto err;
05555 cc = NCCLASS(right);
05556 if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
05557 add_code_range(&(cc->mbuf), env, 0x0A, 0x0D);
05558 }
05559 else {
05560 bitset_set_range(env, cc->bs, 0x0A, 0x0D);
05561 }
05562
05563
05564 if (ONIGENC_IS_UNICODE(env->enc)) {
05565
05566 add_code_range(&(cc->mbuf), env, 0x85, 0x85);
05567 add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
05568 }
05569
05570
05571 target1 = onig_node_new_alt(right, NULL_NODE);
05572 if (IS_NULL(target1)) goto err;
05573 right = NULL;
05574 target2 = onig_node_new_alt(left, target1);
05575 if (IS_NULL(target2)) goto err;
05576 left = NULL;
05577 target1 = NULL;
05578
05579
05580 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
05581 if (IS_NULL(*np)) goto err;
05582 NENCLOSE(*np)->target = target2;
05583 return ONIG_NORMAL;
05584
05585 err:
05586 onig_node_free(left);
05587 onig_node_free(right);
05588 onig_node_free(target1);
05589 onig_node_free(target2);
05590 return ONIGERR_MEMORY;
05591 }
05592
05593 static int
05594 node_extended_grapheme_cluster(Node** np, ScanEnv* env)
05595 {
05596
05597 Node* np1 = NULL;
05598 Node* np2 = NULL;
05599 Node* qn = NULL;
05600 Node* list1 = NULL;
05601 Node* list2 = NULL;
05602 int r = 0;
05603
05604 #ifdef USE_UNICODE_PROPERTIES
05605 if (ONIGENC_IS_UNICODE(env->enc)) {
05606
05607 CClassNode* cc1;
05608 CClassNode* cc2;
05609 UChar* propname = (UChar* )"M";
05610 int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII,
05611 propname, propname + 1);
05612 if (ctype >= 0) {
05613
05614 np1 = node_new_cclass();
05615 if (IS_NULL(np1)) goto err;
05616 cc1 = NCCLASS(np1);
05617 r = add_ctype_to_cc(cc1, ctype, 0, 1, env);
05618 if (r != 0) goto err;
05619 NCCLASS_SET_NOT(cc1);
05620
05621
05622 np2 = node_new_cclass();
05623 if (IS_NULL(np2)) goto err;
05624 cc2 = NCCLASS(np2);
05625 r = add_ctype_to_cc(cc2, ctype, 0, 1, env);
05626 if (r != 0) goto err;
05627
05628 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
05629 if (IS_NULL(qn)) goto err;
05630 NQTFR(qn)->target = np2;
05631 np2 = NULL;
05632
05633
05634 list2 = node_new_list(qn, NULL_NODE);
05635 if (IS_NULL(list2)) goto err;
05636 qn = NULL;
05637 list1 = node_new_list(np1, list2);
05638 if (IS_NULL(list1)) goto err;
05639 np1 = NULL;
05640 list2 = NULL;
05641
05642
05643 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
05644 if (IS_NULL(*np)) goto err;
05645 NENCLOSE(*np)->target = list1;
05646 return ONIG_NORMAL;
05647 }
05648 }
05649 #endif
05650 if (IS_NULL(*np)) {
05651
05652 OnigOptionType option;
05653 np1 = node_new_anychar();
05654 if (IS_NULL(np1)) goto err;
05655
05656 option = env->option;
05657 ONOFF(option, ONIG_OPTION_MULTILINE, 0);
05658 *np = node_new_option(option);
05659 if (IS_NULL(*np)) goto err;
05660 NENCLOSE(*np)->target = np1;
05661 }
05662 return ONIG_NORMAL;
05663
05664 err:
05665 onig_node_free(np1);
05666 onig_node_free(np2);
05667 onig_node_free(qn);
05668 onig_node_free(list1);
05669 onig_node_free(list2);
05670 return (r == 0) ? ONIGERR_MEMORY : r;
05671 }
05672
05673 static int
05674 countbits(unsigned int bits)
05675 {
05676 bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555);
05677 bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333);
05678 bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f);
05679 bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff);
05680 return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff);
05681 }
05682
05683 static int
05684 is_onechar_cclass(CClassNode* cc, OnigCodePoint* code)
05685 {
05686 const OnigCodePoint not_found = ONIG_LAST_CODE_POINT;
05687 OnigCodePoint c = not_found;
05688 int i;
05689 BBuf *bbuf = cc->mbuf;
05690
05691 if (IS_NCCLASS_NOT(cc)) return 0;
05692
05693
05694 if (IS_NOT_NULL(bbuf)) {
05695 OnigCodePoint n, *data;
05696 GET_CODE_POINT(n, bbuf->p);
05697 data = (OnigCodePoint* )(bbuf->p) + 1;
05698 if ((n == 1) && (data[0] == data[1])) {
05699
05700 c = data[0];
05701 if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) {
05702
05703 c = not_found;
05704 }
05705 }
05706 else {
05707 return 0;
05708 }
05709 }
05710
05711
05712 for (i = 0; i < BITSET_SIZE; i++) {
05713 Bits b1 = cc->bs[i];
05714 if (b1 != 0) {
05715 if (((b1 & (b1 - 1)) == 0) && (c == not_found)) {
05716 c = BITS_IN_ROOM * i + countbits(b1 - 1);
05717 } else {
05718 return 0;
05719 }
05720 }
05721 }
05722
05723 if (c != not_found) {
05724 *code = c;
05725 return 1;
05726 }
05727
05728
05729 return 0;
05730 }
05731
05732
05733 static int
05734 parse_exp(Node** np, OnigToken* tok, int term,
05735 UChar** src, UChar* end, ScanEnv* env)
05736 {
05737 int r, len, group = 0;
05738 Node* qn;
05739 Node** targetp;
05740
05741 *np = NULL;
05742 if (tok->type == (enum TokenSyms )term)
05743 goto end_of_token;
05744
05745 switch (tok->type) {
05746 case TK_ALT:
05747 case TK_EOT:
05748 end_of_token:
05749 *np = node_new_empty();
05750 return tok->type;
05751 break;
05752
05753 case TK_SUBEXP_OPEN:
05754 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
05755 if (r < 0) return r;
05756 if (r == 1) group = 1;
05757 else if (r == 2) {
05758 Node* target;
05759 OnigOptionType prev = env->option;
05760
05761 env->option = NENCLOSE(*np)->option;
05762 r = fetch_token(tok, src, end, env);
05763 if (r < 0) return r;
05764 r = parse_subexp(&target, tok, term, src, end, env);
05765 env->option = prev;
05766 if (r < 0) {
05767 onig_node_free(target);
05768 return r;
05769 }
05770 NENCLOSE(*np)->target = target;
05771 return tok->type;
05772 }
05773 break;
05774
05775 case TK_SUBEXP_CLOSE:
05776 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
05777 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
05778
05779 if (tok->escaped) goto tk_raw_byte;
05780 else goto tk_byte;
05781 break;
05782
05783 case TK_LINEBREAK:
05784 r = node_linebreak(np, env);
05785 if (r < 0) return r;
05786 break;
05787
05788 case TK_EXTENDED_GRAPHEME_CLUSTER:
05789 r = node_extended_grapheme_cluster(np, env);
05790 if (r < 0) return r;
05791 break;
05792
05793 case TK_KEEP:
05794 *np = onig_node_new_anchor(ANCHOR_KEEP);
05795 CHECK_NULL_RETURN_MEMERR(*np);
05796 break;
05797
05798 case TK_STRING:
05799 tk_byte:
05800 {
05801 *np = node_new_str(tok->backp, *src);
05802 CHECK_NULL_RETURN_MEMERR(*np);
05803
05804 string_loop:
05805 while (1) {
05806 r = fetch_token(tok, src, end, env);
05807 if (r < 0) return r;
05808 if (r == TK_STRING) {
05809 r = onig_node_str_cat(*np, tok->backp, *src);
05810 }
05811 #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
05812 else if (r == TK_CODE_POINT) {
05813 r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
05814 }
05815 #endif
05816 else {
05817 break;
05818 }
05819 if (r < 0) return r;
05820 }
05821
05822 string_end:
05823 targetp = np;
05824 goto repeat;
05825 }
05826 break;
05827
05828 case TK_RAW_BYTE:
05829 tk_raw_byte:
05830 {
05831 *np = node_new_str_raw_char((UChar )tok->u.c);
05832 CHECK_NULL_RETURN_MEMERR(*np);
05833 len = 1;
05834 while (1) {
05835 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
05836 if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) {
05837 r = fetch_token(tok, src, end, env);
05838 NSTRING_CLEAR_RAW(*np);
05839 goto string_end;
05840 }
05841 }
05842
05843 r = fetch_token(tok, src, end, env);
05844 if (r < 0) return r;
05845 if (r != TK_RAW_BYTE) {
05846
05847 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
05848 int rem;
05849 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
05850 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
05851 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
05852 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
05853 NSTRING_CLEAR_RAW(*np);
05854 goto string_end;
05855 }
05856 }
05857 #endif
05858 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
05859 }
05860
05861 r = node_str_cat_char(*np, (UChar )tok->u.c);
05862 if (r < 0) return r;
05863
05864 len++;
05865 }
05866 }
05867 break;
05868
05869 case TK_CODE_POINT:
05870 {
05871 *np = node_new_empty();
05872 CHECK_NULL_RETURN_MEMERR(*np);
05873 r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
05874 if (r != 0) return r;
05875 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
05876 NSTRING_SET_RAW(*np);
05877 #else
05878 goto string_loop;
05879 #endif
05880 }
05881 break;
05882
05883 case TK_QUOTE_OPEN:
05884 {
05885 OnigCodePoint end_op[2];
05886 UChar *qstart, *qend, *nextp;
05887
05888 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
05889 end_op[1] = (OnigCodePoint )'E';
05890 qstart = *src;
05891 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
05892 if (IS_NULL(qend)) {
05893 nextp = qend = end;
05894 }
05895 *np = node_new_str(qstart, qend);
05896 CHECK_NULL_RETURN_MEMERR(*np);
05897 *src = nextp;
05898 }
05899 break;
05900
05901 case TK_CHAR_TYPE:
05902 {
05903 switch (tok->u.prop.ctype) {
05904 case ONIGENC_CTYPE_WORD:
05905 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not,
05906 IS_ASCII_RANGE(env->option));
05907 CHECK_NULL_RETURN_MEMERR(*np);
05908 break;
05909
05910 case ONIGENC_CTYPE_SPACE:
05911 case ONIGENC_CTYPE_DIGIT:
05912 case ONIGENC_CTYPE_XDIGIT:
05913 {
05914 CClassNode* cc;
05915
05916 #ifdef USE_SHARED_CCLASS_TABLE
05917 const OnigCodePoint *mbr;
05918 OnigCodePoint sb_out;
05919
05920 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
05921 &sb_out, &mbr);
05922 if (r == 0 &&
05923 ! IS_ASCII_RANGE(env->option) &&
05924 ONIGENC_CODE_RANGE_NUM(mbr)
05925 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
05926 type_cclass_key key;
05927 type_cclass_key* new_key;
05928
05929 key.enc = env->enc;
05930 key.not = tok->u.prop.not;
05931 key.type = tok->u.prop.ctype;
05932
05933 THREAD_ATOMIC_START;
05934
05935 if (IS_NULL(OnigTypeCClassTable)) {
05936 OnigTypeCClassTable
05937 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
05938 if (IS_NULL(OnigTypeCClassTable)) {
05939 THREAD_ATOMIC_END;
05940 return ONIGERR_MEMORY;
05941 }
05942 }
05943 else {
05944 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
05945 (st_data_t* )np)) {
05946 THREAD_ATOMIC_END;
05947 break;
05948 }
05949 }
05950
05951 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
05952 sb_out, mbr);
05953 if (IS_NULL(*np)) {
05954 THREAD_ATOMIC_END;
05955 return ONIGERR_MEMORY;
05956 }
05957
05958 cc = NCCLASS(*np);
05959 NCCLASS_SET_SHARE(cc);
05960 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
05961 xmemcpy(new_key, &key, sizeof(type_cclass_key));
05962 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
05963 (st_data_t )*np);
05964
05965 THREAD_ATOMIC_END;
05966 }
05967 else {
05968 #endif
05969 *np = node_new_cclass();
05970 CHECK_NULL_RETURN_MEMERR(*np);
05971 cc = NCCLASS(*np);
05972 r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0, 0, env);
05973 if (r != 0) return r;
05974 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
05975 #ifdef USE_SHARED_CCLASS_TABLE
05976 }
05977 #endif
05978 }
05979 break;
05980
05981 default:
05982 return ONIGERR_PARSER_BUG;
05983 break;
05984 }
05985 }
05986 break;
05987
05988 case TK_CHAR_PROPERTY:
05989 r = parse_char_property(np, tok, src, end, env);
05990 if (r != 0) return r;
05991 break;
05992
05993 case TK_CC_OPEN:
05994 {
05995 CClassNode* cc;
05996 OnigCodePoint code;
05997
05998 r = parse_char_class(np, tok, src, end, env);
05999 if (r != 0) return r;
06000
06001 cc = NCCLASS(*np);
06002 if (is_onechar_cclass(cc, &code)) {
06003 onig_node_free(*np);
06004 *np = node_new_empty();
06005 CHECK_NULL_RETURN_MEMERR(*np);
06006 r = node_str_cat_codepoint(*np, env->enc, code);
06007 if (r != 0) return r;
06008 goto string_loop;
06009 }
06010 if (IS_IGNORECASE(env->option)) {
06011 IApplyCaseFoldArg iarg;
06012
06013 iarg.env = env;
06014 iarg.cc = cc;
06015 iarg.alt_root = NULL_NODE;
06016 iarg.ptail = &(iarg.alt_root);
06017
06018 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
06019 i_apply_case_fold, &iarg);
06020 if (r != 0) {
06021 onig_node_free(iarg.alt_root);
06022 return r;
06023 }
06024 if (IS_NOT_NULL(iarg.alt_root)) {
06025 Node* work = onig_node_new_alt(*np, iarg.alt_root);
06026 if (IS_NULL(work)) {
06027 onig_node_free(iarg.alt_root);
06028 return ONIGERR_MEMORY;
06029 }
06030 *np = work;
06031 }
06032 }
06033 }
06034 break;
06035
06036 case TK_ANYCHAR:
06037 *np = node_new_anychar();
06038 CHECK_NULL_RETURN_MEMERR(*np);
06039 break;
06040
06041 case TK_ANYCHAR_ANYTIME:
06042 *np = node_new_anychar();
06043 CHECK_NULL_RETURN_MEMERR(*np);
06044 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
06045 CHECK_NULL_RETURN_MEMERR(qn);
06046 NQTFR(qn)->target = *np;
06047 *np = qn;
06048 break;
06049
06050 case TK_BACKREF:
06051 len = tok->u.backref.num;
06052 *np = node_new_backref(len,
06053 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
06054 tok->u.backref.by_name,
06055 #ifdef USE_BACKREF_WITH_LEVEL
06056 tok->u.backref.exist_level,
06057 tok->u.backref.level,
06058 #endif
06059 env);
06060 CHECK_NULL_RETURN_MEMERR(*np);
06061 break;
06062
06063 #ifdef USE_SUBEXP_CALL
06064 case TK_CALL:
06065 {
06066 int gnum = tok->u.call.gnum;
06067
06068 if (gnum < 0 || tok->u.call.rel != 0) {
06069 if (gnum > 0) gnum--;
06070 gnum = BACKREF_REL_TO_ABS(gnum, env);
06071 if (gnum <= 0)
06072 return ONIGERR_INVALID_BACKREF;
06073 }
06074 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
06075 CHECK_NULL_RETURN_MEMERR(*np);
06076 env->num_call++;
06077 }
06078 break;
06079 #endif
06080
06081 case TK_ANCHOR:
06082 *np = onig_node_new_anchor(tok->u.anchor.subtype);
06083 CHECK_NULL_RETURN_MEMERR(*np);
06084 NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range;
06085 break;
06086
06087 case TK_OP_REPEAT:
06088 case TK_INTERVAL:
06089 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
06090 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
06091 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
06092 else
06093 *np = node_new_empty();
06094 }
06095 else {
06096 goto tk_byte;
06097 }
06098 break;
06099
06100 default:
06101 return ONIGERR_PARSER_BUG;
06102 break;
06103 }
06104
06105 {
06106 targetp = np;
06107
06108 re_entry:
06109 r = fetch_token(tok, src, end, env);
06110 if (r < 0) return r;
06111
06112 repeat:
06113 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
06114 if (is_invalid_quantifier_target(*targetp))
06115 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
06116
06117 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
06118 (r == TK_INTERVAL ? 1 : 0));
06119 CHECK_NULL_RETURN_MEMERR(qn);
06120 NQTFR(qn)->greedy = tok->u.repeat.greedy;
06121 r = set_quantifier(qn, *targetp, group, env);
06122 if (r < 0) {
06123 onig_node_free(qn);
06124 return r;
06125 }
06126
06127 if (tok->u.repeat.possessive != 0) {
06128 Node* en;
06129 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
06130 if (IS_NULL(en)) {
06131 onig_node_free(qn);
06132 return ONIGERR_MEMORY;
06133 }
06134 NENCLOSE(en)->target = qn;
06135 qn = en;
06136 }
06137
06138 if (r == 0) {
06139 *targetp = qn;
06140 }
06141 else if (r == 1) {
06142 onig_node_free(qn);
06143 }
06144 else if (r == 2) {
06145 Node *tmp;
06146
06147 *targetp = node_new_list(*targetp, NULL);
06148 if (IS_NULL(*targetp)) {
06149 onig_node_free(qn);
06150 return ONIGERR_MEMORY;
06151 }
06152 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
06153 if (IS_NULL(tmp)) {
06154 onig_node_free(qn);
06155 return ONIGERR_MEMORY;
06156 }
06157 targetp = &(NCAR(tmp));
06158 }
06159 goto re_entry;
06160 }
06161 }
06162
06163 return r;
06164 }
06165
06166 static int
06167 parse_branch(Node** top, OnigToken* tok, int term,
06168 UChar** src, UChar* end, ScanEnv* env)
06169 {
06170 int r;
06171 Node *node, **headp;
06172
06173 *top = NULL;
06174 r = parse_exp(&node, tok, term, src, end, env);
06175 if (r < 0) {
06176 onig_node_free(node);
06177 return r;
06178 }
06179
06180 if (r == TK_EOT || r == term || r == TK_ALT) {
06181 *top = node;
06182 }
06183 else {
06184 *top = node_new_list(node, NULL);
06185 headp = &(NCDR(*top));
06186 while (r != TK_EOT && r != term && r != TK_ALT) {
06187 r = parse_exp(&node, tok, term, src, end, env);
06188 if (r < 0) {
06189 onig_node_free(node);
06190 return r;
06191 }
06192
06193 if (NTYPE(node) == NT_LIST) {
06194 *headp = node;
06195 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
06196 headp = &(NCDR(node));
06197 }
06198 else {
06199 *headp = node_new_list(node, NULL);
06200 headp = &(NCDR(*headp));
06201 }
06202 }
06203 }
06204
06205 return r;
06206 }
06207
06208
06209 static int
06210 parse_subexp(Node** top, OnigToken* tok, int term,
06211 UChar** src, UChar* end, ScanEnv* env)
06212 {
06213 int r;
06214 Node *node, **headp;
06215
06216 *top = NULL;
06217 r = parse_branch(&node, tok, term, src, end, env);
06218 if (r < 0) {
06219 onig_node_free(node);
06220 return r;
06221 }
06222
06223 if (r == term) {
06224 *top = node;
06225 }
06226 else if (r == TK_ALT) {
06227 *top = onig_node_new_alt(node, NULL);
06228 headp = &(NCDR(*top));
06229 while (r == TK_ALT) {
06230 r = fetch_token(tok, src, end, env);
06231 if (r < 0) return r;
06232 r = parse_branch(&node, tok, term, src, end, env);
06233 if (r < 0) {
06234 onig_node_free(node);
06235 return r;
06236 }
06237
06238 *headp = onig_node_new_alt(node, NULL);
06239 headp = &(NCDR(*headp));
06240 }
06241
06242 if (tok->type != (enum TokenSyms )term)
06243 goto err;
06244 }
06245 else {
06246 onig_node_free(node);
06247 err:
06248 if (term == TK_SUBEXP_CLOSE)
06249 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
06250 else
06251 return ONIGERR_PARSER_BUG;
06252 }
06253
06254 return r;
06255 }
06256
06257 static int
06258 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
06259 {
06260 int r;
06261 OnigToken tok;
06262
06263 r = fetch_token(&tok, src, end, env);
06264 if (r < 0) return r;
06265 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
06266 if (r < 0) return r;
06267
06268 #ifdef USE_SUBEXP_CALL
06269 if (env->num_call > 0) {
06270
06271 const int num = 0;
06272 Node* np;
06273 np = node_new_enclose_memory(env->option, 0);
06274 CHECK_NULL_RETURN_MEMERR(np);
06275 NENCLOSE(np)->regnum = num;
06276 NENCLOSE(np)->target = *top;
06277 r = scan_env_set_mem_node(env, num, np);
06278 if (r != 0) return r;
06279 *top = np;
06280 }
06281 #endif
06282 return 0;
06283 }
06284
06285 extern int
06286 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
06287 regex_t* reg, ScanEnv* env)
06288 {
06289 int r;
06290 UChar* p;
06291
06292 #ifdef USE_NAMED_GROUP
06293 names_clear(reg);
06294 #endif
06295
06296 scan_env_clear(env);
06297 env->option = reg->options;
06298 env->case_fold_flag = reg->case_fold_flag;
06299 env->enc = reg->enc;
06300 env->syntax = reg->syntax;
06301 env->pattern = (UChar* )pattern;
06302 env->pattern_end = (UChar* )end;
06303 env->reg = reg;
06304
06305 *root = NULL;
06306 p = (UChar* )pattern;
06307 r = parse_regexp(root, &p, (UChar* )end, env);
06308 reg->num_mem = env->num_mem;
06309 return r;
06310 }
06311
06312 extern void
06313 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
06314 UChar* arg, UChar* arg_end)
06315 {
06316 env->error = arg;
06317 env->error_end = arg_end;
06318 }
06319