00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/re.h"
00014 #include "ruby/encoding.h"
00015 #include "ruby/util.h"
00016 #include "internal.h"
00017 #include "regint.h"
00018 #include <ctype.h>
00019
00020 VALUE rb_eRegexpError;
00021
00022 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
00023 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
00024
00025 #define BEG(no) (regs->beg[(no)])
00026 #define END(no) (regs->end[(no)])
00027
00028 #if 'a' == 97
00029 static const char casetable[] = {
00030 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00031 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00032 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00033 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00034
00035 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00036
00037 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00038
00039 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00040
00041 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00042
00043 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00044
00045 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00046
00047 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00048
00049 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00050
00051 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00052
00053 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00054
00055 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00056
00057 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00058 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00059 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00060 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00061 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00062 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00063 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00064 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00065 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00066 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00067 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00068 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00069 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00070 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00071 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00072 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00073 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00074 };
00075 #else
00076 # error >>> "You lose. You will need a translation table for your character set." <<<
00077 #endif
00078
00079 int
00080 rb_memcicmp(const void *x, const void *y, long len)
00081 {
00082 const unsigned char *p1 = x, *p2 = y;
00083 int tmp;
00084
00085 while (len--) {
00086 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
00087 return tmp;
00088 }
00089 return 0;
00090 }
00091
00092 #undef rb_memcmp
00093
00094 int
00095 rb_memcmp(const void *p1, const void *p2, long len)
00096 {
00097 return memcmp(p1, p2, len);
00098 }
00099
00100 #ifdef HAVE_MEMMEM
00101 static inline long
00102 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00103 {
00104 const unsigned char *y;
00105
00106 if (y = memmem(ys, n, xs, m))
00107 return y - ys;
00108 else
00109 return -1;
00110 }
00111 #else
00112 static inline long
00113 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00114 {
00115 const unsigned char *x = xs, *xe = xs + m;
00116 const unsigned char *y = ys, *ye = ys + n;
00117 #ifndef VALUE_MAX
00118 # if SIZEOF_VALUE == 8
00119 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
00120 # elif SIZEOF_VALUE == 4
00121 # define VALUE_MAX 0xFFFFFFFFUL
00122 # endif
00123 #endif
00124 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
00125
00126 if (m > SIZEOF_VALUE)
00127 rb_bug("!!too long pattern string!!");
00128
00129 if (!(y = memchr(y, *x, n - m + 1)))
00130 return -1;
00131
00132
00133 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
00134 hx <<= CHAR_BIT;
00135 hy <<= CHAR_BIT;
00136 hx |= *x;
00137 hy |= *y;
00138 }
00139
00140 while (hx != hy) {
00141 if (y == ye)
00142 return -1;
00143 hy <<= CHAR_BIT;
00144 hy |= *y;
00145 hy &= mask;
00146 y++;
00147 }
00148 return y - ys - m;
00149 }
00150 #endif
00151
00152 static inline long
00153 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
00154 {
00155 const unsigned char *x = xs, *xe = xs + m;
00156 const unsigned char *y = ys;
00157 VALUE i, qstable[256];
00158
00159
00160 for (i = 0; i < 256; ++i)
00161 qstable[i] = m + 1;
00162 for (; x < xe; ++x)
00163 qstable[*x] = xe - x;
00164
00165 for (; y + m <= ys + n; y += *(qstable + y[m])) {
00166 if (*xs == *y && memcmp(xs, y, m) == 0)
00167 return y - ys;
00168 }
00169 return -1;
00170 }
00171
00172 static inline unsigned int
00173 rb_memsearch_qs_utf8_hash(const unsigned char *x)
00174 {
00175 register const unsigned int mix = 8353;
00176 register unsigned int h = *x;
00177 if (h < 0xC0) {
00178 return h + 256;
00179 }
00180 else if (h < 0xE0) {
00181 h *= mix;
00182 h += x[1];
00183 }
00184 else if (h < 0xF0) {
00185 h *= mix;
00186 h += x[1];
00187 h *= mix;
00188 h += x[2];
00189 }
00190 else if (h < 0xF5) {
00191 h *= mix;
00192 h += x[1];
00193 h *= mix;
00194 h += x[2];
00195 h *= mix;
00196 h += x[3];
00197 }
00198 else {
00199 return h + 256;
00200 }
00201 return (unsigned char)h;
00202 }
00203
00204 static inline long
00205 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
00206 {
00207 const unsigned char *x = xs, *xe = xs + m;
00208 const unsigned char *y = ys;
00209 VALUE i, qstable[512];
00210
00211
00212 for (i = 0; i < 512; ++i) {
00213 qstable[i] = m + 1;
00214 }
00215 for (; x < xe; ++x) {
00216 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
00217 }
00218
00219 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
00220 if (*xs == *y && memcmp(xs, y, m) == 0)
00221 return y - ys;
00222 }
00223 return -1;
00224 }
00225
00226 long
00227 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
00228 {
00229 const unsigned char *x = x0, *y = y0;
00230
00231 if (m > n) return -1;
00232 else if (m == n) {
00233 return memcmp(x0, y0, m) == 0 ? 0 : -1;
00234 }
00235 else if (m < 1) {
00236 return 0;
00237 }
00238 else if (m == 1) {
00239 const unsigned char *ys;
00240
00241 if (ys = memchr(y, *x, n))
00242 return ys - y;
00243 else
00244 return -1;
00245 }
00246 else if (m <= SIZEOF_VALUE) {
00247 return rb_memsearch_ss(x0, m, y0, n);
00248 }
00249 else if (enc == rb_utf8_encoding()){
00250 return rb_memsearch_qs_utf8(x0, m, y0, n);
00251 }
00252 else {
00253 return rb_memsearch_qs(x0, m, y0, n);
00254 }
00255 }
00256
00257 #define REG_LITERAL FL_USER5
00258 #define REG_ENCODING_NONE FL_USER6
00259
00260 #define KCODE_FIXED FL_USER4
00261
00262 #define ARG_REG_OPTION_MASK \
00263 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
00264 #define ARG_ENCODING_FIXED 16
00265 #define ARG_ENCODING_NONE 32
00266
00267 static int
00268 char_to_option(int c)
00269 {
00270 int val;
00271
00272 switch (c) {
00273 case 'i':
00274 val = ONIG_OPTION_IGNORECASE;
00275 break;
00276 case 'x':
00277 val = ONIG_OPTION_EXTEND;
00278 break;
00279 case 'm':
00280 val = ONIG_OPTION_MULTILINE;
00281 break;
00282 default:
00283 val = 0;
00284 break;
00285 }
00286 return val;
00287 }
00288
00289 static char *
00290 option_to_str(char str[4], int options)
00291 {
00292 char *p = str;
00293 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
00294 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
00295 if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
00296 *p = 0;
00297 return str;
00298 }
00299
00300 extern int
00301 rb_char_to_option_kcode(int c, int *option, int *kcode)
00302 {
00303 *option = 0;
00304
00305 switch (c) {
00306 case 'n':
00307 *kcode = rb_ascii8bit_encindex();
00308 return (*option = ARG_ENCODING_NONE);
00309 case 'e':
00310 *kcode = rb_enc_find_index("EUC-JP");
00311 break;
00312 case 's':
00313 *kcode = rb_enc_find_index("Windows-31J");
00314 break;
00315 case 'u':
00316 *kcode = rb_utf8_encindex();
00317 break;
00318 default:
00319 *kcode = -1;
00320 return (*option = char_to_option(c));
00321 }
00322 *option = ARG_ENCODING_FIXED;
00323 return 1;
00324 }
00325
00326 static void
00327 rb_reg_check(VALUE re)
00328 {
00329 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00330 rb_raise(rb_eTypeError, "uninitialized Regexp");
00331 }
00332 }
00333
00334 static void
00335 rb_reg_expr_str(VALUE str, const char *s, long len,
00336 rb_encoding *enc, rb_encoding *resenc)
00337 {
00338 const char *p, *pend;
00339 int cr = ENC_CODERANGE_UNKNOWN;
00340 int need_escape = 0;
00341 int c, clen;
00342
00343 p = s; pend = p + len;
00344 rb_str_coderange_scan_restartable(p, pend, enc, &cr);
00345 if (rb_enc_asciicompat(enc) &&
00346 (cr == ENC_CODERANGE_VALID || cr == ENC_CODERANGE_7BIT)) {
00347 while (p < pend) {
00348 c = rb_enc_ascget(p, pend, &clen, enc);
00349 if (c == -1) {
00350 if (enc == resenc) {
00351 p += mbclen(p, pend, enc);
00352 }
00353 else {
00354 need_escape = 1;
00355 break;
00356 }
00357 }
00358 else if (c != '/' && rb_enc_isprint(c, enc)) {
00359 p += clen;
00360 }
00361 else {
00362 need_escape = 1;
00363 break;
00364 }
00365 }
00366 }
00367 else {
00368 need_escape = 1;
00369 }
00370
00371 if (!need_escape) {
00372 rb_str_buf_cat(str, s, len);
00373 }
00374 else {
00375 int unicode_p = rb_enc_unicode_p(enc);
00376 p = s;
00377 while (p<pend) {
00378 c = rb_enc_ascget(p, pend, &clen, enc);
00379 if (c == '\\' && p+clen < pend) {
00380 int n = clen + mbclen(p+clen, pend, enc);
00381 rb_str_buf_cat(str, p, n);
00382 p += n;
00383 continue;
00384 }
00385 else if (c == '/') {
00386 char c = '\\';
00387 rb_str_buf_cat(str, &c, 1);
00388 rb_str_buf_cat(str, p, clen);
00389 }
00390 else if (c == -1) {
00391 clen = rb_enc_precise_mbclen(p, pend, enc);
00392 if (!MBCLEN_CHARFOUND_P(clen)) {
00393 c = (unsigned char)*p;
00394 clen = 1;
00395 goto hex;
00396 }
00397 if (resenc) {
00398 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
00399 rb_str_buf_cat_escaped_char(str, c, unicode_p);
00400 }
00401 else {
00402 clen = MBCLEN_CHARFOUND_LEN(clen);
00403 rb_str_buf_cat(str, p, clen);
00404 }
00405 }
00406 else if (rb_enc_isprint(c, enc)) {
00407 rb_str_buf_cat(str, p, clen);
00408 }
00409 else if (!rb_enc_isspace(c, enc)) {
00410 char b[8];
00411
00412 hex:
00413 snprintf(b, sizeof(b), "\\x%02X", c);
00414 rb_str_buf_cat(str, b, 4);
00415 }
00416 else {
00417 rb_str_buf_cat(str, p, clen);
00418 }
00419 p += clen;
00420 }
00421 }
00422 }
00423
00424 static VALUE
00425 rb_reg_desc(const char *s, long len, VALUE re)
00426 {
00427 rb_encoding *enc = rb_enc_get(re);
00428 VALUE str = rb_str_buf_new2("/");
00429 rb_encoding *resenc = rb_default_internal_encoding();
00430 if (resenc == NULL) resenc = rb_default_external_encoding();
00431
00432 if (re && rb_enc_asciicompat(enc)) {
00433 rb_enc_copy(str, re);
00434 }
00435 else {
00436 rb_enc_associate(str, rb_usascii_encoding());
00437 }
00438 rb_reg_expr_str(str, s, len, enc, resenc);
00439 rb_str_buf_cat2(str, "/");
00440 if (re) {
00441 char opts[4];
00442 rb_reg_check(re);
00443 if (*option_to_str(opts, RREGEXP(re)->ptr->options))
00444 rb_str_buf_cat2(str, opts);
00445 if (RBASIC(re)->flags & REG_ENCODING_NONE)
00446 rb_str_buf_cat2(str, "n");
00447 }
00448 OBJ_INFECT(str, re);
00449 return str;
00450 }
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467 static VALUE
00468 rb_reg_source(VALUE re)
00469 {
00470 VALUE str;
00471
00472 rb_reg_check(re);
00473 str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re));
00474 if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00475 return str;
00476 }
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490 static VALUE
00491 rb_reg_inspect(VALUE re)
00492 {
00493 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00494 return rb_any_to_s(re);
00495 }
00496 return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
00497 }
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520 static VALUE
00521 rb_reg_to_s(VALUE re)
00522 {
00523 int options, opt;
00524 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
00525 long len;
00526 const UChar* ptr;
00527 VALUE str = rb_str_buf_new2("(?");
00528 char optbuf[5];
00529 rb_encoding *enc = rb_enc_get(re);
00530
00531 rb_reg_check(re);
00532
00533 rb_enc_copy(str, re);
00534 options = RREGEXP(re)->ptr->options;
00535 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00536 len = RREGEXP_SRC_LEN(re);
00537 again:
00538 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00539 int err = 1;
00540 ptr += 2;
00541 if ((len -= 2) > 0) {
00542 do {
00543 opt = char_to_option((int )*ptr);
00544 if (opt != 0) {
00545 options |= opt;
00546 }
00547 else {
00548 break;
00549 }
00550 ++ptr;
00551 } while (--len > 0);
00552 }
00553 if (len > 1 && *ptr == '-') {
00554 ++ptr;
00555 --len;
00556 do {
00557 opt = char_to_option((int )*ptr);
00558 if (opt != 0) {
00559 options &= ~opt;
00560 }
00561 else {
00562 break;
00563 }
00564 ++ptr;
00565 } while (--len > 0);
00566 }
00567 if (*ptr == ')') {
00568 --len;
00569 ++ptr;
00570 goto again;
00571 }
00572 if (*ptr == ':' && ptr[len-1] == ')') {
00573 Regexp *rp;
00574
00575 ++ptr;
00576 len -= 2;
00577 err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
00578 enc, OnigDefaultSyntax, NULL);
00579 onig_free(rp);
00580 }
00581 if (err) {
00582 options = RREGEXP(re)->ptr->options;
00583 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00584 len = RREGEXP_SRC_LEN(re);
00585 }
00586 }
00587
00588 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
00589
00590 if ((options & embeddable) != embeddable) {
00591 optbuf[0] = '-';
00592 option_to_str(optbuf + 1, ~options);
00593 rb_str_buf_cat2(str, optbuf);
00594 }
00595
00596 rb_str_buf_cat2(str, ":");
00597 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00598 rb_str_buf_cat2(str, ")");
00599 rb_enc_copy(str, re);
00600
00601 OBJ_INFECT(str, re);
00602 return str;
00603 }
00604
00605 static void
00606 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
00607 {
00608 volatile VALUE desc = rb_reg_desc(s, len, re);
00609
00610 rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
00611 }
00612
00613 static VALUE
00614 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
00615 {
00616 char opts[6];
00617 VALUE desc = rb_str_buf_new2(err);
00618 rb_encoding *resenc = rb_default_internal_encoding();
00619 if (resenc == NULL) resenc = rb_default_external_encoding();
00620
00621 rb_enc_associate(desc, enc);
00622 rb_str_buf_cat2(desc, ": /");
00623 rb_reg_expr_str(desc, s, len, enc, resenc);
00624 opts[0] = '/';
00625 option_to_str(opts + 1, options);
00626 rb_str_buf_cat2(desc, opts);
00627 return rb_exc_new3(rb_eRegexpError, desc);
00628 }
00629
00630 static void
00631 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
00632 {
00633 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
00634 }
00635
00636 static VALUE
00637 rb_reg_error_desc(VALUE str, int options, const char *err)
00638 {
00639 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
00640 rb_enc_get(str), options, err);
00641 }
00642
00643 static void
00644 rb_reg_raise_str(VALUE str, int options, const char *err)
00645 {
00646 rb_exc_raise(rb_reg_error_desc(str, options, err));
00647 }
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661 static VALUE
00662 rb_reg_casefold_p(VALUE re)
00663 {
00664 rb_reg_check(re);
00665 if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
00666 return Qfalse;
00667 }
00668
00669
00670
00671
00672
00673
00674
00675
00676
00677
00678
00679
00680
00681
00682
00683
00684
00685
00686
00687
00688
00689
00690
00691
00692
00693 static VALUE
00694 rb_reg_options_m(VALUE re)
00695 {
00696 int options = rb_reg_options(re);
00697 return INT2NUM(options);
00698 }
00699
00700 static int
00701 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
00702 int back_num, int *back_refs, OnigRegex regex, void *arg)
00703 {
00704 VALUE ary = (VALUE)arg;
00705 rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
00706 return 0;
00707 }
00708
00709
00710
00711
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725 static VALUE
00726 rb_reg_names(VALUE re)
00727 {
00728 VALUE ary = rb_ary_new();
00729 rb_reg_check(re);
00730 onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
00731 return ary;
00732 }
00733
00734 static int
00735 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
00736 int back_num, int *back_refs, OnigRegex regex, void *arg)
00737 {
00738 VALUE hash = (VALUE)arg;
00739 VALUE ary = rb_ary_new2(back_num);
00740 int i;
00741
00742 for (i = 0; i < back_num; i++)
00743 rb_ary_store(ary, i, INT2NUM(back_refs[i]));
00744
00745 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
00746
00747 return 0;
00748 }
00749
00750
00751
00752
00753
00754
00755
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765
00766
00767
00768
00769
00770
00771
00772 static VALUE
00773 rb_reg_named_captures(VALUE re)
00774 {
00775 VALUE hash = rb_hash_new();
00776 rb_reg_check(re);
00777 onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
00778 return hash;
00779 }
00780
00781 static int
00782 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
00783 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
00784 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
00785 {
00786 int r;
00787
00788 *reg = (regex_t* )malloc(sizeof(regex_t));
00789 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
00790
00791 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
00792 if (r) goto err;
00793
00794 r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
00795 if (r) {
00796 err:
00797 onig_free(*reg);
00798 *reg = NULL;
00799 }
00800 return r;
00801 }
00802
00803 static Regexp*
00804 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
00805 const char *sourcefile, int sourceline)
00806 {
00807 Regexp *rp;
00808 int r;
00809 OnigErrorInfo einfo;
00810
00811
00812
00813
00814
00815
00816
00817
00818 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
00819 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
00820 if (r) {
00821 onig_error_code_to_str((UChar*)err, r, &einfo);
00822 return 0;
00823 }
00824 return rp;
00825 }
00826
00827
00828
00829
00830
00831
00832
00833
00834
00835
00836
00837
00838
00839
00840 VALUE rb_cMatch;
00841
00842 static VALUE
00843 match_alloc(VALUE klass)
00844 {
00845 NEWOBJ_OF(match, struct RMatch, klass, T_MATCH);
00846
00847 match->str = 0;
00848 match->rmatch = 0;
00849 match->regexp = 0;
00850 match->rmatch = ALLOC(struct rmatch);
00851 MEMZERO(match->rmatch, struct rmatch, 1);
00852
00853 return (VALUE)match;
00854 }
00855
00856 typedef struct {
00857 long byte_pos;
00858 long char_pos;
00859 } pair_t;
00860
00861 static int
00862 pair_byte_cmp(const void *pair1, const void *pair2)
00863 {
00864 long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
00865 #if SIZEOF_LONG > SIZEOF_INT
00866 return diff ? diff > 0 ? 1 : -1 : 0;
00867 #else
00868 return (int)diff;
00869 #endif
00870 }
00871
00872 static void
00873 update_char_offset(VALUE match)
00874 {
00875 struct rmatch *rm = RMATCH(match)->rmatch;
00876 struct re_registers *regs;
00877 int i, num_regs, num_pos;
00878 long c;
00879 char *s, *p, *q;
00880 rb_encoding *enc;
00881 pair_t *pairs;
00882
00883 if (rm->char_offset_updated)
00884 return;
00885
00886 regs = &rm->regs;
00887 num_regs = rm->regs.num_regs;
00888
00889 if (rm->char_offset_num_allocated < num_regs) {
00890 REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
00891 rm->char_offset_num_allocated = num_regs;
00892 }
00893
00894 enc = rb_enc_get(RMATCH(match)->str);
00895 if (rb_enc_mbmaxlen(enc) == 1) {
00896 for (i = 0; i < num_regs; i++) {
00897 rm->char_offset[i].beg = BEG(i);
00898 rm->char_offset[i].end = END(i);
00899 }
00900 rm->char_offset_updated = 1;
00901 return;
00902 }
00903
00904 pairs = ALLOCA_N(pair_t, num_regs*2);
00905 num_pos = 0;
00906 for (i = 0; i < num_regs; i++) {
00907 if (BEG(i) < 0)
00908 continue;
00909 pairs[num_pos++].byte_pos = BEG(i);
00910 pairs[num_pos++].byte_pos = END(i);
00911 }
00912 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00913
00914 s = p = RSTRING_PTR(RMATCH(match)->str);
00915 c = 0;
00916 for (i = 0; i < num_pos; i++) {
00917 q = s + pairs[i].byte_pos;
00918 c += rb_enc_strlen(p, q, enc);
00919 pairs[i].char_pos = c;
00920 p = q;
00921 }
00922
00923 for (i = 0; i < num_regs; i++) {
00924 pair_t key, *found;
00925 if (BEG(i) < 0) {
00926 rm->char_offset[i].beg = -1;
00927 rm->char_offset[i].end = -1;
00928 continue;
00929 }
00930
00931 key.byte_pos = BEG(i);
00932 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00933 rm->char_offset[i].beg = found->char_pos;
00934
00935 key.byte_pos = END(i);
00936 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00937 rm->char_offset[i].end = found->char_pos;
00938 }
00939
00940 rm->char_offset_updated = 1;
00941 }
00942
00943 static void
00944 match_check(VALUE match)
00945 {
00946 if (!RMATCH(match)->regexp) {
00947 rb_raise(rb_eTypeError, "uninitialized Match");
00948 }
00949 }
00950
00951
00952 static VALUE
00953 match_init_copy(VALUE obj, VALUE orig)
00954 {
00955 struct rmatch *rm;
00956
00957 if (!OBJ_INIT_COPY(obj, orig)) return obj;
00958
00959 RMATCH(obj)->str = RMATCH(orig)->str;
00960 RMATCH(obj)->regexp = RMATCH(orig)->regexp;
00961
00962 rm = RMATCH(obj)->rmatch;
00963 onig_region_copy(&rm->regs, RMATCH_REGS(orig));
00964
00965 if (!RMATCH(orig)->rmatch->char_offset_updated) {
00966 rm->char_offset_updated = 0;
00967 }
00968 else {
00969 if (rm->char_offset_num_allocated < rm->regs.num_regs) {
00970 REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
00971 rm->char_offset_num_allocated = rm->regs.num_regs;
00972 }
00973 MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
00974 struct rmatch_offset, rm->regs.num_regs);
00975 rm->char_offset_updated = 1;
00976 }
00977
00978 return obj;
00979 }
00980
00981
00982
00983
00984
00985
00986
00987
00988
00989
00990
00991
00992 static VALUE
00993 match_regexp(VALUE match)
00994 {
00995 match_check(match);
00996 return RMATCH(match)->regexp;
00997 }
00998
00999
01000
01001
01002
01003
01004
01005
01006
01007
01008
01009
01010
01011
01012
01013 static VALUE
01014 match_names(VALUE match)
01015 {
01016 match_check(match);
01017 return rb_reg_names(RMATCH(match)->regexp);
01018 }
01019
01020
01021
01022
01023
01024
01025
01026
01027
01028
01029
01030
01031
01032 static VALUE
01033 match_size(VALUE match)
01034 {
01035 match_check(match);
01036 return INT2FIX(RMATCH_REGS(match)->num_regs);
01037 }
01038
01039 static int
01040 match_backref_number(VALUE match, VALUE backref)
01041 {
01042 const char *name;
01043 int num;
01044
01045 struct re_registers *regs = RMATCH_REGS(match);
01046 VALUE regexp = RMATCH(match)->regexp;
01047
01048 match_check(match);
01049 switch (TYPE(backref)) {
01050 default:
01051 return NUM2INT(backref);
01052
01053 case T_SYMBOL:
01054 name = rb_id2name(SYM2ID(backref));
01055 break;
01056
01057 case T_STRING:
01058 name = StringValueCStr(backref);
01059 break;
01060 }
01061
01062 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01063 (const unsigned char*)name,
01064 (const unsigned char*)name + strlen(name),
01065 regs);
01066
01067 if (num < 1) {
01068 rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
01069 }
01070
01071 return num;
01072 }
01073
01074 int
01075 rb_reg_backref_number(VALUE match, VALUE backref)
01076 {
01077 return match_backref_number(match, backref);
01078 }
01079
01080
01081
01082
01083
01084
01085
01086
01087
01088
01089
01090
01091
01092
01093
01094
01095
01096
01097
01098 static VALUE
01099 match_offset(VALUE match, VALUE n)
01100 {
01101 int i = match_backref_number(match, n);
01102 struct re_registers *regs = RMATCH_REGS(match);
01103
01104 match_check(match);
01105 if (i < 0 || regs->num_regs <= i)
01106 rb_raise(rb_eIndexError, "index %d out of matches", i);
01107
01108 if (BEG(i) < 0)
01109 return rb_assoc_new(Qnil, Qnil);
01110
01111 update_char_offset(match);
01112 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
01113 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
01114 }
01115
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125
01126
01127
01128
01129
01130
01131
01132
01133
01134 static VALUE
01135 match_begin(VALUE match, VALUE n)
01136 {
01137 int i = match_backref_number(match, n);
01138 struct re_registers *regs = RMATCH_REGS(match);
01139
01140 match_check(match);
01141 if (i < 0 || regs->num_regs <= i)
01142 rb_raise(rb_eIndexError, "index %d out of matches", i);
01143
01144 if (BEG(i) < 0)
01145 return Qnil;
01146
01147 update_char_offset(match);
01148 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
01149 }
01150
01151
01152
01153
01154
01155
01156
01157
01158
01159
01160
01161
01162
01163
01164
01165
01166
01167
01168
01169 static VALUE
01170 match_end(VALUE match, VALUE n)
01171 {
01172 int i = match_backref_number(match, n);
01173 struct re_registers *regs = RMATCH_REGS(match);
01174
01175 match_check(match);
01176 if (i < 0 || regs->num_regs <= i)
01177 rb_raise(rb_eIndexError, "index %d out of matches", i);
01178
01179 if (BEG(i) < 0)
01180 return Qnil;
01181
01182 update_char_offset(match);
01183 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
01184 }
01185
01186 #define MATCH_BUSY FL_USER2
01187
01188 void
01189 rb_match_busy(VALUE match)
01190 {
01191 FL_SET(match, MATCH_BUSY);
01192 }
01193
01194
01195
01196
01197
01198
01199
01200
01201
01202
01203
01204
01205
01206
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217
01218
01219
01220
01221
01222
01223 static VALUE
01224 rb_reg_fixed_encoding_p(VALUE re)
01225 {
01226 if (FL_TEST(re, KCODE_FIXED))
01227 return Qtrue;
01228 else
01229 return Qfalse;
01230 }
01231
01232 static VALUE
01233 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
01234 rb_encoding **fixed_enc, onig_errmsg_buffer err);
01235
01236
01237 static void
01238 reg_enc_error(VALUE re, VALUE str)
01239 {
01240 rb_raise(rb_eEncCompatError,
01241 "incompatible encoding regexp match (%s regexp with %s string)",
01242 rb_enc_name(rb_enc_get(re)),
01243 rb_enc_name(rb_enc_get(str)));
01244 }
01245
01246 static rb_encoding*
01247 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
01248 {
01249 rb_encoding *enc = 0;
01250
01251 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
01252 rb_raise(rb_eArgError,
01253 "invalid byte sequence in %s",
01254 rb_enc_name(rb_enc_get(str)));
01255 }
01256
01257 rb_reg_check(re);
01258 enc = rb_enc_get(str);
01259 if (!rb_enc_str_asciicompat_p(str)) {
01260 if (RREGEXP(re)->ptr->enc != enc) {
01261 reg_enc_error(re, str);
01262 }
01263 }
01264 else if (rb_reg_fixed_encoding_p(re)) {
01265 if (RREGEXP(re)->ptr->enc != enc &&
01266 (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
01267 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
01268 reg_enc_error(re, str);
01269 }
01270 enc = RREGEXP(re)->ptr->enc;
01271 }
01272 if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
01273 enc != rb_ascii8bit_encoding() &&
01274 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
01275 rb_warn("regexp match /.../n against to %s string",
01276 rb_enc_name(enc));
01277 }
01278 return enc;
01279 }
01280
01281 regex_t *
01282 rb_reg_prepare_re(VALUE re, VALUE str)
01283 {
01284 regex_t *reg = RREGEXP(re)->ptr;
01285 onig_errmsg_buffer err = "";
01286 int r;
01287 OnigErrorInfo einfo;
01288 const char *pattern;
01289 VALUE unescaped;
01290 rb_encoding *fixed_enc = 0;
01291 rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
01292
01293 if (reg->enc == enc) return reg;
01294
01295 rb_reg_check(re);
01296 reg = RREGEXP(re)->ptr;
01297 pattern = RREGEXP_SRC_PTR(re);
01298
01299 unescaped = rb_reg_preprocess(
01300 pattern, pattern + RREGEXP_SRC_LEN(re), enc,
01301 &fixed_enc, err);
01302
01303 if (unescaped == Qnil) {
01304 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
01305 }
01306
01307 r = onig_new(®, (UChar* )RSTRING_PTR(unescaped),
01308 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
01309 reg->options, enc,
01310 OnigDefaultSyntax, &einfo);
01311 if (r) {
01312 onig_error_code_to_str((UChar*)err, r, &einfo);
01313 rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
01314 }
01315
01316 RB_GC_GUARD(unescaped);
01317 return reg;
01318 }
01319
01320 long
01321 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
01322 {
01323 long range;
01324 rb_encoding *enc;
01325 UChar *p, *string;
01326
01327 enc = rb_reg_prepare_enc(re, str, 0);
01328
01329 if (reverse) {
01330 range = -pos;
01331 }
01332 else {
01333 range = RSTRING_LEN(str) - pos;
01334 }
01335
01336 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
01337 string = (UChar*)RSTRING_PTR(str);
01338
01339 if (range > 0) {
01340 p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
01341 }
01342 else {
01343 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
01344 }
01345 return p - string;
01346 }
01347
01348 return pos;
01349 }
01350
01351 long
01352 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
01353 {
01354 long result;
01355 VALUE match;
01356 struct re_registers regi, *regs = ®i;
01357 char *range = RSTRING_PTR(str);
01358 regex_t *reg;
01359 int tmpreg;
01360
01361 if (pos > RSTRING_LEN(str) || pos < 0) {
01362 rb_backref_set(Qnil);
01363 return -1;
01364 }
01365
01366 reg = rb_reg_prepare_re(re, str);
01367 tmpreg = reg != RREGEXP(re)->ptr;
01368 if (!tmpreg) RREGEXP(re)->usecnt++;
01369
01370 match = rb_backref_get();
01371 if (!NIL_P(match)) {
01372 if (FL_TEST(match, MATCH_BUSY)) {
01373 match = Qnil;
01374 }
01375 else {
01376 regs = RMATCH_REGS(match);
01377 }
01378 }
01379 if (NIL_P(match)) {
01380 MEMZERO(regs, struct re_registers, 1);
01381 }
01382 if (!reverse) {
01383 range += RSTRING_LEN(str);
01384 }
01385 result = onig_search(reg,
01386 (UChar*)(RSTRING_PTR(str)),
01387 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
01388 ((UChar*)(RSTRING_PTR(str)) + pos),
01389 ((UChar*)range),
01390 regs, ONIG_OPTION_NONE);
01391 if (!tmpreg) RREGEXP(re)->usecnt--;
01392 if (tmpreg) {
01393 if (RREGEXP(re)->usecnt) {
01394 onig_free(reg);
01395 }
01396 else {
01397 onig_free(RREGEXP(re)->ptr);
01398 RREGEXP(re)->ptr = reg;
01399 }
01400 }
01401 if (result < 0) {
01402 if (regs == ®i)
01403 onig_region_free(regs, 0);
01404 if (result == ONIG_MISMATCH) {
01405 rb_backref_set(Qnil);
01406 return result;
01407 }
01408 else {
01409 onig_errmsg_buffer err = "";
01410 onig_error_code_to_str((UChar*)err, (int)result);
01411 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
01412 }
01413 }
01414
01415 if (NIL_P(match)) {
01416 match = match_alloc(rb_cMatch);
01417 onig_region_copy(RMATCH_REGS(match), regs);
01418 onig_region_free(regs, 0);
01419 }
01420 else {
01421 if (rb_safe_level() >= 3)
01422 OBJ_TAINT(match);
01423 else
01424 FL_UNSET(match, FL_TAINT);
01425 }
01426
01427 RMATCH(match)->str = rb_str_new4(str);
01428 RMATCH(match)->regexp = re;
01429 RMATCH(match)->rmatch->char_offset_updated = 0;
01430 rb_backref_set(match);
01431
01432 OBJ_INFECT(match, re);
01433 OBJ_INFECT(match, str);
01434
01435 return result;
01436 }
01437
01438 VALUE
01439 rb_reg_nth_defined(int nth, VALUE match)
01440 {
01441 struct re_registers *regs;
01442 if (NIL_P(match)) return Qnil;
01443 match_check(match);
01444 regs = RMATCH_REGS(match);
01445 if (nth >= regs->num_regs) {
01446 return Qnil;
01447 }
01448 if (nth < 0) {
01449 nth += regs->num_regs;
01450 if (nth <= 0) return Qnil;
01451 }
01452 if (BEG(nth) == -1) return Qfalse;
01453 return Qtrue;
01454 }
01455
01456 VALUE
01457 rb_reg_nth_match(int nth, VALUE match)
01458 {
01459 VALUE str;
01460 long start, end, len;
01461 struct re_registers *regs;
01462
01463 if (NIL_P(match)) return Qnil;
01464 match_check(match);
01465 regs = RMATCH_REGS(match);
01466 if (nth >= regs->num_regs) {
01467 return Qnil;
01468 }
01469 if (nth < 0) {
01470 nth += regs->num_regs;
01471 if (nth <= 0) return Qnil;
01472 }
01473 start = BEG(nth);
01474 if (start == -1) return Qnil;
01475 end = END(nth);
01476 len = end - start;
01477 str = rb_str_subseq(RMATCH(match)->str, start, len);
01478 OBJ_INFECT(str, match);
01479 return str;
01480 }
01481
01482 VALUE
01483 rb_reg_last_match(VALUE match)
01484 {
01485 return rb_reg_nth_match(0, match);
01486 }
01487
01488
01489
01490
01491
01492
01493
01494
01495
01496
01497
01498
01499
01500 VALUE
01501 rb_reg_match_pre(VALUE match)
01502 {
01503 VALUE str;
01504 struct re_registers *regs;
01505
01506 if (NIL_P(match)) return Qnil;
01507 match_check(match);
01508 regs = RMATCH_REGS(match);
01509 if (BEG(0) == -1) return Qnil;
01510 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
01511 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01512 return str;
01513 }
01514
01515
01516
01517
01518
01519
01520
01521
01522
01523
01524
01525
01526
01527 VALUE
01528 rb_reg_match_post(VALUE match)
01529 {
01530 VALUE str;
01531 long pos;
01532 struct re_registers *regs;
01533
01534 if (NIL_P(match)) return Qnil;
01535 match_check(match);
01536 regs = RMATCH_REGS(match);
01537 if (BEG(0) == -1) return Qnil;
01538 str = RMATCH(match)->str;
01539 pos = END(0);
01540 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
01541 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01542 return str;
01543 }
01544
01545 VALUE
01546 rb_reg_match_last(VALUE match)
01547 {
01548 int i;
01549 struct re_registers *regs;
01550
01551 if (NIL_P(match)) return Qnil;
01552 match_check(match);
01553 regs = RMATCH_REGS(match);
01554 if (BEG(0) == -1) return Qnil;
01555
01556 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
01557 ;
01558 if (i == 0) return Qnil;
01559 return rb_reg_nth_match(i, match);
01560 }
01561
01562 static VALUE
01563 last_match_getter(void)
01564 {
01565 return rb_reg_last_match(rb_backref_get());
01566 }
01567
01568 static VALUE
01569 prematch_getter(void)
01570 {
01571 return rb_reg_match_pre(rb_backref_get());
01572 }
01573
01574 static VALUE
01575 postmatch_getter(void)
01576 {
01577 return rb_reg_match_post(rb_backref_get());
01578 }
01579
01580 static VALUE
01581 last_paren_match_getter(void)
01582 {
01583 return rb_reg_match_last(rb_backref_get());
01584 }
01585
01586 static VALUE
01587 match_array(VALUE match, int start)
01588 {
01589 struct re_registers *regs;
01590 VALUE ary;
01591 VALUE target;
01592 int i;
01593 int taint = OBJ_TAINTED(match);
01594
01595 match_check(match);
01596 regs = RMATCH_REGS(match);
01597 ary = rb_ary_new2(regs->num_regs);
01598 target = RMATCH(match)->str;
01599
01600 for (i=start; i<regs->num_regs; i++) {
01601 if (regs->beg[i] == -1) {
01602 rb_ary_push(ary, Qnil);
01603 }
01604 else {
01605 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01606 if (taint) OBJ_TAINT(str);
01607 rb_ary_push(ary, str);
01608 }
01609 }
01610 return ary;
01611 }
01612
01613
01614
01615
01616
01617
01618
01619
01620
01621
01622
01623
01624
01625
01626
01627
01628
01629
01630
01631
01632
01633
01634
01635
01636
01637
01638
01639
01640 static VALUE
01641 match_to_a(VALUE match)
01642 {
01643 return match_array(match, 0);
01644 }
01645
01646
01647
01648
01649
01650
01651
01652
01653
01654
01655
01656
01657
01658
01659 static VALUE
01660 match_captures(VALUE match)
01661 {
01662 return match_array(match, 1);
01663 }
01664
01665 static int
01666 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
01667 {
01668 return onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01669 (const unsigned char* )name, (const unsigned char* )name_end, regs);
01670 }
01671
01672 NORETURN(static void name_to_backref_error(VALUE name));
01673 static void
01674 name_to_backref_error(VALUE name)
01675 {
01676 rb_raise(rb_eIndexError, "undefined group name reference: % "PRIsVALUE,
01677 name);
01678 }
01679
01680
01681
01682
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695
01696
01697
01698
01699
01700
01701
01702
01703
01704
01705
01706
01707 static VALUE
01708 match_aref(int argc, VALUE *argv, VALUE match)
01709 {
01710 VALUE idx, rest;
01711
01712 match_check(match);
01713 rb_scan_args(argc, argv, "11", &idx, &rest);
01714
01715 if (NIL_P(rest)) {
01716 if (FIXNUM_P(idx)) {
01717 if (FIX2INT(idx) >= 0) {
01718 return rb_reg_nth_match(FIX2INT(idx), match);
01719 }
01720 }
01721 else {
01722 const char *p;
01723 int num;
01724
01725 switch (TYPE(idx)) {
01726 case T_SYMBOL:
01727 idx = rb_id2str(SYM2ID(idx));
01728
01729 case T_STRING:
01730 p = StringValuePtr(idx);
01731 if (!rb_enc_compatible(RREGEXP(RMATCH(match)->regexp)->src, idx) ||
01732 (num = name_to_backref_number(RMATCH_REGS(match), RMATCH(match)->regexp,
01733 p, p + RSTRING_LEN(idx))) < 1) {
01734 name_to_backref_error(idx);
01735 }
01736 return rb_reg_nth_match(num, match);
01737
01738 default:
01739 break;
01740 }
01741 }
01742 }
01743
01744 return rb_ary_aref(argc, argv, match_to_a(match));
01745 }
01746
01747 static VALUE
01748 match_entry(VALUE match, long n)
01749 {
01750
01751 return rb_reg_nth_match((int)n, match);
01752 }
01753
01754
01755
01756
01757
01758
01759
01760
01761
01762
01763
01764
01765
01766
01767
01768 static VALUE
01769 match_values_at(int argc, VALUE *argv, VALUE match)
01770 {
01771 struct re_registers *regs;
01772
01773 match_check(match);
01774 regs = RMATCH_REGS(match);
01775 return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
01776 }
01777
01778
01779
01780
01781
01782
01783
01784
01785
01786
01787
01788
01789 static VALUE
01790 match_to_s(VALUE match)
01791 {
01792 VALUE str = rb_reg_last_match(match);
01793
01794 match_check(match);
01795 if (NIL_P(str)) str = rb_str_new(0,0);
01796 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01797 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01798 return str;
01799 }
01800
01801
01802
01803
01804
01805
01806
01807
01808
01809
01810
01811
01812 static VALUE
01813 match_string(VALUE match)
01814 {
01815 match_check(match);
01816 return RMATCH(match)->str;
01817 }
01818
01819 struct backref_name_tag {
01820 const UChar *name;
01821 long len;
01822 };
01823
01824 static int
01825 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
01826 int back_num, int *back_refs, OnigRegex regex, void *arg0)
01827 {
01828 struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
01829 int i;
01830
01831 for (i = 0; i < back_num; i++) {
01832 arg[back_refs[i]].name = name;
01833 arg[back_refs[i]].len = name_end - name;
01834 }
01835 return 0;
01836 }
01837
01838
01839
01840
01841
01842
01843
01844
01845
01846
01847
01848
01849
01850
01851
01852
01853
01854
01855
01856
01857
01858 static VALUE
01859 match_inspect(VALUE match)
01860 {
01861 const char *cname = rb_obj_classname(match);
01862 VALUE str;
01863 int i;
01864 struct re_registers *regs = RMATCH_REGS(match);
01865 int num_regs = regs->num_regs;
01866 struct backref_name_tag *names;
01867 VALUE regexp = RMATCH(match)->regexp;
01868
01869 if (regexp == 0) {
01870 return rb_sprintf("#<%s:%p>", cname, (void*)match);
01871 }
01872
01873 names = ALLOCA_N(struct backref_name_tag, num_regs);
01874 MEMZERO(names, struct backref_name_tag, num_regs);
01875
01876 onig_foreach_name(RREGEXP(regexp)->ptr,
01877 match_inspect_name_iter, names);
01878
01879 str = rb_str_buf_new2("#<");
01880 rb_str_buf_cat2(str, cname);
01881
01882 for (i = 0; i < num_regs; i++) {
01883 VALUE v;
01884 rb_str_buf_cat2(str, " ");
01885 if (0 < i) {
01886 if (names[i].name)
01887 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
01888 else {
01889 rb_str_catf(str, "%d", i);
01890 }
01891 rb_str_buf_cat2(str, ":");
01892 }
01893 v = rb_reg_nth_match(i, match);
01894 if (v == Qnil)
01895 rb_str_buf_cat2(str, "nil");
01896 else
01897 rb_str_buf_append(str, rb_str_inspect(v));
01898 }
01899 rb_str_buf_cat2(str, ">");
01900
01901 return str;
01902 }
01903
01904 VALUE rb_cRegexp;
01905
01906 static int
01907 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
01908 {
01909 const char *p = *pp;
01910 int code;
01911 int meta_prefix = 0, ctrl_prefix = 0;
01912 size_t len;
01913
01914 if (p == end || *p++ != '\\') {
01915 errcpy(err, "too short escaped multibyte character");
01916 return -1;
01917 }
01918
01919 again:
01920 if (p == end) {
01921 errcpy(err, "too short escape sequence");
01922 return -1;
01923 }
01924 switch (*p++) {
01925 case '\\': code = '\\'; break;
01926 case 'n': code = '\n'; break;
01927 case 't': code = '\t'; break;
01928 case 'r': code = '\r'; break;
01929 case 'f': code = '\f'; break;
01930 case 'v': code = '\013'; break;
01931 case 'a': code = '\007'; break;
01932 case 'e': code = '\033'; break;
01933
01934
01935 case '0': case '1': case '2': case '3':
01936 case '4': case '5': case '6': case '7':
01937 p--;
01938 code = scan_oct(p, end < p+3 ? end-p : 3, &len);
01939 p += len;
01940 break;
01941
01942 case 'x':
01943 code = scan_hex(p, end < p+2 ? end-p : 2, &len);
01944 if (len < 1) {
01945 errcpy(err, "invalid hex escape");
01946 return -1;
01947 }
01948 p += len;
01949 break;
01950
01951 case 'M':
01952 if (meta_prefix) {
01953 errcpy(err, "duplicate meta escape");
01954 return -1;
01955 }
01956 meta_prefix = 1;
01957 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
01958 if (*p == '\\') {
01959 p++;
01960 goto again;
01961 }
01962 else {
01963 code = *p++;
01964 break;
01965 }
01966 }
01967 errcpy(err, "too short meta escape");
01968 return -1;
01969
01970 case 'C':
01971 if (p == end || *p++ != '-') {
01972 errcpy(err, "too short control escape");
01973 return -1;
01974 }
01975 case 'c':
01976 if (ctrl_prefix) {
01977 errcpy(err, "duplicate control escape");
01978 return -1;
01979 }
01980 ctrl_prefix = 1;
01981 if (p < end && (*p & 0x80) == 0) {
01982 if (*p == '\\') {
01983 p++;
01984 goto again;
01985 }
01986 else {
01987 code = *p++;
01988 break;
01989 }
01990 }
01991 errcpy(err, "too short control escape");
01992 return -1;
01993
01994 default:
01995 errcpy(err, "unexpected escape sequence");
01996 return -1;
01997 }
01998 if (code < 0 || 0xff < code) {
01999 errcpy(err, "invalid escape code");
02000 return -1;
02001 }
02002
02003 if (ctrl_prefix)
02004 code &= 0x1f;
02005 if (meta_prefix)
02006 code |= 0x80;
02007
02008 *pp = p;
02009 return code;
02010 }
02011
02012 static int
02013 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
02014 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02015 {
02016 const char *p = *pp;
02017 int chmaxlen = rb_enc_mbmaxlen(enc);
02018 char *chbuf = ALLOCA_N(char, chmaxlen);
02019 int chlen = 0;
02020 int byte;
02021 int l;
02022
02023 memset(chbuf, 0, chmaxlen);
02024
02025 byte = read_escaped_byte(&p, end, err);
02026 if (byte == -1) {
02027 return -1;
02028 }
02029
02030 chbuf[chlen++] = byte;
02031 while (chlen < chmaxlen &&
02032 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
02033 byte = read_escaped_byte(&p, end, err);
02034 if (byte == -1) {
02035 return -1;
02036 }
02037 chbuf[chlen++] = byte;
02038 }
02039
02040 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
02041 if (MBCLEN_INVALID_P(l)) {
02042 errcpy(err, "invalid multibyte escape");
02043 return -1;
02044 }
02045 if (1 < chlen || (chbuf[0] & 0x80)) {
02046 rb_str_buf_cat(buf, chbuf, chlen);
02047
02048 if (*encp == 0)
02049 *encp = enc;
02050 else if (*encp != enc) {
02051 errcpy(err, "escaped non ASCII character in UTF-8 regexp");
02052 return -1;
02053 }
02054 }
02055 else {
02056 char escbuf[5];
02057 snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
02058 rb_str_buf_cat(buf, escbuf, 4);
02059 }
02060 *pp = p;
02061 return 0;
02062 }
02063
02064 static int
02065 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
02066 {
02067 if ((0xd800 <= code && code <= 0xdfff) ||
02068 0x10ffff < code) {
02069 errcpy(err, "invalid Unicode range");
02070 return -1;
02071 }
02072 return 0;
02073 }
02074
02075 static int
02076 append_utf8(unsigned long uv,
02077 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02078 {
02079 if (check_unicode_range(uv, err) != 0)
02080 return -1;
02081 if (uv < 0x80) {
02082 char escbuf[5];
02083 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
02084 rb_str_buf_cat(buf, escbuf, 4);
02085 }
02086 else {
02087 int len;
02088 char utf8buf[6];
02089 len = rb_uv_to_utf8(utf8buf, uv);
02090 rb_str_buf_cat(buf, utf8buf, len);
02091
02092 if (*encp == 0)
02093 *encp = rb_utf8_encoding();
02094 else if (*encp != rb_utf8_encoding()) {
02095 errcpy(err, "UTF-8 character in non UTF-8 regexp");
02096 return -1;
02097 }
02098 }
02099 return 0;
02100 }
02101
02102 static int
02103 unescape_unicode_list(const char **pp, const char *end,
02104 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02105 {
02106 const char *p = *pp;
02107 int has_unicode = 0;
02108 unsigned long code;
02109 size_t len;
02110
02111 while (p < end && ISSPACE(*p)) p++;
02112
02113 while (1) {
02114 code = ruby_scan_hex(p, end-p, &len);
02115 if (len == 0)
02116 break;
02117 if (6 < len) {
02118 errcpy(err, "invalid Unicode range");
02119 return -1;
02120 }
02121 p += len;
02122 if (append_utf8(code, buf, encp, err) != 0)
02123 return -1;
02124 has_unicode = 1;
02125
02126 while (p < end && ISSPACE(*p)) p++;
02127 }
02128
02129 if (has_unicode == 0) {
02130 errcpy(err, "invalid Unicode list");
02131 return -1;
02132 }
02133
02134 *pp = p;
02135
02136 return 0;
02137 }
02138
02139 static int
02140 unescape_unicode_bmp(const char **pp, const char *end,
02141 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02142 {
02143 const char *p = *pp;
02144 size_t len;
02145 unsigned long code;
02146
02147 if (end < p+4) {
02148 errcpy(err, "invalid Unicode escape");
02149 return -1;
02150 }
02151 code = ruby_scan_hex(p, 4, &len);
02152 if (len != 4) {
02153 errcpy(err, "invalid Unicode escape");
02154 return -1;
02155 }
02156 if (append_utf8(code, buf, encp, err) != 0)
02157 return -1;
02158 *pp = p + 4;
02159 return 0;
02160 }
02161
02162 static int
02163 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
02164 VALUE buf, rb_encoding **encp, int *has_property,
02165 onig_errmsg_buffer err)
02166 {
02167 char c;
02168 char smallbuf[2];
02169
02170 while (p < end) {
02171 int chlen = rb_enc_precise_mbclen(p, end, enc);
02172 if (!MBCLEN_CHARFOUND_P(chlen)) {
02173 errcpy(err, "invalid multibyte character");
02174 return -1;
02175 }
02176 chlen = MBCLEN_CHARFOUND_LEN(chlen);
02177 if (1 < chlen || (*p & 0x80)) {
02178 rb_str_buf_cat(buf, p, chlen);
02179 p += chlen;
02180 if (*encp == 0)
02181 *encp = enc;
02182 else if (*encp != enc) {
02183 errcpy(err, "non ASCII character in UTF-8 regexp");
02184 return -1;
02185 }
02186 continue;
02187 }
02188
02189 switch (c = *p++) {
02190 case '\\':
02191 if (p == end) {
02192 errcpy(err, "too short escape sequence");
02193 return -1;
02194 }
02195 switch (c = *p++) {
02196 case '1': case '2': case '3':
02197 case '4': case '5': case '6': case '7':
02198 {
02199 size_t octlen;
02200 if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
02201
02202
02203
02204 goto escape_asis;
02205 }
02206 }
02207
02208
02209 case '0':
02210
02211 case 'x':
02212 case 'c':
02213 case 'C':
02214 case 'M':
02215 p = p-2;
02216 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
02217 return -1;
02218 break;
02219
02220 case 'u':
02221 if (p == end) {
02222 errcpy(err, "too short escape sequence");
02223 return -1;
02224 }
02225 if (*p == '{') {
02226
02227 p++;
02228 if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
02229 return -1;
02230 if (p == end || *p++ != '}') {
02231 errcpy(err, "invalid Unicode list");
02232 return -1;
02233 }
02234 break;
02235 }
02236 else {
02237
02238 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
02239 return -1;
02240 break;
02241 }
02242
02243 case 'p':
02244 case 'P':
02245 if (!*encp) {
02246 *has_property = 1;
02247 }
02248 goto escape_asis;
02249
02250 default:
02251 escape_asis:
02252 smallbuf[0] = '\\';
02253 smallbuf[1] = c;
02254 rb_str_buf_cat(buf, smallbuf, 2);
02255 break;
02256 }
02257 break;
02258
02259 default:
02260 rb_str_buf_cat(buf, &c, 1);
02261 break;
02262 }
02263 }
02264
02265 return 0;
02266 }
02267
02268 static VALUE
02269 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
02270 rb_encoding **fixed_enc, onig_errmsg_buffer err)
02271 {
02272 VALUE buf;
02273 int has_property = 0;
02274
02275 buf = rb_str_buf_new(0);
02276
02277 if (rb_enc_asciicompat(enc))
02278 *fixed_enc = 0;
02279 else {
02280 *fixed_enc = enc;
02281 rb_enc_associate(buf, enc);
02282 }
02283
02284 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
02285 return Qnil;
02286
02287 if (has_property && !*fixed_enc) {
02288 *fixed_enc = enc;
02289 }
02290
02291 if (*fixed_enc) {
02292 rb_enc_associate(buf, *fixed_enc);
02293 }
02294
02295 return buf;
02296 }
02297
02298 VALUE
02299 rb_reg_check_preprocess(VALUE str)
02300 {
02301 rb_encoding *fixed_enc = 0;
02302 onig_errmsg_buffer err = "";
02303 VALUE buf;
02304 char *p, *end;
02305 rb_encoding *enc;
02306
02307 StringValue(str);
02308 p = RSTRING_PTR(str);
02309 end = p + RSTRING_LEN(str);
02310 enc = rb_enc_get(str);
02311
02312 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
02313 RB_GC_GUARD(str);
02314
02315 if (buf == Qnil) {
02316 return rb_reg_error_desc(str, 0, err);
02317 }
02318 return Qnil;
02319 }
02320
02321 static VALUE
02322 rb_reg_preprocess_dregexp(VALUE ary, int options)
02323 {
02324 rb_encoding *fixed_enc = 0;
02325 rb_encoding *regexp_enc = 0;
02326 onig_errmsg_buffer err = "";
02327 int i;
02328 VALUE result = 0;
02329 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02330
02331 if (RARRAY_LEN(ary) == 0) {
02332 rb_raise(rb_eArgError, "no arguments given");
02333 }
02334
02335 for (i = 0; i < RARRAY_LEN(ary); i++) {
02336 VALUE str = RARRAY_PTR(ary)[i];
02337 VALUE buf;
02338 char *p, *end;
02339 rb_encoding *src_enc;
02340
02341 src_enc = rb_enc_get(str);
02342 if (options & ARG_ENCODING_NONE &&
02343 src_enc != ascii8bit) {
02344 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)
02345 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02346 else
02347 src_enc = ascii8bit;
02348 }
02349
02350 StringValue(str);
02351 p = RSTRING_PTR(str);
02352 end = p + RSTRING_LEN(str);
02353
02354 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
02355
02356 if (buf == Qnil)
02357 rb_raise(rb_eArgError, "%s", err);
02358
02359 if (fixed_enc != 0) {
02360 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
02361 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
02362 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
02363 }
02364 regexp_enc = fixed_enc;
02365 }
02366
02367 if (!result)
02368 result = rb_str_new3(str);
02369 else
02370 rb_str_buf_append(result, str);
02371 }
02372 if (regexp_enc) {
02373 rb_enc_associate(result, regexp_enc);
02374 }
02375
02376 return result;
02377 }
02378
02379 static int
02380 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
02381 int options, onig_errmsg_buffer err,
02382 const char *sourcefile, int sourceline)
02383 {
02384 struct RRegexp *re = RREGEXP(obj);
02385 VALUE unescaped;
02386 rb_encoding *fixed_enc = 0;
02387 rb_encoding *a_enc = rb_ascii8bit_encoding();
02388
02389 if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4)
02390 rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
02391 rb_check_frozen(obj);
02392 if (FL_TEST(obj, REG_LITERAL))
02393 rb_raise(rb_eSecurityError, "can't modify literal regexp");
02394 if (re->ptr)
02395 rb_raise(rb_eTypeError, "already initialized regexp");
02396 re->ptr = 0;
02397
02398 if (rb_enc_dummy_p(enc)) {
02399 errcpy(err, "can't make regexp with dummy encoding");
02400 return -1;
02401 }
02402
02403 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
02404 if (unescaped == Qnil)
02405 return -1;
02406
02407 if (fixed_enc) {
02408 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
02409 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
02410 errcpy(err, "incompatible character encoding");
02411 return -1;
02412 }
02413 if (fixed_enc != a_enc) {
02414 options |= ARG_ENCODING_FIXED;
02415 enc = fixed_enc;
02416 }
02417 }
02418 else if (!(options & ARG_ENCODING_FIXED)) {
02419 enc = rb_usascii_encoding();
02420 }
02421
02422 rb_enc_associate((VALUE)re, enc);
02423 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
02424 re->basic.flags |= KCODE_FIXED;
02425 }
02426 if (options & ARG_ENCODING_NONE) {
02427 re->basic.flags |= REG_ENCODING_NONE;
02428 }
02429
02430 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
02431 options & ARG_REG_OPTION_MASK, err,
02432 sourcefile, sourceline);
02433 if (!re->ptr) return -1;
02434 re->src = rb_enc_str_new(s, len, enc);
02435 OBJ_FREEZE(re->src);
02436 RB_GC_GUARD(unescaped);
02437 return 0;
02438 }
02439
02440 static int
02441 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
02442 const char *sourcefile, int sourceline)
02443 {
02444 int ret;
02445 rb_encoding *enc = rb_enc_get(str);
02446 if (options & ARG_ENCODING_NONE) {
02447 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02448 if (enc != ascii8bit) {
02449 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
02450 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02451 return -1;
02452 }
02453 enc = ascii8bit;
02454 }
02455 }
02456 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
02457 options, err, sourcefile, sourceline);
02458 OBJ_INFECT(obj, str);
02459 RB_GC_GUARD(str);
02460 return ret;
02461 }
02462
02463 static VALUE
02464 rb_reg_s_alloc(VALUE klass)
02465 {
02466 NEWOBJ_OF(re, struct RRegexp, klass, T_REGEXP);
02467
02468 re->ptr = 0;
02469 re->src = 0;
02470 re->usecnt = 0;
02471
02472 return (VALUE)re;
02473 }
02474
02475 VALUE
02476 rb_reg_alloc(void)
02477 {
02478 return rb_reg_s_alloc(rb_cRegexp);
02479 }
02480
02481 VALUE
02482 rb_reg_new_str(VALUE s, int options)
02483 {
02484 return rb_reg_init_str(rb_reg_alloc(), s, options);
02485 }
02486
02487 VALUE
02488 rb_reg_init_str(VALUE re, VALUE s, int options)
02489 {
02490 onig_errmsg_buffer err = "";
02491
02492 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
02493 rb_reg_raise_str(s, options, err);
02494 }
02495
02496 return re;
02497 }
02498
02499 VALUE
02500 rb_reg_new_ary(VALUE ary, int opt)
02501 {
02502 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
02503 }
02504
02505 VALUE
02506 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
02507 {
02508 VALUE re = rb_reg_alloc();
02509 onig_errmsg_buffer err = "";
02510
02511 if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
02512 rb_enc_reg_raise(s, len, enc, options, err);
02513 }
02514
02515 return re;
02516 }
02517
02518 VALUE
02519 rb_reg_new(const char *s, long len, int options)
02520 {
02521 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
02522 }
02523
02524 VALUE
02525 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
02526 {
02527 VALUE re = rb_reg_alloc();
02528 onig_errmsg_buffer err = "";
02529
02530 if (!str) str = rb_str_new(0,0);
02531 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
02532 rb_set_errinfo(rb_reg_error_desc(str, options, err));
02533 return Qnil;
02534 }
02535 FL_SET(re, REG_LITERAL);
02536 return re;
02537 }
02538
02539 static VALUE reg_cache;
02540
02541 VALUE
02542 rb_reg_regcomp(VALUE str)
02543 {
02544 volatile VALUE save_str = str;
02545 if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
02546 && ENCODING_GET(reg_cache) == ENCODING_GET(str)
02547 && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
02548 return reg_cache;
02549
02550 return reg_cache = rb_reg_new_str(save_str, 0);
02551 }
02552
02553 static st_index_t reg_hash(VALUE re);
02554
02555
02556
02557
02558
02559
02560
02561 static VALUE
02562 rb_reg_hash(VALUE re)
02563 {
02564 st_index_t hashval = reg_hash(re);
02565 return LONG2FIX(hashval);
02566 }
02567
02568 static st_index_t
02569 reg_hash(VALUE re)
02570 {
02571 st_index_t hashval;
02572
02573 rb_reg_check(re);
02574 hashval = RREGEXP(re)->ptr->options;
02575 hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
02576 return rb_hash_end(hashval);
02577 }
02578
02579
02580
02581
02582
02583
02584
02585
02586
02587
02588
02589
02590
02591
02592
02593
02594
02595 static VALUE
02596 rb_reg_equal(VALUE re1, VALUE re2)
02597 {
02598 if (re1 == re2) return Qtrue;
02599 if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse;
02600 rb_reg_check(re1); rb_reg_check(re2);
02601 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
02602 if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
02603 if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
02604 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
02605 if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
02606 return Qtrue;
02607 }
02608 return Qfalse;
02609 }
02610
02611
02612
02613
02614
02615
02616
02617
02618
02619 static VALUE
02620 match_hash(VALUE match)
02621 {
02622 const struct re_registers *regs;
02623 st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
02624
02625 rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
02626 regs = RMATCH_REGS(match);
02627 hashval = rb_hash_uint(hashval, regs->num_regs);
02628 hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
02629 hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
02630 hashval = rb_hash_end(hashval);
02631 return LONG2FIX(hashval);
02632 }
02633
02634
02635
02636
02637
02638
02639
02640
02641
02642 static VALUE
02643 match_equal(VALUE match1, VALUE match2)
02644 {
02645 const struct re_registers *regs1, *regs2;
02646 if (match1 == match2) return Qtrue;
02647 if (!RB_TYPE_P(match2, T_MATCH)) return Qfalse;
02648 if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
02649 if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
02650 regs1 = RMATCH_REGS(match1);
02651 regs2 = RMATCH_REGS(match2);
02652 if (regs1->num_regs != regs2->num_regs) return Qfalse;
02653 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
02654 if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
02655 return Qtrue;
02656 }
02657
02658 static VALUE
02659 reg_operand(VALUE s, int check)
02660 {
02661 if (SYMBOL_P(s)) {
02662 return rb_sym_to_s(s);
02663 }
02664 else {
02665 return (check ? rb_str_to_str : rb_check_string_type)(s);
02666 }
02667 }
02668
02669 static long
02670 reg_match_pos(VALUE re, VALUE *strp, long pos)
02671 {
02672 VALUE str = *strp;
02673
02674 if (NIL_P(str)) {
02675 rb_backref_set(Qnil);
02676 return -1;
02677 }
02678 *strp = str = reg_operand(str, TRUE);
02679 if (pos != 0) {
02680 if (pos < 0) {
02681 VALUE l = rb_str_length(str);
02682 pos += NUM2INT(l);
02683 if (pos < 0) {
02684 return pos;
02685 }
02686 }
02687 pos = rb_str_offset(str, pos);
02688 }
02689 return rb_reg_search(re, str, pos, 0);
02690 }
02691
02692
02693
02694
02695
02696
02697
02698
02699
02700
02701
02702
02703
02704
02705
02706
02707
02708
02709
02710
02711
02712
02713
02714
02715
02716
02717
02718
02719
02720
02721
02722
02723
02724
02725
02726
02727
02728
02729
02730
02731
02732
02733
02734
02735
02736
02737
02738
02739
02740 VALUE
02741 rb_reg_match(VALUE re, VALUE str)
02742 {
02743 long pos = reg_match_pos(re, &str, 0);
02744 if (pos < 0) return Qnil;
02745 pos = rb_str_sublen(str, pos);
02746 return LONG2FIX(pos);
02747 }
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763
02764
02765
02766
02767
02768
02769
02770 VALUE
02771 rb_reg_eqq(VALUE re, VALUE str)
02772 {
02773 long start;
02774
02775 str = reg_operand(str, FALSE);
02776 if (NIL_P(str)) {
02777 rb_backref_set(Qnil);
02778 return Qfalse;
02779 }
02780 start = rb_reg_search(re, str, 0, 0);
02781 if (start < 0) {
02782 return Qfalse;
02783 }
02784 return Qtrue;
02785 }
02786
02787
02788
02789
02790
02791
02792
02793
02794
02795
02796
02797
02798
02799 VALUE
02800 rb_reg_match2(VALUE re)
02801 {
02802 long start;
02803 VALUE line = rb_lastline_get();
02804
02805 if (!RB_TYPE_P(line, T_STRING)) {
02806 rb_backref_set(Qnil);
02807 return Qnil;
02808 }
02809
02810 start = rb_reg_search(re, line, 0, 0);
02811 if (start < 0) {
02812 return Qnil;
02813 }
02814 start = rb_str_sublen(line, start);
02815 return LONG2FIX(start);
02816 }
02817
02818
02819
02820
02821
02822
02823
02824
02825
02826
02827
02828
02829
02830
02831
02832
02833
02834
02835
02836
02837
02838
02839
02840
02841
02842
02843
02844
02845
02846
02847 static VALUE
02848 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
02849 {
02850 VALUE result, str, initpos;
02851 long pos;
02852
02853 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
02854 pos = NUM2LONG(initpos);
02855 }
02856 else {
02857 pos = 0;
02858 }
02859
02860 pos = reg_match_pos(re, &str, pos);
02861 if (pos < 0) {
02862 rb_backref_set(Qnil);
02863 return Qnil;
02864 }
02865 result = rb_backref_get();
02866 rb_match_busy(result);
02867 if (!NIL_P(result) && rb_block_given_p()) {
02868 return rb_yield(result);
02869 }
02870 return result;
02871 }
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896
02897
02898
02899
02900
02901
02902
02903
02904
02905 static VALUE
02906 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
02907 {
02908 onig_errmsg_buffer err = "";
02909 int flags = 0;
02910 VALUE str;
02911 rb_encoding *enc;
02912 const char *ptr;
02913 long len;
02914
02915 rb_check_arity(argc, 1, 3);
02916 if (RB_TYPE_P(argv[0], T_REGEXP)) {
02917 VALUE re = argv[0];
02918
02919 if (argc > 1) {
02920 rb_warn("flags ignored");
02921 }
02922 rb_reg_check(re);
02923 flags = rb_reg_options(re);
02924 ptr = RREGEXP_SRC_PTR(re);
02925 len = RREGEXP_SRC_LEN(re);
02926 enc = rb_enc_get(re);
02927 if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
02928 str = rb_enc_str_new(ptr, len, enc);
02929 rb_reg_raise_str(str, flags, err);
02930 }
02931 }
02932 else {
02933 if (argc >= 2) {
02934 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
02935 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
02936 }
02937 enc = 0;
02938 if (argc == 3 && !NIL_P(argv[2])) {
02939 char *kcode = StringValuePtr(argv[2]);
02940 if (kcode[0] == 'n' || kcode[0] == 'N') {
02941 enc = rb_ascii8bit_encoding();
02942 flags |= ARG_ENCODING_NONE;
02943 }
02944 else {
02945 rb_warn("encoding option is ignored - %s", kcode);
02946 }
02947 }
02948 str = argv[0];
02949 ptr = StringValuePtr(str);
02950 if (enc
02951 ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
02952 : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
02953 rb_reg_raise_str(str, flags, err);
02954 }
02955 }
02956 return self;
02957 }
02958
02959 VALUE
02960 rb_reg_quote(VALUE str)
02961 {
02962 rb_encoding *enc = rb_enc_get(str);
02963 char *s, *send, *t;
02964 VALUE tmp;
02965 int c, clen;
02966 int ascii_only = rb_enc_str_asciionly_p(str);
02967
02968 s = RSTRING_PTR(str);
02969 send = s + RSTRING_LEN(str);
02970 while (s < send) {
02971 c = rb_enc_ascget(s, send, &clen, enc);
02972 if (c == -1) {
02973 s += mbclen(s, send, enc);
02974 continue;
02975 }
02976 switch (c) {
02977 case '[': case ']': case '{': case '}':
02978 case '(': case ')': case '|': case '-':
02979 case '*': case '.': case '\\':
02980 case '?': case '+': case '^': case '$':
02981 case ' ': case '#':
02982 case '\t': case '\f': case '\v': case '\n': case '\r':
02983 goto meta_found;
02984 }
02985 s += clen;
02986 }
02987 tmp = rb_str_new3(str);
02988 if (ascii_only) {
02989 rb_enc_associate(tmp, rb_usascii_encoding());
02990 }
02991 return tmp;
02992
02993 meta_found:
02994 tmp = rb_str_new(0, RSTRING_LEN(str)*2);
02995 if (ascii_only) {
02996 rb_enc_associate(tmp, rb_usascii_encoding());
02997 }
02998 else {
02999 rb_enc_copy(tmp, str);
03000 }
03001 t = RSTRING_PTR(tmp);
03002
03003 memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
03004 t += s - RSTRING_PTR(str);
03005
03006 while (s < send) {
03007 c = rb_enc_ascget(s, send, &clen, enc);
03008 if (c == -1) {
03009 int n = mbclen(s, send, enc);
03010
03011 while (n--)
03012 *t++ = *s++;
03013 continue;
03014 }
03015 s += clen;
03016 switch (c) {
03017 case '[': case ']': case '{': case '}':
03018 case '(': case ')': case '|': case '-':
03019 case '*': case '.': case '\\':
03020 case '?': case '+': case '^': case '$':
03021 case '#':
03022 t += rb_enc_mbcput('\\', t, enc);
03023 break;
03024 case ' ':
03025 t += rb_enc_mbcput('\\', t, enc);
03026 t += rb_enc_mbcput(' ', t, enc);
03027 continue;
03028 case '\t':
03029 t += rb_enc_mbcput('\\', t, enc);
03030 t += rb_enc_mbcput('t', t, enc);
03031 continue;
03032 case '\n':
03033 t += rb_enc_mbcput('\\', t, enc);
03034 t += rb_enc_mbcput('n', t, enc);
03035 continue;
03036 case '\r':
03037 t += rb_enc_mbcput('\\', t, enc);
03038 t += rb_enc_mbcput('r', t, enc);
03039 continue;
03040 case '\f':
03041 t += rb_enc_mbcput('\\', t, enc);
03042 t += rb_enc_mbcput('f', t, enc);
03043 continue;
03044 case '\v':
03045 t += rb_enc_mbcput('\\', t, enc);
03046 t += rb_enc_mbcput('v', t, enc);
03047 continue;
03048 }
03049 t += rb_enc_mbcput(c, t, enc);
03050 }
03051 rb_str_resize(tmp, t - RSTRING_PTR(tmp));
03052 OBJ_INFECT(tmp, str);
03053 return tmp;
03054 }
03055
03056
03057
03058
03059
03060
03061
03062
03063
03064
03065
03066
03067
03068
03069
03070
03071 static VALUE
03072 rb_reg_s_quote(VALUE c, VALUE str)
03073 {
03074 return rb_reg_quote(reg_operand(str, TRUE));
03075 }
03076
03077 int
03078 rb_reg_options(VALUE re)
03079 {
03080 int options;
03081
03082 rb_reg_check(re);
03083 options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
03084 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
03085 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
03086 return options;
03087 }
03088
03089 VALUE
03090 rb_check_regexp_type(VALUE re)
03091 {
03092 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
03093 }
03094
03095
03096
03097
03098
03099
03100
03101
03102
03103
03104
03105
03106
03107
03108
03109
03110
03111
03112 static VALUE
03113 rb_reg_s_try_convert(VALUE dummy, VALUE re)
03114 {
03115 return rb_check_regexp_type(re);
03116 }
03117
03118 static VALUE
03119 rb_reg_s_union(VALUE self, VALUE args0)
03120 {
03121 long argc = RARRAY_LEN(args0);
03122
03123 if (argc == 0) {
03124 VALUE args[1];
03125 args[0] = rb_str_new2("(?!)");
03126 return rb_class_new_instance(1, args, rb_cRegexp);
03127 }
03128 else if (argc == 1) {
03129 VALUE arg = rb_ary_entry(args0, 0);
03130 VALUE re = rb_check_regexp_type(arg);
03131 if (!NIL_P(re))
03132 return re;
03133 else {
03134 VALUE quoted;
03135 quoted = rb_reg_s_quote(Qnil, arg);
03136 return rb_reg_new_str(quoted, 0);
03137 }
03138 }
03139 else {
03140 int i;
03141 VALUE source = rb_str_buf_new(0);
03142 rb_encoding *result_enc;
03143
03144 int has_asciionly = 0;
03145 rb_encoding *has_ascii_compat_fixed = 0;
03146 rb_encoding *has_ascii_incompat = 0;
03147
03148 for (i = 0; i < argc; i++) {
03149 volatile VALUE v;
03150 VALUE e = rb_ary_entry(args0, i);
03151
03152 if (0 < i)
03153 rb_str_buf_cat_ascii(source, "|");
03154
03155 v = rb_check_regexp_type(e);
03156 if (!NIL_P(v)) {
03157 rb_encoding *enc = rb_enc_get(v);
03158 if (!rb_enc_asciicompat(enc)) {
03159 if (!has_ascii_incompat)
03160 has_ascii_incompat = enc;
03161 else if (has_ascii_incompat != enc)
03162 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03163 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03164 }
03165 else if (rb_reg_fixed_encoding_p(v)) {
03166 if (!has_ascii_compat_fixed)
03167 has_ascii_compat_fixed = enc;
03168 else if (has_ascii_compat_fixed != enc)
03169 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03170 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03171 }
03172 else {
03173 has_asciionly = 1;
03174 }
03175 v = rb_reg_to_s(v);
03176 }
03177 else {
03178 rb_encoding *enc;
03179 StringValue(e);
03180 enc = rb_enc_get(e);
03181 if (!rb_enc_str_asciicompat_p(e)) {
03182 if (!has_ascii_incompat)
03183 has_ascii_incompat = enc;
03184 else if (has_ascii_incompat != enc)
03185 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03186 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03187 }
03188 else if (rb_enc_str_asciionly_p(e)) {
03189 has_asciionly = 1;
03190 }
03191 else {
03192 if (!has_ascii_compat_fixed)
03193 has_ascii_compat_fixed = enc;
03194 else if (has_ascii_compat_fixed != enc)
03195 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03196 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03197 }
03198 v = rb_reg_s_quote(Qnil, e);
03199 }
03200 if (has_ascii_incompat) {
03201 if (has_asciionly) {
03202 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
03203 rb_enc_name(has_ascii_incompat));
03204 }
03205 if (has_ascii_compat_fixed) {
03206 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03207 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
03208 }
03209 }
03210
03211 if (i == 0) {
03212 rb_enc_copy(source, v);
03213 }
03214 rb_str_append(source, v);
03215 }
03216
03217 if (has_ascii_incompat) {
03218 result_enc = has_ascii_incompat;
03219 }
03220 else if (has_ascii_compat_fixed) {
03221 result_enc = has_ascii_compat_fixed;
03222 }
03223 else {
03224 result_enc = rb_ascii8bit_encoding();
03225 }
03226
03227 rb_enc_associate(source, result_enc);
03228 return rb_class_new_instance(1, &source, rb_cRegexp);
03229 }
03230 }
03231
03232
03233
03234
03235
03236
03237
03238
03239
03240
03241
03242
03243
03244
03245
03246
03247
03248
03249
03250 static VALUE
03251 rb_reg_s_union_m(VALUE self, VALUE args)
03252 {
03253 VALUE v;
03254 if (RARRAY_LEN(args) == 1 &&
03255 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
03256 return rb_reg_s_union(self, v);
03257 }
03258 return rb_reg_s_union(self, args);
03259 }
03260
03261
03262 static VALUE
03263 rb_reg_init_copy(VALUE copy, VALUE re)
03264 {
03265 onig_errmsg_buffer err = "";
03266 const char *s;
03267 long len;
03268
03269 if (!OBJ_INIT_COPY(copy, re)) return copy;
03270 rb_reg_check(re);
03271 s = RREGEXP_SRC_PTR(re);
03272 len = RREGEXP_SRC_LEN(re);
03273 if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
03274 err, NULL, 0) != 0) {
03275 rb_reg_raise(s, len, err, re);
03276 }
03277 return copy;
03278 }
03279
03280 VALUE
03281 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
03282 {
03283 VALUE val = 0;
03284 char *p, *s, *e;
03285 int no, clen;
03286 rb_encoding *str_enc = rb_enc_get(str);
03287 rb_encoding *src_enc = rb_enc_get(src);
03288 int acompat = rb_enc_asciicompat(str_enc);
03289 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
03290
03291 p = s = RSTRING_PTR(str);
03292 e = s + RSTRING_LEN(str);
03293
03294 while (s < e) {
03295 int c = ASCGET(s, e, &clen);
03296 char *ss;
03297
03298 if (c == -1) {
03299 s += mbclen(s, e, str_enc);
03300 continue;
03301 }
03302 ss = s;
03303 s += clen;
03304
03305 if (c != '\\' || s == e) continue;
03306
03307 if (!val) {
03308 val = rb_str_buf_new(ss-p);
03309 }
03310 rb_enc_str_buf_cat(val, p, ss-p, str_enc);
03311
03312 c = ASCGET(s, e, &clen);
03313 if (c == -1) {
03314 s += mbclen(s, e, str_enc);
03315 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03316 p = s;
03317 continue;
03318 }
03319 s += clen;
03320
03321 p = s;
03322 switch (c) {
03323 case '1': case '2': case '3': case '4':
03324 case '5': case '6': case '7': case '8': case '9':
03325 if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
03326 no = c - '0';
03327 }
03328 else {
03329 continue;
03330 }
03331 break;
03332
03333 case 'k':
03334 if (s < e && ASCGET(s, e, &clen) == '<') {
03335 char *name, *name_end;
03336
03337 name_end = name = s + clen;
03338 while (name_end < e) {
03339 c = ASCGET(name_end, e, &clen);
03340 if (c == '>') break;
03341 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
03342 }
03343 if (name_end < e) {
03344 VALUE n = rb_str_subseq(str, (long)(name - RSTRING_PTR(str)),
03345 (long)(name_end - name));
03346 if (!rb_enc_compatible(RREGEXP(regexp)->src, n) ||
03347 (no = name_to_backref_number(regs, regexp, name, name_end)) < 1) {
03348 name_to_backref_error(n);
03349 }
03350 p = s = name_end + clen;
03351 break;
03352 }
03353 else {
03354 rb_raise(rb_eRuntimeError, "invalid group name reference format");
03355 }
03356 }
03357
03358 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03359 continue;
03360
03361 case '0':
03362 case '&':
03363 no = 0;
03364 break;
03365
03366 case '`':
03367 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
03368 continue;
03369
03370 case '\'':
03371 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
03372 continue;
03373
03374 case '+':
03375 no = regs->num_regs-1;
03376 while (BEG(no) == -1 && no > 0) no--;
03377 if (no == 0) continue;
03378 break;
03379
03380 case '\\':
03381 rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
03382 continue;
03383
03384 default:
03385 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03386 continue;
03387 }
03388
03389 if (no >= 0) {
03390 if (no >= regs->num_regs) continue;
03391 if (BEG(no) == -1) continue;
03392 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
03393 }
03394 }
03395
03396 if (!val) return str;
03397 if (p < e) {
03398 rb_enc_str_buf_cat(val, p, e-p, str_enc);
03399 }
03400
03401 return val;
03402 }
03403
03404 static VALUE
03405 kcode_getter(void)
03406 {
03407 rb_warn("variable $KCODE is no longer effective");
03408 return Qnil;
03409 }
03410
03411 static void
03412 kcode_setter(VALUE val, ID id)
03413 {
03414 rb_warn("variable $KCODE is no longer effective; ignored");
03415 }
03416
03417 static VALUE
03418 ignorecase_getter(void)
03419 {
03420 rb_warn("variable $= is no longer effective");
03421 return Qfalse;
03422 }
03423
03424 static void
03425 ignorecase_setter(VALUE val, ID id)
03426 {
03427 rb_warn("variable $= is no longer effective; ignored");
03428 }
03429
03430 static VALUE
03431 match_getter(void)
03432 {
03433 VALUE match = rb_backref_get();
03434
03435 if (NIL_P(match)) return Qnil;
03436 rb_match_busy(match);
03437 return match;
03438 }
03439
03440 static void
03441 match_setter(VALUE val)
03442 {
03443 if (!NIL_P(val)) {
03444 Check_Type(val, T_MATCH);
03445 }
03446 rb_backref_set(val);
03447 }
03448
03449
03450
03451
03452
03453
03454
03455
03456
03457
03458
03459
03460
03461
03462
03463
03464
03465
03466
03467
03468
03469
03470
03471
03472
03473
03474
03475
03476
03477 static VALUE
03478 rb_reg_s_last_match(int argc, VALUE *argv)
03479 {
03480 VALUE nth;
03481
03482 if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
03483 VALUE match = rb_backref_get();
03484 int n;
03485 if (NIL_P(match)) return Qnil;
03486 n = match_backref_number(match, nth);
03487 return rb_reg_nth_match(n, match);
03488 }
03489 return match_getter();
03490 }
03491
03492 static void
03493 re_warn(const char *s)
03494 {
03495 rb_warn("%s", s);
03496 }
03497
03498
03499
03500
03501
03502
03503
03504
03505
03506
03507
03508
03509
03510
03511
03512
03513
03514
03515
03516
03517
03518
03519
03520
03521 void
03522 Init_Regexp(void)
03523 {
03524 rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
03525
03526 onigenc_set_default_caseconv_table((UChar*)casetable);
03527 onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
03528 onig_set_warn_func(re_warn);
03529 onig_set_verb_warn_func(re_warn);
03530
03531 rb_define_virtual_variable("$~", match_getter, match_setter);
03532 rb_define_virtual_variable("$&", last_match_getter, 0);
03533 rb_define_virtual_variable("$`", prematch_getter, 0);
03534 rb_define_virtual_variable("$'", postmatch_getter, 0);
03535 rb_define_virtual_variable("$+", last_paren_match_getter, 0);
03536
03537 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
03538 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
03539 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
03540
03541 rb_cRegexp = rb_define_class("Regexp", rb_cObject);
03542 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
03543 rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
03544 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
03545 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
03546 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
03547 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
03548 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
03549
03550 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
03551 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
03552 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
03553 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
03554 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
03555 rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
03556 rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
03557 rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
03558 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
03559 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
03560 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
03561 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
03562 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
03563 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
03564 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0);
03565 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
03566 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
03567 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
03568
03569
03570 rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
03571
03572 rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
03573
03574 rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
03575
03576 rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
03577
03578 rb_define_const(rb_cRegexp, "NOENCODING", INT2FIX(ARG_ENCODING_NONE));
03579
03580 rb_global_variable(®_cache);
03581
03582 rb_cMatch = rb_define_class("MatchData", rb_cObject);
03583 rb_define_alloc_func(rb_cMatch, match_alloc);
03584 rb_undef_method(CLASS_OF(rb_cMatch), "new");
03585
03586 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
03587 rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
03588 rb_define_method(rb_cMatch, "names", match_names, 0);
03589 rb_define_method(rb_cMatch, "size", match_size, 0);
03590 rb_define_method(rb_cMatch, "length", match_size, 0);
03591 rb_define_method(rb_cMatch, "offset", match_offset, 1);
03592 rb_define_method(rb_cMatch, "begin", match_begin, 1);
03593 rb_define_method(rb_cMatch, "end", match_end, 1);
03594 rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
03595 rb_define_method(rb_cMatch, "[]", match_aref, -1);
03596 rb_define_method(rb_cMatch, "captures", match_captures, 0);
03597 rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
03598 rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
03599 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
03600 rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
03601 rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
03602 rb_define_method(rb_cMatch, "string", match_string, 0);
03603 rb_define_method(rb_cMatch, "hash", match_hash, 0);
03604 rb_define_method(rb_cMatch, "eql?", match_equal, 1);
03605 rb_define_method(rb_cMatch, "==", match_equal, 1);
03606 }
03607