00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/re.h"
00014 #include "ruby/encoding.h"
00015 #include "ruby/util.h"
00016 #include "internal.h"
00017 #include "regint.h"
00018 #include <ctype.h>
00019
00020 VALUE rb_eRegexpError;
00021
00022 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
00023 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
00024
00025 #define BEG(no) (regs->beg[(no)])
00026 #define END(no) (regs->end[(no)])
00027
00028 #if 'a' == 97
00029 static const char casetable[] = {
00030 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00031 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00032 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00033 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00034
00035 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00036
00037 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00038
00039 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00040
00041 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00042
00043 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00044
00045 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00046
00047 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00048
00049 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00050
00051 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00052
00053 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00054
00055 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00056
00057 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00058 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00059 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00060 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00061 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00062 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00063 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00064 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00065 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00066 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00067 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00068 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00069 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00070 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00071 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00072 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00073 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00074 };
00075 #else
00076 # error >>> "You lose. You will need a translation table for your character set." <<<
00077 #endif
00078
00079 int
00080 rb_memcicmp(const void *x, const void *y, long len)
00081 {
00082 const unsigned char *p1 = x, *p2 = y;
00083 int tmp;
00084
00085 while (len--) {
00086 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
00087 return tmp;
00088 }
00089 return 0;
00090 }
00091
00092 #undef rb_memcmp
00093
00094 int
00095 rb_memcmp(const void *p1, const void *p2, long len)
00096 {
00097 return memcmp(p1, p2, len);
00098 }
00099
00100 #ifdef HAVE_MEMMEM
00101 static inline long
00102 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00103 {
00104 const unsigned char *y;
00105
00106 if (y = memmem(ys, n, xs, m))
00107 return y - ys;
00108 else
00109 return -1;
00110 }
00111 #else
00112 static inline long
00113 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00114 {
00115 const unsigned char *x = xs, *xe = xs + m;
00116 const unsigned char *y = ys, *ye = ys + n;
00117 #ifndef VALUE_MAX
00118 # if SIZEOF_VALUE == 8
00119 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
00120 # elif SIZEOF_VALUE == 4
00121 # define VALUE_MAX 0xFFFFFFFFUL
00122 # endif
00123 #endif
00124 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
00125
00126 if (m > SIZEOF_VALUE)
00127 rb_bug("!!too long pattern string!!");
00128
00129 if (!(y = memchr(y, *x, n - m + 1)))
00130 return -1;
00131
00132
00133 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
00134 hx <<= CHAR_BIT;
00135 hy <<= CHAR_BIT;
00136 hx |= *x;
00137 hy |= *y;
00138 }
00139
00140 while (hx != hy) {
00141 if (y == ye)
00142 return -1;
00143 hy <<= CHAR_BIT;
00144 hy |= *y;
00145 hy &= mask;
00146 y++;
00147 }
00148 return y - ys - m;
00149 }
00150 #endif
00151
00152 static inline long
00153 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
00154 {
00155 const unsigned char *x = xs, *xe = xs + m;
00156 const unsigned char *y = ys;
00157 VALUE i, qstable[256];
00158
00159
00160 for (i = 0; i < 256; ++i)
00161 qstable[i] = m + 1;
00162 for (; x < xe; ++x)
00163 qstable[*x] = xe - x;
00164
00165 for (; y + m <= ys + n; y += *(qstable + y[m])) {
00166 if (*xs == *y && memcmp(xs, y, m) == 0)
00167 return y - ys;
00168 }
00169 return -1;
00170 }
00171
00172 static inline unsigned int
00173 rb_memsearch_qs_utf8_hash(const unsigned char *x)
00174 {
00175 register const unsigned int mix = 8353;
00176 register unsigned int h = *x;
00177 if (h < 0xC0) {
00178 return h + 256;
00179 }
00180 else if (h < 0xE0) {
00181 h *= mix;
00182 h += x[1];
00183 }
00184 else if (h < 0xF0) {
00185 h *= mix;
00186 h += x[1];
00187 h *= mix;
00188 h += x[2];
00189 }
00190 else if (h < 0xF5) {
00191 h *= mix;
00192 h += x[1];
00193 h *= mix;
00194 h += x[2];
00195 h *= mix;
00196 h += x[3];
00197 }
00198 else {
00199 return h + 256;
00200 }
00201 return (unsigned char)h;
00202 }
00203
00204 static inline long
00205 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
00206 {
00207 const unsigned char *x = xs, *xe = xs + m;
00208 const unsigned char *y = ys;
00209 VALUE i, qstable[512];
00210
00211
00212 for (i = 0; i < 512; ++i) {
00213 qstable[i] = m + 1;
00214 }
00215 for (; x < xe; ++x) {
00216 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
00217 }
00218
00219 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
00220 if (*xs == *y && memcmp(xs, y, m) == 0)
00221 return y - ys;
00222 }
00223 return -1;
00224 }
00225
00226 static inline long
00227 rb_memsearch_wchar(const unsigned char *xs, long m, const unsigned char *ys, long n)
00228 {
00229 const unsigned char *x = xs, x0 = *xs, *y = ys;
00230 enum {char_size = 2};
00231
00232 for (n -= m; n >= 0; n -= char_size, y += char_size) {
00233 if (x0 == *y && memcmp(x+1, y+1, m-1) == 0)
00234 return y - ys;
00235 }
00236 return -1;
00237 }
00238
00239 static inline long
00240 rb_memsearch_qchar(const unsigned char *xs, long m, const unsigned char *ys, long n)
00241 {
00242 const unsigned char *x = xs, x0 = *xs, *y = ys;
00243 enum {char_size = 4};
00244
00245 for (n -= m; n >= 0; n -= char_size, y += char_size) {
00246 if (x0 == *y && memcmp(x+1, y+1, m-1) == 0)
00247 return y - ys;
00248 }
00249 return -1;
00250 }
00251
00252 long
00253 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
00254 {
00255 const unsigned char *x = x0, *y = y0;
00256
00257 if (m > n) return -1;
00258 else if (m == n) {
00259 return memcmp(x0, y0, m) == 0 ? 0 : -1;
00260 }
00261 else if (m < 1) {
00262 return 0;
00263 }
00264 else if (m == 1) {
00265 const unsigned char *ys = memchr(y, *x, n);
00266
00267 if (ys)
00268 return ys - y;
00269 else
00270 return -1;
00271 }
00272 else if (rb_enc_mbminlen(enc) == 1) {
00273 if (m <= SIZEOF_VALUE) {
00274 return rb_memsearch_ss(x0, m, y0, n);
00275 }
00276 else if (enc == rb_utf8_encoding()){
00277 return rb_memsearch_qs_utf8(x0, m, y0, n);
00278 }
00279 }
00280 else if (rb_enc_mbminlen(enc) == 2) {
00281 return rb_memsearch_wchar(x0, m, y0, n);
00282 }
00283 else if (rb_enc_mbminlen(enc) == 4) {
00284 return rb_memsearch_qchar(x0, m, y0, n);
00285 }
00286 return rb_memsearch_qs(x0, m, y0, n);
00287 }
00288
00289 #define REG_LITERAL FL_USER5
00290 #define REG_ENCODING_NONE FL_USER6
00291
00292 #define KCODE_FIXED FL_USER4
00293
00294 #define ARG_REG_OPTION_MASK \
00295 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
00296 #define ARG_ENCODING_FIXED 16
00297 #define ARG_ENCODING_NONE 32
00298
00299 static int
00300 char_to_option(int c)
00301 {
00302 int val;
00303
00304 switch (c) {
00305 case 'i':
00306 val = ONIG_OPTION_IGNORECASE;
00307 break;
00308 case 'x':
00309 val = ONIG_OPTION_EXTEND;
00310 break;
00311 case 'm':
00312 val = ONIG_OPTION_MULTILINE;
00313 break;
00314 default:
00315 val = 0;
00316 break;
00317 }
00318 return val;
00319 }
00320
00321 static char *
00322 option_to_str(char str[4], int options)
00323 {
00324 char *p = str;
00325 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
00326 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
00327 if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
00328 *p = 0;
00329 return str;
00330 }
00331
00332 extern int
00333 rb_char_to_option_kcode(int c, int *option, int *kcode)
00334 {
00335 *option = 0;
00336
00337 switch (c) {
00338 case 'n':
00339 *kcode = rb_ascii8bit_encindex();
00340 return (*option = ARG_ENCODING_NONE);
00341 case 'e':
00342 *kcode = ENCINDEX_EUC_JP;
00343 break;
00344 case 's':
00345 *kcode = ENCINDEX_Windows_31J;
00346 break;
00347 case 'u':
00348 *kcode = rb_utf8_encindex();
00349 break;
00350 default:
00351 *kcode = -1;
00352 return (*option = char_to_option(c));
00353 }
00354 *option = ARG_ENCODING_FIXED;
00355 return 1;
00356 }
00357
00358 static void
00359 rb_reg_check(VALUE re)
00360 {
00361 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00362 rb_raise(rb_eTypeError, "uninitialized Regexp");
00363 }
00364 }
00365
00366 static void
00367 rb_reg_expr_str(VALUE str, const char *s, long len,
00368 rb_encoding *enc, rb_encoding *resenc)
00369 {
00370 const char *p, *pend;
00371 int cr = ENC_CODERANGE_UNKNOWN;
00372 int need_escape = 0;
00373 int c, clen;
00374
00375 p = s; pend = p + len;
00376 rb_str_coderange_scan_restartable(p, pend, enc, &cr);
00377 if (rb_enc_asciicompat(enc) &&
00378 (cr == ENC_CODERANGE_VALID || cr == ENC_CODERANGE_7BIT)) {
00379 while (p < pend) {
00380 c = rb_enc_ascget(p, pend, &clen, enc);
00381 if (c == -1) {
00382 if (enc == resenc) {
00383 p += mbclen(p, pend, enc);
00384 }
00385 else {
00386 need_escape = 1;
00387 break;
00388 }
00389 }
00390 else if (c != '/' && rb_enc_isprint(c, enc)) {
00391 p += clen;
00392 }
00393 else {
00394 need_escape = 1;
00395 break;
00396 }
00397 }
00398 }
00399 else {
00400 need_escape = 1;
00401 }
00402
00403 if (!need_escape) {
00404 rb_str_buf_cat(str, s, len);
00405 }
00406 else {
00407 int unicode_p = rb_enc_unicode_p(enc);
00408 p = s;
00409 while (p<pend) {
00410 c = rb_enc_ascget(p, pend, &clen, enc);
00411 if (c == '\\' && p+clen < pend) {
00412 int n = clen + mbclen(p+clen, pend, enc);
00413 rb_str_buf_cat(str, p, n);
00414 p += n;
00415 continue;
00416 }
00417 else if (c == '/') {
00418 char c = '\\';
00419 rb_str_buf_cat(str, &c, 1);
00420 rb_str_buf_cat(str, p, clen);
00421 }
00422 else if (c == -1) {
00423 clen = rb_enc_precise_mbclen(p, pend, enc);
00424 if (!MBCLEN_CHARFOUND_P(clen)) {
00425 c = (unsigned char)*p;
00426 clen = 1;
00427 goto hex;
00428 }
00429 if (resenc) {
00430 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
00431 rb_str_buf_cat_escaped_char(str, c, unicode_p);
00432 }
00433 else {
00434 clen = MBCLEN_CHARFOUND_LEN(clen);
00435 rb_str_buf_cat(str, p, clen);
00436 }
00437 }
00438 else if (rb_enc_isprint(c, enc)) {
00439 rb_str_buf_cat(str, p, clen);
00440 }
00441 else if (!rb_enc_isspace(c, enc)) {
00442 char b[8];
00443
00444 hex:
00445 snprintf(b, sizeof(b), "\\x%02X", c);
00446 rb_str_buf_cat(str, b, 4);
00447 }
00448 else {
00449 rb_str_buf_cat(str, p, clen);
00450 }
00451 p += clen;
00452 }
00453 }
00454 }
00455
00456 static VALUE
00457 rb_reg_desc(const char *s, long len, VALUE re)
00458 {
00459 rb_encoding *enc = rb_enc_get(re);
00460 VALUE str = rb_str_buf_new2("/");
00461 rb_encoding *resenc = rb_default_internal_encoding();
00462 if (resenc == NULL) resenc = rb_default_external_encoding();
00463
00464 if (re && rb_enc_asciicompat(enc)) {
00465 rb_enc_copy(str, re);
00466 }
00467 else {
00468 rb_enc_associate(str, rb_usascii_encoding());
00469 }
00470 rb_reg_expr_str(str, s, len, enc, resenc);
00471 rb_str_buf_cat2(str, "/");
00472 if (re) {
00473 char opts[4];
00474 rb_reg_check(re);
00475 if (*option_to_str(opts, RREGEXP(re)->ptr->options))
00476 rb_str_buf_cat2(str, opts);
00477 if (RBASIC(re)->flags & REG_ENCODING_NONE)
00478 rb_str_buf_cat2(str, "n");
00479 }
00480 OBJ_INFECT(str, re);
00481 return str;
00482 }
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499 static VALUE
00500 rb_reg_source(VALUE re)
00501 {
00502 VALUE str;
00503
00504 rb_reg_check(re);
00505 str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re));
00506 if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00507 return str;
00508 }
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522 static VALUE
00523 rb_reg_inspect(VALUE re)
00524 {
00525 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00526 return rb_any_to_s(re);
00527 }
00528 return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
00529 }
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552 static VALUE
00553 rb_reg_to_s(VALUE re)
00554 {
00555 int options, opt;
00556 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
00557 long len;
00558 const UChar* ptr;
00559 VALUE str = rb_str_buf_new2("(?");
00560 char optbuf[5];
00561 rb_encoding *enc = rb_enc_get(re);
00562
00563 rb_reg_check(re);
00564
00565 rb_enc_copy(str, re);
00566 options = RREGEXP(re)->ptr->options;
00567 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00568 len = RREGEXP_SRC_LEN(re);
00569 again:
00570 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00571 int err = 1;
00572 ptr += 2;
00573 if ((len -= 2) > 0) {
00574 do {
00575 opt = char_to_option((int )*ptr);
00576 if (opt != 0) {
00577 options |= opt;
00578 }
00579 else {
00580 break;
00581 }
00582 ++ptr;
00583 } while (--len > 0);
00584 }
00585 if (len > 1 && *ptr == '-') {
00586 ++ptr;
00587 --len;
00588 do {
00589 opt = char_to_option((int )*ptr);
00590 if (opt != 0) {
00591 options &= ~opt;
00592 }
00593 else {
00594 break;
00595 }
00596 ++ptr;
00597 } while (--len > 0);
00598 }
00599 if (*ptr == ')') {
00600 --len;
00601 ++ptr;
00602 goto again;
00603 }
00604 if (*ptr == ':' && ptr[len-1] == ')') {
00605 Regexp *rp;
00606 VALUE verbose = ruby_verbose;
00607 ruby_verbose = Qfalse;
00608
00609 ++ptr;
00610 len -= 2;
00611 err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
00612 enc, OnigDefaultSyntax, NULL);
00613 onig_free(rp);
00614 ruby_verbose = verbose;
00615 }
00616 if (err) {
00617 options = RREGEXP(re)->ptr->options;
00618 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00619 len = RREGEXP_SRC_LEN(re);
00620 }
00621 }
00622
00623 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
00624
00625 if ((options & embeddable) != embeddable) {
00626 optbuf[0] = '-';
00627 option_to_str(optbuf + 1, ~options);
00628 rb_str_buf_cat2(str, optbuf);
00629 }
00630
00631 rb_str_buf_cat2(str, ":");
00632 if (rb_enc_asciicompat(enc)) {
00633 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00634 rb_str_buf_cat2(str, ")");
00635 }
00636 else {
00637 const char *s, *e;
00638 char *paren;
00639 ptrdiff_t n;
00640 rb_str_buf_cat2(str, ")");
00641 rb_enc_associate(str, rb_usascii_encoding());
00642 str = rb_str_encode(str, rb_enc_from_encoding(enc), 0, Qnil);
00643
00644
00645 s = RSTRING_PTR(str);
00646 e = RSTRING_END(str);
00647 s = rb_enc_left_char_head(s, e-1, e, enc);
00648 n = e - s;
00649 paren = ALLOCA_N(char, n);
00650 memcpy(paren, s, n);
00651 rb_str_resize(str, RSTRING_LEN(str) - n);
00652
00653 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00654 rb_str_buf_cat(str, paren, n);
00655 }
00656 rb_enc_copy(str, re);
00657
00658 OBJ_INFECT(str, re);
00659 return str;
00660 }
00661
00662 static void
00663 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
00664 {
00665 volatile VALUE desc = rb_reg_desc(s, len, re);
00666
00667 rb_raise(rb_eRegexpError, "%s: %"PRIsVALUE, err, desc);
00668 }
00669
00670 static VALUE
00671 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
00672 {
00673 char opts[6];
00674 VALUE desc = rb_str_buf_new2(err);
00675 rb_encoding *resenc = rb_default_internal_encoding();
00676 if (resenc == NULL) resenc = rb_default_external_encoding();
00677
00678 rb_enc_associate(desc, enc);
00679 rb_str_buf_cat2(desc, ": /");
00680 rb_reg_expr_str(desc, s, len, enc, resenc);
00681 opts[0] = '/';
00682 option_to_str(opts + 1, options);
00683 rb_str_buf_cat2(desc, opts);
00684 return rb_exc_new3(rb_eRegexpError, desc);
00685 }
00686
00687 static void
00688 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
00689 {
00690 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
00691 }
00692
00693 static VALUE
00694 rb_reg_error_desc(VALUE str, int options, const char *err)
00695 {
00696 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
00697 rb_enc_get(str), options, err);
00698 }
00699
00700 static void
00701 rb_reg_raise_str(VALUE str, int options, const char *err)
00702 {
00703 rb_exc_raise(rb_reg_error_desc(str, options, err));
00704 }
00705
00706
00707
00708
00709
00710
00711
00712
00713
00714
00715
00716
00717
00718 static VALUE
00719 rb_reg_casefold_p(VALUE re)
00720 {
00721 rb_reg_check(re);
00722 if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
00723 return Qfalse;
00724 }
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734
00735
00736
00737
00738
00739
00740
00741
00742
00743
00744
00745
00746
00747
00748
00749
00750 static VALUE
00751 rb_reg_options_m(VALUE re)
00752 {
00753 int options = rb_reg_options(re);
00754 return INT2NUM(options);
00755 }
00756
00757 static int
00758 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
00759 int back_num, int *back_refs, OnigRegex regex, void *arg)
00760 {
00761 VALUE ary = (VALUE)arg;
00762 rb_ary_push(ary, rb_enc_str_new((const char *)name, name_end-name, regex->enc));
00763 return 0;
00764 }
00765
00766
00767
00768
00769
00770
00771
00772
00773
00774
00775
00776
00777
00778
00779
00780
00781
00782 static VALUE
00783 rb_reg_names(VALUE re)
00784 {
00785 VALUE ary = rb_ary_new();
00786 rb_reg_check(re);
00787 onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
00788 return ary;
00789 }
00790
00791 static int
00792 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
00793 int back_num, int *back_refs, OnigRegex regex, void *arg)
00794 {
00795 VALUE hash = (VALUE)arg;
00796 VALUE ary = rb_ary_new2(back_num);
00797 int i;
00798
00799 for (i = 0; i < back_num; i++)
00800 rb_ary_store(ary, i, INT2NUM(back_refs[i]));
00801
00802 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
00803
00804 return 0;
00805 }
00806
00807
00808
00809
00810
00811
00812
00813
00814
00815
00816
00817
00818
00819
00820
00821
00822
00823
00824
00825
00826
00827
00828
00829 static VALUE
00830 rb_reg_named_captures(VALUE re)
00831 {
00832 VALUE hash = rb_hash_new();
00833 rb_reg_check(re);
00834 onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
00835 return hash;
00836 }
00837
00838 static int
00839 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
00840 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
00841 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
00842 {
00843 int r;
00844
00845 *reg = (regex_t* )malloc(sizeof(regex_t));
00846 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
00847
00848 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
00849 if (r) goto err;
00850
00851 r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
00852 if (r) {
00853 err:
00854 onig_free(*reg);
00855 *reg = NULL;
00856 }
00857 return r;
00858 }
00859
00860 static Regexp*
00861 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
00862 const char *sourcefile, int sourceline)
00863 {
00864 Regexp *rp;
00865 int r;
00866 OnigErrorInfo einfo;
00867
00868
00869
00870
00871
00872
00873
00874
00875 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
00876 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
00877 if (r) {
00878 onig_error_code_to_str((UChar*)err, r, &einfo);
00879 return 0;
00880 }
00881 return rp;
00882 }
00883
00884
00885
00886
00887
00888
00889
00890
00891
00892
00893
00894
00895
00896
00897 VALUE rb_cMatch;
00898
00899 static VALUE
00900 match_alloc(VALUE klass)
00901 {
00902 NEWOBJ_OF(match, struct RMatch, klass, T_MATCH);
00903
00904 match->str = 0;
00905 match->rmatch = 0;
00906 match->regexp = 0;
00907 match->rmatch = ALLOC(struct rmatch);
00908 MEMZERO(match->rmatch, struct rmatch, 1);
00909
00910 return (VALUE)match;
00911 }
00912
00913 typedef struct {
00914 long byte_pos;
00915 long char_pos;
00916 } pair_t;
00917
00918 static int
00919 pair_byte_cmp(const void *pair1, const void *pair2)
00920 {
00921 long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
00922 #if SIZEOF_LONG > SIZEOF_INT
00923 return diff ? diff > 0 ? 1 : -1 : 0;
00924 #else
00925 return (int)diff;
00926 #endif
00927 }
00928
00929 static void
00930 update_char_offset(VALUE match)
00931 {
00932 struct rmatch *rm = RMATCH(match)->rmatch;
00933 struct re_registers *regs;
00934 int i, num_regs, num_pos;
00935 long c;
00936 char *s, *p, *q;
00937 rb_encoding *enc;
00938 pair_t *pairs;
00939
00940 if (rm->char_offset_updated)
00941 return;
00942
00943 regs = &rm->regs;
00944 num_regs = rm->regs.num_regs;
00945
00946 if (rm->char_offset_num_allocated < num_regs) {
00947 REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
00948 rm->char_offset_num_allocated = num_regs;
00949 }
00950
00951 enc = rb_enc_get(RMATCH(match)->str);
00952 if (rb_enc_mbmaxlen(enc) == 1) {
00953 for (i = 0; i < num_regs; i++) {
00954 rm->char_offset[i].beg = BEG(i);
00955 rm->char_offset[i].end = END(i);
00956 }
00957 rm->char_offset_updated = 1;
00958 return;
00959 }
00960
00961 pairs = ALLOCA_N(pair_t, num_regs*2);
00962 num_pos = 0;
00963 for (i = 0; i < num_regs; i++) {
00964 if (BEG(i) < 0)
00965 continue;
00966 pairs[num_pos++].byte_pos = BEG(i);
00967 pairs[num_pos++].byte_pos = END(i);
00968 }
00969 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00970
00971 s = p = RSTRING_PTR(RMATCH(match)->str);
00972 c = 0;
00973 for (i = 0; i < num_pos; i++) {
00974 q = s + pairs[i].byte_pos;
00975 c += rb_enc_strlen(p, q, enc);
00976 pairs[i].char_pos = c;
00977 p = q;
00978 }
00979
00980 for (i = 0; i < num_regs; i++) {
00981 pair_t key, *found;
00982 if (BEG(i) < 0) {
00983 rm->char_offset[i].beg = -1;
00984 rm->char_offset[i].end = -1;
00985 continue;
00986 }
00987
00988 key.byte_pos = BEG(i);
00989 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00990 rm->char_offset[i].beg = found->char_pos;
00991
00992 key.byte_pos = END(i);
00993 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00994 rm->char_offset[i].end = found->char_pos;
00995 }
00996
00997 rm->char_offset_updated = 1;
00998 }
00999
01000 static void
01001 match_check(VALUE match)
01002 {
01003 if (!RMATCH(match)->regexp) {
01004 rb_raise(rb_eTypeError, "uninitialized Match");
01005 }
01006 }
01007
01008
01009 static VALUE
01010 match_init_copy(VALUE obj, VALUE orig)
01011 {
01012 struct rmatch *rm;
01013
01014 if (!OBJ_INIT_COPY(obj, orig)) return obj;
01015
01016 RMATCH(obj)->str = RMATCH(orig)->str;
01017 RMATCH(obj)->regexp = RMATCH(orig)->regexp;
01018
01019 rm = RMATCH(obj)->rmatch;
01020 onig_region_copy(&rm->regs, RMATCH_REGS(orig));
01021
01022 if (!RMATCH(orig)->rmatch->char_offset_updated) {
01023 rm->char_offset_updated = 0;
01024 }
01025 else {
01026 if (rm->char_offset_num_allocated < rm->regs.num_regs) {
01027 REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
01028 rm->char_offset_num_allocated = rm->regs.num_regs;
01029 }
01030 MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
01031 struct rmatch_offset, rm->regs.num_regs);
01032 rm->char_offset_updated = 1;
01033 }
01034
01035 return obj;
01036 }
01037
01038
01039
01040
01041
01042
01043
01044
01045
01046
01047
01048
01049 static VALUE
01050 match_regexp(VALUE match)
01051 {
01052 match_check(match);
01053 return RMATCH(match)->regexp;
01054 }
01055
01056
01057
01058
01059
01060
01061
01062
01063
01064
01065
01066
01067
01068
01069
01070 static VALUE
01071 match_names(VALUE match)
01072 {
01073 match_check(match);
01074 return rb_reg_names(RMATCH(match)->regexp);
01075 }
01076
01077
01078
01079
01080
01081
01082
01083
01084
01085
01086
01087
01088
01089 static VALUE
01090 match_size(VALUE match)
01091 {
01092 match_check(match);
01093 return INT2FIX(RMATCH_REGS(match)->num_regs);
01094 }
01095
01096 static int
01097 match_backref_number(VALUE match, VALUE backref)
01098 {
01099 const char *name;
01100 int num;
01101
01102 struct re_registers *regs = RMATCH_REGS(match);
01103 VALUE regexp = RMATCH(match)->regexp;
01104
01105 match_check(match);
01106 switch (TYPE(backref)) {
01107 default:
01108 return NUM2INT(backref);
01109
01110 case T_SYMBOL:
01111 name = rb_id2name(SYM2ID(backref));
01112 break;
01113
01114 case T_STRING:
01115 name = StringValueCStr(backref);
01116 break;
01117 }
01118
01119 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01120 (const unsigned char*)name,
01121 (const unsigned char*)name + strlen(name),
01122 regs);
01123
01124 if (num < 1) {
01125 rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
01126 }
01127
01128 return num;
01129 }
01130
01131 int
01132 rb_reg_backref_number(VALUE match, VALUE backref)
01133 {
01134 return match_backref_number(match, backref);
01135 }
01136
01137
01138
01139
01140
01141
01142
01143
01144
01145
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155 static VALUE
01156 match_offset(VALUE match, VALUE n)
01157 {
01158 int i = match_backref_number(match, n);
01159 struct re_registers *regs = RMATCH_REGS(match);
01160
01161 match_check(match);
01162 if (i < 0 || regs->num_regs <= i)
01163 rb_raise(rb_eIndexError, "index %d out of matches", i);
01164
01165 if (BEG(i) < 0)
01166 return rb_assoc_new(Qnil, Qnil);
01167
01168 update_char_offset(match);
01169 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
01170 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
01171 }
01172
01173
01174
01175
01176
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191 static VALUE
01192 match_begin(VALUE match, VALUE n)
01193 {
01194 int i = match_backref_number(match, n);
01195 struct re_registers *regs = RMATCH_REGS(match);
01196
01197 match_check(match);
01198 if (i < 0 || regs->num_regs <= i)
01199 rb_raise(rb_eIndexError, "index %d out of matches", i);
01200
01201 if (BEG(i) < 0)
01202 return Qnil;
01203
01204 update_char_offset(match);
01205 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
01206 }
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217
01218
01219
01220
01221
01222
01223
01224
01225
01226 static VALUE
01227 match_end(VALUE match, VALUE n)
01228 {
01229 int i = match_backref_number(match, n);
01230 struct re_registers *regs = RMATCH_REGS(match);
01231
01232 match_check(match);
01233 if (i < 0 || regs->num_regs <= i)
01234 rb_raise(rb_eIndexError, "index %d out of matches", i);
01235
01236 if (BEG(i) < 0)
01237 return Qnil;
01238
01239 update_char_offset(match);
01240 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
01241 }
01242
01243 #define MATCH_BUSY FL_USER2
01244
01245 void
01246 rb_match_busy(VALUE match)
01247 {
01248 FL_SET(match, MATCH_BUSY);
01249 }
01250
01251
01252
01253
01254
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265
01266
01267
01268
01269
01270
01271
01272
01273
01274
01275
01276
01277
01278
01279
01280 static VALUE
01281 rb_reg_fixed_encoding_p(VALUE re)
01282 {
01283 if (FL_TEST(re, KCODE_FIXED))
01284 return Qtrue;
01285 else
01286 return Qfalse;
01287 }
01288
01289 static VALUE
01290 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
01291 rb_encoding **fixed_enc, onig_errmsg_buffer err);
01292
01293
01294 static void
01295 reg_enc_error(VALUE re, VALUE str)
01296 {
01297 rb_raise(rb_eEncCompatError,
01298 "incompatible encoding regexp match (%s regexp with %s string)",
01299 rb_enc_name(rb_enc_get(re)),
01300 rb_enc_name(rb_enc_get(str)));
01301 }
01302
01303 static rb_encoding*
01304 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
01305 {
01306 rb_encoding *enc = 0;
01307
01308 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
01309 rb_raise(rb_eArgError,
01310 "invalid byte sequence in %s",
01311 rb_enc_name(rb_enc_get(str)));
01312 }
01313
01314 rb_reg_check(re);
01315 enc = rb_enc_get(str);
01316 if (!rb_enc_str_asciicompat_p(str)) {
01317 if (RREGEXP(re)->ptr->enc != enc) {
01318 reg_enc_error(re, str);
01319 }
01320 }
01321 else if (rb_reg_fixed_encoding_p(re)) {
01322 if (RREGEXP(re)->ptr->enc != enc &&
01323 (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
01324 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
01325 reg_enc_error(re, str);
01326 }
01327 enc = RREGEXP(re)->ptr->enc;
01328 }
01329 if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
01330 enc != rb_ascii8bit_encoding() &&
01331 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
01332 rb_warn("regexp match /.../n against to %s string",
01333 rb_enc_name(enc));
01334 }
01335 return enc;
01336 }
01337
01338 regex_t *
01339 rb_reg_prepare_re(VALUE re, VALUE str)
01340 {
01341 regex_t *reg = RREGEXP(re)->ptr;
01342 onig_errmsg_buffer err = "";
01343 int r;
01344 OnigErrorInfo einfo;
01345 const char *pattern;
01346 VALUE unescaped;
01347 rb_encoding *fixed_enc = 0;
01348 rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
01349
01350 if (reg->enc == enc) return reg;
01351
01352 rb_reg_check(re);
01353 reg = RREGEXP(re)->ptr;
01354 pattern = RREGEXP_SRC_PTR(re);
01355
01356 unescaped = rb_reg_preprocess(
01357 pattern, pattern + RREGEXP_SRC_LEN(re), enc,
01358 &fixed_enc, err);
01359
01360 if (unescaped == Qnil) {
01361 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
01362 }
01363
01364 r = onig_new(®, (UChar* )RSTRING_PTR(unescaped),
01365 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
01366 reg->options, enc,
01367 OnigDefaultSyntax, &einfo);
01368 if (r) {
01369 onig_error_code_to_str((UChar*)err, r, &einfo);
01370 rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
01371 }
01372
01373 RB_GC_GUARD(unescaped);
01374 return reg;
01375 }
01376
01377 long
01378 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
01379 {
01380 long range;
01381 rb_encoding *enc;
01382 UChar *p, *string;
01383
01384 enc = rb_reg_prepare_enc(re, str, 0);
01385
01386 if (reverse) {
01387 range = -pos;
01388 }
01389 else {
01390 range = RSTRING_LEN(str) - pos;
01391 }
01392
01393 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
01394 string = (UChar*)RSTRING_PTR(str);
01395
01396 if (range > 0) {
01397 p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
01398 }
01399 else {
01400 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
01401 }
01402 return p - string;
01403 }
01404
01405 return pos;
01406 }
01407
01408
01409 long
01410 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
01411 {
01412 long result;
01413 VALUE match;
01414 struct re_registers regi, *regs = ®i;
01415 char *range = RSTRING_PTR(str);
01416 regex_t *reg;
01417 int tmpreg;
01418
01419 if (pos > RSTRING_LEN(str) || pos < 0) {
01420 rb_backref_set(Qnil);
01421 return -1;
01422 }
01423
01424 reg = rb_reg_prepare_re(re, str);
01425 tmpreg = reg != RREGEXP(re)->ptr;
01426 if (!tmpreg) RREGEXP(re)->usecnt++;
01427
01428 match = rb_backref_get();
01429 if (!NIL_P(match)) {
01430 if (FL_TEST(match, MATCH_BUSY)) {
01431 match = Qnil;
01432 }
01433 else {
01434 regs = RMATCH_REGS(match);
01435 }
01436 }
01437 if (NIL_P(match)) {
01438 MEMZERO(regs, struct re_registers, 1);
01439 }
01440 if (!reverse) {
01441 range += RSTRING_LEN(str);
01442 }
01443 result = onig_search(reg,
01444 (UChar*)(RSTRING_PTR(str)),
01445 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
01446 ((UChar*)(RSTRING_PTR(str)) + pos),
01447 ((UChar*)range),
01448 regs, ONIG_OPTION_NONE);
01449 if (!tmpreg) RREGEXP(re)->usecnt--;
01450 if (tmpreg) {
01451 if (RREGEXP(re)->usecnt) {
01452 onig_free(reg);
01453 }
01454 else {
01455 onig_free(RREGEXP(re)->ptr);
01456 RREGEXP(re)->ptr = reg;
01457 }
01458 }
01459 if (result < 0) {
01460 if (regs == ®i)
01461 onig_region_free(regs, 0);
01462 if (result == ONIG_MISMATCH) {
01463 rb_backref_set(Qnil);
01464 return result;
01465 }
01466 else {
01467 onig_errmsg_buffer err = "";
01468 onig_error_code_to_str((UChar*)err, (int)result);
01469 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
01470 }
01471 }
01472
01473 if (NIL_P(match)) {
01474 match = match_alloc(rb_cMatch);
01475 onig_region_copy(RMATCH_REGS(match), regs);
01476 onig_region_free(regs, 0);
01477 }
01478 else {
01479 if (rb_safe_level() >= 3)
01480 OBJ_TAINT(match);
01481 else
01482 FL_UNSET(match, FL_TAINT);
01483 }
01484
01485 RMATCH(match)->str = rb_str_new4(str);
01486 RMATCH(match)->regexp = re;
01487 RMATCH(match)->rmatch->char_offset_updated = 0;
01488 rb_backref_set(match);
01489
01490 OBJ_INFECT(match, re);
01491 OBJ_INFECT(match, str);
01492
01493 return result;
01494 }
01495
01496 VALUE
01497 rb_reg_nth_defined(int nth, VALUE match)
01498 {
01499 struct re_registers *regs;
01500 if (NIL_P(match)) return Qnil;
01501 match_check(match);
01502 regs = RMATCH_REGS(match);
01503 if (nth >= regs->num_regs) {
01504 return Qnil;
01505 }
01506 if (nth < 0) {
01507 nth += regs->num_regs;
01508 if (nth <= 0) return Qnil;
01509 }
01510 if (BEG(nth) == -1) return Qfalse;
01511 return Qtrue;
01512 }
01513
01514 VALUE
01515 rb_reg_nth_match(int nth, VALUE match)
01516 {
01517 VALUE str;
01518 long start, end, len;
01519 struct re_registers *regs;
01520
01521 if (NIL_P(match)) return Qnil;
01522 match_check(match);
01523 regs = RMATCH_REGS(match);
01524 if (nth >= regs->num_regs) {
01525 return Qnil;
01526 }
01527 if (nth < 0) {
01528 nth += regs->num_regs;
01529 if (nth <= 0) return Qnil;
01530 }
01531 start = BEG(nth);
01532 if (start == -1) return Qnil;
01533 end = END(nth);
01534 len = end - start;
01535 str = rb_str_subseq(RMATCH(match)->str, start, len);
01536 OBJ_INFECT(str, match);
01537 return str;
01538 }
01539
01540 VALUE
01541 rb_reg_last_match(VALUE match)
01542 {
01543 return rb_reg_nth_match(0, match);
01544 }
01545
01546
01547
01548
01549
01550
01551
01552
01553
01554
01555
01556
01557
01558 VALUE
01559 rb_reg_match_pre(VALUE match)
01560 {
01561 VALUE str;
01562 struct re_registers *regs;
01563
01564 if (NIL_P(match)) return Qnil;
01565 match_check(match);
01566 regs = RMATCH_REGS(match);
01567 if (BEG(0) == -1) return Qnil;
01568 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
01569 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01570 return str;
01571 }
01572
01573
01574
01575
01576
01577
01578
01579
01580
01581
01582
01583
01584
01585 VALUE
01586 rb_reg_match_post(VALUE match)
01587 {
01588 VALUE str;
01589 long pos;
01590 struct re_registers *regs;
01591
01592 if (NIL_P(match)) return Qnil;
01593 match_check(match);
01594 regs = RMATCH_REGS(match);
01595 if (BEG(0) == -1) return Qnil;
01596 str = RMATCH(match)->str;
01597 pos = END(0);
01598 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
01599 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01600 return str;
01601 }
01602
01603 VALUE
01604 rb_reg_match_last(VALUE match)
01605 {
01606 int i;
01607 struct re_registers *regs;
01608
01609 if (NIL_P(match)) return Qnil;
01610 match_check(match);
01611 regs = RMATCH_REGS(match);
01612 if (BEG(0) == -1) return Qnil;
01613
01614 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
01615 ;
01616 if (i == 0) return Qnil;
01617 return rb_reg_nth_match(i, match);
01618 }
01619
01620 static VALUE
01621 last_match_getter(void)
01622 {
01623 return rb_reg_last_match(rb_backref_get());
01624 }
01625
01626 static VALUE
01627 prematch_getter(void)
01628 {
01629 return rb_reg_match_pre(rb_backref_get());
01630 }
01631
01632 static VALUE
01633 postmatch_getter(void)
01634 {
01635 return rb_reg_match_post(rb_backref_get());
01636 }
01637
01638 static VALUE
01639 last_paren_match_getter(void)
01640 {
01641 return rb_reg_match_last(rb_backref_get());
01642 }
01643
01644 static VALUE
01645 match_array(VALUE match, int start)
01646 {
01647 struct re_registers *regs;
01648 VALUE ary;
01649 VALUE target;
01650 int i;
01651 int taint = OBJ_TAINTED(match);
01652
01653 match_check(match);
01654 regs = RMATCH_REGS(match);
01655 ary = rb_ary_new2(regs->num_regs);
01656 target = RMATCH(match)->str;
01657
01658 for (i=start; i<regs->num_regs; i++) {
01659 if (regs->beg[i] == -1) {
01660 rb_ary_push(ary, Qnil);
01661 }
01662 else {
01663 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01664 if (taint) OBJ_TAINT(str);
01665 rb_ary_push(ary, str);
01666 }
01667 }
01668 return ary;
01669 }
01670
01671
01672
01673
01674
01675
01676
01677
01678
01679
01680
01681
01682
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695
01696
01697
01698 static VALUE
01699 match_to_a(VALUE match)
01700 {
01701 return match_array(match, 0);
01702 }
01703
01704
01705
01706
01707
01708
01709
01710
01711
01712
01713
01714
01715
01716
01717 static VALUE
01718 match_captures(VALUE match)
01719 {
01720 return match_array(match, 1);
01721 }
01722
01723 static int
01724 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
01725 {
01726 return onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01727 (const unsigned char* )name, (const unsigned char* )name_end, regs);
01728 }
01729
01730 NORETURN(static void name_to_backref_error(VALUE name));
01731 static void
01732 name_to_backref_error(VALUE name)
01733 {
01734 rb_raise(rb_eIndexError, "undefined group name reference: % "PRIsVALUE,
01735 name);
01736 }
01737
01738
01739
01740
01741
01742
01743
01744
01745
01746
01747
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757
01758
01759
01760
01761
01762
01763
01764
01765 static VALUE
01766 match_aref(int argc, VALUE *argv, VALUE match)
01767 {
01768 VALUE idx, rest;
01769
01770 match_check(match);
01771 rb_scan_args(argc, argv, "11", &idx, &rest);
01772
01773 if (NIL_P(rest)) {
01774 if (FIXNUM_P(idx)) {
01775 if (FIX2INT(idx) >= 0) {
01776 return rb_reg_nth_match(FIX2INT(idx), match);
01777 }
01778 }
01779 else {
01780 const char *p;
01781 int num;
01782
01783 switch (TYPE(idx)) {
01784 case T_SYMBOL:
01785 idx = rb_id2str(SYM2ID(idx));
01786
01787 case T_STRING:
01788 p = StringValuePtr(idx);
01789 if (!rb_enc_compatible(RREGEXP(RMATCH(match)->regexp)->src, idx) ||
01790 (num = name_to_backref_number(RMATCH_REGS(match), RMATCH(match)->regexp,
01791 p, p + RSTRING_LEN(idx))) < 1) {
01792 name_to_backref_error(idx);
01793 }
01794 return rb_reg_nth_match(num, match);
01795
01796 default:
01797 break;
01798 }
01799 }
01800 }
01801
01802 return rb_ary_aref(argc, argv, match_to_a(match));
01803 }
01804
01805 static VALUE
01806 match_entry(VALUE match, long n)
01807 {
01808
01809 return rb_reg_nth_match((int)n, match);
01810 }
01811
01812
01813
01814
01815
01816
01817
01818
01819
01820
01821
01822
01823
01824
01825
01826 static VALUE
01827 match_values_at(int argc, VALUE *argv, VALUE match)
01828 {
01829 struct re_registers *regs;
01830
01831 match_check(match);
01832 regs = RMATCH_REGS(match);
01833 return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
01834 }
01835
01836
01837
01838
01839
01840
01841
01842
01843
01844
01845
01846
01847 static VALUE
01848 match_to_s(VALUE match)
01849 {
01850 VALUE str = rb_reg_last_match(match);
01851
01852 match_check(match);
01853 if (NIL_P(str)) str = rb_str_new(0,0);
01854 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01855 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01856 return str;
01857 }
01858
01859
01860
01861
01862
01863
01864
01865
01866
01867
01868
01869
01870 static VALUE
01871 match_string(VALUE match)
01872 {
01873 match_check(match);
01874 return RMATCH(match)->str;
01875 }
01876
01877 struct backref_name_tag {
01878 const UChar *name;
01879 long len;
01880 };
01881
01882 static int
01883 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
01884 int back_num, int *back_refs, OnigRegex regex, void *arg0)
01885 {
01886 struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
01887 int i;
01888
01889 for (i = 0; i < back_num; i++) {
01890 arg[back_refs[i]].name = name;
01891 arg[back_refs[i]].len = name_end - name;
01892 }
01893 return 0;
01894 }
01895
01896
01897
01898
01899
01900
01901
01902
01903
01904
01905
01906
01907
01908
01909
01910
01911
01912
01913
01914
01915
01916 static VALUE
01917 match_inspect(VALUE match)
01918 {
01919 const char *cname = rb_obj_classname(match);
01920 VALUE str;
01921 int i;
01922 struct re_registers *regs = RMATCH_REGS(match);
01923 int num_regs = regs->num_regs;
01924 struct backref_name_tag *names;
01925 VALUE regexp = RMATCH(match)->regexp;
01926
01927 if (regexp == 0) {
01928 return rb_sprintf("#<%s:%p>", cname, (void*)match);
01929 }
01930
01931 names = ALLOCA_N(struct backref_name_tag, num_regs);
01932 MEMZERO(names, struct backref_name_tag, num_regs);
01933
01934 onig_foreach_name(RREGEXP(regexp)->ptr,
01935 match_inspect_name_iter, names);
01936
01937 str = rb_str_buf_new2("#<");
01938 rb_str_buf_cat2(str, cname);
01939
01940 for (i = 0; i < num_regs; i++) {
01941 VALUE v;
01942 rb_str_buf_cat2(str, " ");
01943 if (0 < i) {
01944 if (names[i].name)
01945 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
01946 else {
01947 rb_str_catf(str, "%d", i);
01948 }
01949 rb_str_buf_cat2(str, ":");
01950 }
01951 v = rb_reg_nth_match(i, match);
01952 if (v == Qnil)
01953 rb_str_buf_cat2(str, "nil");
01954 else
01955 rb_str_buf_append(str, rb_str_inspect(v));
01956 }
01957 rb_str_buf_cat2(str, ">");
01958
01959 return str;
01960 }
01961
01962 VALUE rb_cRegexp;
01963
01964 static int
01965 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
01966 {
01967 const char *p = *pp;
01968 int code;
01969 int meta_prefix = 0, ctrl_prefix = 0;
01970 size_t len;
01971
01972 if (p == end || *p++ != '\\') {
01973 errcpy(err, "too short escaped multibyte character");
01974 return -1;
01975 }
01976
01977 again:
01978 if (p == end) {
01979 errcpy(err, "too short escape sequence");
01980 return -1;
01981 }
01982 switch (*p++) {
01983 case '\\': code = '\\'; break;
01984 case 'n': code = '\n'; break;
01985 case 't': code = '\t'; break;
01986 case 'r': code = '\r'; break;
01987 case 'f': code = '\f'; break;
01988 case 'v': code = '\013'; break;
01989 case 'a': code = '\007'; break;
01990 case 'e': code = '\033'; break;
01991
01992
01993 case '0': case '1': case '2': case '3':
01994 case '4': case '5': case '6': case '7':
01995 p--;
01996 code = scan_oct(p, end < p+3 ? end-p : 3, &len);
01997 p += len;
01998 break;
01999
02000 case 'x':
02001 code = scan_hex(p, end < p+2 ? end-p : 2, &len);
02002 if (len < 1) {
02003 errcpy(err, "invalid hex escape");
02004 return -1;
02005 }
02006 p += len;
02007 break;
02008
02009 case 'M':
02010 if (meta_prefix) {
02011 errcpy(err, "duplicate meta escape");
02012 return -1;
02013 }
02014 meta_prefix = 1;
02015 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
02016 if (*p == '\\') {
02017 p++;
02018 goto again;
02019 }
02020 else {
02021 code = *p++;
02022 break;
02023 }
02024 }
02025 errcpy(err, "too short meta escape");
02026 return -1;
02027
02028 case 'C':
02029 if (p == end || *p++ != '-') {
02030 errcpy(err, "too short control escape");
02031 return -1;
02032 }
02033 case 'c':
02034 if (ctrl_prefix) {
02035 errcpy(err, "duplicate control escape");
02036 return -1;
02037 }
02038 ctrl_prefix = 1;
02039 if (p < end && (*p & 0x80) == 0) {
02040 if (*p == '\\') {
02041 p++;
02042 goto again;
02043 }
02044 else {
02045 code = *p++;
02046 break;
02047 }
02048 }
02049 errcpy(err, "too short control escape");
02050 return -1;
02051
02052 default:
02053 errcpy(err, "unexpected escape sequence");
02054 return -1;
02055 }
02056 if (code < 0 || 0xff < code) {
02057 errcpy(err, "invalid escape code");
02058 return -1;
02059 }
02060
02061 if (ctrl_prefix)
02062 code &= 0x1f;
02063 if (meta_prefix)
02064 code |= 0x80;
02065
02066 *pp = p;
02067 return code;
02068 }
02069
02070 static int
02071 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
02072 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02073 {
02074 const char *p = *pp;
02075 int chmaxlen = rb_enc_mbmaxlen(enc);
02076 char *chbuf = ALLOCA_N(char, chmaxlen);
02077 int chlen = 0;
02078 int byte;
02079 int l;
02080
02081 memset(chbuf, 0, chmaxlen);
02082
02083 byte = read_escaped_byte(&p, end, err);
02084 if (byte == -1) {
02085 return -1;
02086 }
02087
02088 chbuf[chlen++] = byte;
02089 while (chlen < chmaxlen &&
02090 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
02091 byte = read_escaped_byte(&p, end, err);
02092 if (byte == -1) {
02093 return -1;
02094 }
02095 chbuf[chlen++] = byte;
02096 }
02097
02098 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
02099 if (MBCLEN_INVALID_P(l)) {
02100 errcpy(err, "invalid multibyte escape");
02101 return -1;
02102 }
02103 if (1 < chlen || (chbuf[0] & 0x80)) {
02104 rb_str_buf_cat(buf, chbuf, chlen);
02105
02106 if (*encp == 0)
02107 *encp = enc;
02108 else if (*encp != enc) {
02109 errcpy(err, "escaped non ASCII character in UTF-8 regexp");
02110 return -1;
02111 }
02112 }
02113 else {
02114 char escbuf[5];
02115 snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
02116 rb_str_buf_cat(buf, escbuf, 4);
02117 }
02118 *pp = p;
02119 return 0;
02120 }
02121
02122 static int
02123 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
02124 {
02125 if ((0xd800 <= code && code <= 0xdfff) ||
02126 0x10ffff < code) {
02127 errcpy(err, "invalid Unicode range");
02128 return -1;
02129 }
02130 return 0;
02131 }
02132
02133 static int
02134 append_utf8(unsigned long uv,
02135 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02136 {
02137 if (check_unicode_range(uv, err) != 0)
02138 return -1;
02139 if (uv < 0x80) {
02140 char escbuf[5];
02141 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
02142 rb_str_buf_cat(buf, escbuf, 4);
02143 }
02144 else {
02145 int len;
02146 char utf8buf[6];
02147 len = rb_uv_to_utf8(utf8buf, uv);
02148 rb_str_buf_cat(buf, utf8buf, len);
02149
02150 if (*encp == 0)
02151 *encp = rb_utf8_encoding();
02152 else if (*encp != rb_utf8_encoding()) {
02153 errcpy(err, "UTF-8 character in non UTF-8 regexp");
02154 return -1;
02155 }
02156 }
02157 return 0;
02158 }
02159
02160 static int
02161 unescape_unicode_list(const char **pp, const char *end,
02162 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02163 {
02164 const char *p = *pp;
02165 int has_unicode = 0;
02166 unsigned long code;
02167 size_t len;
02168
02169 while (p < end && ISSPACE(*p)) p++;
02170
02171 while (1) {
02172 code = ruby_scan_hex(p, end-p, &len);
02173 if (len == 0)
02174 break;
02175 if (6 < len) {
02176 errcpy(err, "invalid Unicode range");
02177 return -1;
02178 }
02179 p += len;
02180 if (append_utf8(code, buf, encp, err) != 0)
02181 return -1;
02182 has_unicode = 1;
02183
02184 while (p < end && ISSPACE(*p)) p++;
02185 }
02186
02187 if (has_unicode == 0) {
02188 errcpy(err, "invalid Unicode list");
02189 return -1;
02190 }
02191
02192 *pp = p;
02193
02194 return 0;
02195 }
02196
02197 static int
02198 unescape_unicode_bmp(const char **pp, const char *end,
02199 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02200 {
02201 const char *p = *pp;
02202 size_t len;
02203 unsigned long code;
02204
02205 if (end < p+4) {
02206 errcpy(err, "invalid Unicode escape");
02207 return -1;
02208 }
02209 code = ruby_scan_hex(p, 4, &len);
02210 if (len != 4) {
02211 errcpy(err, "invalid Unicode escape");
02212 return -1;
02213 }
02214 if (append_utf8(code, buf, encp, err) != 0)
02215 return -1;
02216 *pp = p + 4;
02217 return 0;
02218 }
02219
02220 static int
02221 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
02222 VALUE buf, rb_encoding **encp, int *has_property,
02223 onig_errmsg_buffer err)
02224 {
02225 char c;
02226 char smallbuf[2];
02227
02228 while (p < end) {
02229 int chlen = rb_enc_precise_mbclen(p, end, enc);
02230 if (!MBCLEN_CHARFOUND_P(chlen)) {
02231 errcpy(err, "invalid multibyte character");
02232 return -1;
02233 }
02234 chlen = MBCLEN_CHARFOUND_LEN(chlen);
02235 if (1 < chlen || (*p & 0x80)) {
02236 rb_str_buf_cat(buf, p, chlen);
02237 p += chlen;
02238 if (*encp == 0)
02239 *encp = enc;
02240 else if (*encp != enc) {
02241 errcpy(err, "non ASCII character in UTF-8 regexp");
02242 return -1;
02243 }
02244 continue;
02245 }
02246
02247 switch (c = *p++) {
02248 case '\\':
02249 if (p == end) {
02250 errcpy(err, "too short escape sequence");
02251 return -1;
02252 }
02253 switch (c = *p++) {
02254 case '1': case '2': case '3':
02255 case '4': case '5': case '6': case '7':
02256 {
02257 size_t octlen;
02258 if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
02259
02260
02261
02262 goto escape_asis;
02263 }
02264 }
02265
02266
02267 case '0':
02268
02269 case 'x':
02270 case 'c':
02271 case 'C':
02272 case 'M':
02273 p = p-2;
02274 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
02275 return -1;
02276 break;
02277
02278 case 'u':
02279 if (p == end) {
02280 errcpy(err, "too short escape sequence");
02281 return -1;
02282 }
02283 if (*p == '{') {
02284
02285 p++;
02286 if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
02287 return -1;
02288 if (p == end || *p++ != '}') {
02289 errcpy(err, "invalid Unicode list");
02290 return -1;
02291 }
02292 break;
02293 }
02294 else {
02295
02296 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
02297 return -1;
02298 break;
02299 }
02300
02301 case 'p':
02302 case 'P':
02303 if (!*encp) {
02304 *has_property = 1;
02305 }
02306 goto escape_asis;
02307
02308 default:
02309 escape_asis:
02310 smallbuf[0] = '\\';
02311 smallbuf[1] = c;
02312 rb_str_buf_cat(buf, smallbuf, 2);
02313 break;
02314 }
02315 break;
02316
02317 default:
02318 rb_str_buf_cat(buf, &c, 1);
02319 break;
02320 }
02321 }
02322
02323 return 0;
02324 }
02325
02326 static VALUE
02327 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
02328 rb_encoding **fixed_enc, onig_errmsg_buffer err)
02329 {
02330 VALUE buf;
02331 int has_property = 0;
02332
02333 buf = rb_str_buf_new(0);
02334
02335 if (rb_enc_asciicompat(enc))
02336 *fixed_enc = 0;
02337 else {
02338 *fixed_enc = enc;
02339 rb_enc_associate(buf, enc);
02340 }
02341
02342 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
02343 return Qnil;
02344
02345 if (has_property && !*fixed_enc) {
02346 *fixed_enc = enc;
02347 }
02348
02349 if (*fixed_enc) {
02350 rb_enc_associate(buf, *fixed_enc);
02351 }
02352
02353 return buf;
02354 }
02355
02356 VALUE
02357 rb_reg_check_preprocess(VALUE str)
02358 {
02359 rb_encoding *fixed_enc = 0;
02360 onig_errmsg_buffer err = "";
02361 VALUE buf;
02362 char *p, *end;
02363 rb_encoding *enc;
02364
02365 StringValue(str);
02366 p = RSTRING_PTR(str);
02367 end = p + RSTRING_LEN(str);
02368 enc = rb_enc_get(str);
02369
02370 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
02371 RB_GC_GUARD(str);
02372
02373 if (buf == Qnil) {
02374 return rb_reg_error_desc(str, 0, err);
02375 }
02376 return Qnil;
02377 }
02378
02379 static VALUE
02380 rb_reg_preprocess_dregexp(VALUE ary, int options)
02381 {
02382 rb_encoding *fixed_enc = 0;
02383 rb_encoding *regexp_enc = 0;
02384 onig_errmsg_buffer err = "";
02385 int i;
02386 VALUE result = 0;
02387 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02388
02389 if (RARRAY_LEN(ary) == 0) {
02390 rb_raise(rb_eArgError, "no arguments given");
02391 }
02392
02393 for (i = 0; i < RARRAY_LEN(ary); i++) {
02394 VALUE str = RARRAY_AREF(ary, i);
02395 VALUE buf;
02396 char *p, *end;
02397 rb_encoding *src_enc;
02398
02399 src_enc = rb_enc_get(str);
02400 if (options & ARG_ENCODING_NONE &&
02401 src_enc != ascii8bit) {
02402 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)
02403 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02404 else
02405 src_enc = ascii8bit;
02406 }
02407
02408 StringValue(str);
02409 p = RSTRING_PTR(str);
02410 end = p + RSTRING_LEN(str);
02411
02412 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
02413
02414 if (buf == Qnil)
02415 rb_raise(rb_eArgError, "%s", err);
02416
02417 if (fixed_enc != 0) {
02418 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
02419 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
02420 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
02421 }
02422 regexp_enc = fixed_enc;
02423 }
02424
02425 if (!result)
02426 result = rb_str_new3(str);
02427 else
02428 rb_str_buf_append(result, str);
02429 }
02430 if (regexp_enc) {
02431 rb_enc_associate(result, regexp_enc);
02432 }
02433
02434 return result;
02435 }
02436
02437 static int
02438 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
02439 int options, onig_errmsg_buffer err,
02440 const char *sourcefile, int sourceline)
02441 {
02442 struct RRegexp *re = RREGEXP(obj);
02443 VALUE unescaped;
02444 rb_encoding *fixed_enc = 0;
02445 rb_encoding *a_enc = rb_ascii8bit_encoding();
02446
02447 rb_check_frozen(obj);
02448 if (FL_TEST(obj, REG_LITERAL))
02449 rb_raise(rb_eSecurityError, "can't modify literal regexp");
02450 if (re->ptr)
02451 rb_raise(rb_eTypeError, "already initialized regexp");
02452 re->ptr = 0;
02453
02454 if (rb_enc_dummy_p(enc)) {
02455 errcpy(err, "can't make regexp with dummy encoding");
02456 return -1;
02457 }
02458
02459 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
02460 if (unescaped == Qnil)
02461 return -1;
02462
02463 if (fixed_enc) {
02464 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
02465 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
02466 errcpy(err, "incompatible character encoding");
02467 return -1;
02468 }
02469 if (fixed_enc != a_enc) {
02470 options |= ARG_ENCODING_FIXED;
02471 enc = fixed_enc;
02472 }
02473 }
02474 else if (!(options & ARG_ENCODING_FIXED)) {
02475 enc = rb_usascii_encoding();
02476 }
02477
02478 rb_enc_associate((VALUE)re, enc);
02479 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
02480 re->basic.flags |= KCODE_FIXED;
02481 }
02482 if (options & ARG_ENCODING_NONE) {
02483 re->basic.flags |= REG_ENCODING_NONE;
02484 }
02485
02486 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
02487 options & ARG_REG_OPTION_MASK, err,
02488 sourcefile, sourceline);
02489 if (!re->ptr) return -1;
02490 RB_OBJ_WRITE(obj, &re->src, rb_fstring(rb_enc_str_new(s, len, enc)));
02491 RB_GC_GUARD(unescaped);
02492 return 0;
02493 }
02494
02495 static int
02496 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
02497 const char *sourcefile, int sourceline)
02498 {
02499 int ret;
02500 rb_encoding *enc = rb_enc_get(str);
02501 if (options & ARG_ENCODING_NONE) {
02502 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02503 if (enc != ascii8bit) {
02504 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
02505 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02506 return -1;
02507 }
02508 enc = ascii8bit;
02509 }
02510 }
02511 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
02512 options, err, sourcefile, sourceline);
02513 OBJ_INFECT(obj, str);
02514 RB_GC_GUARD(str);
02515 return ret;
02516 }
02517
02518 static VALUE
02519 rb_reg_s_alloc(VALUE klass)
02520 {
02521 NEWOBJ_OF(re, struct RRegexp, klass, T_REGEXP | (RGENGC_WB_PROTECTED_REGEXP ? FL_WB_PROTECTED : 0));
02522
02523 re->ptr = 0;
02524 RB_OBJ_WRITE(re, &re->src, 0);
02525 re->usecnt = 0;
02526
02527 return (VALUE)re;
02528 }
02529
02530 VALUE
02531 rb_reg_alloc(void)
02532 {
02533 return rb_reg_s_alloc(rb_cRegexp);
02534 }
02535
02536 VALUE
02537 rb_reg_new_str(VALUE s, int options)
02538 {
02539 return rb_reg_init_str(rb_reg_alloc(), s, options);
02540 }
02541
02542 VALUE
02543 rb_reg_init_str(VALUE re, VALUE s, int options)
02544 {
02545 onig_errmsg_buffer err = "";
02546
02547 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
02548 rb_reg_raise_str(s, options, err);
02549 }
02550
02551 return re;
02552 }
02553
02554 VALUE
02555 rb_reg_new_ary(VALUE ary, int opt)
02556 {
02557 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
02558 }
02559
02560 VALUE
02561 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
02562 {
02563 VALUE re = rb_reg_alloc();
02564 onig_errmsg_buffer err = "";
02565
02566 if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
02567 rb_enc_reg_raise(s, len, enc, options, err);
02568 }
02569
02570 return re;
02571 }
02572
02573 VALUE
02574 rb_reg_new(const char *s, long len, int options)
02575 {
02576 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
02577 }
02578
02579 VALUE
02580 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
02581 {
02582 VALUE re = rb_reg_alloc();
02583 onig_errmsg_buffer err = "";
02584
02585 if (!str) str = rb_str_new(0,0);
02586 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
02587 rb_set_errinfo(rb_reg_error_desc(str, options, err));
02588 return Qnil;
02589 }
02590 FL_SET(re, REG_LITERAL);
02591 return re;
02592 }
02593
02594 static VALUE reg_cache;
02595
02596 VALUE
02597 rb_reg_regcomp(VALUE str)
02598 {
02599 volatile VALUE save_str = str;
02600 if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
02601 && ENCODING_GET(reg_cache) == ENCODING_GET(str)
02602 && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
02603 return reg_cache;
02604
02605 return reg_cache = rb_reg_new_str(save_str, 0);
02606 }
02607
02608 static st_index_t reg_hash(VALUE re);
02609
02610
02611
02612
02613
02614
02615
02616 static VALUE
02617 rb_reg_hash(VALUE re)
02618 {
02619 st_index_t hashval = reg_hash(re);
02620 return LONG2FIX(hashval);
02621 }
02622
02623 static st_index_t
02624 reg_hash(VALUE re)
02625 {
02626 st_index_t hashval;
02627
02628 rb_reg_check(re);
02629 hashval = RREGEXP(re)->ptr->options;
02630 hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
02631 return rb_hash_end(hashval);
02632 }
02633
02634
02635
02636
02637
02638
02639
02640
02641
02642
02643
02644
02645
02646
02647
02648
02649
02650 static VALUE
02651 rb_reg_equal(VALUE re1, VALUE re2)
02652 {
02653 if (re1 == re2) return Qtrue;
02654 if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse;
02655 rb_reg_check(re1); rb_reg_check(re2);
02656 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
02657 if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
02658 if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
02659 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
02660 if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
02661 return Qtrue;
02662 }
02663 return Qfalse;
02664 }
02665
02666
02667
02668
02669
02670
02671
02672
02673
02674 static VALUE
02675 match_hash(VALUE match)
02676 {
02677 const struct re_registers *regs;
02678 st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
02679
02680 rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
02681 regs = RMATCH_REGS(match);
02682 hashval = rb_hash_uint(hashval, regs->num_regs);
02683 hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
02684 hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
02685 hashval = rb_hash_end(hashval);
02686 return LONG2FIX(hashval);
02687 }
02688
02689
02690
02691
02692
02693
02694
02695
02696
02697
02698 static VALUE
02699 match_equal(VALUE match1, VALUE match2)
02700 {
02701 const struct re_registers *regs1, *regs2;
02702 if (match1 == match2) return Qtrue;
02703 if (!RB_TYPE_P(match2, T_MATCH)) return Qfalse;
02704 if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
02705 if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
02706 regs1 = RMATCH_REGS(match1);
02707 regs2 = RMATCH_REGS(match2);
02708 if (regs1->num_regs != regs2->num_regs) return Qfalse;
02709 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
02710 if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
02711 return Qtrue;
02712 }
02713
02714 static VALUE
02715 reg_operand(VALUE s, int check)
02716 {
02717 if (SYMBOL_P(s)) {
02718 return rb_sym_to_s(s);
02719 }
02720 else {
02721 return (check ? rb_str_to_str : rb_check_string_type)(s);
02722 }
02723 }
02724
02725 static long
02726 reg_match_pos(VALUE re, VALUE *strp, long pos)
02727 {
02728 VALUE str = *strp;
02729
02730 if (NIL_P(str)) {
02731 rb_backref_set(Qnil);
02732 return -1;
02733 }
02734 *strp = str = reg_operand(str, TRUE);
02735 if (pos != 0) {
02736 if (pos < 0) {
02737 VALUE l = rb_str_length(str);
02738 pos += NUM2INT(l);
02739 if (pos < 0) {
02740 return pos;
02741 }
02742 }
02743 pos = rb_str_offset(str, pos);
02744 }
02745 return rb_reg_search(re, str, pos, 0);
02746 }
02747
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763
02764
02765
02766
02767
02768
02769
02770
02771
02772
02773
02774
02775
02776
02777
02778
02779
02780
02781
02782
02783
02784
02785
02786
02787
02788
02789
02790
02791
02792
02793
02794
02795
02796 VALUE
02797 rb_reg_match(VALUE re, VALUE str)
02798 {
02799 long pos = reg_match_pos(re, &str, 0);
02800 if (pos < 0) return Qnil;
02801 pos = rb_str_sublen(str, pos);
02802 return LONG2FIX(pos);
02803 }
02804
02805
02806
02807
02808
02809
02810
02811
02812
02813
02814
02815
02816
02817
02818
02819
02820
02821
02822
02823
02824
02825
02826 VALUE
02827 rb_reg_eqq(VALUE re, VALUE str)
02828 {
02829 long start;
02830
02831 str = reg_operand(str, FALSE);
02832 if (NIL_P(str)) {
02833 rb_backref_set(Qnil);
02834 return Qfalse;
02835 }
02836 start = rb_reg_search(re, str, 0, 0);
02837 if (start < 0) {
02838 return Qfalse;
02839 }
02840 return Qtrue;
02841 }
02842
02843
02844
02845
02846
02847
02848
02849
02850
02851
02852
02853
02854
02855 VALUE
02856 rb_reg_match2(VALUE re)
02857 {
02858 long start;
02859 VALUE line = rb_lastline_get();
02860
02861 if (!RB_TYPE_P(line, T_STRING)) {
02862 rb_backref_set(Qnil);
02863 return Qnil;
02864 }
02865
02866 start = rb_reg_search(re, line, 0, 0);
02867 if (start < 0) {
02868 return Qnil;
02869 }
02870 start = rb_str_sublen(line, start);
02871 return LONG2FIX(start);
02872 }
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896
02897
02898
02899
02900
02901
02902
02903 static VALUE
02904 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
02905 {
02906 VALUE result, str, initpos;
02907 long pos;
02908
02909 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
02910 pos = NUM2LONG(initpos);
02911 }
02912 else {
02913 pos = 0;
02914 }
02915
02916 pos = reg_match_pos(re, &str, pos);
02917 if (pos < 0) {
02918 rb_backref_set(Qnil);
02919 return Qnil;
02920 }
02921 result = rb_backref_get();
02922 rb_match_busy(result);
02923 if (!NIL_P(result) && rb_block_given_p()) {
02924 return rb_yield(result);
02925 }
02926 return result;
02927 }
02928
02929
02930
02931
02932
02933
02934
02935
02936
02937
02938
02939
02940
02941
02942
02943
02944
02945
02946
02947
02948
02949
02950
02951
02952
02953
02954
02955
02956
02957
02958 static VALUE
02959 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
02960 {
02961 onig_errmsg_buffer err = "";
02962 int flags = 0;
02963 VALUE str;
02964 rb_encoding *enc;
02965 const char *ptr;
02966 long len;
02967
02968 rb_check_arity(argc, 1, 3);
02969 if (RB_TYPE_P(argv[0], T_REGEXP)) {
02970 VALUE re = argv[0];
02971
02972 if (argc > 1) {
02973 rb_warn("flags ignored");
02974 }
02975 rb_reg_check(re);
02976 flags = rb_reg_options(re);
02977 ptr = RREGEXP_SRC_PTR(re);
02978 len = RREGEXP_SRC_LEN(re);
02979 enc = rb_enc_get(re);
02980 if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
02981 str = rb_enc_str_new(ptr, len, enc);
02982 rb_reg_raise_str(str, flags, err);
02983 }
02984 }
02985 else {
02986 if (argc >= 2) {
02987 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
02988 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
02989 }
02990 enc = 0;
02991 if (argc == 3 && !NIL_P(argv[2])) {
02992 char *kcode = StringValuePtr(argv[2]);
02993 if (kcode[0] == 'n' || kcode[0] == 'N') {
02994 enc = rb_ascii8bit_encoding();
02995 flags |= ARG_ENCODING_NONE;
02996 }
02997 else {
02998 rb_warn("encoding option is ignored - %s", kcode);
02999 }
03000 }
03001 str = argv[0];
03002 ptr = StringValuePtr(str);
03003 if (enc
03004 ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
03005 : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
03006 rb_reg_raise_str(str, flags, err);
03007 }
03008 }
03009 return self;
03010 }
03011
03012 VALUE
03013 rb_reg_quote(VALUE str)
03014 {
03015 rb_encoding *enc = rb_enc_get(str);
03016 char *s, *send, *t;
03017 VALUE tmp;
03018 int c, clen;
03019 int ascii_only = rb_enc_str_asciionly_p(str);
03020
03021 s = RSTRING_PTR(str);
03022 send = s + RSTRING_LEN(str);
03023 while (s < send) {
03024 c = rb_enc_ascget(s, send, &clen, enc);
03025 if (c == -1) {
03026 s += mbclen(s, send, enc);
03027 continue;
03028 }
03029 switch (c) {
03030 case '[': case ']': case '{': case '}':
03031 case '(': case ')': case '|': case '-':
03032 case '*': case '.': case '\\':
03033 case '?': case '+': case '^': case '$':
03034 case ' ': case '#':
03035 case '\t': case '\f': case '\v': case '\n': case '\r':
03036 goto meta_found;
03037 }
03038 s += clen;
03039 }
03040 tmp = rb_str_new3(str);
03041 if (ascii_only) {
03042 rb_enc_associate(tmp, rb_usascii_encoding());
03043 }
03044 return tmp;
03045
03046 meta_found:
03047 tmp = rb_str_new(0, RSTRING_LEN(str)*2);
03048 if (ascii_only) {
03049 rb_enc_associate(tmp, rb_usascii_encoding());
03050 }
03051 else {
03052 rb_enc_copy(tmp, str);
03053 }
03054 t = RSTRING_PTR(tmp);
03055
03056 memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
03057 t += s - RSTRING_PTR(str);
03058
03059 while (s < send) {
03060 c = rb_enc_ascget(s, send, &clen, enc);
03061 if (c == -1) {
03062 int n = mbclen(s, send, enc);
03063
03064 while (n--)
03065 *t++ = *s++;
03066 continue;
03067 }
03068 s += clen;
03069 switch (c) {
03070 case '[': case ']': case '{': case '}':
03071 case '(': case ')': case '|': case '-':
03072 case '*': case '.': case '\\':
03073 case '?': case '+': case '^': case '$':
03074 case '#':
03075 t += rb_enc_mbcput('\\', t, enc);
03076 break;
03077 case ' ':
03078 t += rb_enc_mbcput('\\', t, enc);
03079 t += rb_enc_mbcput(' ', t, enc);
03080 continue;
03081 case '\t':
03082 t += rb_enc_mbcput('\\', t, enc);
03083 t += rb_enc_mbcput('t', t, enc);
03084 continue;
03085 case '\n':
03086 t += rb_enc_mbcput('\\', t, enc);
03087 t += rb_enc_mbcput('n', t, enc);
03088 continue;
03089 case '\r':
03090 t += rb_enc_mbcput('\\', t, enc);
03091 t += rb_enc_mbcput('r', t, enc);
03092 continue;
03093 case '\f':
03094 t += rb_enc_mbcput('\\', t, enc);
03095 t += rb_enc_mbcput('f', t, enc);
03096 continue;
03097 case '\v':
03098 t += rb_enc_mbcput('\\', t, enc);
03099 t += rb_enc_mbcput('v', t, enc);
03100 continue;
03101 }
03102 t += rb_enc_mbcput(c, t, enc);
03103 }
03104 rb_str_resize(tmp, t - RSTRING_PTR(tmp));
03105 OBJ_INFECT(tmp, str);
03106 return tmp;
03107 }
03108
03109
03110
03111
03112
03113
03114
03115
03116
03117
03118
03119
03120
03121
03122
03123
03124 static VALUE
03125 rb_reg_s_quote(VALUE c, VALUE str)
03126 {
03127 return rb_reg_quote(reg_operand(str, TRUE));
03128 }
03129
03130 int
03131 rb_reg_options(VALUE re)
03132 {
03133 int options;
03134
03135 rb_reg_check(re);
03136 options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
03137 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
03138 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
03139 return options;
03140 }
03141
03142 VALUE
03143 rb_check_regexp_type(VALUE re)
03144 {
03145 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
03146 }
03147
03148
03149
03150
03151
03152
03153
03154
03155
03156
03157
03158
03159
03160
03161
03162
03163
03164
03165 static VALUE
03166 rb_reg_s_try_convert(VALUE dummy, VALUE re)
03167 {
03168 return rb_check_regexp_type(re);
03169 }
03170
03171 static VALUE
03172 rb_reg_s_union(VALUE self, VALUE args0)
03173 {
03174 long argc = RARRAY_LEN(args0);
03175
03176 if (argc == 0) {
03177 VALUE args[1];
03178 args[0] = rb_str_new2("(?!)");
03179 return rb_class_new_instance(1, args, rb_cRegexp);
03180 }
03181 else if (argc == 1) {
03182 VALUE arg = rb_ary_entry(args0, 0);
03183 VALUE re = rb_check_regexp_type(arg);
03184 if (!NIL_P(re))
03185 return re;
03186 else {
03187 VALUE quoted;
03188 quoted = rb_reg_s_quote(Qnil, arg);
03189 return rb_reg_new_str(quoted, 0);
03190 }
03191 }
03192 else {
03193 int i;
03194 VALUE source = rb_str_buf_new(0);
03195 rb_encoding *result_enc;
03196
03197 int has_asciionly = 0;
03198 rb_encoding *has_ascii_compat_fixed = 0;
03199 rb_encoding *has_ascii_incompat = 0;
03200
03201 for (i = 0; i < argc; i++) {
03202 volatile VALUE v;
03203 VALUE e = rb_ary_entry(args0, i);
03204
03205 if (0 < i)
03206 rb_str_buf_cat_ascii(source, "|");
03207
03208 v = rb_check_regexp_type(e);
03209 if (!NIL_P(v)) {
03210 rb_encoding *enc = rb_enc_get(v);
03211 if (!rb_enc_asciicompat(enc)) {
03212 if (!has_ascii_incompat)
03213 has_ascii_incompat = enc;
03214 else if (has_ascii_incompat != enc)
03215 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03216 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03217 }
03218 else if (rb_reg_fixed_encoding_p(v)) {
03219 if (!has_ascii_compat_fixed)
03220 has_ascii_compat_fixed = enc;
03221 else if (has_ascii_compat_fixed != enc)
03222 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03223 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03224 }
03225 else {
03226 has_asciionly = 1;
03227 }
03228 v = rb_reg_to_s(v);
03229 }
03230 else {
03231 rb_encoding *enc;
03232 StringValue(e);
03233 enc = rb_enc_get(e);
03234 if (!rb_enc_str_asciicompat_p(e)) {
03235 if (!has_ascii_incompat)
03236 has_ascii_incompat = enc;
03237 else if (has_ascii_incompat != enc)
03238 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03239 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03240 }
03241 else if (rb_enc_str_asciionly_p(e)) {
03242 has_asciionly = 1;
03243 }
03244 else {
03245 if (!has_ascii_compat_fixed)
03246 has_ascii_compat_fixed = enc;
03247 else if (has_ascii_compat_fixed != enc)
03248 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03249 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03250 }
03251 v = rb_reg_s_quote(Qnil, e);
03252 }
03253 if (has_ascii_incompat) {
03254 if (has_asciionly) {
03255 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
03256 rb_enc_name(has_ascii_incompat));
03257 }
03258 if (has_ascii_compat_fixed) {
03259 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03260 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
03261 }
03262 }
03263
03264 if (i == 0) {
03265 rb_enc_copy(source, v);
03266 }
03267 rb_str_append(source, v);
03268 }
03269
03270 if (has_ascii_incompat) {
03271 result_enc = has_ascii_incompat;
03272 }
03273 else if (has_ascii_compat_fixed) {
03274 result_enc = has_ascii_compat_fixed;
03275 }
03276 else {
03277 result_enc = rb_ascii8bit_encoding();
03278 }
03279
03280 rb_enc_associate(source, result_enc);
03281 return rb_class_new_instance(1, &source, rb_cRegexp);
03282 }
03283 }
03284
03285
03286
03287
03288
03289
03290
03291
03292
03293
03294
03295
03296
03297
03298
03299
03300
03301
03302
03303
03304
03305
03306 static VALUE
03307 rb_reg_s_union_m(VALUE self, VALUE args)
03308 {
03309 VALUE v;
03310 if (RARRAY_LEN(args) == 1 &&
03311 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
03312 return rb_reg_s_union(self, v);
03313 }
03314 return rb_reg_s_union(self, args);
03315 }
03316
03317
03318 static VALUE
03319 rb_reg_init_copy(VALUE copy, VALUE re)
03320 {
03321 onig_errmsg_buffer err = "";
03322 const char *s;
03323 long len;
03324
03325 if (!OBJ_INIT_COPY(copy, re)) return copy;
03326 rb_reg_check(re);
03327 s = RREGEXP_SRC_PTR(re);
03328 len = RREGEXP_SRC_LEN(re);
03329 if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
03330 err, NULL, 0) != 0) {
03331 rb_reg_raise(s, len, err, re);
03332 }
03333 return copy;
03334 }
03335
03336 VALUE
03337 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
03338 {
03339 VALUE val = 0;
03340 char *p, *s, *e;
03341 int no, clen;
03342 rb_encoding *str_enc = rb_enc_get(str);
03343 rb_encoding *src_enc = rb_enc_get(src);
03344 int acompat = rb_enc_asciicompat(str_enc);
03345 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
03346
03347 p = s = RSTRING_PTR(str);
03348 e = s + RSTRING_LEN(str);
03349
03350 while (s < e) {
03351 int c = ASCGET(s, e, &clen);
03352 char *ss;
03353
03354 if (c == -1) {
03355 s += mbclen(s, e, str_enc);
03356 continue;
03357 }
03358 ss = s;
03359 s += clen;
03360
03361 if (c != '\\' || s == e) continue;
03362
03363 if (!val) {
03364 val = rb_str_buf_new(ss-p);
03365 }
03366 rb_enc_str_buf_cat(val, p, ss-p, str_enc);
03367
03368 c = ASCGET(s, e, &clen);
03369 if (c == -1) {
03370 s += mbclen(s, e, str_enc);
03371 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03372 p = s;
03373 continue;
03374 }
03375 s += clen;
03376
03377 p = s;
03378 switch (c) {
03379 case '1': case '2': case '3': case '4':
03380 case '5': case '6': case '7': case '8': case '9':
03381 if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
03382 no = c - '0';
03383 }
03384 else {
03385 continue;
03386 }
03387 break;
03388
03389 case 'k':
03390 if (s < e && ASCGET(s, e, &clen) == '<') {
03391 char *name, *name_end;
03392
03393 name_end = name = s + clen;
03394 while (name_end < e) {
03395 c = ASCGET(name_end, e, &clen);
03396 if (c == '>') break;
03397 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
03398 }
03399 if (name_end < e) {
03400 VALUE n = rb_str_subseq(str, (long)(name - RSTRING_PTR(str)),
03401 (long)(name_end - name));
03402 if (!rb_enc_compatible(RREGEXP(regexp)->src, n) ||
03403 (no = name_to_backref_number(regs, regexp, name, name_end)) < 1) {
03404 name_to_backref_error(n);
03405 }
03406 p = s = name_end + clen;
03407 break;
03408 }
03409 else {
03410 rb_raise(rb_eRuntimeError, "invalid group name reference format");
03411 }
03412 }
03413
03414 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03415 continue;
03416
03417 case '0':
03418 case '&':
03419 no = 0;
03420 break;
03421
03422 case '`':
03423 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
03424 continue;
03425
03426 case '\'':
03427 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
03428 continue;
03429
03430 case '+':
03431 no = regs->num_regs-1;
03432 while (BEG(no) == -1 && no > 0) no--;
03433 if (no == 0) continue;
03434 break;
03435
03436 case '\\':
03437 rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
03438 continue;
03439
03440 default:
03441 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03442 continue;
03443 }
03444
03445 if (no >= 0) {
03446 if (no >= regs->num_regs) continue;
03447 if (BEG(no) == -1) continue;
03448 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
03449 }
03450 }
03451
03452 if (!val) return str;
03453 if (p < e) {
03454 rb_enc_str_buf_cat(val, p, e-p, str_enc);
03455 }
03456
03457 return val;
03458 }
03459
03460 static VALUE
03461 kcode_getter(void)
03462 {
03463 rb_warn("variable $KCODE is no longer effective");
03464 return Qnil;
03465 }
03466
03467 static void
03468 kcode_setter(VALUE val, ID id)
03469 {
03470 rb_warn("variable $KCODE is no longer effective; ignored");
03471 }
03472
03473 static VALUE
03474 ignorecase_getter(void)
03475 {
03476 rb_warn("variable $= is no longer effective");
03477 return Qfalse;
03478 }
03479
03480 static void
03481 ignorecase_setter(VALUE val, ID id)
03482 {
03483 rb_warn("variable $= is no longer effective; ignored");
03484 }
03485
03486 static VALUE
03487 match_getter(void)
03488 {
03489 VALUE match = rb_backref_get();
03490
03491 if (NIL_P(match)) return Qnil;
03492 rb_match_busy(match);
03493 return match;
03494 }
03495
03496 static void
03497 match_setter(VALUE val)
03498 {
03499 if (!NIL_P(val)) {
03500 Check_Type(val, T_MATCH);
03501 }
03502 rb_backref_set(val);
03503 }
03504
03505
03506
03507
03508
03509
03510
03511
03512
03513
03514
03515
03516
03517
03518
03519
03520
03521
03522
03523
03524
03525
03526
03527
03528
03529
03530
03531
03532
03533 static VALUE
03534 rb_reg_s_last_match(int argc, VALUE *argv)
03535 {
03536 VALUE nth;
03537
03538 if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
03539 VALUE match = rb_backref_get();
03540 int n;
03541 if (NIL_P(match)) return Qnil;
03542 n = match_backref_number(match, nth);
03543 return rb_reg_nth_match(n, match);
03544 }
03545 return match_getter();
03546 }
03547
03548 static void
03549 re_warn(const char *s)
03550 {
03551 rb_warn("%s", s);
03552 }
03553
03554
03555
03556
03557
03558
03559
03560
03561
03562
03563
03564
03565
03566
03567
03568
03569
03570
03571
03572
03573
03574
03575
03576
03577 void
03578 Init_Regexp(void)
03579 {
03580 rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
03581
03582 onigenc_set_default_caseconv_table((UChar*)casetable);
03583 onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
03584 onig_set_warn_func(re_warn);
03585 onig_set_verb_warn_func(re_warn);
03586
03587 rb_define_virtual_variable("$~", match_getter, match_setter);
03588 rb_define_virtual_variable("$&", last_match_getter, 0);
03589 rb_define_virtual_variable("$`", prematch_getter, 0);
03590 rb_define_virtual_variable("$'", postmatch_getter, 0);
03591 rb_define_virtual_variable("$+", last_paren_match_getter, 0);
03592
03593 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
03594 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
03595 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
03596
03597 rb_cRegexp = rb_define_class("Regexp", rb_cObject);
03598 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
03599 rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
03600 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
03601 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
03602 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
03603 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
03604 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
03605
03606 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
03607 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
03608 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
03609 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
03610 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
03611 rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
03612 rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
03613 rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
03614 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
03615 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
03616 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
03617 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
03618 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
03619 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
03620 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0);
03621 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
03622 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
03623 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
03624
03625
03626 rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
03627
03628 rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
03629
03630 rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
03631
03632 rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
03633
03634 rb_define_const(rb_cRegexp, "NOENCODING", INT2FIX(ARG_ENCODING_NONE));
03635
03636 rb_global_variable(®_cache);
03637
03638 rb_cMatch = rb_define_class("MatchData", rb_cObject);
03639 rb_define_alloc_func(rb_cMatch, match_alloc);
03640 rb_undef_method(CLASS_OF(rb_cMatch), "new");
03641
03642 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
03643 rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
03644 rb_define_method(rb_cMatch, "names", match_names, 0);
03645 rb_define_method(rb_cMatch, "size", match_size, 0);
03646 rb_define_method(rb_cMatch, "length", match_size, 0);
03647 rb_define_method(rb_cMatch, "offset", match_offset, 1);
03648 rb_define_method(rb_cMatch, "begin", match_begin, 1);
03649 rb_define_method(rb_cMatch, "end", match_end, 1);
03650 rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
03651 rb_define_method(rb_cMatch, "[]", match_aref, -1);
03652 rb_define_method(rb_cMatch, "captures", match_captures, 0);
03653 rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
03654 rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
03655 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
03656 rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
03657 rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
03658 rb_define_method(rb_cMatch, "string", match_string, 0);
03659 rb_define_method(rb_cMatch, "hash", match_hash, 0);
03660 rb_define_method(rb_cMatch, "eql?", match_equal, 1);
03661 rb_define_method(rb_cMatch, "==", match_equal, 1);
03662 }
03663