00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "vm_core.h"
00018 #include "internal.h"
00019 #include "probes.h"
00020 #include <assert.h>
00021
00022 #define BEG(no) (regs->beg[(no)])
00023 #define END(no) (regs->end[(no)])
00024
00025 #include <math.h>
00026 #include <ctype.h>
00027
00028 #ifdef HAVE_UNISTD_H
00029 #include <unistd.h>
00030 #endif
00031
00032 #define STRING_ENUMERATORS_WANTARRAY 0
00033
00034 #undef rb_str_new_cstr
00035 #undef rb_tainted_str_new_cstr
00036 #undef rb_usascii_str_new_cstr
00037 #undef rb_enc_str_new_cstr
00038 #undef rb_external_str_new_cstr
00039 #undef rb_locale_str_new_cstr
00040 #undef rb_str_dup_frozen
00041 #undef rb_str_buf_new_cstr
00042 #undef rb_str_buf_cat2
00043 #undef rb_str_cat2
00044
00045 static VALUE rb_str_clear(VALUE str);
00046
00047 VALUE rb_cString;
00048 VALUE rb_cSymbol;
00049
00050 #define RUBY_MAX_CHAR_LEN 16
00051 #define STR_TMPLOCK FL_USER7
00052 #define STR_UNSET_NOCAPA(s) do {\
00053 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00054 } while (0)
00055
00056 #define STR_SET_NOEMBED(str) do {\
00057 FL_SET((str), STR_NOEMBED);\
00058 STR_SET_EMBED_LEN((str), 0);\
00059 } while (0)
00060 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00061 #define STR_SET_EMBED_LEN(str, n) do { \
00062 long tmp_n = (n);\
00063 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00064 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00065 } while (0)
00066
00067 #define STR_SET_LEN(str, n) do { \
00068 if (STR_EMBED_P(str)) {\
00069 STR_SET_EMBED_LEN((str), (n));\
00070 }\
00071 else {\
00072 RSTRING(str)->as.heap.len = (n);\
00073 }\
00074 } while (0)
00075
00076 #define STR_DEC_LEN(str) do {\
00077 if (STR_EMBED_P(str)) {\
00078 long n = RSTRING_LEN(str);\
00079 n--;\
00080 STR_SET_EMBED_LEN((str), n);\
00081 }\
00082 else {\
00083 RSTRING(str)->as.heap.len--;\
00084 }\
00085 } while (0)
00086
00087 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
00088 #define TERM_FILL(ptr, termlen) do {\
00089 char *const term_fill_ptr = (ptr);\
00090 const int term_fill_len = (termlen);\
00091 *term_fill_ptr = '\0';\
00092 if (UNLIKELY(term_fill_len > 1))\
00093 memset(term_fill_ptr, 0, term_fill_len);\
00094 } while (0)
00095
00096 #define RESIZE_CAPA(str,capacity) do {\
00097 const int termlen = TERM_LEN(str);\
00098 if (STR_EMBED_P(str)) {\
00099 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00100 char *const tmp = ALLOC_N(char, (capacity)+termlen);\
00101 const long tlen = RSTRING_LEN(str);\
00102 memcpy(tmp, RSTRING_PTR(str), tlen);\
00103 RSTRING(str)->as.heap.ptr = tmp;\
00104 RSTRING(str)->as.heap.len = tlen;\
00105 STR_SET_NOEMBED(str);\
00106 RSTRING(str)->as.heap.aux.capa = (capacity);\
00107 }\
00108 }\
00109 else {\
00110 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+termlen);\
00111 if (!STR_NOCAPA_P(str))\
00112 RSTRING(str)->as.heap.aux.capa = (capacity);\
00113 }\
00114 } while (0)
00115
00116 #define STR_SET_SHARED(str, shared_str) do { \
00117 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
00118 FL_SET((str), ELTS_SHARED); \
00119 } while (0)
00120
00121 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
00122 #define STR_HEAP_SIZE(str) (RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
00123
00124 #define STR_ENC_GET(str) get_encoding(str)
00125
00126 rb_encoding *rb_enc_get_from_index(int index);
00127
00128 static rb_encoding *
00129 get_actual_encoding(const int encidx, VALUE str)
00130 {
00131 const unsigned char *q;
00132
00133 switch (encidx) {
00134 case ENCINDEX_UTF_16:
00135 if (RSTRING_LEN(str) < 2) break;
00136 q = (const unsigned char *)RSTRING_PTR(str);
00137 if (q[0] == 0xFE && q[1] == 0xFF) {
00138 return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
00139 }
00140 if (q[0] == 0xFF && q[1] == 0xFE) {
00141 return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
00142 }
00143 return rb_ascii8bit_encoding();
00144 case ENCINDEX_UTF_32:
00145 if (RSTRING_LEN(str) < 4) break;
00146 q = (const unsigned char *)RSTRING_PTR(str);
00147 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
00148 return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
00149 }
00150 if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
00151 return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
00152 }
00153 return rb_ascii8bit_encoding();
00154 }
00155 return rb_enc_from_index(encidx);
00156 }
00157
00158 static rb_encoding *
00159 get_encoding(VALUE str)
00160 {
00161 return get_actual_encoding(ENCODING_GET(str), str);
00162 }
00163
00164 static int fstring_cmp(VALUE a, VALUE b);
00165
00166 static st_table* frozen_strings;
00167
00168 static const struct st_hash_type fstring_hash_type = {
00169 fstring_cmp,
00170 rb_str_hash,
00171 };
00172
00173 static int
00174 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
00175 {
00176 VALUE *fstr = (VALUE *)arg;
00177 VALUE str = (VALUE)*key;
00178
00179 if (existing) {
00180
00181
00182 rb_gc_resurrect(*fstr = *key);
00183 return ST_STOP;
00184 }
00185
00186 if (STR_SHARED_P(str)) {
00187
00188 str = rb_enc_str_new(RSTRING_PTR(str), RSTRING_LEN(str), STR_ENC_GET(str));
00189 OBJ_FREEZE(str);
00190 }
00191 else {
00192 str = rb_str_new_frozen(str);
00193 }
00194 RBASIC(str)->flags |= RSTRING_FSTR;
00195
00196 *key = *value = *fstr = str;
00197 return ST_CONTINUE;
00198 }
00199
00200 VALUE
00201 rb_fstring(VALUE str)
00202 {
00203 VALUE fstr = Qnil;
00204 Check_Type(str, T_STRING);
00205
00206 if (!frozen_strings)
00207 frozen_strings = st_init_table(&fstring_hash_type);
00208
00209 if (FL_TEST(str, RSTRING_FSTR))
00210 return str;
00211
00212 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&fstr);
00213 return fstr;
00214 }
00215
00216 static int
00217 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
00218 {
00219 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
00220 return ST_CONTINUE;
00221 }
00222
00223 static int
00224 fstring_cmp(VALUE a, VALUE b)
00225 {
00226 int cmp = rb_str_hash_cmp(a, b);
00227 if (cmp != 0) {
00228 return cmp;
00229 }
00230 return ENCODING_GET(b) - ENCODING_GET(a);
00231 }
00232
00233 static inline int
00234 single_byte_optimizable(VALUE str)
00235 {
00236 rb_encoding *enc;
00237
00238
00239 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00240 return 1;
00241
00242 enc = STR_ENC_GET(str);
00243 if (rb_enc_mbmaxlen(enc) == 1)
00244 return 1;
00245
00246
00247
00248 return 0;
00249 }
00250
00251 VALUE rb_fs;
00252
00253 static inline const char *
00254 search_nonascii(const char *p, const char *e)
00255 {
00256 #if SIZEOF_VALUE == 8
00257 # define NONASCII_MASK 0x8080808080808080ULL
00258 #elif SIZEOF_VALUE == 4
00259 # define NONASCII_MASK 0x80808080UL
00260 #endif
00261 #ifdef NONASCII_MASK
00262 if ((int)sizeof(VALUE) * 2 < e - p) {
00263 const VALUE *s, *t;
00264 const VALUE lowbits = sizeof(VALUE) - 1;
00265 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00266 while (p < (const char *)s) {
00267 if (!ISASCII(*p))
00268 return p;
00269 p++;
00270 }
00271 t = (const VALUE*)(~lowbits & (VALUE)e);
00272 while (s < t) {
00273 if (*s & NONASCII_MASK) {
00274 t = s;
00275 break;
00276 }
00277 s++;
00278 }
00279 p = (const char *)t;
00280 }
00281 #endif
00282 while (p < e) {
00283 if (!ISASCII(*p))
00284 return p;
00285 p++;
00286 }
00287 return NULL;
00288 }
00289
00290 static int
00291 coderange_scan(const char *p, long len, rb_encoding *enc)
00292 {
00293 const char *e = p + len;
00294
00295 if (rb_enc_to_index(enc) == 0) {
00296
00297 p = search_nonascii(p, e);
00298 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00299 }
00300
00301 if (rb_enc_asciicompat(enc)) {
00302 p = search_nonascii(p, e);
00303 if (!p) {
00304 return ENC_CODERANGE_7BIT;
00305 }
00306 while (p < e) {
00307 int ret = rb_enc_precise_mbclen(p, e, enc);
00308 if (!MBCLEN_CHARFOUND_P(ret)) {
00309 return ENC_CODERANGE_BROKEN;
00310 }
00311 p += MBCLEN_CHARFOUND_LEN(ret);
00312 if (p < e) {
00313 p = search_nonascii(p, e);
00314 if (!p) {
00315 return ENC_CODERANGE_VALID;
00316 }
00317 }
00318 }
00319 if (e < p) {
00320 return ENC_CODERANGE_BROKEN;
00321 }
00322 return ENC_CODERANGE_VALID;
00323 }
00324
00325 while (p < e) {
00326 int ret = rb_enc_precise_mbclen(p, e, enc);
00327
00328 if (!MBCLEN_CHARFOUND_P(ret)) {
00329 return ENC_CODERANGE_BROKEN;
00330 }
00331 p += MBCLEN_CHARFOUND_LEN(ret);
00332 }
00333 if (e < p) {
00334 return ENC_CODERANGE_BROKEN;
00335 }
00336 return ENC_CODERANGE_VALID;
00337 }
00338
00339 long
00340 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00341 {
00342 const char *p = s;
00343
00344 if (*cr == ENC_CODERANGE_BROKEN)
00345 return e - s;
00346
00347 if (rb_enc_to_index(enc) == 0) {
00348
00349 p = search_nonascii(p, e);
00350 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00351 return e - s;
00352 }
00353 else if (rb_enc_asciicompat(enc)) {
00354 p = search_nonascii(p, e);
00355 if (!p) {
00356 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00357 return e - s;
00358 }
00359 while (p < e) {
00360 int ret = rb_enc_precise_mbclen(p, e, enc);
00361 if (!MBCLEN_CHARFOUND_P(ret)) {
00362 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00363 return p - s;
00364 }
00365 p += MBCLEN_CHARFOUND_LEN(ret);
00366 if (p < e) {
00367 p = search_nonascii(p, e);
00368 if (!p) {
00369 *cr = ENC_CODERANGE_VALID;
00370 return e - s;
00371 }
00372 }
00373 }
00374 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00375 return p - s;
00376 }
00377 else {
00378 while (p < e) {
00379 int ret = rb_enc_precise_mbclen(p, e, enc);
00380 if (!MBCLEN_CHARFOUND_P(ret)) {
00381 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00382 return p - s;
00383 }
00384 p += MBCLEN_CHARFOUND_LEN(ret);
00385 }
00386 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00387 return p - s;
00388 }
00389 }
00390
00391 static inline void
00392 str_enc_copy(VALUE str1, VALUE str2)
00393 {
00394 rb_enc_set_index(str1, ENCODING_GET(str2));
00395 }
00396
00397 static void
00398 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00399 {
00400
00401
00402
00403 str_enc_copy(dest, src);
00404 if (RSTRING_LEN(dest) == 0) {
00405 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00406 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00407 else
00408 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00409 return;
00410 }
00411 switch (ENC_CODERANGE(src)) {
00412 case ENC_CODERANGE_7BIT:
00413 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00414 break;
00415 case ENC_CODERANGE_VALID:
00416 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00417 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00418 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00419 else
00420 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00421 break;
00422 default:
00423 break;
00424 }
00425 }
00426
00427 static void
00428 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00429 {
00430 str_enc_copy(dest, src);
00431 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00432 }
00433
00434 int
00435 rb_enc_str_coderange(VALUE str)
00436 {
00437 int cr = ENC_CODERANGE(str);
00438
00439 if (cr == ENC_CODERANGE_UNKNOWN) {
00440 rb_encoding *enc = STR_ENC_GET(str);
00441 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00442 ENC_CODERANGE_SET(str, cr);
00443 }
00444 return cr;
00445 }
00446
00447 int
00448 rb_enc_str_asciionly_p(VALUE str)
00449 {
00450 rb_encoding *enc = STR_ENC_GET(str);
00451
00452 if (!rb_enc_asciicompat(enc))
00453 return FALSE;
00454 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00455 return TRUE;
00456 return FALSE;
00457 }
00458
00459 static inline void
00460 str_mod_check(VALUE s, const char *p, long len)
00461 {
00462 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00463 rb_raise(rb_eRuntimeError, "string modified");
00464 }
00465 }
00466
00467 size_t
00468 rb_str_capacity(VALUE str)
00469 {
00470 if (STR_EMBED_P(str)) {
00471 return RSTRING_EMBED_LEN_MAX;
00472 }
00473 else if (STR_NOCAPA_P(str)) {
00474 return RSTRING(str)->as.heap.len;
00475 }
00476 else {
00477 return RSTRING(str)->as.heap.aux.capa;
00478 }
00479 }
00480
00481 static inline VALUE
00482 str_alloc(VALUE klass)
00483 {
00484 NEWOBJ_OF(str, struct RString, klass, T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0));
00485 return (VALUE)str;
00486 }
00487
00488 static inline VALUE
00489 empty_str_alloc(VALUE klass)
00490 {
00491 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00492 RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
00493 }
00494 return str_alloc(klass);
00495 }
00496
00497 static VALUE
00498 str_new0(VALUE klass, const char *ptr, long len, int termlen)
00499 {
00500 VALUE str;
00501
00502 if (len < 0) {
00503 rb_raise(rb_eArgError, "negative string size (or size too big)");
00504 }
00505
00506 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00507 RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
00508 }
00509
00510 str = str_alloc(klass);
00511 if (len > RSTRING_EMBED_LEN_MAX) {
00512 RSTRING(str)->as.heap.aux.capa = len;
00513 RSTRING(str)->as.heap.ptr = ALLOC_N(char, len + termlen);
00514 STR_SET_NOEMBED(str);
00515 }
00516 else if (len == 0) {
00517 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00518 }
00519 if (ptr) {
00520 memcpy(RSTRING_PTR(str), ptr, len);
00521 }
00522 STR_SET_LEN(str, len);
00523 TERM_FILL(RSTRING_PTR(str) + len, termlen);
00524 return str;
00525 }
00526
00527 static VALUE
00528 str_new(VALUE klass, const char *ptr, long len)
00529 {
00530 return str_new0(klass, ptr, len, 1);
00531 }
00532
00533 VALUE
00534 rb_str_new(const char *ptr, long len)
00535 {
00536 return str_new(rb_cString, ptr, len);
00537 }
00538
00539 VALUE
00540 rb_usascii_str_new(const char *ptr, long len)
00541 {
00542 VALUE str = rb_str_new(ptr, len);
00543 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00544 return str;
00545 }
00546
00547 VALUE
00548 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00549 {
00550 VALUE str;
00551
00552 if (!enc) return rb_str_new(ptr, len);
00553
00554 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
00555 rb_enc_associate(str, enc);
00556 return str;
00557 }
00558
00559 VALUE
00560 rb_str_new_cstr(const char *ptr)
00561 {
00562 if (!ptr) {
00563 rb_raise(rb_eArgError, "NULL pointer given");
00564 }
00565 return rb_str_new(ptr, strlen(ptr));
00566 }
00567
00568 VALUE
00569 rb_usascii_str_new_cstr(const char *ptr)
00570 {
00571 VALUE str = rb_str_new2(ptr);
00572 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00573 return str;
00574 }
00575
00576 VALUE
00577 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
00578 {
00579 if (!ptr) {
00580 rb_raise(rb_eArgError, "NULL pointer given");
00581 }
00582 if (rb_enc_mbminlen(enc) != 1) {
00583 rb_raise(rb_eArgError, "wchar encoding given");
00584 }
00585 return rb_enc_str_new(ptr, strlen(ptr), enc);
00586 }
00587
00588 VALUE
00589 rb_tainted_str_new(const char *ptr, long len)
00590 {
00591 VALUE str = rb_str_new(ptr, len);
00592
00593 OBJ_TAINT(str);
00594 return str;
00595 }
00596
00597 VALUE
00598 rb_tainted_str_new_cstr(const char *ptr)
00599 {
00600 VALUE str = rb_str_new2(ptr);
00601
00602 OBJ_TAINT(str);
00603 return str;
00604 }
00605
00606 VALUE
00607 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00608 {
00609 extern VALUE rb_cEncodingConverter;
00610 rb_econv_t *ec;
00611 rb_econv_result_t ret;
00612 long len, olen;
00613 VALUE econv_wrapper;
00614 VALUE newstr;
00615 const unsigned char *start, *sp;
00616 unsigned char *dest, *dp;
00617 size_t converted_output = 0;
00618
00619 if (!to) return str;
00620 if (!from) from = rb_enc_get(str);
00621 if (from == to) return str;
00622 if ((rb_enc_asciicompat(to) && is_ascii_string(str)) ||
00623 to == rb_ascii8bit_encoding()) {
00624 if (STR_ENC_GET(str) != to) {
00625 str = rb_str_dup(str);
00626 rb_enc_associate(str, to);
00627 }
00628 return str;
00629 }
00630
00631 len = RSTRING_LEN(str);
00632 newstr = rb_str_new(0, len);
00633 OBJ_INFECT(newstr, str);
00634 olen = len;
00635
00636 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
00637 RBASIC_CLEAR_CLASS(econv_wrapper);
00638 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00639 if (!ec) return str;
00640 DATA_PTR(econv_wrapper) = ec;
00641
00642 sp = (unsigned char*)RSTRING_PTR(str);
00643 start = sp;
00644 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
00645 (dp = dest + converted_output),
00646 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
00647 ret == econv_destination_buffer_full) {
00648
00649 size_t converted_input = sp - start;
00650 size_t rest = len - converted_input;
00651 converted_output = dp - dest;
00652 rb_str_set_len(newstr, converted_output);
00653 if (converted_input && converted_output &&
00654 rest < (LONG_MAX / converted_output)) {
00655 rest = (rest * converted_output) / converted_input;
00656 }
00657 else {
00658 rest = olen;
00659 }
00660 olen += rest < 2 ? 2 : rest;
00661 rb_str_resize(newstr, olen);
00662 }
00663 DATA_PTR(econv_wrapper) = 0;
00664 rb_econv_close(ec);
00665 rb_gc_force_recycle(econv_wrapper);
00666 switch (ret) {
00667 case econv_finished:
00668 len = dp - (unsigned char*)RSTRING_PTR(newstr);
00669 rb_str_set_len(newstr, len);
00670 rb_enc_associate(newstr, to);
00671 return newstr;
00672
00673 default:
00674
00675 return str;
00676 }
00677 }
00678
00679 VALUE
00680 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00681 {
00682 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00683 }
00684
00685 VALUE
00686 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00687 {
00688 VALUE str;
00689
00690 str = rb_tainted_str_new(ptr, len);
00691 return rb_external_str_with_enc(str, eenc);
00692 }
00693
00694 VALUE
00695 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
00696 {
00697 if (eenc == rb_usascii_encoding() &&
00698 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00699 rb_enc_associate(str, rb_ascii8bit_encoding());
00700 return str;
00701 }
00702 rb_enc_associate(str, eenc);
00703 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00704 }
00705
00706 VALUE
00707 rb_external_str_new(const char *ptr, long len)
00708 {
00709 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00710 }
00711
00712 VALUE
00713 rb_external_str_new_cstr(const char *ptr)
00714 {
00715 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00716 }
00717
00718 VALUE
00719 rb_locale_str_new(const char *ptr, long len)
00720 {
00721 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00722 }
00723
00724 VALUE
00725 rb_locale_str_new_cstr(const char *ptr)
00726 {
00727 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00728 }
00729
00730 VALUE
00731 rb_filesystem_str_new(const char *ptr, long len)
00732 {
00733 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00734 }
00735
00736 VALUE
00737 rb_filesystem_str_new_cstr(const char *ptr)
00738 {
00739 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00740 }
00741
00742 VALUE
00743 rb_str_export(VALUE str)
00744 {
00745 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00746 }
00747
00748 VALUE
00749 rb_str_export_locale(VALUE str)
00750 {
00751 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00752 }
00753
00754 VALUE
00755 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00756 {
00757 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00758 }
00759
00760 static VALUE
00761 str_replace_shared_without_enc(VALUE str2, VALUE str)
00762 {
00763 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00764 STR_SET_EMBED(str2);
00765 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00766 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00767 }
00768 else {
00769 str = rb_str_new_frozen(str);
00770 FL_SET(str2, STR_NOEMBED);
00771 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00772 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00773 STR_SET_SHARED(str2, str);
00774 }
00775 return str2;
00776 }
00777
00778 static VALUE
00779 str_replace_shared(VALUE str2, VALUE str)
00780 {
00781 str_replace_shared_without_enc(str2, str);
00782 rb_enc_cr_str_exact_copy(str2, str);
00783 return str2;
00784 }
00785
00786 static VALUE
00787 str_new_shared(VALUE klass, VALUE str)
00788 {
00789 return str_replace_shared(str_alloc(klass), str);
00790 }
00791
00792 static VALUE
00793 str_new3(VALUE klass, VALUE str)
00794 {
00795 return str_new_shared(klass, str);
00796 }
00797
00798 VALUE
00799 rb_str_new_shared(VALUE str)
00800 {
00801 VALUE str2 = str_new3(rb_obj_class(str), str);
00802
00803 OBJ_INFECT(str2, str);
00804 return str2;
00805 }
00806
00807 static VALUE
00808 str_new4(VALUE klass, VALUE str)
00809 {
00810 VALUE str2;
00811
00812 str2 = str_alloc(klass);
00813 STR_SET_NOEMBED(str2);
00814 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00815 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00816 if (STR_SHARED_P(str)) {
00817 VALUE shared = RSTRING(str)->as.heap.aux.shared;
00818 assert(OBJ_FROZEN(shared));
00819 STR_SET_SHARED(str2, shared);
00820 }
00821 else {
00822 if (!STR_ASSOC_P(str)) {
00823 RSTRING(str2)->as.heap.aux.capa = RSTRING(str)->as.heap.aux.capa;
00824 }
00825 STR_SET_SHARED(str, str2);
00826 }
00827 rb_enc_cr_str_exact_copy(str2, str);
00828 OBJ_INFECT(str2, str);
00829 return str2;
00830 }
00831
00832 VALUE
00833 rb_str_new_frozen(VALUE orig)
00834 {
00835 VALUE klass, str;
00836
00837 if (OBJ_FROZEN(orig)) return orig;
00838 klass = rb_obj_class(orig);
00839 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00840 long ofs;
00841 assert(OBJ_FROZEN(str));
00842 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00843 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00844 ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & FL_TAINT) ||
00845 ENCODING_GET(str) != ENCODING_GET(orig)) {
00846 str = str_new3(klass, str);
00847 RSTRING(str)->as.heap.ptr += ofs;
00848 RSTRING(str)->as.heap.len -= ofs;
00849 rb_enc_cr_str_exact_copy(str, orig);
00850 OBJ_INFECT(str, orig);
00851 }
00852 }
00853 else if (STR_EMBED_P(orig)) {
00854 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00855 rb_enc_cr_str_exact_copy(str, orig);
00856 OBJ_INFECT(str, orig);
00857 }
00858 else if (STR_ASSOC_P(orig)) {
00859 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00860 FL_UNSET(orig, STR_ASSOC);
00861 str = str_new4(klass, orig);
00862 FL_SET(str, STR_ASSOC);
00863 RB_OBJ_WRITE(str, &RSTRING(str)->as.heap.aux.shared, assoc);
00864
00865 }
00866 else {
00867 str = str_new4(klass, orig);
00868 }
00869 OBJ_FREEZE(str);
00870 return str;
00871 }
00872
00873 VALUE
00874 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00875 {
00876 return str_new(rb_obj_class(obj), ptr, len);
00877 }
00878
00879 static VALUE
00880 str_new_empty(VALUE str)
00881 {
00882 VALUE v = rb_str_new5(str, 0, 0);
00883 rb_enc_copy(v, str);
00884 OBJ_INFECT(v, str);
00885 return v;
00886 }
00887
00888 #define STR_BUF_MIN_SIZE 128
00889
00890 VALUE
00891 rb_str_buf_new(long capa)
00892 {
00893 VALUE str = str_alloc(rb_cString);
00894
00895 if (capa < STR_BUF_MIN_SIZE) {
00896 capa = STR_BUF_MIN_SIZE;
00897 }
00898 FL_SET(str, STR_NOEMBED);
00899 RSTRING(str)->as.heap.aux.capa = capa;
00900 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00901 RSTRING(str)->as.heap.ptr[0] = '\0';
00902
00903 return str;
00904 }
00905
00906 VALUE
00907 rb_str_buf_new_cstr(const char *ptr)
00908 {
00909 VALUE str;
00910 long len = strlen(ptr);
00911
00912 str = rb_str_buf_new(len);
00913 rb_str_buf_cat(str, ptr, len);
00914
00915 return str;
00916 }
00917
00918 VALUE
00919 rb_str_tmp_new(long len)
00920 {
00921 return str_new(0, 0, len);
00922 }
00923
00924 void *
00925 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00926 {
00927 VALUE s = rb_str_tmp_new(len);
00928 *store = s;
00929 return RSTRING_PTR(s);
00930 }
00931
00932 void
00933 rb_free_tmp_buffer(volatile VALUE *store)
00934 {
00935 VALUE s = *store;
00936 *store = 0;
00937 if (s) rb_str_clear(s);
00938 }
00939
00940 void
00941 rb_str_free(VALUE str)
00942 {
00943 if (FL_TEST(str, RSTRING_FSTR)) {
00944 st_data_t fstr = (st_data_t)str;
00945 st_delete(frozen_strings, &fstr, NULL);
00946 }
00947 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00948 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
00949 }
00950 }
00951
00952 RUBY_FUNC_EXPORTED size_t
00953 rb_str_memsize(VALUE str)
00954 {
00955 if (FL_TEST(str, STR_NOEMBED|ELTS_SHARED) == STR_NOEMBED) {
00956 return STR_HEAP_SIZE(str);
00957 }
00958 else {
00959 return 0;
00960 }
00961 }
00962
00963 VALUE
00964 rb_str_to_str(VALUE str)
00965 {
00966 return rb_convert_type(str, T_STRING, "String", "to_str");
00967 }
00968
00969 static inline void str_discard(VALUE str);
00970
00971 void
00972 rb_str_shared_replace(VALUE str, VALUE str2)
00973 {
00974 rb_encoding *enc;
00975 int cr;
00976 if (str == str2) return;
00977 enc = STR_ENC_GET(str2);
00978 cr = ENC_CODERANGE(str2);
00979 str_discard(str);
00980 OBJ_INFECT(str, str2);
00981 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00982 STR_SET_EMBED(str);
00983 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00984 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00985 rb_enc_associate(str, enc);
00986 ENC_CODERANGE_SET(str, cr);
00987 return;
00988 }
00989 STR_SET_NOEMBED(str);
00990 STR_UNSET_NOCAPA(str);
00991 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00992 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00993 if (STR_NOCAPA_P(str2)) {
00994 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00995 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00996 RB_OBJ_WRITE(str, &RSTRING(str)->as.heap.aux.shared, shared);
00997 }
00998 else {
00999 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
01000 }
01001 STR_SET_EMBED(str2);
01002 RSTRING_PTR(str2)[0] = 0;
01003 STR_SET_EMBED_LEN(str2, 0);
01004 rb_enc_associate(str, enc);
01005 ENC_CODERANGE_SET(str, cr);
01006 }
01007
01008 static ID id_to_s;
01009
01010 VALUE
01011 rb_obj_as_string(VALUE obj)
01012 {
01013 VALUE str;
01014
01015 if (RB_TYPE_P(obj, T_STRING)) {
01016 return obj;
01017 }
01018 str = rb_funcall(obj, id_to_s, 0);
01019 if (!RB_TYPE_P(str, T_STRING))
01020 return rb_any_to_s(obj);
01021 OBJ_INFECT(str, obj);
01022 return str;
01023 }
01024
01025 static VALUE
01026 str_replace(VALUE str, VALUE str2)
01027 {
01028 long len;
01029
01030 len = RSTRING_LEN(str2);
01031 if (STR_ASSOC_P(str2)) {
01032 str2 = rb_str_new4(str2);
01033 }
01034 if (STR_SHARED_P(str2)) {
01035 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
01036 assert(OBJ_FROZEN(shared));
01037 STR_SET_NOEMBED(str);
01038 RSTRING(str)->as.heap.len = len;
01039 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
01040 FL_SET(str, ELTS_SHARED);
01041 FL_UNSET(str, STR_ASSOC);
01042 STR_SET_SHARED(str, shared);
01043 }
01044 else {
01045 str_replace_shared(str, str2);
01046 }
01047
01048 OBJ_INFECT(str, str2);
01049 rb_enc_cr_str_exact_copy(str, str2);
01050 return str;
01051 }
01052
01053 static VALUE
01054 str_duplicate(VALUE klass, VALUE str)
01055 {
01056 VALUE dup = str_alloc(klass);
01057 str_replace(dup, str);
01058 return dup;
01059 }
01060
01061 VALUE
01062 rb_str_dup(VALUE str)
01063 {
01064 return str_duplicate(rb_obj_class(str), str);
01065 }
01066
01067 VALUE
01068 rb_str_resurrect(VALUE str)
01069 {
01070 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
01071 RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
01072 rb_sourcefile(), rb_sourceline());
01073 }
01074 return str_duplicate(rb_cString, str);
01075 }
01076
01077
01078
01079
01080
01081
01082
01083
01084 static VALUE
01085 rb_str_init(int argc, VALUE *argv, VALUE str)
01086 {
01087 VALUE orig;
01088
01089 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
01090 rb_str_replace(str, orig);
01091 return str;
01092 }
01093
01094 static inline long
01095 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
01096 {
01097 long c;
01098 const char *q;
01099
01100 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01101 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
01102 }
01103 else if (rb_enc_asciicompat(enc)) {
01104 c = 0;
01105 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
01106 while (p < e) {
01107 if (ISASCII(*p)) {
01108 q = search_nonascii(p, e);
01109 if (!q)
01110 return c + (e - p);
01111 c += q - p;
01112 p = q;
01113 }
01114 p += rb_enc_fast_mbclen(p, e, enc);
01115 c++;
01116 }
01117 }
01118 else {
01119 while (p < e) {
01120 if (ISASCII(*p)) {
01121 q = search_nonascii(p, e);
01122 if (!q)
01123 return c + (e - p);
01124 c += q - p;
01125 p = q;
01126 }
01127 p += rb_enc_mbclen(p, e, enc);
01128 c++;
01129 }
01130 }
01131 return c;
01132 }
01133
01134 for (c=0; p<e; c++) {
01135 p += rb_enc_mbclen(p, e, enc);
01136 }
01137 return c;
01138 }
01139
01140 long
01141 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
01142 {
01143 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
01144 }
01145
01146 long
01147 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
01148 {
01149 long c;
01150 const char *q;
01151 int ret;
01152
01153 *cr = 0;
01154 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01155 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
01156 }
01157 else if (rb_enc_asciicompat(enc)) {
01158 c = 0;
01159 while (p < e) {
01160 if (ISASCII(*p)) {
01161 q = search_nonascii(p, e);
01162 if (!q) {
01163 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01164 return c + (e - p);
01165 }
01166 c += q - p;
01167 p = q;
01168 }
01169 ret = rb_enc_precise_mbclen(p, e, enc);
01170 if (MBCLEN_CHARFOUND_P(ret)) {
01171 *cr |= ENC_CODERANGE_VALID;
01172 p += MBCLEN_CHARFOUND_LEN(ret);
01173 }
01174 else {
01175 *cr = ENC_CODERANGE_BROKEN;
01176 p++;
01177 }
01178 c++;
01179 }
01180 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01181 return c;
01182 }
01183
01184 for (c=0; p<e; c++) {
01185 ret = rb_enc_precise_mbclen(p, e, enc);
01186 if (MBCLEN_CHARFOUND_P(ret)) {
01187 *cr |= ENC_CODERANGE_VALID;
01188 p += MBCLEN_CHARFOUND_LEN(ret);
01189 }
01190 else {
01191 *cr = ENC_CODERANGE_BROKEN;
01192 if (p + rb_enc_mbminlen(enc) <= e)
01193 p += rb_enc_mbminlen(enc);
01194 else
01195 p = e;
01196 }
01197 }
01198 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01199 return c;
01200 }
01201
01202 #ifdef NONASCII_MASK
01203 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01204
01205
01206
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217 static inline VALUE
01218 count_utf8_lead_bytes_with_word(const VALUE *s)
01219 {
01220 VALUE d = *s;
01221
01222
01223 d |= ~(d>>1);
01224 d >>= 6;
01225 d &= NONASCII_MASK >> 7;
01226
01227
01228 d += (d>>8);
01229 d += (d>>16);
01230 #if SIZEOF_VALUE == 8
01231 d += (d>>32);
01232 #endif
01233 return (d&0xF);
01234 }
01235 #endif
01236
01237 static long
01238 str_strlen(VALUE str, rb_encoding *enc)
01239 {
01240 const char *p, *e;
01241 long n;
01242 int cr;
01243
01244 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01245 if (!enc) enc = STR_ENC_GET(str);
01246 p = RSTRING_PTR(str);
01247 e = RSTRING_END(str);
01248 cr = ENC_CODERANGE(str);
01249 #ifdef NONASCII_MASK
01250 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01251 enc == rb_utf8_encoding()) {
01252
01253 VALUE len = 0;
01254 if ((int)sizeof(VALUE) * 2 < e - p) {
01255 const VALUE *s, *t;
01256 const VALUE lowbits = sizeof(VALUE) - 1;
01257 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01258 t = (const VALUE*)(~lowbits & (VALUE)e);
01259 while (p < (const char *)s) {
01260 if (is_utf8_lead_byte(*p)) len++;
01261 p++;
01262 }
01263 while (s < t) {
01264 len += count_utf8_lead_bytes_with_word(s);
01265 s++;
01266 }
01267 p = (const char *)s;
01268 }
01269 while (p < e) {
01270 if (is_utf8_lead_byte(*p)) len++;
01271 p++;
01272 }
01273 return (long)len;
01274 }
01275 #endif
01276 n = rb_enc_strlen_cr(p, e, enc, &cr);
01277 if (cr) {
01278 ENC_CODERANGE_SET(str, cr);
01279 }
01280 return n;
01281 }
01282
01283 long
01284 rb_str_strlen(VALUE str)
01285 {
01286 return str_strlen(str, STR_ENC_GET(str));
01287 }
01288
01289
01290
01291
01292
01293
01294
01295
01296
01297 VALUE
01298 rb_str_length(VALUE str)
01299 {
01300 long len;
01301
01302 len = str_strlen(str, STR_ENC_GET(str));
01303 return LONG2NUM(len);
01304 }
01305
01306
01307
01308
01309
01310
01311
01312
01313
01314
01315
01316 static VALUE
01317 rb_str_bytesize(VALUE str)
01318 {
01319 return LONG2NUM(RSTRING_LEN(str));
01320 }
01321
01322
01323
01324
01325
01326
01327
01328
01329
01330
01331
01332
01333 static VALUE
01334 rb_str_empty(VALUE str)
01335 {
01336 if (RSTRING_LEN(str) == 0)
01337 return Qtrue;
01338 return Qfalse;
01339 }
01340
01341
01342
01343
01344
01345
01346
01347
01348
01349
01350
01351 VALUE
01352 rb_str_plus(VALUE str1, VALUE str2)
01353 {
01354 VALUE str3;
01355 rb_encoding *enc;
01356
01357 StringValue(str2);
01358 enc = rb_enc_check(str1, str2);
01359 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01360 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01361 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01362 RSTRING_PTR(str2), RSTRING_LEN(str2));
01363 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01364
01365 FL_SET_RAW(str3, OBJ_TAINTED_RAW(str1) | OBJ_TAINTED_RAW(str2));
01366 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01367 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01368 return str3;
01369 }
01370
01371
01372
01373
01374
01375
01376
01377
01378
01379
01380
01381
01382 VALUE
01383 rb_str_times(VALUE str, VALUE times)
01384 {
01385 VALUE str2;
01386 long n, len;
01387 char *ptr2;
01388
01389 len = NUM2LONG(times);
01390 if (len < 0) {
01391 rb_raise(rb_eArgError, "negative argument");
01392 }
01393 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
01394 rb_raise(rb_eArgError, "argument too big");
01395 }
01396
01397 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01398 ptr2 = RSTRING_PTR(str2);
01399 if (len) {
01400 n = RSTRING_LEN(str);
01401 memcpy(ptr2, RSTRING_PTR(str), n);
01402 while (n <= len/2) {
01403 memcpy(ptr2 + n, ptr2, n);
01404 n *= 2;
01405 }
01406 memcpy(ptr2 + n, ptr2, len-n);
01407 }
01408 ptr2[RSTRING_LEN(str2)] = '\0';
01409 OBJ_INFECT(str2, str);
01410 rb_enc_cr_str_copy_for_substr(str2, str);
01411
01412 return str2;
01413 }
01414
01415
01416
01417
01418
01419
01420
01421
01422
01423
01424
01425
01426
01427
01428
01429
01430 static VALUE
01431 rb_str_format_m(VALUE str, VALUE arg)
01432 {
01433 volatile VALUE tmp = rb_check_array_type(arg);
01434
01435 if (!NIL_P(tmp)) {
01436 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
01437 }
01438 return rb_str_format(1, &arg, str);
01439 }
01440
01441 static inline void
01442 str_modifiable(VALUE str)
01443 {
01444 if (FL_TEST(str, STR_TMPLOCK)) {
01445 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01446 }
01447 rb_check_frozen(str);
01448 }
01449
01450 static inline int
01451 str_independent(VALUE str)
01452 {
01453 str_modifiable(str);
01454 if (!STR_SHARED_P(str)) return 1;
01455 if (STR_EMBED_P(str)) return 1;
01456 return 0;
01457 }
01458
01459 static void
01460 str_make_independent_expand(VALUE str, long expand)
01461 {
01462 char *ptr;
01463 long len = RSTRING_LEN(str);
01464 const int termlen = TERM_LEN(str);
01465 long capa = len + expand;
01466
01467 if (len > capa) len = capa;
01468 ptr = ALLOC_N(char, capa + termlen);
01469 if (RSTRING_PTR(str)) {
01470 memcpy(ptr, RSTRING_PTR(str), len);
01471 }
01472 STR_SET_NOEMBED(str);
01473 STR_UNSET_NOCAPA(str);
01474 TERM_FILL(ptr + len, termlen);
01475 RSTRING(str)->as.heap.ptr = ptr;
01476 RSTRING(str)->as.heap.len = len;
01477 RSTRING(str)->as.heap.aux.capa = capa;
01478 }
01479
01480 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01481
01482 void
01483 rb_str_modify(VALUE str)
01484 {
01485 if (!str_independent(str))
01486 str_make_independent(str);
01487 ENC_CODERANGE_CLEAR(str);
01488 }
01489
01490 void
01491 rb_str_modify_expand(VALUE str, long expand)
01492 {
01493 if (expand < 0) {
01494 rb_raise(rb_eArgError, "negative expanding string size");
01495 }
01496 if (!str_independent(str)) {
01497 str_make_independent_expand(str, expand);
01498 }
01499 else if (expand > 0) {
01500 long len = RSTRING_LEN(str);
01501 long capa = len + expand;
01502 int termlen = TERM_LEN(str);
01503 if (!STR_EMBED_P(str)) {
01504 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa + termlen);
01505 STR_UNSET_NOCAPA(str);
01506 RSTRING(str)->as.heap.aux.capa = capa;
01507 }
01508 else if (capa + termlen > RSTRING_EMBED_LEN_MAX + 1) {
01509 str_make_independent_expand(str, expand);
01510 }
01511 }
01512 ENC_CODERANGE_CLEAR(str);
01513 }
01514
01515
01516 static void
01517 str_modify_keep_cr(VALUE str)
01518 {
01519 if (!str_independent(str))
01520 str_make_independent(str);
01521 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01522
01523 ENC_CODERANGE_CLEAR(str);
01524 }
01525
01526 static inline void
01527 str_discard(VALUE str)
01528 {
01529 str_modifiable(str);
01530 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01531 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
01532 RSTRING(str)->as.heap.ptr = 0;
01533 RSTRING(str)->as.heap.len = 0;
01534 }
01535 }
01536
01537 void
01538 rb_str_associate(VALUE str, VALUE add)
01539 {
01540
01541 rb_check_frozen(str);
01542 if (STR_ASSOC_P(str)) {
01543
01544 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01545 }
01546 else {
01547 if (STR_SHARED_P(str)) {
01548 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01549 str_make_independent(str);
01550 if (STR_ASSOC_P(assoc)) {
01551 assoc = RSTRING(assoc)->as.heap.aux.shared;
01552 rb_ary_concat(assoc, add);
01553 add = assoc;
01554 }
01555 }
01556 else if (STR_EMBED_P(str)) {
01557 str_make_independent(str);
01558 }
01559 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01560 RESIZE_CAPA(str, RSTRING_LEN(str));
01561 }
01562 FL_SET(str, STR_ASSOC);
01563 RBASIC_CLEAR_CLASS(add);
01564 RB_OBJ_WRITE(str, &RSTRING(str)->as.heap.aux.shared, add);
01565 }
01566 }
01567
01568 VALUE
01569 rb_str_associated(VALUE str)
01570 {
01571 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01572 if (STR_ASSOC_P(str)) {
01573 return RSTRING(str)->as.heap.aux.shared;
01574 }
01575 return Qfalse;
01576 }
01577
01578 void
01579 rb_must_asciicompat(VALUE str)
01580 {
01581 rb_encoding *enc = rb_enc_get(str);
01582 if (!rb_enc_asciicompat(enc)) {
01583 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
01584 }
01585 }
01586
01587 VALUE
01588 rb_string_value(volatile VALUE *ptr)
01589 {
01590 VALUE s = *ptr;
01591 if (!RB_TYPE_P(s, T_STRING)) {
01592 s = rb_str_to_str(s);
01593 *ptr = s;
01594 }
01595 return s;
01596 }
01597
01598 char *
01599 rb_string_value_ptr(volatile VALUE *ptr)
01600 {
01601 VALUE str = rb_string_value(ptr);
01602 return RSTRING_PTR(str);
01603 }
01604
01605 static int
01606 zero_filled(const char *s, int n)
01607 {
01608 for (; n > 0; --n) {
01609 if (*s++) return 0;
01610 }
01611 return 1;
01612 }
01613
01614 static const char *
01615 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
01616 {
01617 const char *e = s + len;
01618
01619 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
01620 if (zero_filled(s, minlen)) return s;
01621 }
01622 return 0;
01623 }
01624
01625 static char *
01626 str_fill_term(VALUE str, char *s, long len, int oldtermlen, int termlen)
01627 {
01628 long capa = rb_str_capacity(str) + 1;
01629
01630 if (capa < len + termlen) {
01631 rb_str_modify_expand(str, termlen);
01632 }
01633 else if (!str_independent(str)) {
01634 if (zero_filled(s + len, termlen)) return s;
01635 str_make_independent(str);
01636 }
01637 s = RSTRING_PTR(str);
01638 TERM_FILL(s + len, termlen);
01639 return s;
01640 }
01641
01642 char *
01643 rb_string_value_cstr(volatile VALUE *ptr)
01644 {
01645 VALUE str = rb_string_value(ptr);
01646 char *s = RSTRING_PTR(str);
01647 long len = RSTRING_LEN(str);
01648 rb_encoding *enc = rb_enc_get(str);
01649 const int minlen = rb_enc_mbminlen(enc);
01650
01651 if (minlen > 1) {
01652 if (str_null_char(s, len, minlen, enc)) {
01653 rb_raise(rb_eArgError, "string contains null char");
01654 }
01655 return str_fill_term(str, s, len, minlen, minlen);
01656 }
01657 if (!s || memchr(s, 0, len)) {
01658 rb_raise(rb_eArgError, "string contains null byte");
01659 }
01660 if (s[len]) {
01661 rb_str_modify(str);
01662 s = RSTRING_PTR(str);
01663 s[RSTRING_LEN(str)] = 0;
01664 }
01665 return s;
01666 }
01667
01668 void
01669 rb_str_fill_terminator(VALUE str, const int newminlen)
01670 {
01671 char *s = RSTRING_PTR(str);
01672 long len = RSTRING_LEN(str);
01673 rb_encoding *enc = rb_enc_get(str);
01674 str_fill_term(str, s, len, rb_enc_mbminlen(enc), newminlen);
01675 }
01676
01677 VALUE
01678 rb_check_string_type(VALUE str)
01679 {
01680 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01681 return str;
01682 }
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695 static VALUE
01696 rb_str_s_try_convert(VALUE dummy, VALUE str)
01697 {
01698 return rb_check_string_type(str);
01699 }
01700
01701 static char*
01702 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01703 {
01704 long nth = *nthp;
01705 if (rb_enc_mbmaxlen(enc) == 1) {
01706 p += nth;
01707 }
01708 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01709 p += nth * rb_enc_mbmaxlen(enc);
01710 }
01711 else if (rb_enc_asciicompat(enc)) {
01712 const char *p2, *e2;
01713 int n;
01714
01715 while (p < e && 0 < nth) {
01716 e2 = p + nth;
01717 if (e < e2) {
01718 *nthp = nth;
01719 return (char *)e;
01720 }
01721 if (ISASCII(*p)) {
01722 p2 = search_nonascii(p, e2);
01723 if (!p2) {
01724 nth -= e2 - p;
01725 *nthp = nth;
01726 return (char *)e2;
01727 }
01728 nth -= p2 - p;
01729 p = p2;
01730 }
01731 n = rb_enc_mbclen(p, e, enc);
01732 p += n;
01733 nth--;
01734 }
01735 *nthp = nth;
01736 if (nth != 0) {
01737 return (char *)e;
01738 }
01739 return (char *)p;
01740 }
01741 else {
01742 while (p < e && nth--) {
01743 p += rb_enc_mbclen(p, e, enc);
01744 }
01745 }
01746 if (p > e) p = e;
01747 *nthp = nth;
01748 return (char*)p;
01749 }
01750
01751 char*
01752 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01753 {
01754 return str_nth_len(p, e, &nth, enc);
01755 }
01756
01757 static char*
01758 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01759 {
01760 if (singlebyte)
01761 p += nth;
01762 else {
01763 p = str_nth_len(p, e, &nth, enc);
01764 }
01765 if (!p) return 0;
01766 if (p > e) p = e;
01767 return (char *)p;
01768 }
01769
01770
01771 static long
01772 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01773 {
01774 const char *pp = str_nth(p, e, nth, enc, singlebyte);
01775 if (!pp) return e - p;
01776 return pp - p;
01777 }
01778
01779 long
01780 rb_str_offset(VALUE str, long pos)
01781 {
01782 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01783 STR_ENC_GET(str), single_byte_optimizable(str));
01784 }
01785
01786 #ifdef NONASCII_MASK
01787 static char *
01788 str_utf8_nth(const char *p, const char *e, long *nthp)
01789 {
01790 long nth = *nthp;
01791 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01792 const VALUE *s, *t;
01793 const VALUE lowbits = sizeof(VALUE) - 1;
01794 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01795 t = (const VALUE*)(~lowbits & (VALUE)e);
01796 while (p < (const char *)s) {
01797 if (is_utf8_lead_byte(*p)) nth--;
01798 p++;
01799 }
01800 do {
01801 nth -= count_utf8_lead_bytes_with_word(s);
01802 s++;
01803 } while (s < t && (int)sizeof(VALUE) <= nth);
01804 p = (char *)s;
01805 }
01806 while (p < e) {
01807 if (is_utf8_lead_byte(*p)) {
01808 if (nth == 0) break;
01809 nth--;
01810 }
01811 p++;
01812 }
01813 *nthp = nth;
01814 return (char *)p;
01815 }
01816
01817 static long
01818 str_utf8_offset(const char *p, const char *e, long nth)
01819 {
01820 const char *pp = str_utf8_nth(p, e, &nth);
01821 return pp - p;
01822 }
01823 #endif
01824
01825
01826 long
01827 rb_str_sublen(VALUE str, long pos)
01828 {
01829 if (single_byte_optimizable(str) || pos < 0)
01830 return pos;
01831 else {
01832 char *p = RSTRING_PTR(str);
01833 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01834 }
01835 }
01836
01837 VALUE
01838 rb_str_subseq(VALUE str, long beg, long len)
01839 {
01840 VALUE str2;
01841
01842 if (RSTRING_LEN(str) == beg + len &&
01843 RSTRING_EMBED_LEN_MAX < len) {
01844 str2 = rb_str_new_shared(rb_str_new_frozen(str));
01845 rb_str_drop_bytes(str2, beg);
01846 }
01847 else {
01848 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01849 RB_GC_GUARD(str);
01850 }
01851
01852 rb_enc_cr_str_copy_for_substr(str2, str);
01853 OBJ_INFECT(str2, str);
01854
01855 return str2;
01856 }
01857
01858 char *
01859 rb_str_subpos(VALUE str, long beg, long *lenp)
01860 {
01861 long len = *lenp;
01862 long slen = -1L;
01863 long blen = RSTRING_LEN(str);
01864 rb_encoding *enc = STR_ENC_GET(str);
01865 char *p, *s = RSTRING_PTR(str), *e = s + blen;
01866
01867 if (len < 0) return 0;
01868 if (!blen) {
01869 len = 0;
01870 }
01871 if (single_byte_optimizable(str)) {
01872 if (beg > blen) return 0;
01873 if (beg < 0) {
01874 beg += blen;
01875 if (beg < 0) return 0;
01876 }
01877 if (beg + len > blen)
01878 len = blen - beg;
01879 if (len < 0) return 0;
01880 p = s + beg;
01881 goto end;
01882 }
01883 if (beg < 0) {
01884 if (len > -beg) len = -beg;
01885 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01886 beg = -beg;
01887 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01888 p = e;
01889 if (!p) return 0;
01890 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01891 if (!p) return 0;
01892 len = e - p;
01893 goto end;
01894 }
01895 else {
01896 slen = str_strlen(str, enc);
01897 beg += slen;
01898 if (beg < 0) return 0;
01899 p = s + beg;
01900 if (len == 0) goto end;
01901 }
01902 }
01903 else if (beg > 0 && beg > RSTRING_LEN(str)) {
01904 return 0;
01905 }
01906 if (len == 0) {
01907 if (beg > str_strlen(str, enc)) return 0;
01908 p = s + beg;
01909 }
01910 #ifdef NONASCII_MASK
01911 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01912 enc == rb_utf8_encoding()) {
01913 p = str_utf8_nth(s, e, &beg);
01914 if (beg > 0) return 0;
01915 len = str_utf8_offset(p, e, len);
01916 }
01917 #endif
01918 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01919 int char_sz = rb_enc_mbmaxlen(enc);
01920
01921 p = s + beg * char_sz;
01922 if (p > e) {
01923 return 0;
01924 }
01925 else if (len * char_sz > e - p)
01926 len = e - p;
01927 else
01928 len *= char_sz;
01929 }
01930 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01931 if (beg > 0) return 0;
01932 len = 0;
01933 }
01934 else {
01935 len = str_offset(p, e, len, enc, 0);
01936 }
01937 end:
01938 *lenp = len;
01939 RB_GC_GUARD(str);
01940 return p;
01941 }
01942
01943 VALUE
01944 rb_str_substr(VALUE str, long beg, long len)
01945 {
01946 VALUE str2;
01947 char *p = rb_str_subpos(str, beg, &len);
01948
01949 if (!p) return Qnil;
01950 if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
01951 str2 = rb_str_new4(str);
01952 str2 = str_new3(rb_obj_class(str2), str2);
01953 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01954 RSTRING(str2)->as.heap.len = len;
01955 }
01956 else {
01957 str2 = rb_str_new5(str, p, len);
01958 OBJ_INFECT(str2, str);
01959 RB_GC_GUARD(str);
01960 }
01961 rb_enc_cr_str_copy_for_substr(str2, str);
01962
01963 return str2;
01964 }
01965
01966 VALUE
01967 rb_str_freeze(VALUE str)
01968 {
01969 if (STR_ASSOC_P(str)) {
01970 VALUE ary = RSTRING(str)->as.heap.aux.shared;
01971 OBJ_FREEZE(ary);
01972 }
01973 return rb_obj_freeze(str);
01974 }
01975
01976 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01977 #define rb_str_dup_frozen rb_str_new_frozen
01978
01979 VALUE
01980 rb_str_locktmp(VALUE str)
01981 {
01982 if (FL_TEST(str, STR_TMPLOCK)) {
01983 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01984 }
01985 FL_SET(str, STR_TMPLOCK);
01986 return str;
01987 }
01988
01989 VALUE
01990 rb_str_unlocktmp(VALUE str)
01991 {
01992 if (!FL_TEST(str, STR_TMPLOCK)) {
01993 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01994 }
01995 FL_UNSET(str, STR_TMPLOCK);
01996 return str;
01997 }
01998
01999 VALUE
02000 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
02001 {
02002 rb_str_locktmp(str);
02003 return rb_ensure(func, arg, rb_str_unlocktmp, str);
02004 }
02005
02006 void
02007 rb_str_set_len(VALUE str, long len)
02008 {
02009 long capa;
02010 const int termlen = TERM_LEN(str);
02011
02012 str_modifiable(str);
02013 if (STR_SHARED_P(str)) {
02014 rb_raise(rb_eRuntimeError, "can't set length of shared string");
02015 }
02016 if (len + termlen - 1 > (capa = (long)rb_str_capacity(str))) {
02017 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
02018 }
02019 STR_SET_LEN(str, len);
02020 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
02021 }
02022
02023 VALUE
02024 rb_str_resize(VALUE str, long len)
02025 {
02026 long slen;
02027 int independent;
02028
02029 if (len < 0) {
02030 rb_raise(rb_eArgError, "negative string size (or size too big)");
02031 }
02032
02033 independent = str_independent(str);
02034 ENC_CODERANGE_CLEAR(str);
02035 slen = RSTRING_LEN(str);
02036 {
02037 long capa;
02038 const int termlen = TERM_LEN(str);
02039 if (STR_EMBED_P(str)) {
02040 if (len == slen) return str;
02041 if (len + termlen <= RSTRING_EMBED_LEN_MAX + 1) {
02042 STR_SET_EMBED_LEN(str, len);
02043 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
02044 return str;
02045 }
02046 str_make_independent_expand(str, len - slen);
02047 STR_SET_NOEMBED(str);
02048 }
02049 else if (len + termlen <= RSTRING_EMBED_LEN_MAX + 1) {
02050 char *ptr = STR_HEAP_PTR(str);
02051 STR_SET_EMBED(str);
02052 if (slen > len) slen = len;
02053 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
02054 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
02055 STR_SET_EMBED_LEN(str, len);
02056 if (independent) ruby_xfree(ptr);
02057 return str;
02058 }
02059 else if (!independent) {
02060 if (len == slen) return str;
02061 str_make_independent_expand(str, len - slen);
02062 }
02063 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
02064 (capa - len) > (len < 1024 ? len : 1024)) {
02065 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len + termlen);
02066 RSTRING(str)->as.heap.aux.capa = len;
02067 }
02068 else if (len == slen) return str;
02069 RSTRING(str)->as.heap.len = len;
02070 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen);
02071 }
02072 return str;
02073 }
02074
02075 static VALUE
02076 str_buf_cat(VALUE str, const char *ptr, long len)
02077 {
02078 long capa, total, off = -1;
02079 const int termlen = TERM_LEN(str);
02080
02081 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
02082 off = ptr - RSTRING_PTR(str);
02083 }
02084 rb_str_modify(str);
02085 if (len == 0) return 0;
02086 if (STR_ASSOC_P(str)) {
02087 FL_UNSET(str, STR_ASSOC);
02088 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
02089 }
02090 else if (STR_EMBED_P(str)) {
02091 capa = RSTRING_EMBED_LEN_MAX;
02092 }
02093 else {
02094 capa = RSTRING(str)->as.heap.aux.capa;
02095 }
02096 if (RSTRING_LEN(str) >= LONG_MAX - len) {
02097 rb_raise(rb_eArgError, "string sizes too big");
02098 }
02099 total = RSTRING_LEN(str)+len;
02100 if (capa <= total) {
02101 while (total > capa) {
02102 if (capa + termlen >= LONG_MAX / 2) {
02103 capa = (total + 4095) / 4096 * 4096;
02104 break;
02105 }
02106 capa = (capa + termlen) * 2;
02107 }
02108 RESIZE_CAPA(str, capa);
02109 }
02110 if (off != -1) {
02111 ptr = RSTRING_PTR(str) + off;
02112 }
02113 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
02114 STR_SET_LEN(str, total);
02115 RSTRING_PTR(str)[total] = '\0';
02116
02117 return str;
02118 }
02119
02120 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
02121
02122 VALUE
02123 rb_str_buf_cat(VALUE str, const char *ptr, long len)
02124 {
02125 if (len == 0) return str;
02126 if (len < 0) {
02127 rb_raise(rb_eArgError, "negative string size (or size too big)");
02128 }
02129 return str_buf_cat(str, ptr, len);
02130 }
02131
02132 VALUE
02133 rb_str_buf_cat2(VALUE str, const char *ptr)
02134 {
02135 return rb_str_buf_cat(str, ptr, strlen(ptr));
02136 }
02137
02138 VALUE
02139 rb_str_cat(VALUE str, const char *ptr, long len)
02140 {
02141 if (len < 0) {
02142 rb_raise(rb_eArgError, "negative string size (or size too big)");
02143 }
02144 if (STR_ASSOC_P(str)) {
02145 char *p;
02146 rb_str_modify_expand(str, len);
02147 p = RSTRING(str)->as.heap.ptr;
02148 memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
02149 len = RSTRING(str)->as.heap.len += len;
02150 TERM_FILL(p, TERM_LEN(str));
02151 return str;
02152 }
02153
02154 return rb_str_buf_cat(str, ptr, len);
02155 }
02156
02157 VALUE
02158 rb_str_cat2(VALUE str, const char *ptr)
02159 {
02160 return rb_str_cat(str, ptr, strlen(ptr));
02161 }
02162
02163 static VALUE
02164 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
02165 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
02166 {
02167 int str_encindex = ENCODING_GET(str);
02168 int res_encindex;
02169 int str_cr, res_cr;
02170
02171 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
02172
02173 if (str_encindex == ptr_encindex) {
02174 if (str_cr == ENC_CODERANGE_UNKNOWN)
02175 ptr_cr = ENC_CODERANGE_UNKNOWN;
02176 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
02177 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
02178 }
02179 }
02180 else {
02181 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
02182 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
02183 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
02184 if (len == 0)
02185 return str;
02186 if (RSTRING_LEN(str) == 0) {
02187 rb_str_buf_cat(str, ptr, len);
02188 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
02189 return str;
02190 }
02191 goto incompatible;
02192 }
02193 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
02194 ptr_cr = coderange_scan(ptr, len, ptr_enc);
02195 }
02196 if (str_cr == ENC_CODERANGE_UNKNOWN) {
02197 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
02198 str_cr = rb_enc_str_coderange(str);
02199 }
02200 }
02201 }
02202 if (ptr_cr_ret)
02203 *ptr_cr_ret = ptr_cr;
02204
02205 if (str_encindex != ptr_encindex &&
02206 str_cr != ENC_CODERANGE_7BIT &&
02207 ptr_cr != ENC_CODERANGE_7BIT) {
02208 incompatible:
02209 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
02210 rb_enc_name(rb_enc_from_index(str_encindex)),
02211 rb_enc_name(rb_enc_from_index(ptr_encindex)));
02212 }
02213
02214 if (str_cr == ENC_CODERANGE_UNKNOWN) {
02215 res_encindex = str_encindex;
02216 res_cr = ENC_CODERANGE_UNKNOWN;
02217 }
02218 else if (str_cr == ENC_CODERANGE_7BIT) {
02219 if (ptr_cr == ENC_CODERANGE_7BIT) {
02220 res_encindex = str_encindex;
02221 res_cr = ENC_CODERANGE_7BIT;
02222 }
02223 else {
02224 res_encindex = ptr_encindex;
02225 res_cr = ptr_cr;
02226 }
02227 }
02228 else if (str_cr == ENC_CODERANGE_VALID) {
02229 res_encindex = str_encindex;
02230 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
02231 res_cr = str_cr;
02232 else
02233 res_cr = ptr_cr;
02234 }
02235 else {
02236 res_encindex = str_encindex;
02237 res_cr = str_cr;
02238 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
02239 }
02240
02241 if (len < 0) {
02242 rb_raise(rb_eArgError, "negative string size (or size too big)");
02243 }
02244 str_buf_cat(str, ptr, len);
02245 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
02246 return str;
02247 }
02248
02249 VALUE
02250 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
02251 {
02252 return rb_enc_cr_str_buf_cat(str, ptr, len,
02253 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
02254 }
02255
02256 VALUE
02257 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02258 {
02259
02260 int encindex = ENCODING_GET(str);
02261 rb_encoding *enc = rb_enc_from_index(encindex);
02262 if (rb_enc_asciicompat(enc)) {
02263 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02264 encindex, ENC_CODERANGE_7BIT, 0);
02265 }
02266 else {
02267 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02268 while (*ptr) {
02269 unsigned int c = (unsigned char)*ptr;
02270 int len = rb_enc_codelen(c, enc);
02271 rb_enc_mbcput(c, buf, enc);
02272 rb_enc_cr_str_buf_cat(str, buf, len,
02273 encindex, ENC_CODERANGE_VALID, 0);
02274 ptr++;
02275 }
02276 return str;
02277 }
02278 }
02279
02280 VALUE
02281 rb_str_buf_append(VALUE str, VALUE str2)
02282 {
02283 int str2_cr;
02284
02285 str2_cr = ENC_CODERANGE(str2);
02286
02287 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02288 ENCODING_GET(str2), str2_cr, &str2_cr);
02289
02290 OBJ_INFECT(str, str2);
02291 ENC_CODERANGE_SET(str2, str2_cr);
02292
02293 return str;
02294 }
02295
02296 VALUE
02297 rb_str_append(VALUE str, VALUE str2)
02298 {
02299 rb_encoding *enc;
02300 int cr, cr2;
02301 long len2;
02302
02303 StringValue(str2);
02304 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02305 long len1 = RSTRING(str)->as.heap.len, len = len1 + len2;
02306 enc = rb_enc_check(str, str2);
02307 cr = ENC_CODERANGE(str);
02308 if ((cr2 = ENC_CODERANGE(str2)) > cr || RSTRING_LEN(str) == 0)
02309 cr = cr2;
02310 rb_str_modify_expand(str, len2);
02311 memcpy(RSTRING(str)->as.heap.ptr + len1, RSTRING_PTR(str2), len2);
02312 TERM_FILL(RSTRING(str)->as.heap.ptr + len, rb_enc_mbminlen(enc));
02313 RSTRING(str)->as.heap.len = len;
02314 rb_enc_associate(str, enc);
02315 ENC_CODERANGE_SET(str, cr);
02316 OBJ_INFECT(str, str2);
02317 return str;
02318 }
02319 return rb_str_buf_append(str, str2);
02320 }
02321
02322
02323
02324
02325
02326
02327
02328
02329
02330
02331
02332
02333
02334
02335
02336
02337
02338 VALUE
02339 rb_str_concat(VALUE str1, VALUE str2)
02340 {
02341 unsigned int code;
02342 rb_encoding *enc = STR_ENC_GET(str1);
02343
02344 if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) {
02345 if (rb_num_to_uint(str2, &code) == 0) {
02346 }
02347 else if (FIXNUM_P(str2)) {
02348 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02349 }
02350 else {
02351 rb_raise(rb_eRangeError, "bignum out of char range");
02352 }
02353 }
02354 else {
02355 return rb_str_append(str1, str2);
02356 }
02357
02358 if (enc == rb_usascii_encoding()) {
02359
02360 char buf[1];
02361 buf[0] = (char)code;
02362 if (code > 0xFF) {
02363 rb_raise(rb_eRangeError, "%u out of char range", code);
02364 }
02365 rb_str_cat(str1, buf, 1);
02366 if (code > 127) {
02367 rb_enc_associate(str1, rb_ascii8bit_encoding());
02368 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02369 }
02370 }
02371 else {
02372 long pos = RSTRING_LEN(str1);
02373 int cr = ENC_CODERANGE(str1);
02374 int len;
02375 char *buf;
02376
02377 switch (len = rb_enc_codelen(code, enc)) {
02378 case ONIGERR_INVALID_CODE_POINT_VALUE:
02379 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02380 break;
02381 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02382 case 0:
02383 rb_raise(rb_eRangeError, "%u out of char range", code);
02384 break;
02385 }
02386 buf = ALLOCA_N(char, len + 1);
02387 rb_enc_mbcput(code, buf, enc);
02388 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02389 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02390 }
02391 rb_str_resize(str1, pos+len);
02392 memcpy(RSTRING_PTR(str1) + pos, buf, len);
02393 if (cr == ENC_CODERANGE_7BIT && code > 127)
02394 cr = ENC_CODERANGE_VALID;
02395 ENC_CODERANGE_SET(str1, cr);
02396 }
02397 return str1;
02398 }
02399
02400
02401
02402
02403
02404
02405
02406
02407
02408
02409
02410
02411 static VALUE
02412 rb_str_prepend(VALUE str, VALUE str2)
02413 {
02414 StringValue(str2);
02415 StringValue(str);
02416 rb_str_update(str, 0L, 0L, str2);
02417 return str;
02418 }
02419
02420 st_index_t
02421 rb_str_hash(VALUE str)
02422 {
02423 int e = ENCODING_GET(str);
02424 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02425 e = 0;
02426 }
02427 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02428 }
02429
02430 int
02431 rb_str_hash_cmp(VALUE str1, VALUE str2)
02432 {
02433 long len;
02434
02435 if (!rb_str_comparable(str1, str2)) return 1;
02436 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02437 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02438 return 0;
02439 }
02440 return 1;
02441 }
02442
02443
02444
02445
02446
02447
02448
02449
02450 static VALUE
02451 rb_str_hash_m(VALUE str)
02452 {
02453 st_index_t hval = rb_str_hash(str);
02454 return INT2FIX(hval);
02455 }
02456
02457 #define lesser(a,b) (((a)>(b))?(b):(a))
02458
02459 int
02460 rb_str_comparable(VALUE str1, VALUE str2)
02461 {
02462 int idx1, idx2;
02463 int rc1, rc2;
02464
02465 if (RSTRING_LEN(str1) == 0) return TRUE;
02466 if (RSTRING_LEN(str2) == 0) return TRUE;
02467 idx1 = ENCODING_GET(str1);
02468 idx2 = ENCODING_GET(str2);
02469 if (idx1 == idx2) return TRUE;
02470 rc1 = rb_enc_str_coderange(str1);
02471 rc2 = rb_enc_str_coderange(str2);
02472 if (rc1 == ENC_CODERANGE_7BIT) {
02473 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02474 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02475 return TRUE;
02476 }
02477 if (rc2 == ENC_CODERANGE_7BIT) {
02478 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02479 return TRUE;
02480 }
02481 return FALSE;
02482 }
02483
02484 int
02485 rb_str_cmp(VALUE str1, VALUE str2)
02486 {
02487 long len1, len2;
02488 const char *ptr1, *ptr2;
02489 int retval;
02490
02491 if (str1 == str2) return 0;
02492 RSTRING_GETMEM(str1, ptr1, len1);
02493 RSTRING_GETMEM(str2, ptr2, len2);
02494 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02495 if (len1 == len2) {
02496 if (!rb_str_comparable(str1, str2)) {
02497 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02498 return 1;
02499 return -1;
02500 }
02501 return 0;
02502 }
02503 if (len1 > len2) return 1;
02504 return -1;
02505 }
02506 if (retval > 0) return 1;
02507 return -1;
02508 }
02509
02510
02511 static VALUE
02512 str_eql(const VALUE str1, const VALUE str2)
02513 {
02514 const long len = RSTRING_LEN(str1);
02515 const char *ptr1, *ptr2;
02516
02517 if (len != RSTRING_LEN(str2)) return Qfalse;
02518 if (!rb_str_comparable(str1, str2)) return Qfalse;
02519 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02520 return Qtrue;
02521 if (memcmp(ptr1, ptr2, len) == 0)
02522 return Qtrue;
02523 return Qfalse;
02524 }
02525
02526
02527
02528
02529
02530
02531
02532
02533
02534
02535
02536
02537
02538
02539
02540
02541 VALUE
02542 rb_str_equal(VALUE str1, VALUE str2)
02543 {
02544 if (str1 == str2) return Qtrue;
02545 if (!RB_TYPE_P(str2, T_STRING)) {
02546 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02547 return Qfalse;
02548 }
02549 return rb_equal(str2, str1);
02550 }
02551 return str_eql(str1, str2);
02552 }
02553
02554
02555
02556
02557
02558
02559
02560
02561 static VALUE
02562 rb_str_eql(VALUE str1, VALUE str2)
02563 {
02564 if (str1 == str2) return Qtrue;
02565 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
02566 return str_eql(str1, str2);
02567 }
02568
02569
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581
02582
02583
02584
02585
02586
02587
02588
02589
02590
02591
02592
02593
02594 static VALUE
02595 rb_str_cmp_m(VALUE str1, VALUE str2)
02596 {
02597 int result;
02598
02599 if (!RB_TYPE_P(str2, T_STRING)) {
02600 VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0);
02601 if (RB_TYPE_P(tmp, T_STRING)) {
02602 result = rb_str_cmp(str1, tmp);
02603 }
02604 else {
02605 return rb_invcmp(str1, str2);
02606 }
02607 }
02608 else {
02609 result = rb_str_cmp(str1, str2);
02610 }
02611 return INT2FIX(result);
02612 }
02613
02614
02615
02616
02617
02618
02619
02620
02621
02622
02623
02624
02625
02626 static VALUE
02627 rb_str_casecmp(VALUE str1, VALUE str2)
02628 {
02629 long len;
02630 rb_encoding *enc;
02631 char *p1, *p1end, *p2, *p2end;
02632
02633 StringValue(str2);
02634 enc = rb_enc_compatible(str1, str2);
02635 if (!enc) {
02636 return Qnil;
02637 }
02638
02639 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02640 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02641 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02642 while (p1 < p1end && p2 < p2end) {
02643 if (*p1 != *p2) {
02644 unsigned int c1 = TOUPPER(*p1 & 0xff);
02645 unsigned int c2 = TOUPPER(*p2 & 0xff);
02646 if (c1 != c2)
02647 return INT2FIX(c1 < c2 ? -1 : 1);
02648 }
02649 p1++;
02650 p2++;
02651 }
02652 }
02653 else {
02654 while (p1 < p1end && p2 < p2end) {
02655 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02656 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02657
02658 if (0 <= c1 && 0 <= c2) {
02659 c1 = TOUPPER(c1);
02660 c2 = TOUPPER(c2);
02661 if (c1 != c2)
02662 return INT2FIX(c1 < c2 ? -1 : 1);
02663 }
02664 else {
02665 int r;
02666 l1 = rb_enc_mbclen(p1, p1end, enc);
02667 l2 = rb_enc_mbclen(p2, p2end, enc);
02668 len = l1 < l2 ? l1 : l2;
02669 r = memcmp(p1, p2, len);
02670 if (r != 0)
02671 return INT2FIX(r < 0 ? -1 : 1);
02672 if (l1 != l2)
02673 return INT2FIX(l1 < l2 ? -1 : 1);
02674 }
02675 p1 += l1;
02676 p2 += l2;
02677 }
02678 }
02679 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02680 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02681 return INT2FIX(-1);
02682 }
02683
02684 static long
02685 rb_str_index(VALUE str, VALUE sub, long offset)
02686 {
02687 char *s, *sptr, *e;
02688 long pos, len, slen;
02689 int single_byte = single_byte_optimizable(str);
02690 rb_encoding *enc;
02691
02692 enc = rb_enc_check(str, sub);
02693 if (is_broken_string(sub)) return -1;
02694
02695 len = single_byte ? RSTRING_LEN(str) : str_strlen(str, enc);
02696 slen = str_strlen(sub, enc);
02697 if (offset < 0) {
02698 offset += len;
02699 if (offset < 0) return -1;
02700 }
02701 if (len - offset < slen) return -1;
02702
02703 s = RSTRING_PTR(str);
02704 e = RSTRING_END(str);
02705 if (offset) {
02706 offset = str_offset(s, e, offset, enc, single_byte);
02707 s += offset;
02708 }
02709 if (slen == 0) return offset;
02710
02711 sptr = RSTRING_PTR(sub);
02712 slen = RSTRING_LEN(sub);
02713 len = RSTRING_LEN(str) - offset;
02714 for (;;) {
02715 char *t;
02716 pos = rb_memsearch(sptr, slen, s, len, enc);
02717 if (pos < 0) return pos;
02718 t = rb_enc_right_char_head(s, s+pos, e, enc);
02719 if (t == s + pos) break;
02720 len -= t - s;
02721 if (len <= 0) return -1;
02722 offset += t - s;
02723 s = t;
02724 }
02725 return pos + offset;
02726 }
02727
02728
02729
02730
02731
02732
02733
02734
02735
02736
02737
02738
02739
02740
02741
02742
02743
02744
02745
02746 static VALUE
02747 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02748 {
02749 VALUE sub;
02750 VALUE initpos;
02751 long pos;
02752
02753 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02754 pos = NUM2LONG(initpos);
02755 }
02756 else {
02757 pos = 0;
02758 }
02759 if (pos < 0) {
02760 pos += str_strlen(str, STR_ENC_GET(str));
02761 if (pos < 0) {
02762 if (RB_TYPE_P(sub, T_REGEXP)) {
02763 rb_backref_set(Qnil);
02764 }
02765 return Qnil;
02766 }
02767 }
02768
02769 if (SPECIAL_CONST_P(sub)) goto generic;
02770 switch (BUILTIN_TYPE(sub)) {
02771 case T_REGEXP:
02772 if (pos > str_strlen(str, STR_ENC_GET(str)))
02773 return Qnil;
02774 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02775 rb_enc_check(str, sub), single_byte_optimizable(str));
02776
02777 pos = rb_reg_search(sub, str, pos, 0);
02778 pos = rb_str_sublen(str, pos);
02779 break;
02780
02781 generic:
02782 default: {
02783 VALUE tmp;
02784
02785 tmp = rb_check_string_type(sub);
02786 if (NIL_P(tmp)) {
02787 rb_raise(rb_eTypeError, "type mismatch: %s given",
02788 rb_obj_classname(sub));
02789 }
02790 sub = tmp;
02791 }
02792
02793 case T_STRING:
02794 pos = rb_str_index(str, sub, pos);
02795 pos = rb_str_sublen(str, pos);
02796 break;
02797 }
02798
02799 if (pos == -1) return Qnil;
02800 return LONG2NUM(pos);
02801 }
02802
02803 #ifdef HAVE_MEMRCHR
02804 static long
02805 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
02806 {
02807 char *hit, *adjusted;
02808 int c;
02809 long slen, searchlen;
02810 char *sbeg, *e, *t;
02811
02812 slen = RSTRING_LEN(sub);
02813 if (slen == 0) return pos;
02814 sbeg = RSTRING_PTR(str);
02815 e = RSTRING_END(str);
02816 t = RSTRING_PTR(sub);
02817 c = *t & 0xff;
02818 searchlen = s - sbeg + 1;
02819
02820 do {
02821 hit = memrchr(sbeg, c, searchlen);
02822 if (!hit) break;
02823 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
02824 if (hit != adjusted) {
02825 searchlen = adjusted - sbeg;
02826 continue;
02827 }
02828 if (memcmp(hit, t, slen) == 0)
02829 return rb_str_sublen(str, hit - sbeg);
02830 searchlen = adjusted - sbeg;
02831 } while (searchlen > 0);
02832
02833 return -1;
02834 }
02835 #else
02836 static long
02837 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
02838 {
02839 long slen;
02840 char *sbeg, *e, *t;
02841
02842 sbeg = RSTRING_PTR(str);
02843 e = RSTRING_END(str);
02844 t = RSTRING_PTR(sub);
02845 slen = RSTRING_LEN(sub);
02846
02847 while (s) {
02848 if (memcmp(s, t, slen) == 0) {
02849 return pos;
02850 }
02851 if (pos == 0) break;
02852 pos--;
02853 s = rb_enc_prev_char(sbeg, s, e, enc);
02854 }
02855
02856 return -1;
02857 }
02858 #endif
02859
02860 static long
02861 rb_str_rindex(VALUE str, VALUE sub, long pos)
02862 {
02863 long len, slen;
02864 char *sbeg, *s;
02865 rb_encoding *enc;
02866 int singlebyte;
02867
02868 enc = rb_enc_check(str, sub);
02869 if (is_broken_string(sub)) return -1;
02870 singlebyte = single_byte_optimizable(str);
02871 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
02872 slen = str_strlen(sub, enc);
02873
02874
02875 if (len < slen) return -1;
02876 if (len - pos < slen) pos = len - slen;
02877 if (len == 0) return pos;
02878
02879 sbeg = RSTRING_PTR(str);
02880
02881 if (pos == 0) {
02882 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
02883 return 0;
02884 else
02885 return -1;
02886 }
02887
02888 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
02889 return str_rindex(str, sub, s, pos, enc);
02890 }
02891
02892
02893
02894
02895
02896
02897
02898
02899
02900
02901
02902
02903
02904
02905
02906
02907
02908
02909
02910
02911 static VALUE
02912 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02913 {
02914 VALUE sub;
02915 VALUE vpos;
02916 rb_encoding *enc = STR_ENC_GET(str);
02917 long pos, len = str_strlen(str, enc);
02918
02919 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02920 pos = NUM2LONG(vpos);
02921 if (pos < 0) {
02922 pos += len;
02923 if (pos < 0) {
02924 if (RB_TYPE_P(sub, T_REGEXP)) {
02925 rb_backref_set(Qnil);
02926 }
02927 return Qnil;
02928 }
02929 }
02930 if (pos > len) pos = len;
02931 }
02932 else {
02933 pos = len;
02934 }
02935
02936 if (SPECIAL_CONST_P(sub)) goto generic;
02937 switch (BUILTIN_TYPE(sub)) {
02938 case T_REGEXP:
02939
02940 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02941 STR_ENC_GET(str), single_byte_optimizable(str));
02942
02943 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02944 pos = rb_reg_search(sub, str, pos, 1);
02945 pos = rb_str_sublen(str, pos);
02946 }
02947 if (pos >= 0) return LONG2NUM(pos);
02948 break;
02949
02950 generic:
02951 default: {
02952 VALUE tmp;
02953
02954 tmp = rb_check_string_type(sub);
02955 if (NIL_P(tmp)) {
02956 rb_raise(rb_eTypeError, "type mismatch: %s given",
02957 rb_obj_classname(sub));
02958 }
02959 sub = tmp;
02960 }
02961
02962 case T_STRING:
02963 pos = rb_str_rindex(str, sub, pos);
02964 if (pos >= 0) return LONG2NUM(pos);
02965 break;
02966 }
02967 return Qnil;
02968 }
02969
02970
02971
02972
02973
02974
02975
02976
02977
02978
02979
02980
02981
02982
02983
02984
02985
02986
02987
02988 static VALUE
02989 rb_str_match(VALUE x, VALUE y)
02990 {
02991 if (SPECIAL_CONST_P(y)) goto generic;
02992 switch (BUILTIN_TYPE(y)) {
02993 case T_STRING:
02994 rb_raise(rb_eTypeError, "type mismatch: String given");
02995
02996 case T_REGEXP:
02997 return rb_reg_match(y, x);
02998
02999 generic:
03000 default:
03001 return rb_funcall(y, rb_intern("=~"), 1, x);
03002 }
03003 }
03004
03005
03006 static VALUE get_pat(VALUE, int);
03007
03008
03009
03010
03011
03012
03013
03014
03015
03016
03017
03018
03019
03020
03021
03022
03023
03024
03025
03026
03027
03028
03029
03030
03031
03032
03033
03034
03035
03036
03037
03038 static VALUE
03039 rb_str_match_m(int argc, VALUE *argv, VALUE str)
03040 {
03041 VALUE re, result;
03042 if (argc < 1)
03043 rb_check_arity(argc, 1, 2);
03044 re = argv[0];
03045 argv[0] = str;
03046 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
03047 if (!NIL_P(result) && rb_block_given_p()) {
03048 return rb_yield(result);
03049 }
03050 return result;
03051 }
03052
03053 enum neighbor_char {
03054 NEIGHBOR_NOT_CHAR,
03055 NEIGHBOR_FOUND,
03056 NEIGHBOR_WRAPPED
03057 };
03058
03059 static enum neighbor_char
03060 enc_succ_char(char *p, long len, rb_encoding *enc)
03061 {
03062 long i;
03063 int l;
03064
03065 if (rb_enc_mbminlen(enc) > 1) {
03066
03067 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
03068 if (!MBCLEN_CHARFOUND_P(r)) {
03069 return NEIGHBOR_NOT_CHAR;
03070 }
03071 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
03072 l = rb_enc_code_to_mbclen(c, enc);
03073 if (!l) return NEIGHBOR_NOT_CHAR;
03074 if (l != len) return NEIGHBOR_WRAPPED;
03075 rb_enc_mbcput(c, p, enc);
03076 r = rb_enc_precise_mbclen(p, p + len, enc);
03077 if (!MBCLEN_CHARFOUND_P(r)) {
03078 return NEIGHBOR_NOT_CHAR;
03079 }
03080 return NEIGHBOR_FOUND;
03081 }
03082 while (1) {
03083 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
03084 p[i] = '\0';
03085 if (i < 0)
03086 return NEIGHBOR_WRAPPED;
03087 ++((unsigned char*)p)[i];
03088 l = rb_enc_precise_mbclen(p, p+len, enc);
03089 if (MBCLEN_CHARFOUND_P(l)) {
03090 l = MBCLEN_CHARFOUND_LEN(l);
03091 if (l == len) {
03092 return NEIGHBOR_FOUND;
03093 }
03094 else {
03095 memset(p+l, 0xff, len-l);
03096 }
03097 }
03098 if (MBCLEN_INVALID_P(l) && i < len-1) {
03099 long len2;
03100 int l2;
03101 for (len2 = len-1; 0 < len2; len2--) {
03102 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
03103 if (!MBCLEN_INVALID_P(l2))
03104 break;
03105 }
03106 memset(p+len2+1, 0xff, len-(len2+1));
03107 }
03108 }
03109 }
03110
03111 static enum neighbor_char
03112 enc_pred_char(char *p, long len, rb_encoding *enc)
03113 {
03114 long i;
03115 int l;
03116 if (rb_enc_mbminlen(enc) > 1) {
03117
03118 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
03119 if (!MBCLEN_CHARFOUND_P(r)) {
03120 return NEIGHBOR_NOT_CHAR;
03121 }
03122 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
03123 if (!c) return NEIGHBOR_NOT_CHAR;
03124 --c;
03125 l = rb_enc_code_to_mbclen(c, enc);
03126 if (!l) return NEIGHBOR_NOT_CHAR;
03127 if (l != len) return NEIGHBOR_WRAPPED;
03128 rb_enc_mbcput(c, p, enc);
03129 r = rb_enc_precise_mbclen(p, p + len, enc);
03130 if (!MBCLEN_CHARFOUND_P(r)) {
03131 return NEIGHBOR_NOT_CHAR;
03132 }
03133 return NEIGHBOR_FOUND;
03134 }
03135 while (1) {
03136 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
03137 p[i] = '\xff';
03138 if (i < 0)
03139 return NEIGHBOR_WRAPPED;
03140 --((unsigned char*)p)[i];
03141 l = rb_enc_precise_mbclen(p, p+len, enc);
03142 if (MBCLEN_CHARFOUND_P(l)) {
03143 l = MBCLEN_CHARFOUND_LEN(l);
03144 if (l == len) {
03145 return NEIGHBOR_FOUND;
03146 }
03147 else {
03148 memset(p+l, 0, len-l);
03149 }
03150 }
03151 if (MBCLEN_INVALID_P(l) && i < len-1) {
03152 long len2;
03153 int l2;
03154 for (len2 = len-1; 0 < len2; len2--) {
03155 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
03156 if (!MBCLEN_INVALID_P(l2))
03157 break;
03158 }
03159 memset(p+len2+1, 0, len-(len2+1));
03160 }
03161 }
03162 }
03163
03164
03165
03166
03167
03168
03169
03170
03171
03172
03173 static enum neighbor_char
03174 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
03175 {
03176 enum neighbor_char ret;
03177 unsigned int c;
03178 int ctype;
03179 int range;
03180 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
03181
03182
03183 int try;
03184 const int max_gaps = 1;
03185
03186 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
03187 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
03188 ctype = ONIGENC_CTYPE_DIGIT;
03189 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
03190 ctype = ONIGENC_CTYPE_ALPHA;
03191 else
03192 return NEIGHBOR_NOT_CHAR;
03193
03194 MEMCPY(save, p, char, len);
03195 for (try = 0; try <= max_gaps; ++try) {
03196 ret = enc_succ_char(p, len, enc);
03197 if (ret == NEIGHBOR_FOUND) {
03198 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
03199 if (rb_enc_isctype(c, ctype, enc))
03200 return NEIGHBOR_FOUND;
03201 }
03202 }
03203 MEMCPY(p, save, char, len);
03204 range = 1;
03205 while (1) {
03206 MEMCPY(save, p, char, len);
03207 ret = enc_pred_char(p, len, enc);
03208 if (ret == NEIGHBOR_FOUND) {
03209 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
03210 if (!rb_enc_isctype(c, ctype, enc)) {
03211 MEMCPY(p, save, char, len);
03212 break;
03213 }
03214 }
03215 else {
03216 MEMCPY(p, save, char, len);
03217 break;
03218 }
03219 range++;
03220 }
03221 if (range == 1) {
03222 return NEIGHBOR_NOT_CHAR;
03223 }
03224
03225 if (ctype != ONIGENC_CTYPE_DIGIT) {
03226 MEMCPY(carry, p, char, len);
03227 return NEIGHBOR_WRAPPED;
03228 }
03229
03230 MEMCPY(carry, p, char, len);
03231 enc_succ_char(carry, len, enc);
03232 return NEIGHBOR_WRAPPED;
03233 }
03234
03235
03236
03237
03238
03239
03240
03241
03242
03243
03244
03245
03246
03247
03248
03249
03250
03251
03252
03253
03254
03255
03256
03257
03258
03259
03260
03261 VALUE
03262 rb_str_succ(VALUE orig)
03263 {
03264 rb_encoding *enc;
03265 VALUE str;
03266 char *sbeg, *s, *e, *last_alnum = 0;
03267 int c = -1;
03268 long l;
03269 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
03270 long carry_pos = 0, carry_len = 1;
03271 enum neighbor_char neighbor = NEIGHBOR_FOUND;
03272
03273 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
03274 rb_enc_cr_str_copy_for_substr(str, orig);
03275 OBJ_INFECT(str, orig);
03276 if (RSTRING_LEN(str) == 0) return str;
03277
03278 enc = STR_ENC_GET(orig);
03279 sbeg = RSTRING_PTR(str);
03280 s = e = sbeg + RSTRING_LEN(str);
03281
03282 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
03283 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
03284 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
03285 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
03286 s = last_alnum;
03287 break;
03288 }
03289 }
03290 l = rb_enc_precise_mbclen(s, e, enc);
03291 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
03292 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
03293 neighbor = enc_succ_alnum_char(s, l, enc, carry);
03294 switch (neighbor) {
03295 case NEIGHBOR_NOT_CHAR:
03296 continue;
03297 case NEIGHBOR_FOUND:
03298 return str;
03299 case NEIGHBOR_WRAPPED:
03300 last_alnum = s;
03301 break;
03302 }
03303 c = 1;
03304 carry_pos = s - sbeg;
03305 carry_len = l;
03306 }
03307 if (c == -1) {
03308 s = e;
03309 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
03310 enum neighbor_char neighbor;
03311 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
03312 l = rb_enc_precise_mbclen(s, e, enc);
03313 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
03314 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
03315 MEMCPY(tmp, s, char, l);
03316 neighbor = enc_succ_char(tmp, l, enc);
03317 switch (neighbor) {
03318 case NEIGHBOR_FOUND:
03319 MEMCPY(s, tmp, char, l);
03320 return str;
03321 break;
03322 case NEIGHBOR_WRAPPED:
03323 MEMCPY(s, tmp, char, l);
03324 break;
03325 case NEIGHBOR_NOT_CHAR:
03326 break;
03327 }
03328 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
03329
03330 enc_succ_char(s, l, enc);
03331 }
03332 if (!rb_enc_asciicompat(enc)) {
03333 MEMCPY(carry, s, char, l);
03334 carry_len = l;
03335 }
03336 carry_pos = s - sbeg;
03337 }
03338 }
03339 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
03340 s = RSTRING_PTR(str) + carry_pos;
03341 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
03342 memmove(s, carry, carry_len);
03343 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
03344 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03345 rb_enc_str_coderange(str);
03346 return str;
03347 }
03348
03349
03350
03351
03352
03353
03354
03355
03356
03357
03358
03359 static VALUE
03360 rb_str_succ_bang(VALUE str)
03361 {
03362 rb_str_shared_replace(str, rb_str_succ(str));
03363
03364 return str;
03365 }
03366
03367
03368
03369
03370
03371
03372
03373
03374
03375
03376
03377
03378
03379
03380
03381
03382
03383
03384
03385
03386
03387
03388
03389
03390
03391
03392
03393
03394
03395
03396
03397
03398
03399
03400 static VALUE
03401 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03402 {
03403 VALUE end, exclusive;
03404 VALUE current, after_end;
03405 ID succ;
03406 int n, excl, ascii;
03407 rb_encoding *enc;
03408
03409 rb_scan_args(argc, argv, "11", &end, &exclusive);
03410 RETURN_ENUMERATOR(beg, argc, argv);
03411 excl = RTEST(exclusive);
03412 CONST_ID(succ, "succ");
03413 StringValue(end);
03414 enc = rb_enc_check(beg, end);
03415 ascii = (is_ascii_string(beg) && is_ascii_string(end));
03416
03417 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03418 char c = RSTRING_PTR(beg)[0];
03419 char e = RSTRING_PTR(end)[0];
03420
03421 if (c > e || (excl && c == e)) return beg;
03422 for (;;) {
03423 rb_yield(rb_enc_str_new(&c, 1, enc));
03424 if (!excl && c == e) break;
03425 c++;
03426 if (excl && c == e) break;
03427 }
03428 return beg;
03429 }
03430
03431 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03432 char *s, *send;
03433 VALUE b, e;
03434 int width;
03435
03436 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03437 width = rb_long2int(send - s);
03438 while (s < send) {
03439 if (!ISDIGIT(*s)) goto no_digits;
03440 s++;
03441 }
03442 s = RSTRING_PTR(end); send = RSTRING_END(end);
03443 while (s < send) {
03444 if (!ISDIGIT(*s)) goto no_digits;
03445 s++;
03446 }
03447 b = rb_str_to_inum(beg, 10, FALSE);
03448 e = rb_str_to_inum(end, 10, FALSE);
03449 if (FIXNUM_P(b) && FIXNUM_P(e)) {
03450 long bi = FIX2LONG(b);
03451 long ei = FIX2LONG(e);
03452 rb_encoding *usascii = rb_usascii_encoding();
03453
03454 while (bi <= ei) {
03455 if (excl && bi == ei) break;
03456 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03457 bi++;
03458 }
03459 }
03460 else {
03461 ID op = excl ? '<' : rb_intern("<=");
03462 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03463
03464 args[0] = INT2FIX(width);
03465 while (rb_funcall(b, op, 1, e)) {
03466 args[1] = b;
03467 rb_yield(rb_str_format(numberof(args), args, fmt));
03468 b = rb_funcall(b, succ, 0, 0);
03469 }
03470 }
03471 return beg;
03472 }
03473
03474 no_digits:
03475 n = rb_str_cmp(beg, end);
03476 if (n > 0 || (excl && n == 0)) return beg;
03477
03478 after_end = rb_funcall(end, succ, 0, 0);
03479 current = rb_str_dup(beg);
03480 while (!rb_str_equal(current, after_end)) {
03481 VALUE next = Qnil;
03482 if (excl || !rb_str_equal(current, end))
03483 next = rb_funcall(current, succ, 0, 0);
03484 rb_yield(current);
03485 if (NIL_P(next)) break;
03486 current = next;
03487 StringValue(current);
03488 if (excl && rb_str_equal(current, end)) break;
03489 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03490 break;
03491 }
03492
03493 return beg;
03494 }
03495
03496 static VALUE
03497 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03498 {
03499 if (rb_reg_search(re, str, 0, 0) >= 0) {
03500 VALUE match = rb_backref_get();
03501 int nth = rb_reg_backref_number(match, backref);
03502 return rb_reg_nth_match(nth, match);
03503 }
03504 return Qnil;
03505 }
03506
03507 static VALUE
03508 rb_str_aref(VALUE str, VALUE indx)
03509 {
03510 long idx;
03511
03512 if (FIXNUM_P(indx)) {
03513 idx = FIX2LONG(indx);
03514
03515 num_index:
03516 str = rb_str_substr(str, idx, 1);
03517 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03518 return str;
03519 }
03520
03521 if (SPECIAL_CONST_P(indx)) goto generic;
03522 switch (BUILTIN_TYPE(indx)) {
03523 case T_REGEXP:
03524 return rb_str_subpat(str, indx, INT2FIX(0));
03525
03526 case T_STRING:
03527 if (rb_str_index(str, indx, 0) != -1)
03528 return rb_str_dup(indx);
03529 return Qnil;
03530
03531 generic:
03532 default:
03533
03534 {
03535 long beg, len;
03536 VALUE tmp;
03537
03538 len = str_strlen(str, STR_ENC_GET(str));
03539 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03540 case Qfalse:
03541 break;
03542 case Qnil:
03543 return Qnil;
03544 default:
03545 tmp = rb_str_substr(str, beg, len);
03546 return tmp;
03547 }
03548 }
03549 idx = NUM2LONG(indx);
03550 goto num_index;
03551 }
03552
03553 UNREACHABLE;
03554 }
03555
03556
03557
03558
03559
03560
03561
03562
03563
03564
03565
03566
03567
03568
03569
03570
03571
03572
03573
03574
03575
03576
03577
03578
03579
03580
03581
03582
03583
03584
03585
03586
03587
03588
03589
03590
03591
03592
03593
03594
03595
03596
03597
03598
03599
03600
03601
03602
03603
03604
03605
03606
03607
03608
03609
03610
03611
03612
03613
03614
03615
03616
03617
03618
03619
03620
03621
03622
03623
03624
03625
03626 static VALUE
03627 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03628 {
03629 if (argc == 2) {
03630 if (RB_TYPE_P(argv[0], T_REGEXP)) {
03631 return rb_str_subpat(str, argv[0], argv[1]);
03632 }
03633 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03634 }
03635 rb_check_arity(argc, 1, 2);
03636 return rb_str_aref(str, argv[0]);
03637 }
03638
03639 VALUE
03640 rb_str_drop_bytes(VALUE str, long len)
03641 {
03642 char *ptr = RSTRING_PTR(str);
03643 long olen = RSTRING_LEN(str), nlen;
03644
03645 str_modifiable(str);
03646 if (len > olen) len = olen;
03647 nlen = olen - len;
03648 if (nlen <= RSTRING_EMBED_LEN_MAX) {
03649 char *oldptr = ptr;
03650 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03651 STR_SET_EMBED(str);
03652 STR_SET_EMBED_LEN(str, nlen);
03653 ptr = RSTRING(str)->as.ary;
03654 memmove(ptr, oldptr + len, nlen);
03655 if (fl == STR_NOEMBED) xfree(oldptr);
03656 }
03657 else {
03658 if (!STR_SHARED_P(str)) rb_str_new4(str);
03659 ptr = RSTRING(str)->as.heap.ptr += len;
03660 RSTRING(str)->as.heap.len = nlen;
03661 }
03662 ptr[nlen] = 0;
03663 ENC_CODERANGE_CLEAR(str);
03664 return str;
03665 }
03666
03667 static void
03668 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03669 {
03670 if (beg == 0 && RSTRING_LEN(val) == 0) {
03671 rb_str_drop_bytes(str, len);
03672 OBJ_INFECT(str, val);
03673 return;
03674 }
03675
03676 rb_str_modify(str);
03677 if (len < RSTRING_LEN(val)) {
03678
03679 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + TERM_LEN(str));
03680 }
03681
03682 if (RSTRING_LEN(val) != len) {
03683 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03684 RSTRING_PTR(str) + beg + len,
03685 RSTRING_LEN(str) - (beg + len));
03686 }
03687 if (RSTRING_LEN(val) < beg && len < 0) {
03688 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03689 }
03690 if (RSTRING_LEN(val) > 0) {
03691 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03692 }
03693 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03694 if (RSTRING_PTR(str)) {
03695 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03696 }
03697 OBJ_INFECT(str, val);
03698 }
03699
03700 static void
03701 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03702 {
03703 long slen;
03704 char *p, *e;
03705 rb_encoding *enc;
03706 int singlebyte = single_byte_optimizable(str);
03707 int cr;
03708
03709 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03710
03711 StringValue(val);
03712 enc = rb_enc_check(str, val);
03713 slen = str_strlen(str, enc);
03714
03715 if (slen < beg) {
03716 out_of_range:
03717 rb_raise(rb_eIndexError, "index %ld out of string", beg);
03718 }
03719 if (beg < 0) {
03720 if (-beg > slen) {
03721 goto out_of_range;
03722 }
03723 beg += slen;
03724 }
03725 if (slen < len || slen < beg + len) {
03726 len = slen - beg;
03727 }
03728 str_modify_keep_cr(str);
03729 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03730 if (!p) p = RSTRING_END(str);
03731 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03732 if (!e) e = RSTRING_END(str);
03733
03734 beg = p - RSTRING_PTR(str);
03735 len = e - p;
03736 rb_str_splice_0(str, beg, len, val);
03737 rb_enc_associate(str, enc);
03738 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03739 if (cr != ENC_CODERANGE_BROKEN)
03740 ENC_CODERANGE_SET(str, cr);
03741 }
03742
03743 void
03744 rb_str_update(VALUE str, long beg, long len, VALUE val)
03745 {
03746 rb_str_splice(str, beg, len, val);
03747 }
03748
03749 static void
03750 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03751 {
03752 int nth;
03753 VALUE match;
03754 long start, end, len;
03755 rb_encoding *enc;
03756 struct re_registers *regs;
03757
03758 if (rb_reg_search(re, str, 0, 0) < 0) {
03759 rb_raise(rb_eIndexError, "regexp not matched");
03760 }
03761 match = rb_backref_get();
03762 nth = rb_reg_backref_number(match, backref);
03763 regs = RMATCH_REGS(match);
03764 if (nth >= regs->num_regs) {
03765 out_of_range:
03766 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03767 }
03768 if (nth < 0) {
03769 if (-nth >= regs->num_regs) {
03770 goto out_of_range;
03771 }
03772 nth += regs->num_regs;
03773 }
03774
03775 start = BEG(nth);
03776 if (start == -1) {
03777 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03778 }
03779 end = END(nth);
03780 len = end - start;
03781 StringValue(val);
03782 enc = rb_enc_check(str, val);
03783 rb_str_splice_0(str, start, len, val);
03784 rb_enc_associate(str, enc);
03785 }
03786
03787 static VALUE
03788 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03789 {
03790 long idx, beg;
03791
03792 if (FIXNUM_P(indx)) {
03793 idx = FIX2LONG(indx);
03794 num_index:
03795 rb_str_splice(str, idx, 1, val);
03796 return val;
03797 }
03798
03799 if (SPECIAL_CONST_P(indx)) goto generic;
03800 switch (TYPE(indx)) {
03801 case T_REGEXP:
03802 rb_str_subpat_set(str, indx, INT2FIX(0), val);
03803 return val;
03804
03805 case T_STRING:
03806 beg = rb_str_index(str, indx, 0);
03807 if (beg < 0) {
03808 rb_raise(rb_eIndexError, "string not matched");
03809 }
03810 beg = rb_str_sublen(str, beg);
03811 rb_str_splice(str, beg, str_strlen(indx, 0), val);
03812 return val;
03813
03814 generic:
03815 default:
03816
03817 {
03818 long beg, len;
03819 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03820 rb_str_splice(str, beg, len, val);
03821 return val;
03822 }
03823 }
03824 idx = NUM2LONG(indx);
03825 goto num_index;
03826 }
03827 }
03828
03829
03830
03831
03832
03833
03834
03835
03836
03837
03838
03839
03840
03841
03842
03843
03844
03845
03846
03847
03848
03849
03850
03851
03852
03853
03854 static VALUE
03855 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03856 {
03857 if (argc == 3) {
03858 if (RB_TYPE_P(argv[0], T_REGEXP)) {
03859 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03860 }
03861 else {
03862 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03863 }
03864 return argv[2];
03865 }
03866 rb_check_arity(argc, 2, 3);
03867 return rb_str_aset(str, argv[0], argv[1]);
03868 }
03869
03870
03871
03872
03873
03874
03875
03876
03877
03878
03879
03880
03881
03882
03883
03884
03885
03886
03887 static VALUE
03888 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03889 {
03890 long pos = NUM2LONG(idx);
03891
03892 if (pos == -1) {
03893 return rb_str_append(str, str2);
03894 }
03895 else if (pos < 0) {
03896 pos++;
03897 }
03898 rb_str_splice(str, pos, 0, str2);
03899 return str;
03900 }
03901
03902
03903
03904
03905
03906
03907
03908
03909
03910
03911
03912
03913
03914
03915
03916
03917
03918
03919
03920
03921
03922 static VALUE
03923 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03924 {
03925 VALUE result;
03926 VALUE buf[3];
03927 int i;
03928
03929 rb_check_arity(argc, 1, 2);
03930 for (i=0; i<argc; i++) {
03931 buf[i] = argv[i];
03932 }
03933 str_modify_keep_cr(str);
03934 result = rb_str_aref_m(argc, buf, str);
03935 if (!NIL_P(result)) {
03936 buf[i] = rb_str_new(0,0);
03937 rb_str_aset_m(argc+1, buf, str);
03938 }
03939 return result;
03940 }
03941
03942 static VALUE
03943 get_pat(VALUE pat, int quote)
03944 {
03945 VALUE val;
03946
03947 switch (TYPE(pat)) {
03948 case T_REGEXP:
03949 return pat;
03950
03951 case T_STRING:
03952 break;
03953
03954 default:
03955 val = rb_check_string_type(pat);
03956 if (NIL_P(val)) {
03957 Check_Type(pat, T_REGEXP);
03958 }
03959 pat = val;
03960 }
03961
03962 if (quote) {
03963 pat = rb_reg_quote(pat);
03964 }
03965
03966 return rb_reg_regcomp(pat);
03967 }
03968
03969
03970
03971
03972
03973
03974
03975
03976
03977
03978
03979
03980
03981 static VALUE
03982 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03983 {
03984 VALUE pat, repl, hash = Qnil;
03985 int iter = 0;
03986 int tainted = 0;
03987 long plen;
03988 int min_arity = rb_block_given_p() ? 1 : 2;
03989
03990 rb_check_arity(argc, min_arity, 2);
03991 if (argc == 1) {
03992 iter = 1;
03993 }
03994 else {
03995 repl = argv[1];
03996 hash = rb_check_hash_type(argv[1]);
03997 if (NIL_P(hash)) {
03998 StringValue(repl);
03999 }
04000 tainted = OBJ_TAINTED_RAW(repl);
04001 }
04002
04003 pat = get_pat(argv[0], 1);
04004 str_modifiable(str);
04005 if (rb_reg_search(pat, str, 0, 0) >= 0) {
04006 rb_encoding *enc;
04007 int cr = ENC_CODERANGE(str);
04008 VALUE match = rb_backref_get();
04009 struct re_registers *regs = RMATCH_REGS(match);
04010 long beg0 = BEG(0);
04011 long end0 = END(0);
04012 char *p, *rp;
04013 long len, rlen;
04014
04015 if (iter || !NIL_P(hash)) {
04016 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
04017
04018 if (iter) {
04019 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
04020 }
04021 else {
04022 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
04023 repl = rb_obj_as_string(repl);
04024 }
04025 str_mod_check(str, p, len);
04026 rb_check_frozen(str);
04027 }
04028 else {
04029 repl = rb_reg_regsub(repl, str, regs, pat);
04030 }
04031 enc = rb_enc_compatible(str, repl);
04032 if (!enc) {
04033 rb_encoding *str_enc = STR_ENC_GET(str);
04034 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
04035 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
04036 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
04037 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
04038 rb_enc_name(str_enc),
04039 rb_enc_name(STR_ENC_GET(repl)));
04040 }
04041 enc = STR_ENC_GET(repl);
04042 }
04043 rb_str_modify(str);
04044 rb_enc_associate(str, enc);
04045 tainted |= OBJ_TAINTED_RAW(repl);
04046 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
04047 int cr2 = ENC_CODERANGE(repl);
04048 if (cr2 == ENC_CODERANGE_BROKEN ||
04049 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
04050 cr = ENC_CODERANGE_UNKNOWN;
04051 else
04052 cr = cr2;
04053 }
04054 plen = end0 - beg0;
04055 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
04056 len = RSTRING_LEN(str);
04057 if (rlen > plen) {
04058 RESIZE_CAPA(str, len + rlen - plen);
04059 }
04060 p = RSTRING_PTR(str);
04061 if (rlen != plen) {
04062 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
04063 }
04064 memcpy(p + beg0, rp, rlen);
04065 len += rlen - plen;
04066 STR_SET_LEN(str, len);
04067 RSTRING_PTR(str)[len] = '\0';
04068 ENC_CODERANGE_SET(str, cr);
04069 FL_SET_RAW(str, tainted);
04070
04071 return str;
04072 }
04073 return Qnil;
04074 }
04075
04076
04077
04078
04079
04080
04081
04082
04083
04084
04085
04086
04087
04088
04089
04090
04091
04092
04093
04094
04095
04096
04097
04098
04099
04100
04101
04102
04103
04104
04105
04106
04107
04108
04109
04110
04111
04112
04113
04114
04115
04116
04117
04118
04119 static VALUE
04120 rb_str_sub(int argc, VALUE *argv, VALUE str)
04121 {
04122 str = rb_str_dup(str);
04123 rb_str_sub_bang(argc, argv, str);
04124 return str;
04125 }
04126
04127 static VALUE
04128 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
04129 {
04130 VALUE pat, val, repl, match, dest, hash = Qnil;
04131 struct re_registers *regs;
04132 long beg, n;
04133 long beg0, end0;
04134 long offset, blen, slen, len, last;
04135 int iter = 0;
04136 char *sp, *cp;
04137 int tainted = 0;
04138 rb_encoding *str_enc;
04139
04140 switch (argc) {
04141 case 1:
04142 RETURN_ENUMERATOR(str, argc, argv);
04143 iter = 1;
04144 break;
04145 case 2:
04146 repl = argv[1];
04147 hash = rb_check_hash_type(argv[1]);
04148 if (NIL_P(hash)) {
04149 StringValue(repl);
04150 }
04151 tainted = OBJ_TAINTED_RAW(repl);
04152 break;
04153 default:
04154 rb_check_arity(argc, 1, 2);
04155 }
04156
04157 pat = get_pat(argv[0], 1);
04158 beg = rb_reg_search(pat, str, 0, 0);
04159 if (beg < 0) {
04160 if (bang) return Qnil;
04161 return rb_str_dup(str);
04162 }
04163
04164 offset = 0;
04165 n = 0;
04166 blen = RSTRING_LEN(str) + 30;
04167 dest = rb_str_buf_new(blen);
04168 sp = RSTRING_PTR(str);
04169 slen = RSTRING_LEN(str);
04170 cp = sp;
04171 str_enc = STR_ENC_GET(str);
04172 rb_enc_associate(dest, str_enc);
04173 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
04174
04175 do {
04176 n++;
04177 match = rb_backref_get();
04178 regs = RMATCH_REGS(match);
04179 beg0 = BEG(0);
04180 end0 = END(0);
04181 if (iter || !NIL_P(hash)) {
04182 if (iter) {
04183 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
04184 }
04185 else {
04186 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
04187 val = rb_obj_as_string(val);
04188 }
04189 str_mod_check(str, sp, slen);
04190 if (val == dest) {
04191 rb_raise(rb_eRuntimeError, "block should not cheat");
04192 }
04193 }
04194 else {
04195 val = rb_reg_regsub(repl, str, regs, pat);
04196 }
04197
04198 tainted |= OBJ_TAINTED_RAW(val);
04199
04200 len = beg0 - offset;
04201 if (len) {
04202 rb_enc_str_buf_cat(dest, cp, len, str_enc);
04203 }
04204
04205 rb_str_buf_append(dest, val);
04206
04207 last = offset;
04208 offset = end0;
04209 if (beg0 == end0) {
04210
04211
04212
04213
04214 if (RSTRING_LEN(str) <= end0) break;
04215 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
04216 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
04217 offset = end0 + len;
04218 }
04219 cp = RSTRING_PTR(str) + offset;
04220 if (offset > RSTRING_LEN(str)) break;
04221 beg = rb_reg_search(pat, str, offset, 0);
04222 } while (beg >= 0);
04223 if (RSTRING_LEN(str) > offset) {
04224 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
04225 }
04226 rb_reg_search(pat, str, last, 0);
04227 if (bang) {
04228 rb_str_shared_replace(str, dest);
04229 }
04230 else {
04231 RBASIC_SET_CLASS(dest, rb_obj_class(str));
04232 tainted |= OBJ_TAINTED_RAW(str);
04233 str = dest;
04234 }
04235
04236 FL_SET_RAW(str, tainted);
04237 return str;
04238 }
04239
04240
04241
04242
04243
04244
04245
04246
04247
04248
04249
04250
04251
04252 static VALUE
04253 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
04254 {
04255 str_modify_keep_cr(str);
04256 return str_gsub(argc, argv, str, 1);
04257 }
04258
04259
04260
04261
04262
04263
04264
04265
04266
04267
04268
04269
04270
04271
04272
04273
04274
04275
04276
04277
04278
04279
04280
04281
04282
04283
04284
04285
04286
04287
04288
04289
04290
04291
04292
04293
04294
04295
04296
04297
04298
04299
04300
04301
04302
04303 static VALUE
04304 rb_str_gsub(int argc, VALUE *argv, VALUE str)
04305 {
04306 return str_gsub(argc, argv, str, 0);
04307 }
04308
04309
04310
04311
04312
04313
04314
04315
04316
04317
04318
04319
04320
04321 VALUE
04322 rb_str_replace(VALUE str, VALUE str2)
04323 {
04324 str_modifiable(str);
04325 if (str == str2) return str;
04326
04327 StringValue(str2);
04328 str_discard(str);
04329 return str_replace(str, str2);
04330 }
04331
04332
04333
04334
04335
04336
04337
04338
04339
04340
04341
04342 static VALUE
04343 rb_str_clear(VALUE str)
04344 {
04345 str_discard(str);
04346 STR_SET_EMBED(str);
04347 STR_SET_EMBED_LEN(str, 0);
04348 RSTRING_PTR(str)[0] = 0;
04349 if (rb_enc_asciicompat(STR_ENC_GET(str)))
04350 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04351 else
04352 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04353 return str;
04354 }
04355
04356
04357
04358
04359
04360
04361
04362
04363
04364
04365
04366 static VALUE
04367 rb_str_chr(VALUE str)
04368 {
04369 return rb_str_substr(str, 0, 1);
04370 }
04371
04372
04373
04374
04375
04376
04377
04378 static VALUE
04379 rb_str_getbyte(VALUE str, VALUE index)
04380 {
04381 long pos = NUM2LONG(index);
04382
04383 if (pos < 0)
04384 pos += RSTRING_LEN(str);
04385 if (pos < 0 || RSTRING_LEN(str) <= pos)
04386 return Qnil;
04387
04388 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
04389 }
04390
04391
04392
04393
04394
04395
04396
04397 static VALUE
04398 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
04399 {
04400 long pos = NUM2LONG(index);
04401 int byte = NUM2INT(value);
04402
04403 rb_str_modify(str);
04404
04405 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04406 rb_raise(rb_eIndexError, "index %ld out of string", pos);
04407 if (pos < 0)
04408 pos += RSTRING_LEN(str);
04409
04410 RSTRING_PTR(str)[pos] = byte;
04411
04412 return value;
04413 }
04414
04415 static VALUE
04416 str_byte_substr(VALUE str, long beg, long len)
04417 {
04418 char *p, *s = RSTRING_PTR(str);
04419 long n = RSTRING_LEN(str);
04420 VALUE str2;
04421
04422 if (beg > n || len < 0) return Qnil;
04423 if (beg < 0) {
04424 beg += n;
04425 if (beg < 0) return Qnil;
04426 }
04427 if (beg + len > n)
04428 len = n - beg;
04429 if (len <= 0) {
04430 len = 0;
04431 p = 0;
04432 }
04433 else
04434 p = s + beg;
04435
04436 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04437 str2 = rb_str_new4(str);
04438 str2 = str_new3(rb_obj_class(str2), str2);
04439 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04440 RSTRING(str2)->as.heap.len = len;
04441 }
04442 else {
04443 str2 = rb_str_new5(str, p, len);
04444 }
04445
04446 str_enc_copy(str2, str);
04447
04448 if (RSTRING_LEN(str2) == 0) {
04449 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
04450 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
04451 else
04452 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04453 }
04454 else {
04455 switch (ENC_CODERANGE(str)) {
04456 case ENC_CODERANGE_7BIT:
04457 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04458 break;
04459 default:
04460 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
04461 break;
04462 }
04463 }
04464
04465 OBJ_INFECT_RAW(str2, str);
04466
04467 return str2;
04468 }
04469
04470 static VALUE
04471 str_byte_aref(VALUE str, VALUE indx)
04472 {
04473 long idx;
04474 switch (TYPE(indx)) {
04475 case T_FIXNUM:
04476 idx = FIX2LONG(indx);
04477
04478 num_index:
04479 str = str_byte_substr(str, idx, 1);
04480 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04481 return str;
04482
04483 default:
04484
04485 {
04486 long beg, len = RSTRING_LEN(str);
04487
04488 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04489 case Qfalse:
04490 break;
04491 case Qnil:
04492 return Qnil;
04493 default:
04494 return str_byte_substr(str, beg, len);
04495 }
04496 }
04497 idx = NUM2LONG(indx);
04498 goto num_index;
04499 }
04500
04501 UNREACHABLE;
04502 }
04503
04504
04505
04506
04507
04508
04509
04510
04511
04512
04513
04514
04515
04516
04517
04518
04519
04520
04521
04522
04523
04524
04525
04526
04527 static VALUE
04528 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04529 {
04530 if (argc == 2) {
04531 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04532 }
04533 rb_check_arity(argc, 1, 2);
04534 return str_byte_aref(str, argv[0]);
04535 }
04536
04537
04538
04539
04540
04541
04542
04543
04544
04545
04546 static VALUE
04547 rb_str_reverse(VALUE str)
04548 {
04549 rb_encoding *enc;
04550 VALUE rev;
04551 char *s, *e, *p;
04552 int cr;
04553
04554 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04555 enc = STR_ENC_GET(str);
04556 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04557 s = RSTRING_PTR(str); e = RSTRING_END(str);
04558 p = RSTRING_END(rev);
04559 cr = ENC_CODERANGE(str);
04560
04561 if (RSTRING_LEN(str) > 1) {
04562 if (single_byte_optimizable(str)) {
04563 while (s < e) {
04564 *--p = *s++;
04565 }
04566 }
04567 else if (cr == ENC_CODERANGE_VALID) {
04568 while (s < e) {
04569 int clen = rb_enc_fast_mbclen(s, e, enc);
04570
04571 p -= clen;
04572 memcpy(p, s, clen);
04573 s += clen;
04574 }
04575 }
04576 else {
04577 cr = rb_enc_asciicompat(enc) ?
04578 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
04579 while (s < e) {
04580 int clen = rb_enc_mbclen(s, e, enc);
04581
04582 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
04583 p -= clen;
04584 memcpy(p, s, clen);
04585 s += clen;
04586 }
04587 }
04588 }
04589 STR_SET_LEN(rev, RSTRING_LEN(str));
04590 OBJ_INFECT_RAW(rev, str);
04591 str_enc_copy(rev, str);
04592 ENC_CODERANGE_SET(rev, cr);
04593
04594 return rev;
04595 }
04596
04597
04598
04599
04600
04601
04602
04603
04604
04605 static VALUE
04606 rb_str_reverse_bang(VALUE str)
04607 {
04608 if (RSTRING_LEN(str) > 1) {
04609 if (single_byte_optimizable(str)) {
04610 char *s, *e, c;
04611
04612 str_modify_keep_cr(str);
04613 s = RSTRING_PTR(str);
04614 e = RSTRING_END(str) - 1;
04615 while (s < e) {
04616 c = *s;
04617 *s++ = *e;
04618 *e-- = c;
04619 }
04620 }
04621 else {
04622 rb_str_shared_replace(str, rb_str_reverse(str));
04623 }
04624 }
04625 else {
04626 str_modify_keep_cr(str);
04627 }
04628 return str;
04629 }
04630
04631
04632
04633
04634
04635
04636
04637
04638
04639
04640
04641
04642
04643
04644 static VALUE
04645 rb_str_include(VALUE str, VALUE arg)
04646 {
04647 long i;
04648
04649 StringValue(arg);
04650 i = rb_str_index(str, arg, 0);
04651
04652 if (i == -1) return Qfalse;
04653 return Qtrue;
04654 }
04655
04656
04657
04658
04659
04660
04661
04662
04663
04664
04665
04666
04667
04668
04669
04670
04671
04672
04673
04674
04675
04676
04677
04678 static VALUE
04679 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04680 {
04681 int base;
04682
04683 if (argc == 0) base = 10;
04684 else {
04685 VALUE b;
04686
04687 rb_scan_args(argc, argv, "01", &b);
04688 base = NUM2INT(b);
04689 }
04690 if (base < 0) {
04691 rb_raise(rb_eArgError, "invalid radix %d", base);
04692 }
04693 return rb_str_to_inum(str, base, FALSE);
04694 }
04695
04696
04697
04698
04699
04700
04701
04702
04703
04704
04705
04706
04707
04708
04709
04710
04711 static VALUE
04712 rb_str_to_f(VALUE str)
04713 {
04714 return DBL2NUM(rb_str_to_dbl(str, FALSE));
04715 }
04716
04717
04718
04719
04720
04721
04722
04723
04724
04725
04726 static VALUE
04727 rb_str_to_s(VALUE str)
04728 {
04729 if (rb_obj_class(str) != rb_cString) {
04730 return str_duplicate(rb_cString, str);
04731 }
04732 return str;
04733 }
04734
04735 #if 0
04736 static void
04737 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04738 {
04739 char s[RUBY_MAX_CHAR_LEN];
04740 int n = rb_enc_codelen(c, enc);
04741
04742 rb_enc_mbcput(c, s, enc);
04743 rb_enc_str_buf_cat(str, s, n, enc);
04744 }
04745 #endif
04746
04747 #define CHAR_ESC_LEN 13
04748
04749 int
04750 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04751 {
04752 char buf[CHAR_ESC_LEN + 1];
04753 int l;
04754
04755 #if SIZEOF_INT > 4
04756 c &= 0xffffffff;
04757 #endif
04758 if (unicode_p) {
04759 if (c < 0x7F && ISPRINT(c)) {
04760 snprintf(buf, CHAR_ESC_LEN, "%c", c);
04761 }
04762 else if (c < 0x10000) {
04763 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04764 }
04765 else {
04766 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04767 }
04768 }
04769 else {
04770 if (c < 0x100) {
04771 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04772 }
04773 else {
04774 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04775 }
04776 }
04777 l = (int)strlen(buf);
04778 rb_str_buf_cat(result, buf, l);
04779 return l;
04780 }
04781
04782
04783
04784
04785
04786
04787
04788
04789
04790
04791
04792
04793
04794 VALUE
04795 rb_str_inspect(VALUE str)
04796 {
04797 int encidx = ENCODING_GET(str);
04798 rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
04799 const char *p, *pend, *prev;
04800 char buf[CHAR_ESC_LEN + 1];
04801 VALUE result = rb_str_buf_new(0);
04802 rb_encoding *resenc = rb_default_internal_encoding();
04803 int unicode_p = rb_enc_unicode_p(enc);
04804 int asciicompat = rb_enc_asciicompat(enc);
04805
04806 if (resenc == NULL) resenc = rb_default_external_encoding();
04807 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04808 rb_enc_associate(result, resenc);
04809 str_buf_cat2(result, "\"");
04810
04811 p = RSTRING_PTR(str); pend = RSTRING_END(str);
04812 prev = p;
04813 actenc = get_actual_encoding(encidx, str);
04814 if (actenc != enc) {
04815 enc = actenc;
04816 if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
04817 }
04818 while (p < pend) {
04819 unsigned int c, cc;
04820 int n;
04821
04822 n = rb_enc_precise_mbclen(p, pend, enc);
04823 if (!MBCLEN_CHARFOUND_P(n)) {
04824 if (p > prev) str_buf_cat(result, prev, p - prev);
04825 n = rb_enc_mbminlen(enc);
04826 if (pend < p + n)
04827 n = (int)(pend - p);
04828 while (n--) {
04829 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04830 str_buf_cat(result, buf, strlen(buf));
04831 prev = ++p;
04832 }
04833 continue;
04834 }
04835 n = MBCLEN_CHARFOUND_LEN(n);
04836 c = rb_enc_mbc_to_codepoint(p, pend, enc);
04837 p += n;
04838 if ((asciicompat || unicode_p) &&
04839 (c == '"'|| c == '\\' ||
04840 (c == '#' &&
04841 p < pend &&
04842 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04843 (cc = rb_enc_codepoint(p,pend,enc),
04844 (cc == '$' || cc == '@' || cc == '{'))))) {
04845 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04846 str_buf_cat2(result, "\\");
04847 if (asciicompat || enc == resenc) {
04848 prev = p - n;
04849 continue;
04850 }
04851 }
04852 switch (c) {
04853 case '\n': cc = 'n'; break;
04854 case '\r': cc = 'r'; break;
04855 case '\t': cc = 't'; break;
04856 case '\f': cc = 'f'; break;
04857 case '\013': cc = 'v'; break;
04858 case '\010': cc = 'b'; break;
04859 case '\007': cc = 'a'; break;
04860 case 033: cc = 'e'; break;
04861 default: cc = 0; break;
04862 }
04863 if (cc) {
04864 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04865 buf[0] = '\\';
04866 buf[1] = (char)cc;
04867 str_buf_cat(result, buf, 2);
04868 prev = p;
04869 continue;
04870 }
04871 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04872 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04873 continue;
04874 }
04875 else {
04876 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04877 rb_str_buf_cat_escaped_char(result, c, unicode_p);
04878 prev = p;
04879 continue;
04880 }
04881 }
04882 if (p > prev) str_buf_cat(result, prev, p - prev);
04883 str_buf_cat2(result, "\"");
04884
04885 OBJ_INFECT_RAW(result, str);
04886 return result;
04887 }
04888
04889 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04890
04891
04892
04893
04894
04895
04896
04897
04898
04899
04900
04901 VALUE
04902 rb_str_dump(VALUE str)
04903 {
04904 rb_encoding *enc = rb_enc_get(str);
04905 long len;
04906 const char *p, *pend;
04907 char *q, *qend;
04908 VALUE result;
04909 int u8 = (enc == rb_utf8_encoding());
04910
04911 len = 2;
04912 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04913 while (p < pend) {
04914 unsigned char c = *p++;
04915 switch (c) {
04916 case '"': case '\\':
04917 case '\n': case '\r':
04918 case '\t': case '\f':
04919 case '\013': case '\010': case '\007': case '\033':
04920 len += 2;
04921 break;
04922
04923 case '#':
04924 len += IS_EVSTR(p, pend) ? 2 : 1;
04925 break;
04926
04927 default:
04928 if (ISPRINT(c)) {
04929 len++;
04930 }
04931 else {
04932 if (u8 && c > 0x7F) {
04933 int n = rb_enc_precise_mbclen(p-1, pend, enc);
04934 if (MBCLEN_CHARFOUND_P(n)) {
04935 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04936 while (cc >>= 4) len++;
04937 len += 5;
04938 p += MBCLEN_CHARFOUND_LEN(n)-1;
04939 break;
04940 }
04941 }
04942 len += 4;
04943 }
04944 break;
04945 }
04946 }
04947 if (!rb_enc_asciicompat(enc)) {
04948 len += 19;
04949 len += strlen(enc->name);
04950 }
04951
04952 result = rb_str_new5(str, 0, len);
04953 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04954 q = RSTRING_PTR(result); qend = q + len + 1;
04955
04956 *q++ = '"';
04957 while (p < pend) {
04958 unsigned char c = *p++;
04959
04960 if (c == '"' || c == '\\') {
04961 *q++ = '\\';
04962 *q++ = c;
04963 }
04964 else if (c == '#') {
04965 if (IS_EVSTR(p, pend)) *q++ = '\\';
04966 *q++ = '#';
04967 }
04968 else if (c == '\n') {
04969 *q++ = '\\';
04970 *q++ = 'n';
04971 }
04972 else if (c == '\r') {
04973 *q++ = '\\';
04974 *q++ = 'r';
04975 }
04976 else if (c == '\t') {
04977 *q++ = '\\';
04978 *q++ = 't';
04979 }
04980 else if (c == '\f') {
04981 *q++ = '\\';
04982 *q++ = 'f';
04983 }
04984 else if (c == '\013') {
04985 *q++ = '\\';
04986 *q++ = 'v';
04987 }
04988 else if (c == '\010') {
04989 *q++ = '\\';
04990 *q++ = 'b';
04991 }
04992 else if (c == '\007') {
04993 *q++ = '\\';
04994 *q++ = 'a';
04995 }
04996 else if (c == '\033') {
04997 *q++ = '\\';
04998 *q++ = 'e';
04999 }
05000 else if (ISPRINT(c)) {
05001 *q++ = c;
05002 }
05003 else {
05004 *q++ = '\\';
05005 if (u8) {
05006 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
05007 if (MBCLEN_CHARFOUND_P(n)) {
05008 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
05009 p += n;
05010 snprintf(q, qend-q, "u{%x}", cc);
05011 q += strlen(q);
05012 continue;
05013 }
05014 }
05015 snprintf(q, qend-q, "x%02X", c);
05016 q += 3;
05017 }
05018 }
05019 *q++ = '"';
05020 *q = '\0';
05021 if (!rb_enc_asciicompat(enc)) {
05022 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
05023 enc = rb_ascii8bit_encoding();
05024 }
05025 OBJ_INFECT_RAW(result, str);
05026
05027 rb_enc_associate(result, enc);
05028 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
05029 return result;
05030 }
05031
05032
05033 static void
05034 rb_str_check_dummy_enc(rb_encoding *enc)
05035 {
05036 if (rb_enc_dummy_p(enc)) {
05037 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
05038 rb_enc_name(enc));
05039 }
05040 }
05041
05042
05043
05044
05045
05046
05047
05048
05049
05050
05051 static VALUE
05052 rb_str_upcase_bang(VALUE str)
05053 {
05054 rb_encoding *enc;
05055 char *s, *send;
05056 int modify = 0;
05057 int n;
05058
05059 str_modify_keep_cr(str);
05060 enc = STR_ENC_GET(str);
05061 rb_str_check_dummy_enc(enc);
05062 s = RSTRING_PTR(str); send = RSTRING_END(str);
05063 if (single_byte_optimizable(str)) {
05064 while (s < send) {
05065 unsigned int c = *(unsigned char*)s;
05066
05067 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
05068 *s = 'A' + (c - 'a');
05069 modify = 1;
05070 }
05071 s++;
05072 }
05073 }
05074 else {
05075 int ascompat = rb_enc_asciicompat(enc);
05076
05077 while (s < send) {
05078 unsigned int c;
05079
05080 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05081 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
05082 *s = 'A' + (c - 'a');
05083 modify = 1;
05084 }
05085 s++;
05086 }
05087 else {
05088 c = rb_enc_codepoint_len(s, send, &n, enc);
05089 if (rb_enc_islower(c, enc)) {
05090
05091 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
05092 modify = 1;
05093 }
05094 s += n;
05095 }
05096 }
05097 }
05098
05099 if (modify) return str;
05100 return Qnil;
05101 }
05102
05103
05104
05105
05106
05107
05108
05109
05110
05111
05112
05113
05114
05115
05116 static VALUE
05117 rb_str_upcase(VALUE str)
05118 {
05119 str = rb_str_dup(str);
05120 rb_str_upcase_bang(str);
05121 return str;
05122 }
05123
05124
05125
05126
05127
05128
05129
05130
05131
05132
05133
05134 static VALUE
05135 rb_str_downcase_bang(VALUE str)
05136 {
05137 rb_encoding *enc;
05138 char *s, *send;
05139 int modify = 0;
05140
05141 str_modify_keep_cr(str);
05142 enc = STR_ENC_GET(str);
05143 rb_str_check_dummy_enc(enc);
05144 s = RSTRING_PTR(str); send = RSTRING_END(str);
05145 if (single_byte_optimizable(str)) {
05146 while (s < send) {
05147 unsigned int c = *(unsigned char*)s;
05148
05149 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
05150 *s = 'a' + (c - 'A');
05151 modify = 1;
05152 }
05153 s++;
05154 }
05155 }
05156 else {
05157 int ascompat = rb_enc_asciicompat(enc);
05158
05159 while (s < send) {
05160 unsigned int c;
05161 int n;
05162
05163 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05164 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
05165 *s = 'a' + (c - 'A');
05166 modify = 1;
05167 }
05168 s++;
05169 }
05170 else {
05171 c = rb_enc_codepoint_len(s, send, &n, enc);
05172 if (rb_enc_isupper(c, enc)) {
05173
05174 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
05175 modify = 1;
05176 }
05177 s += n;
05178 }
05179 }
05180 }
05181
05182 if (modify) return str;
05183 return Qnil;
05184 }
05185
05186
05187
05188
05189
05190
05191
05192
05193
05194
05195
05196
05197
05198
05199 static VALUE
05200 rb_str_downcase(VALUE str)
05201 {
05202 str = rb_str_dup(str);
05203 rb_str_downcase_bang(str);
05204 return str;
05205 }
05206
05207
05208
05209
05210
05211
05212
05213
05214
05215
05216
05217
05218
05219
05220
05221
05222 static VALUE
05223 rb_str_capitalize_bang(VALUE str)
05224 {
05225 rb_encoding *enc;
05226 char *s, *send;
05227 int modify = 0;
05228 unsigned int c;
05229 int n;
05230
05231 str_modify_keep_cr(str);
05232 enc = STR_ENC_GET(str);
05233 rb_str_check_dummy_enc(enc);
05234 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05235 s = RSTRING_PTR(str); send = RSTRING_END(str);
05236
05237 c = rb_enc_codepoint_len(s, send, &n, enc);
05238 if (rb_enc_islower(c, enc)) {
05239 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
05240 modify = 1;
05241 }
05242 s += n;
05243 while (s < send) {
05244 c = rb_enc_codepoint_len(s, send, &n, enc);
05245 if (rb_enc_isupper(c, enc)) {
05246 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
05247 modify = 1;
05248 }
05249 s += n;
05250 }
05251
05252 if (modify) return str;
05253 return Qnil;
05254 }
05255
05256
05257
05258
05259
05260
05261
05262
05263
05264
05265
05266
05267
05268
05269
05270 static VALUE
05271 rb_str_capitalize(VALUE str)
05272 {
05273 str = rb_str_dup(str);
05274 rb_str_capitalize_bang(str);
05275 return str;
05276 }
05277
05278
05279
05280
05281
05282
05283
05284
05285
05286
05287
05288 static VALUE
05289 rb_str_swapcase_bang(VALUE str)
05290 {
05291 rb_encoding *enc;
05292 char *s, *send;
05293 int modify = 0;
05294 int n;
05295
05296 str_modify_keep_cr(str);
05297 enc = STR_ENC_GET(str);
05298 rb_str_check_dummy_enc(enc);
05299 s = RSTRING_PTR(str); send = RSTRING_END(str);
05300 while (s < send) {
05301 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
05302
05303 if (rb_enc_isupper(c, enc)) {
05304
05305 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
05306 modify = 1;
05307 }
05308 else if (rb_enc_islower(c, enc)) {
05309
05310 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
05311 modify = 1;
05312 }
05313 s += n;
05314 }
05315
05316 if (modify) return str;
05317 return Qnil;
05318 }
05319
05320
05321
05322
05323
05324
05325
05326
05327
05328
05329
05330
05331
05332
05333 static VALUE
05334 rb_str_swapcase(VALUE str)
05335 {
05336 str = rb_str_dup(str);
05337 rb_str_swapcase_bang(str);
05338 return str;
05339 }
05340
05341 typedef unsigned char *USTR;
05342
05343 struct tr {
05344 int gen;
05345 unsigned int now, max;
05346 char *p, *pend;
05347 };
05348
05349 static unsigned int
05350 trnext(struct tr *t, rb_encoding *enc)
05351 {
05352 int n;
05353
05354 for (;;) {
05355 if (!t->gen) {
05356 nextpart:
05357 if (t->p == t->pend) return -1;
05358 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
05359 t->p += n;
05360 }
05361 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
05362 t->p += n;
05363 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
05364 t->p += n;
05365 if (t->p < t->pend) {
05366 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
05367 t->p += n;
05368 if (t->now > c) {
05369 if (t->now < 0x80 && c < 0x80) {
05370 rb_raise(rb_eArgError,
05371 "invalid range \"%c-%c\" in string transliteration",
05372 t->now, c);
05373 }
05374 else {
05375 rb_raise(rb_eArgError, "invalid range in string transliteration");
05376 }
05377 continue;
05378 }
05379 t->gen = 1;
05380 t->max = c;
05381 }
05382 }
05383 return t->now;
05384 }
05385 else {
05386 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
05387 if (t->now == t->max) {
05388 t->gen = 0;
05389 goto nextpart;
05390 }
05391 }
05392 if (t->now < t->max) {
05393 return t->now;
05394 }
05395 else {
05396 t->gen = 0;
05397 return t->max;
05398 }
05399 }
05400 }
05401 }
05402
05403 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
05404
05405 static VALUE
05406 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
05407 {
05408 const unsigned int errc = -1;
05409 unsigned int trans[256];
05410 rb_encoding *enc, *e1, *e2;
05411 struct tr trsrc, trrepl;
05412 int cflag = 0;
05413 unsigned int c, c0, last = 0;
05414 int modify = 0, i, l;
05415 char *s, *send;
05416 VALUE hash = 0;
05417 int singlebyte = single_byte_optimizable(str);
05418 int cr;
05419
05420 #define CHECK_IF_ASCII(c) \
05421 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05422 (cr = ENC_CODERANGE_VALID) : 0)
05423
05424 StringValue(src);
05425 StringValue(repl);
05426 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05427 if (RSTRING_LEN(repl) == 0) {
05428 return rb_str_delete_bang(1, &src, str);
05429 }
05430
05431 cr = ENC_CODERANGE(str);
05432 e1 = rb_enc_check(str, src);
05433 e2 = rb_enc_check(str, repl);
05434 if (e1 == e2) {
05435 enc = e1;
05436 }
05437 else {
05438 enc = rb_enc_check(src, repl);
05439 }
05440 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05441 if (RSTRING_LEN(src) > 1 &&
05442 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05443 trsrc.p + l < trsrc.pend) {
05444 cflag = 1;
05445 trsrc.p += l;
05446 }
05447 trrepl.p = RSTRING_PTR(repl);
05448 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05449 trsrc.gen = trrepl.gen = 0;
05450 trsrc.now = trrepl.now = 0;
05451 trsrc.max = trrepl.max = 0;
05452
05453 if (cflag) {
05454 for (i=0; i<256; i++) {
05455 trans[i] = 1;
05456 }
05457 while ((c = trnext(&trsrc, enc)) != errc) {
05458 if (c < 256) {
05459 trans[c] = errc;
05460 }
05461 else {
05462 if (!hash) hash = rb_hash_new();
05463 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05464 }
05465 }
05466 while ((c = trnext(&trrepl, enc)) != errc)
05467 ;
05468 last = trrepl.now;
05469 for (i=0; i<256; i++) {
05470 if (trans[i] != errc) {
05471 trans[i] = last;
05472 }
05473 }
05474 }
05475 else {
05476 unsigned int r;
05477
05478 for (i=0; i<256; i++) {
05479 trans[i] = errc;
05480 }
05481 while ((c = trnext(&trsrc, enc)) != errc) {
05482 r = trnext(&trrepl, enc);
05483 if (r == errc) r = trrepl.now;
05484 if (c < 256) {
05485 trans[c] = r;
05486 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05487 }
05488 else {
05489 if (!hash) hash = rb_hash_new();
05490 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05491 }
05492 }
05493 }
05494
05495 if (cr == ENC_CODERANGE_VALID)
05496 cr = ENC_CODERANGE_7BIT;
05497 str_modify_keep_cr(str);
05498 s = RSTRING_PTR(str); send = RSTRING_END(str);
05499 if (sflag) {
05500 int clen, tlen;
05501 long offset, max = RSTRING_LEN(str);
05502 unsigned int save = -1;
05503 char *buf = ALLOC_N(char, max), *t = buf;
05504
05505 while (s < send) {
05506 int may_modify = 0;
05507
05508 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05509 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05510
05511 s += clen;
05512 if (c < 256) {
05513 c = trans[c];
05514 }
05515 else if (hash) {
05516 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05517 if (NIL_P(tmp)) {
05518 if (cflag) c = last;
05519 else c = errc;
05520 }
05521 else if (cflag) c = errc;
05522 else c = NUM2INT(tmp);
05523 }
05524 else {
05525 c = errc;
05526 }
05527 if (c != (unsigned int)-1) {
05528 if (save == c) {
05529 CHECK_IF_ASCII(c);
05530 continue;
05531 }
05532 save = c;
05533 tlen = rb_enc_codelen(c, enc);
05534 modify = 1;
05535 }
05536 else {
05537 save = -1;
05538 c = c0;
05539 if (enc != e1) may_modify = 1;
05540 }
05541 while (t - buf + tlen >= max) {
05542 offset = t - buf;
05543 max *= 2;
05544 REALLOC_N(buf, char, max);
05545 t = buf + offset;
05546 }
05547 rb_enc_mbcput(c, t, enc);
05548 if (may_modify && memcmp(s, t, tlen) != 0) {
05549 modify = 1;
05550 }
05551 CHECK_IF_ASCII(c);
05552 t += tlen;
05553 }
05554 if (!STR_EMBED_P(str)) {
05555 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
05556 }
05557 *t = '\0';
05558 RSTRING(str)->as.heap.ptr = buf;
05559 RSTRING(str)->as.heap.len = t - buf;
05560 STR_SET_NOEMBED(str);
05561 RSTRING(str)->as.heap.aux.capa = max;
05562 }
05563 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05564 while (s < send) {
05565 c = (unsigned char)*s;
05566 if (trans[c] != errc) {
05567 if (!cflag) {
05568 c = trans[c];
05569 *s = c;
05570 modify = 1;
05571 }
05572 else {
05573 *s = last;
05574 modify = 1;
05575 }
05576 }
05577 CHECK_IF_ASCII(c);
05578 s++;
05579 }
05580 }
05581 else {
05582 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05583 long offset;
05584 char *buf = ALLOC_N(char, max), *t = buf;
05585
05586 while (s < send) {
05587 int may_modify = 0;
05588 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05589 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05590
05591 if (c < 256) {
05592 c = trans[c];
05593 }
05594 else if (hash) {
05595 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05596 if (NIL_P(tmp)) {
05597 if (cflag) c = last;
05598 else c = errc;
05599 }
05600 else if (cflag) c = errc;
05601 else c = NUM2INT(tmp);
05602 }
05603 else {
05604 c = cflag ? last : errc;
05605 }
05606 if (c != errc) {
05607 tlen = rb_enc_codelen(c, enc);
05608 modify = 1;
05609 }
05610 else {
05611 c = c0;
05612 if (enc != e1) may_modify = 1;
05613 }
05614 while (t - buf + tlen >= max) {
05615 offset = t - buf;
05616 max *= 2;
05617 REALLOC_N(buf, char, max);
05618 t = buf + offset;
05619 }
05620 if (s != t) {
05621 rb_enc_mbcput(c, t, enc);
05622 if (may_modify && memcmp(s, t, tlen) != 0) {
05623 modify = 1;
05624 }
05625 }
05626 CHECK_IF_ASCII(c);
05627 s += clen;
05628 t += tlen;
05629 }
05630 if (!STR_EMBED_P(str)) {
05631 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
05632 }
05633 *t = '\0';
05634 RSTRING(str)->as.heap.ptr = buf;
05635 RSTRING(str)->as.heap.len = t - buf;
05636 STR_SET_NOEMBED(str);
05637 RSTRING(str)->as.heap.aux.capa = max;
05638 }
05639
05640 if (modify) {
05641 if (cr != ENC_CODERANGE_BROKEN)
05642 ENC_CODERANGE_SET(str, cr);
05643 rb_enc_associate(str, enc);
05644 return str;
05645 }
05646 return Qnil;
05647 }
05648
05649
05650
05651
05652
05653
05654
05655
05656
05657
05658
05659 static VALUE
05660 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05661 {
05662 return tr_trans(str, src, repl, 0);
05663 }
05664
05665
05666
05667
05668
05669
05670
05671
05672
05673
05674
05675
05676
05677
05678
05679
05680
05681
05682
05683
05684
05685
05686
05687
05688
05689
05690
05691
05692
05693
05694
05695
05696
05697
05698
05699
05700
05701 static VALUE
05702 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05703 {
05704 str = rb_str_dup(str);
05705 tr_trans(str, src, repl, 0);
05706 return str;
05707 }
05708
05709 #define TR_TABLE_SIZE 257
05710 static void
05711 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05712 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05713 {
05714 const unsigned int errc = -1;
05715 char buf[256];
05716 struct tr tr;
05717 unsigned int c;
05718 VALUE table = 0, ptable = 0;
05719 int i, l, cflag = 0;
05720
05721 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05722 tr.gen = tr.now = tr.max = 0;
05723
05724 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05725 cflag = 1;
05726 tr.p += l;
05727 }
05728 if (first) {
05729 for (i=0; i<256; i++) {
05730 stable[i] = 1;
05731 }
05732 stable[256] = cflag;
05733 }
05734 else if (stable[256] && !cflag) {
05735 stable[256] = 0;
05736 }
05737 for (i=0; i<256; i++) {
05738 buf[i] = cflag;
05739 }
05740
05741 while ((c = trnext(&tr, enc)) != errc) {
05742 if (c < 256) {
05743 buf[c & 0xff] = !cflag;
05744 }
05745 else {
05746 VALUE key = UINT2NUM(c);
05747
05748 if (!table && (first || *tablep || stable[256])) {
05749 if (cflag) {
05750 ptable = *ctablep;
05751 table = ptable ? ptable : rb_hash_new();
05752 *ctablep = table;
05753 }
05754 else {
05755 table = rb_hash_new();
05756 ptable = *tablep;
05757 *tablep = table;
05758 }
05759 }
05760 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
05761 rb_hash_aset(table, key, Qtrue);
05762 }
05763 }
05764 }
05765 for (i=0; i<256; i++) {
05766 stable[i] = stable[i] && buf[i];
05767 }
05768 if (!table && !cflag) {
05769 *tablep = 0;
05770 }
05771 }
05772
05773
05774 static int
05775 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05776 {
05777 if (c < 256) {
05778 return table[c] != 0;
05779 }
05780 else {
05781 VALUE v = UINT2NUM(c);
05782
05783 if (del) {
05784 if (!NIL_P(rb_hash_lookup(del, v)) &&
05785 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05786 return TRUE;
05787 }
05788 }
05789 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05790 return FALSE;
05791 }
05792 return table[256] ? TRUE : FALSE;
05793 }
05794 }
05795
05796
05797
05798
05799
05800
05801
05802
05803
05804 static VALUE
05805 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05806 {
05807 char squeez[TR_TABLE_SIZE];
05808 rb_encoding *enc = 0;
05809 char *s, *send, *t;
05810 VALUE del = 0, nodel = 0;
05811 int modify = 0;
05812 int i, ascompat, cr;
05813
05814 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05815 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
05816 for (i=0; i<argc; i++) {
05817 VALUE s = argv[i];
05818
05819 StringValue(s);
05820 enc = rb_enc_check(str, s);
05821 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05822 }
05823
05824 str_modify_keep_cr(str);
05825 ascompat = rb_enc_asciicompat(enc);
05826 s = t = RSTRING_PTR(str);
05827 send = RSTRING_END(str);
05828 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05829 while (s < send) {
05830 unsigned int c;
05831 int clen;
05832
05833 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05834 if (squeez[c]) {
05835 modify = 1;
05836 }
05837 else {
05838 if (t != s) *t = c;
05839 t++;
05840 }
05841 s++;
05842 }
05843 else {
05844 c = rb_enc_codepoint_len(s, send, &clen, enc);
05845
05846 if (tr_find(c, squeez, del, nodel)) {
05847 modify = 1;
05848 }
05849 else {
05850 if (t != s) rb_enc_mbcput(c, t, enc);
05851 t += clen;
05852 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05853 }
05854 s += clen;
05855 }
05856 }
05857 *t = '\0';
05858 STR_SET_LEN(str, t - RSTRING_PTR(str));
05859 ENC_CODERANGE_SET(str, cr);
05860
05861 if (modify) return str;
05862 return Qnil;
05863 }
05864
05865
05866
05867
05868
05869
05870
05871
05872
05873
05874
05875
05876
05877
05878
05879
05880 static VALUE
05881 rb_str_delete(int argc, VALUE *argv, VALUE str)
05882 {
05883 str = rb_str_dup(str);
05884 rb_str_delete_bang(argc, argv, str);
05885 return str;
05886 }
05887
05888
05889
05890
05891
05892
05893
05894
05895
05896
05897 static VALUE
05898 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05899 {
05900 char squeez[TR_TABLE_SIZE];
05901 rb_encoding *enc = 0;
05902 VALUE del = 0, nodel = 0;
05903 char *s, *send, *t;
05904 int i, modify = 0;
05905 int ascompat, singlebyte = single_byte_optimizable(str);
05906 unsigned int save;
05907
05908 if (argc == 0) {
05909 enc = STR_ENC_GET(str);
05910 }
05911 else {
05912 for (i=0; i<argc; i++) {
05913 VALUE s = argv[i];
05914
05915 StringValue(s);
05916 enc = rb_enc_check(str, s);
05917 if (singlebyte && !single_byte_optimizable(s))
05918 singlebyte = 0;
05919 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05920 }
05921 }
05922
05923 str_modify_keep_cr(str);
05924 s = t = RSTRING_PTR(str);
05925 if (!s || RSTRING_LEN(str) == 0) return Qnil;
05926 send = RSTRING_END(str);
05927 save = -1;
05928 ascompat = rb_enc_asciicompat(enc);
05929
05930 if (singlebyte) {
05931 while (s < send) {
05932 unsigned int c = *(unsigned char*)s++;
05933 if (c != save || (argc > 0 && !squeez[c])) {
05934 *t++ = save = c;
05935 }
05936 }
05937 } else {
05938 while (s < send) {
05939 unsigned int c;
05940 int clen;
05941
05942 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05943 if (c != save || (argc > 0 && !squeez[c])) {
05944 *t++ = save = c;
05945 }
05946 s++;
05947 }
05948 else {
05949 c = rb_enc_codepoint_len(s, send, &clen, enc);
05950
05951 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05952 if (t != s) rb_enc_mbcput(c, t, enc);
05953 save = c;
05954 t += clen;
05955 }
05956 s += clen;
05957 }
05958 }
05959 }
05960
05961 *t = '\0';
05962 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05963 STR_SET_LEN(str, t - RSTRING_PTR(str));
05964 modify = 1;
05965 }
05966
05967 if (modify) return str;
05968 return Qnil;
05969 }
05970
05971
05972
05973
05974
05975
05976
05977
05978
05979
05980
05981
05982
05983
05984
05985
05986
05987 static VALUE
05988 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05989 {
05990 str = rb_str_dup(str);
05991 rb_str_squeeze_bang(argc, argv, str);
05992 return str;
05993 }
05994
05995
05996
05997
05998
05999
06000
06001
06002
06003
06004 static VALUE
06005 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
06006 {
06007 return tr_trans(str, src, repl, 1);
06008 }
06009
06010
06011
06012
06013
06014
06015
06016
06017
06018
06019
06020
06021
06022
06023
06024 static VALUE
06025 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
06026 {
06027 str = rb_str_dup(str);
06028 tr_trans(str, src, repl, 1);
06029 return str;
06030 }
06031
06032
06033
06034
06035
06036
06037
06038
06039
06040
06041
06042
06043
06044
06045
06046
06047
06048
06049
06050
06051
06052
06053
06054
06055
06056
06057
06058
06059
06060 static VALUE
06061 rb_str_count(int argc, VALUE *argv, VALUE str)
06062 {
06063 char table[TR_TABLE_SIZE];
06064 rb_encoding *enc = 0;
06065 VALUE del = 0, nodel = 0, tstr;
06066 char *s, *send;
06067 int i;
06068 int ascompat;
06069
06070 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
06071
06072 tstr = argv[0];
06073 StringValue(tstr);
06074 enc = rb_enc_check(str, tstr);
06075 if (argc == 1) {
06076 const char *ptstr;
06077 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
06078 (ptstr = RSTRING_PTR(tstr),
06079 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
06080 !is_broken_string(str)) {
06081 int n = 0;
06082 int clen;
06083 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
06084
06085 s = RSTRING_PTR(str);
06086 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
06087 send = RSTRING_END(str);
06088 while (s < send) {
06089 if (*(unsigned char*)s++ == c) n++;
06090 }
06091 return INT2NUM(n);
06092 }
06093 }
06094
06095 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
06096 for (i=1; i<argc; i++) {
06097 tstr = argv[i];
06098 StringValue(tstr);
06099 enc = rb_enc_check(str, tstr);
06100 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
06101 }
06102
06103 s = RSTRING_PTR(str);
06104 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
06105 send = RSTRING_END(str);
06106 ascompat = rb_enc_asciicompat(enc);
06107 i = 0;
06108 while (s < send) {
06109 unsigned int c;
06110
06111 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
06112 if (table[c]) {
06113 i++;
06114 }
06115 s++;
06116 }
06117 else {
06118 int clen;
06119 c = rb_enc_codepoint_len(s, send, &clen, enc);
06120 if (tr_find(c, table, del, nodel)) {
06121 i++;
06122 }
06123 s += clen;
06124 }
06125 }
06126
06127 return INT2NUM(i);
06128 }
06129
06130 static const char isspacetable[256] = {
06131 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
06132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06133 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06135 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06136 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06137 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06138 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
06146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
06147 };
06148
06149 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
06150
06151
06152
06153
06154
06155
06156
06157
06158
06159
06160
06161
06162
06163
06164
06165
06166
06167
06168
06169
06170
06171
06172
06173
06174
06175
06176
06177
06178
06179
06180
06181
06182
06183
06184
06185
06186
06187
06188
06189
06190
06191
06192
06193
06194
06195
06196
06197
06198 static VALUE
06199 rb_str_split_m(int argc, VALUE *argv, VALUE str)
06200 {
06201 rb_encoding *enc;
06202 VALUE spat;
06203 VALUE limit;
06204 enum {awk, string, regexp} split_type;
06205 long beg, end, i = 0;
06206 int lim = 0;
06207 VALUE result, tmp;
06208
06209 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
06210 lim = NUM2INT(limit);
06211 if (lim <= 0) limit = Qnil;
06212 else if (lim == 1) {
06213 if (RSTRING_LEN(str) == 0)
06214 return rb_ary_new2(0);
06215 return rb_ary_new3(1, str);
06216 }
06217 i = 1;
06218 }
06219
06220 enc = STR_ENC_GET(str);
06221 if (NIL_P(spat) && NIL_P(spat = rb_fs)) {
06222 split_type = awk;
06223 }
06224 else {
06225 if (RB_TYPE_P(spat, T_STRING)) {
06226 rb_encoding *enc2 = STR_ENC_GET(spat);
06227
06228 split_type = string;
06229 if (RSTRING_LEN(spat) == 0) {
06230
06231 spat = rb_reg_regcomp(spat);
06232 split_type = regexp;
06233 }
06234 else if (rb_enc_asciicompat(enc2) == 1) {
06235 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
06236 split_type = awk;
06237 }
06238 }
06239 else {
06240 int l;
06241 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
06242 RSTRING_LEN(spat) == l) {
06243 split_type = awk;
06244 }
06245 }
06246 }
06247 else {
06248 spat = get_pat(spat, 1);
06249 split_type = regexp;
06250 }
06251 }
06252
06253 result = rb_ary_new();
06254 beg = 0;
06255 if (split_type == awk) {
06256 char *ptr = RSTRING_PTR(str);
06257 char *eptr = RSTRING_END(str);
06258 char *bptr = ptr;
06259 int skip = 1;
06260 unsigned int c;
06261
06262 end = beg;
06263 if (is_ascii_string(str)) {
06264 while (ptr < eptr) {
06265 c = (unsigned char)*ptr++;
06266 if (skip) {
06267 if (ascii_isspace(c)) {
06268 beg = ptr - bptr;
06269 }
06270 else {
06271 end = ptr - bptr;
06272 skip = 0;
06273 if (!NIL_P(limit) && lim <= i) break;
06274 }
06275 }
06276 else if (ascii_isspace(c)) {
06277 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06278 skip = 1;
06279 beg = ptr - bptr;
06280 if (!NIL_P(limit)) ++i;
06281 }
06282 else {
06283 end = ptr - bptr;
06284 }
06285 }
06286 }
06287 else {
06288 while (ptr < eptr) {
06289 int n;
06290
06291 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
06292 ptr += n;
06293 if (skip) {
06294 if (rb_isspace(c)) {
06295 beg = ptr - bptr;
06296 }
06297 else {
06298 end = ptr - bptr;
06299 skip = 0;
06300 if (!NIL_P(limit) && lim <= i) break;
06301 }
06302 }
06303 else if (rb_isspace(c)) {
06304 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06305 skip = 1;
06306 beg = ptr - bptr;
06307 if (!NIL_P(limit)) ++i;
06308 }
06309 else {
06310 end = ptr - bptr;
06311 }
06312 }
06313 }
06314 }
06315 else if (split_type == string) {
06316 char *ptr = RSTRING_PTR(str);
06317 char *temp = ptr;
06318 char *eptr = RSTRING_END(str);
06319 char *sptr = RSTRING_PTR(spat);
06320 long slen = RSTRING_LEN(spat);
06321
06322 if (is_broken_string(str)) {
06323 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
06324 }
06325 if (is_broken_string(spat)) {
06326 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
06327 }
06328 enc = rb_enc_check(str, spat);
06329 while (ptr < eptr &&
06330 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
06331
06332 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
06333 if (t != ptr + end) {
06334 ptr = t;
06335 continue;
06336 }
06337 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
06338 ptr += end + slen;
06339 if (!NIL_P(limit) && lim <= ++i) break;
06340 }
06341 beg = ptr - temp;
06342 }
06343 else {
06344 char *ptr = RSTRING_PTR(str);
06345 long len = RSTRING_LEN(str);
06346 long start = beg;
06347 long idx;
06348 int last_null = 0;
06349 struct re_registers *regs;
06350
06351 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
06352 regs = RMATCH_REGS(rb_backref_get());
06353 if (start == end && BEG(0) == END(0)) {
06354 if (!ptr) {
06355 rb_ary_push(result, str_new_empty(str));
06356 break;
06357 }
06358 else if (last_null == 1) {
06359 rb_ary_push(result, rb_str_subseq(str, beg,
06360 rb_enc_fast_mbclen(ptr+beg,
06361 ptr+len,
06362 enc)));
06363 beg = start;
06364 }
06365 else {
06366 if (ptr+start == ptr+len)
06367 start++;
06368 else
06369 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
06370 last_null = 1;
06371 continue;
06372 }
06373 }
06374 else {
06375 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06376 beg = start = END(0);
06377 }
06378 last_null = 0;
06379
06380 for (idx=1; idx < regs->num_regs; idx++) {
06381 if (BEG(idx) == -1) continue;
06382 if (BEG(idx) == END(idx))
06383 tmp = str_new_empty(str);
06384 else
06385 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
06386 rb_ary_push(result, tmp);
06387 }
06388 if (!NIL_P(limit) && lim <= ++i) break;
06389 }
06390 }
06391 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
06392 if (RSTRING_LEN(str) == beg)
06393 tmp = str_new_empty(str);
06394 else
06395 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
06396 rb_ary_push(result, tmp);
06397 }
06398 if (NIL_P(limit) && lim == 0) {
06399 long len;
06400 while ((len = RARRAY_LEN(result)) > 0 &&
06401 (tmp = RARRAY_AREF(result, len-1), RSTRING_LEN(tmp) == 0))
06402 rb_ary_pop(result);
06403 }
06404
06405 return result;
06406 }
06407
06408 VALUE
06409 rb_str_split(VALUE str, const char *sep0)
06410 {
06411 VALUE sep;
06412
06413 StringValue(str);
06414 sep = rb_str_new2(sep0);
06415 return rb_str_split_m(1, &sep, str);
06416 }
06417
06418
06419 static VALUE
06420 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
06421 {
06422 rb_encoding *enc;
06423 VALUE line, rs, orig = str;
06424 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
06425 long pos, len, rslen;
06426 int paragraph_mode = 0;
06427
06428 VALUE UNINITIALIZED_VAR(ary);
06429
06430 if (argc == 0)
06431 rs = rb_rs;
06432 else
06433 rb_scan_args(argc, argv, "01", &rs);
06434
06435 if (rb_block_given_p()) {
06436 if (wantarray) {
06437 #if STRING_ENUMERATORS_WANTARRAY
06438 rb_warn("given block not used");
06439 ary = rb_ary_new();
06440 #else
06441 rb_warning("passing a block to String#lines is deprecated");
06442 wantarray = 0;
06443 #endif
06444 }
06445 }
06446 else {
06447 if (wantarray)
06448 ary = rb_ary_new();
06449 else
06450 RETURN_ENUMERATOR(str, argc, argv);
06451 }
06452
06453 if (NIL_P(rs)) {
06454 if (wantarray) {
06455 rb_ary_push(ary, str);
06456 return ary;
06457 }
06458 else {
06459 rb_yield(str);
06460 return orig;
06461 }
06462 }
06463
06464 str = rb_str_new4(str);
06465 ptr = subptr = RSTRING_PTR(str);
06466 pend = RSTRING_END(str);
06467 len = RSTRING_LEN(str);
06468 StringValue(rs);
06469 rslen = RSTRING_LEN(rs);
06470
06471 if (rs == rb_default_rs)
06472 enc = rb_enc_get(str);
06473 else
06474 enc = rb_enc_check(str, rs);
06475
06476 if (rslen == 0) {
06477 rsptr = "\n\n";
06478 rslen = 2;
06479 paragraph_mode = 1;
06480 }
06481 else {
06482 rsptr = RSTRING_PTR(rs);
06483 }
06484
06485 if ((rs == rb_default_rs || paragraph_mode) && !rb_enc_asciicompat(enc)) {
06486 rs = rb_str_new(rsptr, rslen);
06487 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
06488 rsptr = RSTRING_PTR(rs);
06489 rslen = RSTRING_LEN(rs);
06490 }
06491
06492 while (subptr < pend) {
06493 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
06494 if (pos < 0) break;
06495 hit = subptr + pos;
06496 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
06497 if (hit != adjusted) {
06498 subptr = adjusted;
06499 continue;
06500 }
06501 subend = hit + rslen;
06502 if (paragraph_mode) {
06503 while (subend < pend && rb_enc_is_newline(subend, pend, enc)) {
06504 subend += rb_enc_mbclen(subend, pend, enc);
06505 }
06506 }
06507 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
06508 if (wantarray) {
06509 rb_ary_push(ary, line);
06510 }
06511 else {
06512 rb_yield(line);
06513 str_mod_check(str, ptr, len);
06514 }
06515 subptr = subend;
06516 }
06517
06518 if (subptr != pend) {
06519 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
06520 if (wantarray)
06521 rb_ary_push(ary, line);
06522 else
06523 rb_yield(line);
06524 RB_GC_GUARD(str);
06525 }
06526
06527 if (wantarray)
06528 return ary;
06529 else
06530 return orig;
06531 }
06532
06533
06534
06535
06536
06537
06538
06539
06540
06541
06542
06543
06544
06545
06546
06547
06548
06549
06550
06551
06552
06553
06554
06555
06556
06557
06558
06559
06560
06561
06562
06563
06564
06565
06566
06567
06568 static VALUE
06569 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06570 {
06571 return rb_str_enumerate_lines(argc, argv, str, 0);
06572 }
06573
06574
06575
06576
06577
06578
06579
06580
06581
06582
06583
06584
06585
06586 static VALUE
06587 rb_str_lines(int argc, VALUE *argv, VALUE str)
06588 {
06589 return rb_str_enumerate_lines(argc, argv, str, 1);
06590 }
06591
06592 static VALUE
06593 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
06594 {
06595 return LONG2FIX(RSTRING_LEN(str));
06596 }
06597
06598 static VALUE
06599 rb_str_enumerate_bytes(VALUE str, int wantarray)
06600 {
06601 long i;
06602 VALUE UNINITIALIZED_VAR(ary);
06603
06604 if (rb_block_given_p()) {
06605 if (wantarray) {
06606 #if STRING_ENUMERATORS_WANTARRAY
06607 rb_warn("given block not used");
06608 ary = rb_ary_new();
06609 #else
06610 rb_warning("passing a block to String#bytes is deprecated");
06611 wantarray = 0;
06612 #endif
06613 }
06614 }
06615 else {
06616 if (wantarray)
06617 ary = rb_ary_new2(RSTRING_LEN(str));
06618 else
06619 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
06620 }
06621
06622 for (i=0; i<RSTRING_LEN(str); i++) {
06623 if (wantarray)
06624 rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06625 else
06626 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06627 }
06628 if (wantarray)
06629 return ary;
06630 else
06631 return str;
06632 }
06633
06634
06635
06636
06637
06638
06639
06640
06641
06642
06643
06644
06645
06646
06647
06648
06649 static VALUE
06650 rb_str_each_byte(VALUE str)
06651 {
06652 return rb_str_enumerate_bytes(str, 0);
06653 }
06654
06655
06656
06657
06658
06659
06660
06661
06662
06663
06664
06665
06666 static VALUE
06667 rb_str_bytes(VALUE str)
06668 {
06669 return rb_str_enumerate_bytes(str, 1);
06670 }
06671
06672 static VALUE
06673 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
06674 {
06675 return rb_str_length(str);
06676 }
06677
06678 static VALUE
06679 rb_str_enumerate_chars(VALUE str, int wantarray)
06680 {
06681 VALUE orig = str;
06682 VALUE substr;
06683 long i, len, n;
06684 const char *ptr;
06685 rb_encoding *enc;
06686 VALUE UNINITIALIZED_VAR(ary);
06687
06688 str = rb_str_new4(str);
06689 ptr = RSTRING_PTR(str);
06690 len = RSTRING_LEN(str);
06691 enc = rb_enc_get(str);
06692
06693 if (rb_block_given_p()) {
06694 if (wantarray) {
06695 #if STRING_ENUMERATORS_WANTARRAY
06696 rb_warn("given block not used");
06697 ary = rb_ary_new_capa(str_strlen(str, enc));
06698 #else
06699 rb_warning("passing a block to String#chars is deprecated");
06700 wantarray = 0;
06701 #endif
06702 }
06703 }
06704 else {
06705 if (wantarray)
06706 ary = rb_ary_new_capa(str_strlen(str, enc));
06707 else
06708 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
06709 }
06710
06711 switch (ENC_CODERANGE(str)) {
06712 case ENC_CODERANGE_VALID:
06713 case ENC_CODERANGE_7BIT:
06714 for (i = 0; i < len; i += n) {
06715 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06716 substr = rb_str_subseq(str, i, n);
06717 if (wantarray)
06718 rb_ary_push(ary, substr);
06719 else
06720 rb_yield(substr);
06721 }
06722 break;
06723 default:
06724 for (i = 0; i < len; i += n) {
06725 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06726 substr = rb_str_subseq(str, i, n);
06727 if (wantarray)
06728 rb_ary_push(ary, substr);
06729 else
06730 rb_yield(substr);
06731 }
06732 }
06733 RB_GC_GUARD(str);
06734 if (wantarray)
06735 return ary;
06736 else
06737 return orig;
06738 }
06739
06740
06741
06742
06743
06744
06745
06746
06747
06748
06749
06750
06751
06752
06753
06754
06755 static VALUE
06756 rb_str_each_char(VALUE str)
06757 {
06758 return rb_str_enumerate_chars(str, 0);
06759 }
06760
06761
06762
06763
06764
06765
06766
06767
06768
06769
06770
06771
06772 static VALUE
06773 rb_str_chars(VALUE str)
06774 {
06775 return rb_str_enumerate_chars(str, 1);
06776 }
06777
06778
06779 static VALUE
06780 rb_str_enumerate_codepoints(VALUE str, int wantarray)
06781 {
06782 VALUE orig = str;
06783 int n;
06784 unsigned int c;
06785 const char *ptr, *end;
06786 rb_encoding *enc;
06787 VALUE UNINITIALIZED_VAR(ary);
06788
06789 if (single_byte_optimizable(str))
06790 return rb_str_enumerate_bytes(str, wantarray);
06791
06792 str = rb_str_new4(str);
06793 ptr = RSTRING_PTR(str);
06794 end = RSTRING_END(str);
06795 enc = STR_ENC_GET(str);
06796
06797 if (rb_block_given_p()) {
06798 if (wantarray) {
06799 #if STRING_ENUMERATORS_WANTARRAY
06800 rb_warn("given block not used");
06801 ary = rb_ary_new_capa(str_strlen(str, enc));
06802 #else
06803 rb_warning("passing a block to String#codepoints is deprecated");
06804 wantarray = 0;
06805 #endif
06806 }
06807 }
06808 else {
06809 if (wantarray)
06810 ary = rb_ary_new_capa(str_strlen(str, enc));
06811 else
06812 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
06813 }
06814
06815 while (ptr < end) {
06816 c = rb_enc_codepoint_len(ptr, end, &n, enc);
06817 if (wantarray)
06818 rb_ary_push(ary, UINT2NUM(c));
06819 else
06820 rb_yield(UINT2NUM(c));
06821 ptr += n;
06822 }
06823 RB_GC_GUARD(str);
06824 if (wantarray)
06825 return ary;
06826 else
06827 return orig;
06828 }
06829
06830
06831
06832
06833
06834
06835
06836
06837
06838
06839
06840
06841
06842
06843
06844
06845
06846
06847
06848 static VALUE
06849 rb_str_each_codepoint(VALUE str)
06850 {
06851 return rb_str_enumerate_codepoints(str, 0);
06852 }
06853
06854
06855
06856
06857
06858
06859
06860
06861
06862
06863
06864
06865
06866 static VALUE
06867 rb_str_codepoints(VALUE str)
06868 {
06869 return rb_str_enumerate_codepoints(str, 1);
06870 }
06871
06872
06873 static long
06874 chopped_length(VALUE str)
06875 {
06876 rb_encoding *enc = STR_ENC_GET(str);
06877 const char *p, *p2, *beg, *end;
06878
06879 beg = RSTRING_PTR(str);
06880 end = beg + RSTRING_LEN(str);
06881 if (beg > end) return 0;
06882 p = rb_enc_prev_char(beg, end, end, enc);
06883 if (!p) return 0;
06884 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06885 p2 = rb_enc_prev_char(beg, p, end, enc);
06886 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06887 }
06888 return p - beg;
06889 }
06890
06891
06892
06893
06894
06895
06896
06897
06898
06899
06900 static VALUE
06901 rb_str_chop_bang(VALUE str)
06902 {
06903 str_modify_keep_cr(str);
06904 if (RSTRING_LEN(str) > 0) {
06905 long len;
06906 len = chopped_length(str);
06907 STR_SET_LEN(str, len);
06908 RSTRING_PTR(str)[len] = '\0';
06909 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06910 ENC_CODERANGE_CLEAR(str);
06911 }
06912 return str;
06913 }
06914 return Qnil;
06915 }
06916
06917
06918
06919
06920
06921
06922
06923
06924
06925
06926
06927
06928
06929
06930
06931
06932
06933
06934
06935 static VALUE
06936 rb_str_chop(VALUE str)
06937 {
06938 return rb_str_subseq(str, 0, chopped_length(str));
06939 }
06940
06941
06942
06943
06944
06945
06946
06947
06948
06949
06950 static VALUE
06951 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06952 {
06953 rb_encoding *enc;
06954 VALUE rs;
06955 int newline;
06956 char *p, *pp, *e;
06957 long len, rslen;
06958
06959 str_modify_keep_cr(str);
06960 len = RSTRING_LEN(str);
06961 if (len == 0) return Qnil;
06962 p = RSTRING_PTR(str);
06963 e = p + len;
06964 if (argc == 0) {
06965 rs = rb_rs;
06966 if (rs == rb_default_rs) {
06967 smart_chomp:
06968 enc = rb_enc_get(str);
06969 if (rb_enc_mbminlen(enc) > 1) {
06970 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06971 if (rb_enc_is_newline(pp, e, enc)) {
06972 e = pp;
06973 }
06974 pp = e - rb_enc_mbminlen(enc);
06975 if (pp >= p) {
06976 pp = rb_enc_left_char_head(p, pp, e, enc);
06977 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06978 e = pp;
06979 }
06980 }
06981 if (e == RSTRING_END(str)) {
06982 return Qnil;
06983 }
06984 len = e - RSTRING_PTR(str);
06985 STR_SET_LEN(str, len);
06986 }
06987 else {
06988 if (RSTRING_PTR(str)[len-1] == '\n') {
06989 STR_DEC_LEN(str);
06990 if (RSTRING_LEN(str) > 0 &&
06991 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06992 STR_DEC_LEN(str);
06993 }
06994 }
06995 else if (RSTRING_PTR(str)[len-1] == '\r') {
06996 STR_DEC_LEN(str);
06997 }
06998 else {
06999 return Qnil;
07000 }
07001 }
07002 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
07003 return str;
07004 }
07005 }
07006 else {
07007 rb_scan_args(argc, argv, "01", &rs);
07008 }
07009 if (NIL_P(rs)) return Qnil;
07010 StringValue(rs);
07011 rslen = RSTRING_LEN(rs);
07012 if (rslen == 0) {
07013 while (len>0 && p[len-1] == '\n') {
07014 len--;
07015 if (len>0 && p[len-1] == '\r')
07016 len--;
07017 }
07018 if (len < RSTRING_LEN(str)) {
07019 STR_SET_LEN(str, len);
07020 RSTRING_PTR(str)[len] = '\0';
07021 return str;
07022 }
07023 return Qnil;
07024 }
07025 if (rslen > len) return Qnil;
07026 newline = RSTRING_PTR(rs)[rslen-1];
07027 if (rslen == 1 && newline == '\n')
07028 goto smart_chomp;
07029
07030 enc = rb_enc_check(str, rs);
07031 if (is_broken_string(rs)) {
07032 return Qnil;
07033 }
07034 pp = e - rslen;
07035 if (p[len-1] == newline &&
07036 (rslen <= 1 ||
07037 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
07038 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
07039 return Qnil;
07040 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
07041 ENC_CODERANGE_CLEAR(str);
07042 }
07043 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
07044 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
07045 return str;
07046 }
07047 return Qnil;
07048 }
07049
07050
07051
07052
07053
07054
07055
07056
07057
07058
07059
07060
07061
07062
07063
07064
07065
07066
07067
07068
07069
07070
07071
07072
07073 static VALUE
07074 rb_str_chomp(int argc, VALUE *argv, VALUE str)
07075 {
07076 str = rb_str_dup(str);
07077 rb_str_chomp_bang(argc, argv, str);
07078 return str;
07079 }
07080
07081
07082
07083
07084
07085
07086
07087
07088
07089
07090
07091
07092
07093 static VALUE
07094 rb_str_lstrip_bang(VALUE str)
07095 {
07096 rb_encoding *enc;
07097 char *s, *t, *e;
07098
07099 str_modify_keep_cr(str);
07100 enc = STR_ENC_GET(str);
07101 s = RSTRING_PTR(str);
07102 if (!s || RSTRING_LEN(str) == 0) return Qnil;
07103 e = t = RSTRING_END(str);
07104
07105 while (s < e) {
07106 int n;
07107 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
07108
07109 if (!rb_isspace(cc)) break;
07110 s += n;
07111 }
07112
07113 if (s > RSTRING_PTR(str)) {
07114 STR_SET_LEN(str, t-s);
07115 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
07116 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
07117 return str;
07118 }
07119 return Qnil;
07120 }
07121
07122
07123
07124
07125
07126
07127
07128
07129
07130
07131
07132
07133
07134 static VALUE
07135 rb_str_lstrip(VALUE str)
07136 {
07137 str = rb_str_dup(str);
07138 rb_str_lstrip_bang(str);
07139 return str;
07140 }
07141
07142
07143
07144
07145
07146
07147
07148
07149
07150
07151
07152
07153
07154
07155 static VALUE
07156 rb_str_rstrip_bang(VALUE str)
07157 {
07158 rb_encoding *enc;
07159 char *s, *t, *e;
07160
07161 str_modify_keep_cr(str);
07162 enc = STR_ENC_GET(str);
07163 rb_str_check_dummy_enc(enc);
07164 s = RSTRING_PTR(str);
07165 if (!s || RSTRING_LEN(str) == 0) return Qnil;
07166 t = e = RSTRING_END(str);
07167
07168
07169 if (single_byte_optimizable(str)) {
07170 unsigned char c;
07171 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
07172 }
07173 else {
07174 char *tp;
07175
07176 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
07177 unsigned int c = rb_enc_codepoint(tp, e, enc);
07178 if (c && !rb_isspace(c)) break;
07179 t = tp;
07180 }
07181 }
07182 if (t < e) {
07183 long len = t-RSTRING_PTR(str);
07184
07185 STR_SET_LEN(str, len);
07186 RSTRING_PTR(str)[len] = '\0';
07187 return str;
07188 }
07189 return Qnil;
07190 }
07191
07192
07193
07194
07195
07196
07197
07198
07199
07200
07201
07202
07203
07204 static VALUE
07205 rb_str_rstrip(VALUE str)
07206 {
07207 str = rb_str_dup(str);
07208 rb_str_rstrip_bang(str);
07209 return str;
07210 }
07211
07212
07213
07214
07215
07216
07217
07218
07219
07220
07221 static VALUE
07222 rb_str_strip_bang(VALUE str)
07223 {
07224 VALUE l = rb_str_lstrip_bang(str);
07225 VALUE r = rb_str_rstrip_bang(str);
07226
07227 if (NIL_P(l) && NIL_P(r)) return Qnil;
07228 return str;
07229 }
07230
07231
07232
07233
07234
07235
07236
07237
07238
07239
07240
07241
07242 static VALUE
07243 rb_str_strip(VALUE str)
07244 {
07245 str = rb_str_dup(str);
07246 rb_str_strip_bang(str);
07247 return str;
07248 }
07249
07250 static VALUE
07251 scan_once(VALUE str, VALUE pat, long *start)
07252 {
07253 VALUE result, match;
07254 struct re_registers *regs;
07255 int i;
07256
07257 if (rb_reg_search(pat, str, *start, 0) >= 0) {
07258 match = rb_backref_get();
07259 regs = RMATCH_REGS(match);
07260 if (BEG(0) == END(0)) {
07261 rb_encoding *enc = STR_ENC_GET(str);
07262
07263
07264
07265 if (RSTRING_LEN(str) > END(0))
07266 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
07267 RSTRING_END(str), enc);
07268 else
07269 *start = END(0)+1;
07270 }
07271 else {
07272 *start = END(0);
07273 }
07274 if (regs->num_regs == 1) {
07275 return rb_reg_nth_match(0, match);
07276 }
07277 result = rb_ary_new2(regs->num_regs);
07278 for (i=1; i < regs->num_regs; i++) {
07279 rb_ary_push(result, rb_reg_nth_match(i, match));
07280 }
07281
07282 return result;
07283 }
07284 return Qnil;
07285 }
07286
07287
07288
07289
07290
07291
07292
07293
07294
07295
07296
07297
07298
07299
07300
07301
07302
07303
07304
07305
07306
07307
07308
07309
07310
07311
07312
07313
07314
07315
07316
07317
07318
07319 static VALUE
07320 rb_str_scan(VALUE str, VALUE pat)
07321 {
07322 VALUE result;
07323 long start = 0;
07324 long last = -1, prev = 0;
07325 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
07326
07327 pat = get_pat(pat, 1);
07328 if (!rb_block_given_p()) {
07329 VALUE ary = rb_ary_new();
07330
07331 while (!NIL_P(result = scan_once(str, pat, &start))) {
07332 last = prev;
07333 prev = start;
07334 rb_ary_push(ary, result);
07335 }
07336 if (last >= 0) rb_reg_search(pat, str, last, 0);
07337 return ary;
07338 }
07339
07340 while (!NIL_P(result = scan_once(str, pat, &start))) {
07341 last = prev;
07342 prev = start;
07343 rb_yield(result);
07344 str_mod_check(str, p, len);
07345 }
07346 if (last >= 0) rb_reg_search(pat, str, last, 0);
07347 return str;
07348 }
07349
07350
07351
07352
07353
07354
07355
07356
07357
07358
07359
07360
07361
07362
07363
07364
07365 static VALUE
07366 rb_str_hex(VALUE str)
07367 {
07368 return rb_str_to_inum(str, 16, FALSE);
07369 }
07370
07371
07372
07373
07374
07375
07376
07377
07378
07379
07380
07381
07382
07383
07384
07385
07386 static VALUE
07387 rb_str_oct(VALUE str)
07388 {
07389 return rb_str_to_inum(str, -8, FALSE);
07390 }
07391
07392
07393
07394
07395
07396
07397
07398
07399
07400
07401
07402
07403
07404
07405
07406
07407
07408
07409
07410 static VALUE
07411 rb_str_crypt(VALUE str, VALUE salt)
07412 {
07413 extern char *crypt(const char *, const char *);
07414 VALUE result;
07415 const char *s, *saltp;
07416 char *res;
07417 #ifdef BROKEN_CRYPT
07418 char salt_8bit_clean[3];
07419 #endif
07420
07421 StringValue(salt);
07422 if (RSTRING_LEN(salt) < 2)
07423 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
07424
07425 s = RSTRING_PTR(str);
07426 if (!s) s = "";
07427 saltp = RSTRING_PTR(salt);
07428 #ifdef BROKEN_CRYPT
07429 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
07430 salt_8bit_clean[0] = saltp[0] & 0x7f;
07431 salt_8bit_clean[1] = saltp[1] & 0x7f;
07432 salt_8bit_clean[2] = '\0';
07433 saltp = salt_8bit_clean;
07434 }
07435 #endif
07436 res = crypt(s, saltp);
07437 if (!res) {
07438 rb_sys_fail("crypt");
07439 }
07440 result = rb_str_new2(res);
07441 FL_SET_RAW(result, OBJ_TAINTED_RAW(str) | OBJ_TAINTED_RAW(salt));
07442 return result;
07443 }
07444
07445
07446
07447
07448
07449
07450
07451
07452
07453
07454
07455
07456
07457
07458
07459
07460
07461
07462
07463
07464
07465
07466 VALUE
07467 rb_str_intern(VALUE s)
07468 {
07469 VALUE str = RB_GC_GUARD(s);
07470 ID id;
07471
07472 id = rb_intern_str(str);
07473 return ID2SYM(id);
07474 }
07475
07476
07477
07478
07479
07480
07481
07482
07483
07484
07485
07486 VALUE
07487 rb_str_ord(VALUE s)
07488 {
07489 unsigned int c;
07490
07491 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
07492 return UINT2NUM(c);
07493 }
07494
07495
07496
07497
07498
07499
07500
07501
07502
07503
07504
07505 static VALUE
07506 rb_str_sum(int argc, VALUE *argv, VALUE str)
07507 {
07508 VALUE vbits;
07509 int bits;
07510 char *ptr, *p, *pend;
07511 long len;
07512 VALUE sum = INT2FIX(0);
07513 unsigned long sum0 = 0;
07514
07515 if (argc == 0) {
07516 bits = 16;
07517 }
07518 else {
07519 rb_scan_args(argc, argv, "01", &vbits);
07520 bits = NUM2INT(vbits);
07521 }
07522 ptr = p = RSTRING_PTR(str);
07523 len = RSTRING_LEN(str);
07524 pend = p + len;
07525
07526 while (p < pend) {
07527 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
07528 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07529 str_mod_check(str, ptr, len);
07530 sum0 = 0;
07531 }
07532 sum0 += (unsigned char)*p;
07533 p++;
07534 }
07535
07536 if (bits == 0) {
07537 if (sum0) {
07538 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07539 }
07540 }
07541 else {
07542 if (sum == INT2FIX(0)) {
07543 if (bits < (int)sizeof(long)*CHAR_BIT) {
07544 sum0 &= (((unsigned long)1)<<bits)-1;
07545 }
07546 sum = LONG2FIX(sum0);
07547 }
07548 else {
07549 VALUE mod;
07550
07551 if (sum0) {
07552 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07553 }
07554
07555 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
07556 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
07557 sum = rb_funcall(sum, '&', 1, mod);
07558 }
07559 }
07560 return sum;
07561 }
07562
07563 static VALUE
07564 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
07565 {
07566 rb_encoding *enc;
07567 VALUE w;
07568 long width, len, flen = 1, fclen = 1;
07569 VALUE res;
07570 char *p;
07571 const char *f = " ";
07572 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
07573 volatile VALUE pad;
07574 int singlebyte = 1, cr;
07575
07576 rb_scan_args(argc, argv, "11", &w, &pad);
07577 enc = STR_ENC_GET(str);
07578 width = NUM2LONG(w);
07579 if (argc == 2) {
07580 StringValue(pad);
07581 enc = rb_enc_check(str, pad);
07582 f = RSTRING_PTR(pad);
07583 flen = RSTRING_LEN(pad);
07584 fclen = str_strlen(pad, enc);
07585 singlebyte = single_byte_optimizable(pad);
07586 if (flen == 0 || fclen == 0) {
07587 rb_raise(rb_eArgError, "zero width padding");
07588 }
07589 }
07590 len = str_strlen(str, enc);
07591 if (width < 0 || len >= width) return rb_str_dup(str);
07592 n = width - len;
07593 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
07594 rlen = n - llen;
07595 cr = ENC_CODERANGE(str);
07596 if (flen > 1) {
07597 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
07598 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
07599 }
07600 size = RSTRING_LEN(str);
07601 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
07602 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
07603 (len += llen2 + rlen2) >= LONG_MAX - size) {
07604 rb_raise(rb_eArgError, "argument too big");
07605 }
07606 len += size;
07607 res = rb_str_new5(str, 0, len);
07608 p = RSTRING_PTR(res);
07609 if (flen <= 1) {
07610 memset(p, *f, llen);
07611 p += llen;
07612 }
07613 else {
07614 while (llen >= fclen) {
07615 memcpy(p,f,flen);
07616 p += flen;
07617 llen -= fclen;
07618 }
07619 if (llen > 0) {
07620 memcpy(p, f, llen2);
07621 p += llen2;
07622 }
07623 }
07624 memcpy(p, RSTRING_PTR(str), size);
07625 p += size;
07626 if (flen <= 1) {
07627 memset(p, *f, rlen);
07628 p += rlen;
07629 }
07630 else {
07631 while (rlen >= fclen) {
07632 memcpy(p,f,flen);
07633 p += flen;
07634 rlen -= fclen;
07635 }
07636 if (rlen > 0) {
07637 memcpy(p, f, rlen2);
07638 p += rlen2;
07639 }
07640 }
07641 *p = '\0';
07642 STR_SET_LEN(res, p-RSTRING_PTR(res));
07643 OBJ_INFECT_RAW(res, str);
07644 if (!NIL_P(pad)) OBJ_INFECT_RAW(res, pad);
07645 rb_enc_associate(res, enc);
07646 if (argc == 2)
07647 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07648 if (cr != ENC_CODERANGE_BROKEN)
07649 ENC_CODERANGE_SET(res, cr);
07650 return res;
07651 }
07652
07653
07654
07655
07656
07657
07658
07659
07660
07661
07662
07663
07664
07665
07666
07667 static VALUE
07668 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07669 {
07670 return rb_str_justify(argc, argv, str, 'l');
07671 }
07672
07673
07674
07675
07676
07677
07678
07679
07680
07681
07682
07683
07684
07685
07686
07687 static VALUE
07688 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07689 {
07690 return rb_str_justify(argc, argv, str, 'r');
07691 }
07692
07693
07694
07695
07696
07697
07698
07699
07700
07701
07702
07703
07704
07705
07706
07707 static VALUE
07708 rb_str_center(int argc, VALUE *argv, VALUE str)
07709 {
07710 return rb_str_justify(argc, argv, str, 'c');
07711 }
07712
07713
07714
07715
07716
07717
07718
07719
07720
07721
07722
07723
07724
07725
07726
07727
07728 static VALUE
07729 rb_str_partition(VALUE str, VALUE sep)
07730 {
07731 long pos;
07732 int regex = FALSE;
07733
07734 if (RB_TYPE_P(sep, T_REGEXP)) {
07735 pos = rb_reg_search(sep, str, 0, 0);
07736 regex = TRUE;
07737 }
07738 else {
07739 VALUE tmp;
07740
07741 tmp = rb_check_string_type(sep);
07742 if (NIL_P(tmp)) {
07743 rb_raise(rb_eTypeError, "type mismatch: %s given",
07744 rb_obj_classname(sep));
07745 }
07746 sep = tmp;
07747 pos = rb_str_index(str, sep, 0);
07748 }
07749 if (pos < 0) {
07750 failed:
07751 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07752 }
07753 if (regex) {
07754 sep = rb_str_subpat(str, sep, INT2FIX(0));
07755 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07756 }
07757 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07758 sep,
07759 rb_str_subseq(str, pos+RSTRING_LEN(sep),
07760 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07761 }
07762
07763
07764
07765
07766
07767
07768
07769
07770
07771
07772
07773
07774
07775
07776
07777
07778 static VALUE
07779 rb_str_rpartition(VALUE str, VALUE sep)
07780 {
07781 long pos = RSTRING_LEN(str);
07782 int regex = FALSE;
07783
07784 if (RB_TYPE_P(sep, T_REGEXP)) {
07785 pos = rb_reg_search(sep, str, pos, 1);
07786 regex = TRUE;
07787 }
07788 else {
07789 VALUE tmp;
07790
07791 tmp = rb_check_string_type(sep);
07792 if (NIL_P(tmp)) {
07793 rb_raise(rb_eTypeError, "type mismatch: %s given",
07794 rb_obj_classname(sep));
07795 }
07796 sep = tmp;
07797 pos = rb_str_sublen(str, pos);
07798 pos = rb_str_rindex(str, sep, pos);
07799 }
07800 if (pos < 0) {
07801 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07802 }
07803 if (regex) {
07804 sep = rb_reg_nth_match(0, rb_backref_get());
07805 }
07806 else {
07807 pos = rb_str_offset(str, pos);
07808 }
07809 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07810 sep,
07811 rb_str_subseq(str, pos+RSTRING_LEN(sep),
07812 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07813 }
07814
07815
07816
07817
07818
07819
07820
07821
07822
07823
07824
07825
07826
07827
07828 static VALUE
07829 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07830 {
07831 int i;
07832
07833 for (i=0; i<argc; i++) {
07834 VALUE tmp = argv[i];
07835 StringValue(tmp);
07836 rb_enc_check(str, tmp);
07837 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07838 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07839 return Qtrue;
07840 }
07841 return Qfalse;
07842 }
07843
07844
07845
07846
07847
07848
07849
07850
07851 static VALUE
07852 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07853 {
07854 int i;
07855 char *p, *s, *e;
07856 rb_encoding *enc;
07857
07858 for (i=0; i<argc; i++) {
07859 VALUE tmp = argv[i];
07860 StringValue(tmp);
07861 enc = rb_enc_check(str, tmp);
07862 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07863 p = RSTRING_PTR(str);
07864 e = p + RSTRING_LEN(str);
07865 s = e - RSTRING_LEN(tmp);
07866 if (rb_enc_left_char_head(p, s, e, enc) != s)
07867 continue;
07868 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07869 return Qtrue;
07870 }
07871 return Qfalse;
07872 }
07873
07874 void
07875 rb_str_setter(VALUE val, ID id, VALUE *var)
07876 {
07877 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
07878 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07879 }
07880 *var = val;
07881 }
07882
07883
07884
07885
07886
07887
07888
07889
07890
07891 static VALUE
07892 rb_str_force_encoding(VALUE str, VALUE enc)
07893 {
07894 str_modifiable(str);
07895 rb_enc_associate(str, rb_to_encoding(enc));
07896 ENC_CODERANGE_CLEAR(str);
07897 return str;
07898 }
07899
07900
07901
07902
07903
07904
07905
07906
07907 static VALUE
07908 rb_str_b(VALUE str)
07909 {
07910 VALUE str2 = str_alloc(rb_cString);
07911 str_replace_shared_without_enc(str2, str);
07912 OBJ_INFECT_RAW(str2, str);
07913 ENC_CODERANGE_CLEAR(str2);
07914 return str2;
07915 }
07916
07917
07918
07919
07920
07921
07922
07923
07924
07925
07926
07927
07928 static VALUE
07929 rb_str_valid_encoding_p(VALUE str)
07930 {
07931 int cr = rb_enc_str_coderange(str);
07932
07933 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07934 }
07935
07936
07937
07938
07939
07940
07941
07942
07943
07944
07945
07946 static VALUE
07947 rb_str_is_ascii_only_p(VALUE str)
07948 {
07949 int cr = rb_enc_str_coderange(str);
07950
07951 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07952 }
07953
07968 VALUE
07969 rb_str_ellipsize(VALUE str, long len)
07970 {
07971 static const char ellipsis[] = "...";
07972 const long ellipsislen = sizeof(ellipsis) - 1;
07973 rb_encoding *const enc = rb_enc_get(str);
07974 const long blen = RSTRING_LEN(str);
07975 const char *const p = RSTRING_PTR(str), *e = p + blen;
07976 VALUE estr, ret = 0;
07977
07978 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07979 if (len * rb_enc_mbminlen(enc) >= blen ||
07980 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07981 ret = str;
07982 }
07983 else if (len <= ellipsislen ||
07984 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07985 if (rb_enc_asciicompat(enc)) {
07986 ret = rb_str_new_with_class(str, ellipsis, len);
07987 rb_enc_associate(ret, enc);
07988 }
07989 else {
07990 estr = rb_usascii_str_new(ellipsis, len);
07991 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07992 }
07993 }
07994 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07995 rb_str_cat(ret, ellipsis, ellipsislen);
07996 }
07997 else {
07998 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07999 rb_enc_from_encoding(enc), 0, Qnil);
08000 rb_str_append(ret, estr);
08001 }
08002 return ret;
08003 }
08004
08005 static VALUE
08006 str_compat_and_valid(VALUE str, rb_encoding *enc)
08007 {
08008 int cr;
08009 str = StringValue(str);
08010 cr = rb_enc_str_coderange(str);
08011 if (cr == ENC_CODERANGE_BROKEN) {
08012 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
08013 }
08014 else if (cr == ENC_CODERANGE_7BIT) {
08015 rb_encoding *e = STR_ENC_GET(str);
08016 if (!rb_enc_asciicompat(enc)) {
08017 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
08018 rb_enc_name(enc), rb_enc_name(e));
08019 }
08020 }
08021 else {
08022 rb_encoding *e = STR_ENC_GET(str);
08023 if (enc != e) {
08024 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
08025 rb_enc_name(enc), rb_enc_name(e));
08026 }
08027 }
08028 return str;
08029 }
08030
08036 VALUE
08037 rb_str_scrub(VALUE str, VALUE repl)
08038 {
08039 int cr = ENC_CODERANGE(str);
08040 rb_encoding *enc;
08041 int encidx;
08042 VALUE buf = Qnil;
08043 const char *rep;
08044 long replen;
08045 int tainted = 0;
08046
08047 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID)
08048 return Qnil;
08049
08050 enc = STR_ENC_GET(str);
08051 if (!NIL_P(repl)) {
08052 repl = str_compat_and_valid(repl, enc);
08053 tainted = OBJ_TAINTED_RAW(repl);
08054 }
08055
08056 if (rb_enc_dummy_p(enc)) {
08057 return Qnil;
08058 }
08059 encidx = rb_enc_to_index(enc);
08060
08061 #define DEFAULT_REPLACE_CHAR(str) do { \
08062 static const char replace[sizeof(str)-1] = str; \
08063 rep = replace; replen = (int)sizeof(replace); \
08064 } while (0)
08065
08066 if (rb_enc_asciicompat(enc)) {
08067 const char *p = RSTRING_PTR(str);
08068 const char *e = RSTRING_END(str);
08069 const char *p1 = p;
08070 int rep7bit_p;
08071 if (rb_block_given_p()) {
08072 rep = NULL;
08073 replen = 0;
08074 rep7bit_p = FALSE;
08075 }
08076 else if (!NIL_P(repl)) {
08077 rep = RSTRING_PTR(repl);
08078 replen = RSTRING_LEN(repl);
08079 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
08080 }
08081 else if (encidx == rb_utf8_encindex()) {
08082 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
08083 rep7bit_p = FALSE;
08084 }
08085 else {
08086 DEFAULT_REPLACE_CHAR("?");
08087 rep7bit_p = TRUE;
08088 }
08089 cr = ENC_CODERANGE_7BIT;
08090
08091 p = search_nonascii(p, e);
08092 if (!p) {
08093 p = e;
08094 }
08095 while (p < e) {
08096 int ret = rb_enc_precise_mbclen(p, e, enc);
08097 if (MBCLEN_NEEDMORE_P(ret)) {
08098 break;
08099 }
08100 else if (MBCLEN_CHARFOUND_P(ret)) {
08101 cr = ENC_CODERANGE_VALID;
08102 p += MBCLEN_CHARFOUND_LEN(ret);
08103 }
08104 else if (MBCLEN_INVALID_P(ret)) {
08105
08106
08107
08108
08109 long clen = rb_enc_mbmaxlen(enc);
08110 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
08111 if (p > p1) {
08112 rb_str_buf_cat(buf, p1, p - p1);
08113 }
08114
08115 if (e - p < clen) clen = e - p;
08116 if (clen <= 2) {
08117 clen = 1;
08118 }
08119 else {
08120 const char *q = p;
08121 clen--;
08122 for (; clen > 1; clen--) {
08123 ret = rb_enc_precise_mbclen(q, q + clen, enc);
08124 if (MBCLEN_NEEDMORE_P(ret)) break;
08125 if (MBCLEN_INVALID_P(ret)) continue;
08126 UNREACHABLE;
08127 }
08128 }
08129 if (rep) {
08130 rb_str_buf_cat(buf, rep, replen);
08131 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
08132 }
08133 else {
08134 repl = rb_yield(rb_enc_str_new(p, clen, enc));
08135 repl = str_compat_and_valid(repl, enc);
08136 tainted |= OBJ_TAINTED_RAW(repl);
08137 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
08138 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
08139 cr = ENC_CODERANGE_VALID;
08140 }
08141 p += clen;
08142 p1 = p;
08143 p = search_nonascii(p, e);
08144 if (!p) {
08145 p = e;
08146 break;
08147 }
08148 }
08149 else {
08150 UNREACHABLE;
08151 }
08152 }
08153 if (NIL_P(buf)) {
08154 if (p == e) {
08155 ENC_CODERANGE_SET(str, cr);
08156 return Qnil;
08157 }
08158 buf = rb_str_buf_new(RSTRING_LEN(str));
08159 }
08160 if (p1 < p) {
08161 rb_str_buf_cat(buf, p1, p - p1);
08162 }
08163 if (p < e) {
08164 if (rep) {
08165 rb_str_buf_cat(buf, rep, replen);
08166 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
08167 }
08168 else {
08169 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
08170 repl = str_compat_and_valid(repl, enc);
08171 tainted |= OBJ_TAINTED_RAW(repl);
08172 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
08173 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
08174 cr = ENC_CODERANGE_VALID;
08175 }
08176 }
08177 }
08178 else {
08179
08180 const char *p = RSTRING_PTR(str);
08181 const char *e = RSTRING_END(str);
08182 const char *p1 = p;
08183 long mbminlen = rb_enc_mbminlen(enc);
08184 if (!NIL_P(repl)) {
08185 rep = RSTRING_PTR(repl);
08186 replen = RSTRING_LEN(repl);
08187 }
08188 else if (encidx == ENCINDEX_UTF_16BE) {
08189 DEFAULT_REPLACE_CHAR("\xFF\xFD");
08190 }
08191 else if (encidx == ENCINDEX_UTF_16LE) {
08192 DEFAULT_REPLACE_CHAR("\xFD\xFF");
08193 }
08194 else if (encidx == ENCINDEX_UTF_32BE) {
08195 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
08196 }
08197 else if (encidx == ENCINDEX_UTF_32LE) {
08198 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
08199 }
08200 else {
08201 DEFAULT_REPLACE_CHAR("?");
08202 }
08203
08204 while (p < e) {
08205 int ret = rb_enc_precise_mbclen(p, e, enc);
08206 if (MBCLEN_NEEDMORE_P(ret)) {
08207 break;
08208 }
08209 else if (MBCLEN_CHARFOUND_P(ret)) {
08210 p += MBCLEN_CHARFOUND_LEN(ret);
08211 }
08212 else if (MBCLEN_INVALID_P(ret)) {
08213 const char *q = p;
08214 long clen = rb_enc_mbmaxlen(enc);
08215 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
08216 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
08217
08218 if (e - p < clen) clen = e - p;
08219 if (clen <= mbminlen * 2) {
08220 clen = mbminlen;
08221 }
08222 else {
08223 clen -= mbminlen;
08224 for (; clen > mbminlen; clen-=mbminlen) {
08225 ret = rb_enc_precise_mbclen(q, q + clen, enc);
08226 if (MBCLEN_NEEDMORE_P(ret)) break;
08227 if (MBCLEN_INVALID_P(ret)) continue;
08228 UNREACHABLE;
08229 }
08230 }
08231 if (rep) {
08232 rb_str_buf_cat(buf, rep, replen);
08233 }
08234 else {
08235 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
08236 repl = str_compat_and_valid(repl, enc);
08237 tainted |= OBJ_TAINTED_RAW(repl);
08238 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
08239 }
08240 p += clen;
08241 p1 = p;
08242 }
08243 else {
08244 UNREACHABLE;
08245 }
08246 }
08247 if (NIL_P(buf)) {
08248 if (p == e) {
08249 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
08250 return Qnil;
08251 }
08252 buf = rb_str_buf_new(RSTRING_LEN(str));
08253 }
08254 if (p1 < p) {
08255 rb_str_buf_cat(buf, p1, p - p1);
08256 }
08257 if (p < e) {
08258 if (rep) {
08259 rb_str_buf_cat(buf, rep, replen);
08260 }
08261 else {
08262 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
08263 repl = str_compat_and_valid(repl, enc);
08264 tainted |= OBJ_TAINTED_RAW(repl);
08265 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
08266 }
08267 }
08268 cr = ENC_CODERANGE_VALID;
08269 }
08270 FL_SET_RAW(buf, tainted|OBJ_TAINTED_RAW(str));
08271 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
08272 return buf;
08273 }
08274
08275
08276
08277
08278
08279
08280
08281
08282
08283
08284
08285
08286
08287
08288
08289 static VALUE
08290 str_scrub(int argc, VALUE *argv, VALUE str)
08291 {
08292 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
08293 VALUE new = rb_str_scrub(str, repl);
08294 return NIL_P(new) ? rb_str_dup(str): new;
08295 }
08296
08297
08298
08299
08300
08301
08302
08303
08304
08305
08306
08307
08308
08309
08310
08311 static VALUE
08312 str_scrub_bang(int argc, VALUE *argv, VALUE str)
08313 {
08314 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
08315 VALUE new = rb_str_scrub(str, repl);
08316 if (!NIL_P(new)) rb_str_replace(str, new);
08317 return str;
08318 }
08319
08320
08321
08322
08323
08324
08325
08326
08327
08328
08329
08330
08331
08332
08333
08334
08335
08336
08337
08338
08339
08340
08341
08342
08343
08344
08345
08346
08347
08348
08349
08350
08351
08352
08353
08354
08355
08356
08357
08358
08359
08360
08361
08362 static VALUE
08363 sym_equal(VALUE sym1, VALUE sym2)
08364 {
08365 if (sym1 == sym2) return Qtrue;
08366 return Qfalse;
08367 }
08368
08369
08370 static int
08371 sym_printable(const char *s, const char *send, rb_encoding *enc)
08372 {
08373 while (s < send) {
08374 int n;
08375 int c = rb_enc_codepoint_len(s, send, &n, enc);
08376
08377 if (!rb_enc_isprint(c, enc)) return FALSE;
08378 s += n;
08379 }
08380 return TRUE;
08381 }
08382
08383 int
08384 rb_str_symname_p(VALUE sym)
08385 {
08386 rb_encoding *enc;
08387 const char *ptr;
08388 long len;
08389 rb_encoding *resenc = rb_default_internal_encoding();
08390
08391 if (resenc == NULL) resenc = rb_default_external_encoding();
08392 enc = STR_ENC_GET(sym);
08393 ptr = RSTRING_PTR(sym);
08394 len = RSTRING_LEN(sym);
08395 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
08396 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
08397 return FALSE;
08398 }
08399 return TRUE;
08400 }
08401
08402 VALUE
08403 rb_str_quote_unprintable(VALUE str)
08404 {
08405 rb_encoding *enc;
08406 const char *ptr;
08407 long len;
08408 rb_encoding *resenc;
08409
08410 Check_Type(str, T_STRING);
08411 resenc = rb_default_internal_encoding();
08412 if (resenc == NULL) resenc = rb_default_external_encoding();
08413 enc = STR_ENC_GET(str);
08414 ptr = RSTRING_PTR(str);
08415 len = RSTRING_LEN(str);
08416 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
08417 !sym_printable(ptr, ptr + len, enc)) {
08418 return rb_str_inspect(str);
08419 }
08420 return str;
08421 }
08422
08423 VALUE
08424 rb_id_quote_unprintable(ID id)
08425 {
08426 return rb_str_quote_unprintable(rb_id2str(id));
08427 }
08428
08429
08430
08431
08432
08433
08434
08435
08436
08437
08438 static VALUE
08439 sym_inspect(VALUE sym)
08440 {
08441 VALUE str;
08442 const char *ptr;
08443 long len;
08444 ID id = SYM2ID(sym);
08445 char *dest;
08446
08447 sym = rb_id2str(id);
08448 if (!rb_str_symname_p(sym)) {
08449 str = rb_str_inspect(sym);
08450 len = RSTRING_LEN(str);
08451 rb_str_resize(str, len + 1);
08452 dest = RSTRING_PTR(str);
08453 memmove(dest + 1, dest, len);
08454 dest[0] = ':';
08455 }
08456 else {
08457 rb_encoding *enc = STR_ENC_GET(sym);
08458 ptr = RSTRING_PTR(sym);
08459 len = RSTRING_LEN(sym);
08460 str = rb_enc_str_new(0, len + 1, enc);
08461 dest = RSTRING_PTR(str);
08462 dest[0] = ':';
08463 memcpy(dest + 1, ptr, len);
08464 }
08465 return str;
08466 }
08467
08468
08469
08470
08471
08472
08473
08474
08475
08476
08477
08478
08479
08480 VALUE
08481 rb_sym_to_s(VALUE sym)
08482 {
08483 ID id = SYM2ID(sym);
08484
08485 return str_new3(rb_cString, rb_id2str(id));
08486 }
08487
08488
08489
08490
08491
08492
08493
08494
08495
08496
08497
08498
08499 static VALUE
08500 sym_to_sym(VALUE sym)
08501 {
08502 return sym;
08503 }
08504
08505 static VALUE
08506 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
08507 {
08508 VALUE obj;
08509
08510 if (argc < 1) {
08511 rb_raise(rb_eArgError, "no receiver given");
08512 }
08513 obj = argv[0];
08514 return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
08515 }
08516
08517
08518
08519
08520
08521
08522
08523
08524
08525
08526 static VALUE
08527 sym_to_proc(VALUE sym)
08528 {
08529 static VALUE sym_proc_cache = Qfalse;
08530 enum {SYM_PROC_CACHE_SIZE = 67};
08531 VALUE proc;
08532 long id, index;
08533 VALUE *aryp;
08534
08535 if (!sym_proc_cache) {
08536 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
08537 rb_gc_register_mark_object(sym_proc_cache);
08538 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
08539 }
08540
08541 id = SYM2ID(sym);
08542 index = (id % SYM_PROC_CACHE_SIZE) << 1;
08543
08544 aryp = RARRAY_PTR(sym_proc_cache);
08545 if (aryp[index] == sym) {
08546 return aryp[index + 1];
08547 }
08548 else {
08549 proc = rb_proc_new(sym_call, (VALUE)id);
08550 rb_block_clear_env_self(proc);
08551 aryp[index] = sym;
08552 aryp[index + 1] = proc;
08553 return proc;
08554 }
08555 }
08556
08557
08558
08559
08560
08561
08562
08563
08564
08565 static VALUE
08566 sym_succ(VALUE sym)
08567 {
08568 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
08569 }
08570
08571
08572
08573
08574
08575
08576
08577
08578
08579
08580
08581
08582
08583
08584
08585 static VALUE
08586 sym_cmp(VALUE sym, VALUE other)
08587 {
08588 if (!SYMBOL_P(other)) {
08589 return Qnil;
08590 }
08591 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
08592 }
08593
08594
08595
08596
08597
08598
08599
08600
08601
08602 static VALUE
08603 sym_casecmp(VALUE sym, VALUE other)
08604 {
08605 if (!SYMBOL_P(other)) {
08606 return Qnil;
08607 }
08608 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
08609 }
08610
08611
08612
08613
08614
08615
08616
08617
08618
08619 static VALUE
08620 sym_match(VALUE sym, VALUE other)
08621 {
08622 return rb_str_match(rb_sym_to_s(sym), other);
08623 }
08624
08625
08626
08627
08628
08629
08630
08631
08632
08633
08634
08635 static VALUE
08636 sym_aref(int argc, VALUE *argv, VALUE sym)
08637 {
08638 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
08639 }
08640
08641
08642
08643
08644
08645
08646
08647
08648
08649 static VALUE
08650 sym_length(VALUE sym)
08651 {
08652 return rb_str_length(rb_id2str(SYM2ID(sym)));
08653 }
08654
08655
08656
08657
08658
08659
08660
08661
08662 static VALUE
08663 sym_empty(VALUE sym)
08664 {
08665 return rb_str_empty(rb_id2str(SYM2ID(sym)));
08666 }
08667
08668
08669
08670
08671
08672
08673
08674
08675 static VALUE
08676 sym_upcase(VALUE sym)
08677 {
08678 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
08679 }
08680
08681
08682
08683
08684
08685
08686
08687
08688 static VALUE
08689 sym_downcase(VALUE sym)
08690 {
08691 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
08692 }
08693
08694
08695
08696
08697
08698
08699
08700
08701 static VALUE
08702 sym_capitalize(VALUE sym)
08703 {
08704 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
08705 }
08706
08707
08708
08709
08710
08711
08712
08713
08714 static VALUE
08715 sym_swapcase(VALUE sym)
08716 {
08717 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
08718 }
08719
08720
08721
08722
08723
08724
08725
08726
08727 static VALUE
08728 sym_encoding(VALUE sym)
08729 {
08730 return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
08731 }
08732
08733 ID
08734 rb_to_id(VALUE name)
08735 {
08736 VALUE tmp;
08737
08738 if (SYMBOL_P(name)) {
08739 return SYM2ID(name);
08740 }
08741 if (!RB_TYPE_P(name, T_STRING)) {
08742 tmp = rb_check_string_type(name);
08743 if (NIL_P(tmp)) {
08744 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
08745 name);
08746 }
08747 name = tmp;
08748 }
08749 return rb_intern_str(name);
08750 }
08751
08752
08753
08754
08755
08756
08757
08758
08759
08760
08761
08762
08763
08764
08765 void
08766 Init_String(void)
08767 {
08768 #undef rb_intern
08769 #define rb_intern(str) rb_intern_const(str)
08770
08771 rb_cString = rb_define_class("String", rb_cObject);
08772 rb_include_module(rb_cString, rb_mComparable);
08773 rb_define_alloc_func(rb_cString, empty_str_alloc);
08774 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
08775 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
08776 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
08777 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
08778 rb_define_method(rb_cString, "==", rb_str_equal, 1);
08779 rb_define_method(rb_cString, "===", rb_str_equal, 1);
08780 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
08781 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
08782 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
08783 rb_define_method(rb_cString, "+", rb_str_plus, 1);
08784 rb_define_method(rb_cString, "*", rb_str_times, 1);
08785 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
08786 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
08787 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
08788 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
08789 rb_define_method(rb_cString, "length", rb_str_length, 0);
08790 rb_define_method(rb_cString, "size", rb_str_length, 0);
08791 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
08792 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
08793 rb_define_method(rb_cString, "=~", rb_str_match, 1);
08794 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
08795 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
08796 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
08797 rb_define_method(rb_cString, "next", rb_str_succ, 0);
08798 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
08799 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
08800 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
08801 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
08802 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
08803 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
08804 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
08805 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
08806 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
08807 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
08808 rb_define_method(rb_cString, "scrub", str_scrub, -1);
08809 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
08810 rb_define_method(rb_cString, "freeze", rb_obj_freeze, 0);
08811
08812 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
08813 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
08814 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
08815 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
08816 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
08817 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
08818
08819 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
08820 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
08821 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
08822 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
08823
08824 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
08825 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
08826 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
08827 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
08828
08829 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
08830 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
08831 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
08832 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
08833 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
08834 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
08835 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
08836 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
08837 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
08838 rb_define_method(rb_cString, "concat", rb_str_concat, 1);
08839 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
08840 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
08841 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
08842 rb_define_method(rb_cString, "intern", rb_str_intern, 0);
08843 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
08844 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
08845
08846 rb_define_method(rb_cString, "include?", rb_str_include, 1);
08847 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
08848 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
08849
08850 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
08851
08852 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
08853 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
08854 rb_define_method(rb_cString, "center", rb_str_center, -1);
08855
08856 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
08857 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
08858 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
08859 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
08860 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
08861 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
08862 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
08863
08864 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
08865 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
08866 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
08867 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
08868 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
08869 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
08870 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
08871
08872 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
08873 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
08874 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
08875 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
08876 rb_define_method(rb_cString, "count", rb_str_count, -1);
08877
08878 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
08879 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
08880 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
08881 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
08882
08883 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
08884 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
08885 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
08886 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
08887
08888 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
08889
08890 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
08891 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
08892
08893 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
08894 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
08895
08896 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0);
08897 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
08898 rb_define_method(rb_cString, "b", rb_str_b, 0);
08899 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
08900 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
08901
08902 id_to_s = rb_intern("to_s");
08903
08904 rb_fs = Qnil;
08905 rb_define_variable("$;", &rb_fs);
08906 rb_define_variable("$-F", &rb_fs);
08907
08908 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
08909 rb_include_module(rb_cSymbol, rb_mComparable);
08910 rb_undef_alloc_func(rb_cSymbol);
08911 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
08912 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0);
08913
08914 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
08915 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
08916 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
08917 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
08918 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
08919 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
08920 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
08921 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
08922 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
08923 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
08924
08925 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
08926 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
08927 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
08928
08929 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
08930 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
08931 rb_define_method(rb_cSymbol, "length", sym_length, 0);
08932 rb_define_method(rb_cSymbol, "size", sym_length, 0);
08933 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
08934 rb_define_method(rb_cSymbol, "match", sym_match, 1);
08935
08936 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
08937 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
08938 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
08939 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
08940
08941 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
08942
08943 if (frozen_strings)
08944 st_foreach(frozen_strings, fstring_set_class_i, rb_cString);
08945 }
08946