00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "vm_core.h"
00018 #include "internal.h"
00019 #include "probes.h"
00020 #include <assert.h>
00021
00022 #define BEG(no) (regs->beg[(no)])
00023 #define END(no) (regs->end[(no)])
00024
00025 #include <math.h>
00026 #include <ctype.h>
00027
00028 #ifdef HAVE_UNISTD_H
00029 #include <unistd.h>
00030 #endif
00031
00032 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00033
00034 #undef rb_str_new_cstr
00035 #undef rb_tainted_str_new_cstr
00036 #undef rb_usascii_str_new_cstr
00037 #undef rb_external_str_new_cstr
00038 #undef rb_locale_str_new_cstr
00039 #undef rb_str_new2
00040 #undef rb_str_new3
00041 #undef rb_str_new4
00042 #undef rb_str_new5
00043 #undef rb_tainted_str_new2
00044 #undef rb_usascii_str_new2
00045 #undef rb_str_dup_frozen
00046 #undef rb_str_buf_new_cstr
00047 #undef rb_str_buf_new2
00048 #undef rb_str_buf_cat2
00049 #undef rb_str_cat2
00050
00051 static VALUE rb_str_clear(VALUE str);
00052
00053 VALUE rb_cString;
00054 VALUE rb_cSymbol;
00055
00056 #define RUBY_MAX_CHAR_LEN 16
00057 #define STR_TMPLOCK FL_USER7
00058 #define STR_NOEMBED FL_USER1
00059 #define STR_SHARED FL_USER2
00060 #define STR_ASSOC FL_USER3
00061 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
00062 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
00063 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00064 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
00065 #define STR_UNSET_NOCAPA(s) do {\
00066 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00067 } while (0)
00068
00069
00070 #define STR_SET_NOEMBED(str) do {\
00071 FL_SET((str), STR_NOEMBED);\
00072 STR_SET_EMBED_LEN((str), 0);\
00073 } while (0)
00074 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00075 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
00076 #define STR_SET_EMBED_LEN(str, n) do { \
00077 long tmp_n = (n);\
00078 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00079 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00080 } while (0)
00081
00082 #define STR_SET_LEN(str, n) do { \
00083 if (STR_EMBED_P(str)) {\
00084 STR_SET_EMBED_LEN((str), (n));\
00085 }\
00086 else {\
00087 RSTRING(str)->as.heap.len = (n);\
00088 }\
00089 } while (0)
00090
00091 #define STR_DEC_LEN(str) do {\
00092 if (STR_EMBED_P(str)) {\
00093 long n = RSTRING_LEN(str);\
00094 n--;\
00095 STR_SET_EMBED_LEN((str), n);\
00096 }\
00097 else {\
00098 RSTRING(str)->as.heap.len--;\
00099 }\
00100 } while (0)
00101
00102 #define RESIZE_CAPA(str,capacity) do {\
00103 if (STR_EMBED_P(str)) {\
00104 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00105 char *tmp = ALLOC_N(char, (capacity)+1);\
00106 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00107 RSTRING(str)->as.heap.ptr = tmp;\
00108 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00109 STR_SET_NOEMBED(str);\
00110 RSTRING(str)->as.heap.aux.capa = (capacity);\
00111 }\
00112 }\
00113 else {\
00114 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00115 if (!STR_NOCAPA_P(str))\
00116 RSTRING(str)->as.heap.aux.capa = (capacity);\
00117 }\
00118 } while (0)
00119
00120 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00121 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00122
00123 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00124
00125 static inline int
00126 single_byte_optimizable(VALUE str)
00127 {
00128 rb_encoding *enc;
00129
00130
00131 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00132 return 1;
00133
00134 enc = STR_ENC_GET(str);
00135 if (rb_enc_mbmaxlen(enc) == 1)
00136 return 1;
00137
00138
00139
00140 return 0;
00141 }
00142
00143 VALUE rb_fs;
00144
00145 static inline const char *
00146 search_nonascii(const char *p, const char *e)
00147 {
00148 #if SIZEOF_VALUE == 8
00149 # define NONASCII_MASK 0x8080808080808080ULL
00150 #elif SIZEOF_VALUE == 4
00151 # define NONASCII_MASK 0x80808080UL
00152 #endif
00153 #ifdef NONASCII_MASK
00154 if ((int)sizeof(VALUE) * 2 < e - p) {
00155 const VALUE *s, *t;
00156 const VALUE lowbits = sizeof(VALUE) - 1;
00157 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00158 while (p < (const char *)s) {
00159 if (!ISASCII(*p))
00160 return p;
00161 p++;
00162 }
00163 t = (const VALUE*)(~lowbits & (VALUE)e);
00164 while (s < t) {
00165 if (*s & NONASCII_MASK) {
00166 t = s;
00167 break;
00168 }
00169 s++;
00170 }
00171 p = (const char *)t;
00172 }
00173 #endif
00174 while (p < e) {
00175 if (!ISASCII(*p))
00176 return p;
00177 p++;
00178 }
00179 return NULL;
00180 }
00181
00182 static int
00183 coderange_scan(const char *p, long len, rb_encoding *enc)
00184 {
00185 const char *e = p + len;
00186
00187 if (rb_enc_to_index(enc) == 0) {
00188
00189 p = search_nonascii(p, e);
00190 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00191 }
00192
00193 if (rb_enc_asciicompat(enc)) {
00194 p = search_nonascii(p, e);
00195 if (!p) {
00196 return ENC_CODERANGE_7BIT;
00197 }
00198 while (p < e) {
00199 int ret = rb_enc_precise_mbclen(p, e, enc);
00200 if (!MBCLEN_CHARFOUND_P(ret)) {
00201 return ENC_CODERANGE_BROKEN;
00202 }
00203 p += MBCLEN_CHARFOUND_LEN(ret);
00204 if (p < e) {
00205 p = search_nonascii(p, e);
00206 if (!p) {
00207 return ENC_CODERANGE_VALID;
00208 }
00209 }
00210 }
00211 if (e < p) {
00212 return ENC_CODERANGE_BROKEN;
00213 }
00214 return ENC_CODERANGE_VALID;
00215 }
00216
00217 while (p < e) {
00218 int ret = rb_enc_precise_mbclen(p, e, enc);
00219
00220 if (!MBCLEN_CHARFOUND_P(ret)) {
00221 return ENC_CODERANGE_BROKEN;
00222 }
00223 p += MBCLEN_CHARFOUND_LEN(ret);
00224 }
00225 if (e < p) {
00226 return ENC_CODERANGE_BROKEN;
00227 }
00228 return ENC_CODERANGE_VALID;
00229 }
00230
00231 long
00232 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00233 {
00234 const char *p = s;
00235
00236 if (*cr == ENC_CODERANGE_BROKEN)
00237 return e - s;
00238
00239 if (rb_enc_to_index(enc) == 0) {
00240
00241 p = search_nonascii(p, e);
00242 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00243 return e - s;
00244 }
00245 else if (rb_enc_asciicompat(enc)) {
00246 p = search_nonascii(p, e);
00247 if (!p) {
00248 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00249 return e - s;
00250 }
00251 while (p < e) {
00252 int ret = rb_enc_precise_mbclen(p, e, enc);
00253 if (!MBCLEN_CHARFOUND_P(ret)) {
00254 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00255 return p - s;
00256 }
00257 p += MBCLEN_CHARFOUND_LEN(ret);
00258 if (p < e) {
00259 p = search_nonascii(p, e);
00260 if (!p) {
00261 *cr = ENC_CODERANGE_VALID;
00262 return e - s;
00263 }
00264 }
00265 }
00266 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00267 return p - s;
00268 }
00269 else {
00270 while (p < e) {
00271 int ret = rb_enc_precise_mbclen(p, e, enc);
00272 if (!MBCLEN_CHARFOUND_P(ret)) {
00273 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00274 return p - s;
00275 }
00276 p += MBCLEN_CHARFOUND_LEN(ret);
00277 }
00278 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00279 return p - s;
00280 }
00281 }
00282
00283 static inline void
00284 str_enc_copy(VALUE str1, VALUE str2)
00285 {
00286 rb_enc_set_index(str1, ENCODING_GET(str2));
00287 }
00288
00289 static void
00290 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00291 {
00292
00293
00294
00295 str_enc_copy(dest, src);
00296 if (RSTRING_LEN(dest) == 0) {
00297 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00298 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299 else
00300 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301 return;
00302 }
00303 switch (ENC_CODERANGE(src)) {
00304 case ENC_CODERANGE_7BIT:
00305 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00306 break;
00307 case ENC_CODERANGE_VALID:
00308 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00309 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00311 else
00312 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00313 break;
00314 default:
00315 break;
00316 }
00317 }
00318
00319 static void
00320 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00321 {
00322 str_enc_copy(dest, src);
00323 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00324 }
00325
00326 int
00327 rb_enc_str_coderange(VALUE str)
00328 {
00329 int cr = ENC_CODERANGE(str);
00330
00331 if (cr == ENC_CODERANGE_UNKNOWN) {
00332 rb_encoding *enc = STR_ENC_GET(str);
00333 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00334 ENC_CODERANGE_SET(str, cr);
00335 }
00336 return cr;
00337 }
00338
00339 int
00340 rb_enc_str_asciionly_p(VALUE str)
00341 {
00342 rb_encoding *enc = STR_ENC_GET(str);
00343
00344 if (!rb_enc_asciicompat(enc))
00345 return FALSE;
00346 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00347 return TRUE;
00348 return FALSE;
00349 }
00350
00351 static inline void
00352 str_mod_check(VALUE s, const char *p, long len)
00353 {
00354 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00355 rb_raise(rb_eRuntimeError, "string modified");
00356 }
00357 }
00358
00359 size_t
00360 rb_str_capacity(VALUE str)
00361 {
00362 if (STR_EMBED_P(str)) {
00363 return RSTRING_EMBED_LEN_MAX;
00364 }
00365 else if (STR_NOCAPA_P(str)) {
00366 return RSTRING(str)->as.heap.len;
00367 }
00368 else {
00369 return RSTRING(str)->as.heap.aux.capa;
00370 }
00371 }
00372
00373 static inline VALUE
00374 str_alloc(VALUE klass)
00375 {
00376 NEWOBJ_OF(str, struct RString, klass, T_STRING);
00377
00378 str->as.heap.ptr = 0;
00379 str->as.heap.len = 0;
00380 str->as.heap.aux.capa = 0;
00381
00382 return (VALUE)str;
00383 }
00384
00385 static inline VALUE
00386 empty_str_alloc(VALUE klass)
00387 {
00388 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00389 RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
00390 }
00391 return str_alloc(klass);
00392 }
00393
00394 static VALUE
00395 str_new(VALUE klass, const char *ptr, long len)
00396 {
00397 VALUE str;
00398
00399 if (len < 0) {
00400 rb_raise(rb_eArgError, "negative string size (or size too big)");
00401 }
00402
00403 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00404 RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
00405 }
00406
00407 str = str_alloc(klass);
00408 if (len > RSTRING_EMBED_LEN_MAX) {
00409 RSTRING(str)->as.heap.aux.capa = len;
00410 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00411 STR_SET_NOEMBED(str);
00412 }
00413 else if (len == 0) {
00414 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00415 }
00416 if (ptr) {
00417 memcpy(RSTRING_PTR(str), ptr, len);
00418 }
00419 STR_SET_LEN(str, len);
00420 RSTRING_PTR(str)[len] = '\0';
00421 return str;
00422 }
00423
00424 VALUE
00425 rb_str_new(const char *ptr, long len)
00426 {
00427 return str_new(rb_cString, ptr, len);
00428 }
00429
00430 VALUE
00431 rb_usascii_str_new(const char *ptr, long len)
00432 {
00433 VALUE str = rb_str_new(ptr, len);
00434 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00435 return str;
00436 }
00437
00438 VALUE
00439 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00440 {
00441 VALUE str = rb_str_new(ptr, len);
00442 rb_enc_associate(str, enc);
00443 return str;
00444 }
00445
00446 VALUE
00447 rb_str_new_cstr(const char *ptr)
00448 {
00449 if (!ptr) {
00450 rb_raise(rb_eArgError, "NULL pointer given");
00451 }
00452 return rb_str_new(ptr, strlen(ptr));
00453 }
00454
00455 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00456 #define rb_str_new2 rb_str_new_cstr
00457
00458 VALUE
00459 rb_usascii_str_new_cstr(const char *ptr)
00460 {
00461 VALUE str = rb_str_new2(ptr);
00462 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00463 return str;
00464 }
00465
00466 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00467 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00468
00469 VALUE
00470 rb_tainted_str_new(const char *ptr, long len)
00471 {
00472 VALUE str = rb_str_new(ptr, len);
00473
00474 OBJ_TAINT(str);
00475 return str;
00476 }
00477
00478 VALUE
00479 rb_tainted_str_new_cstr(const char *ptr)
00480 {
00481 VALUE str = rb_str_new2(ptr);
00482
00483 OBJ_TAINT(str);
00484 return str;
00485 }
00486
00487 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00488 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00489
00490 VALUE
00491 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00492 {
00493 extern VALUE rb_cEncodingConverter;
00494 rb_econv_t *ec;
00495 rb_econv_result_t ret;
00496 long len, olen;
00497 VALUE econv_wrapper;
00498 VALUE newstr;
00499 const unsigned char *start, *sp;
00500 unsigned char *dest, *dp;
00501 size_t converted_output = 0;
00502
00503 if (!to) return str;
00504 if (!from) from = rb_enc_get(str);
00505 if (from == to) return str;
00506 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00507 to == rb_ascii8bit_encoding()) {
00508 if (STR_ENC_GET(str) != to) {
00509 str = rb_str_dup(str);
00510 rb_enc_associate(str, to);
00511 }
00512 return str;
00513 }
00514
00515 len = RSTRING_LEN(str);
00516 newstr = rb_str_new(0, len);
00517 olen = len;
00518
00519 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
00520 RBASIC(econv_wrapper)->klass = 0;
00521 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00522 if (!ec) return str;
00523 DATA_PTR(econv_wrapper) = ec;
00524
00525 sp = (unsigned char*)RSTRING_PTR(str);
00526 start = sp;
00527 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
00528 (dp = dest + converted_output),
00529 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
00530 ret == econv_destination_buffer_full) {
00531
00532 size_t converted_input = sp - start;
00533 size_t rest = len - converted_input;
00534 converted_output = dp - dest;
00535 rb_str_set_len(newstr, converted_output);
00536 if (converted_input && converted_output &&
00537 rest < (LONG_MAX / converted_output)) {
00538 rest = (rest * converted_output) / converted_input;
00539 }
00540 else {
00541 rest = olen;
00542 }
00543 olen += rest < 2 ? 2 : rest;
00544 rb_str_resize(newstr, olen);
00545 }
00546 DATA_PTR(econv_wrapper) = 0;
00547 rb_econv_close(ec);
00548 rb_gc_force_recycle(econv_wrapper);
00549 switch (ret) {
00550 case econv_finished:
00551 len = dp - (unsigned char*)RSTRING_PTR(newstr);
00552 rb_str_set_len(newstr, len);
00553 rb_enc_associate(newstr, to);
00554 return newstr;
00555
00556 default:
00557
00558 return str;
00559 }
00560 }
00561
00562 VALUE
00563 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00564 {
00565 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00566 }
00567
00568 VALUE
00569 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00570 {
00571 VALUE str;
00572
00573 str = rb_tainted_str_new(ptr, len);
00574 if (eenc == rb_usascii_encoding() &&
00575 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00576 rb_enc_associate(str, rb_ascii8bit_encoding());
00577 return str;
00578 }
00579 rb_enc_associate(str, eenc);
00580 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00581 }
00582
00583 VALUE
00584 rb_external_str_new(const char *ptr, long len)
00585 {
00586 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00587 }
00588
00589 VALUE
00590 rb_external_str_new_cstr(const char *ptr)
00591 {
00592 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00593 }
00594
00595 VALUE
00596 rb_locale_str_new(const char *ptr, long len)
00597 {
00598 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00599 }
00600
00601 VALUE
00602 rb_locale_str_new_cstr(const char *ptr)
00603 {
00604 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00605 }
00606
00607 VALUE
00608 rb_filesystem_str_new(const char *ptr, long len)
00609 {
00610 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00611 }
00612
00613 VALUE
00614 rb_filesystem_str_new_cstr(const char *ptr)
00615 {
00616 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00617 }
00618
00619 VALUE
00620 rb_str_export(VALUE str)
00621 {
00622 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00623 }
00624
00625 VALUE
00626 rb_str_export_locale(VALUE str)
00627 {
00628 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00629 }
00630
00631 VALUE
00632 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00633 {
00634 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00635 }
00636
00637 static VALUE
00638 str_replace_shared_without_enc(VALUE str2, VALUE str)
00639 {
00640 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00641 STR_SET_EMBED(str2);
00642 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00643 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00644 }
00645 else {
00646 str = rb_str_new_frozen(str);
00647 FL_SET(str2, STR_NOEMBED);
00648 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00649 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00650 RSTRING(str2)->as.heap.aux.shared = str;
00651 FL_SET(str2, ELTS_SHARED);
00652 }
00653 return str2;
00654 }
00655
00656 static VALUE
00657 str_replace_shared(VALUE str2, VALUE str)
00658 {
00659 str_replace_shared_without_enc(str2, str);
00660 rb_enc_cr_str_exact_copy(str2, str);
00661 return str2;
00662 }
00663
00664 static VALUE
00665 str_new_shared(VALUE klass, VALUE str)
00666 {
00667 return str_replace_shared(str_alloc(klass), str);
00668 }
00669
00670 static VALUE
00671 str_new3(VALUE klass, VALUE str)
00672 {
00673 return str_new_shared(klass, str);
00674 }
00675
00676 VALUE
00677 rb_str_new_shared(VALUE str)
00678 {
00679 VALUE str2 = str_new3(rb_obj_class(str), str);
00680
00681 OBJ_INFECT(str2, str);
00682 return str2;
00683 }
00684
00685 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00686 #define rb_str_new3 rb_str_new_shared
00687
00688 static VALUE
00689 str_new4(VALUE klass, VALUE str)
00690 {
00691 VALUE str2;
00692
00693 str2 = str_alloc(klass);
00694 STR_SET_NOEMBED(str2);
00695 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00696 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00697 if (STR_SHARED_P(str)) {
00698 VALUE shared = RSTRING(str)->as.heap.aux.shared;
00699 assert(OBJ_FROZEN(shared));
00700 FL_SET(str2, ELTS_SHARED);
00701 RSTRING(str2)->as.heap.aux.shared = shared;
00702 }
00703 else {
00704 FL_SET(str, ELTS_SHARED);
00705 RSTRING(str)->as.heap.aux.shared = str2;
00706 }
00707 rb_enc_cr_str_exact_copy(str2, str);
00708 OBJ_INFECT(str2, str);
00709 return str2;
00710 }
00711
00712 VALUE
00713 rb_str_new_frozen(VALUE orig)
00714 {
00715 VALUE klass, str;
00716
00717 if (OBJ_FROZEN(orig)) return orig;
00718 klass = rb_obj_class(orig);
00719 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00720 long ofs;
00721 assert(OBJ_FROZEN(str));
00722 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00723 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00724 ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) ||
00725 ENCODING_GET(str) != ENCODING_GET(orig)) {
00726 str = str_new3(klass, str);
00727 RSTRING(str)->as.heap.ptr += ofs;
00728 RSTRING(str)->as.heap.len -= ofs;
00729 rb_enc_cr_str_exact_copy(str, orig);
00730 OBJ_INFECT(str, orig);
00731 }
00732 }
00733 else if (STR_EMBED_P(orig)) {
00734 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00735 rb_enc_cr_str_exact_copy(str, orig);
00736 OBJ_INFECT(str, orig);
00737 }
00738 else if (STR_ASSOC_P(orig)) {
00739 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00740 FL_UNSET(orig, STR_ASSOC);
00741 str = str_new4(klass, orig);
00742 FL_SET(str, STR_ASSOC);
00743 RSTRING(str)->as.heap.aux.shared = assoc;
00744 }
00745 else {
00746 str = str_new4(klass, orig);
00747 }
00748 OBJ_FREEZE(str);
00749 return str;
00750 }
00751
00752 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00753 #define rb_str_new4 rb_str_new_frozen
00754
00755 VALUE
00756 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00757 {
00758 return str_new(rb_obj_class(obj), ptr, len);
00759 }
00760
00761 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00762 rb_str_new_with_class, (obj, ptr, len))
00763 #define rb_str_new5 rb_str_new_with_class
00764
00765 static VALUE
00766 str_new_empty(VALUE str)
00767 {
00768 VALUE v = rb_str_new5(str, 0, 0);
00769 rb_enc_copy(v, str);
00770 OBJ_INFECT(v, str);
00771 return v;
00772 }
00773
00774 #define STR_BUF_MIN_SIZE 128
00775
00776 VALUE
00777 rb_str_buf_new(long capa)
00778 {
00779 VALUE str = str_alloc(rb_cString);
00780
00781 if (capa < STR_BUF_MIN_SIZE) {
00782 capa = STR_BUF_MIN_SIZE;
00783 }
00784 FL_SET(str, STR_NOEMBED);
00785 RSTRING(str)->as.heap.aux.capa = capa;
00786 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00787 RSTRING(str)->as.heap.ptr[0] = '\0';
00788
00789 return str;
00790 }
00791
00792 VALUE
00793 rb_str_buf_new_cstr(const char *ptr)
00794 {
00795 VALUE str;
00796 long len = strlen(ptr);
00797
00798 str = rb_str_buf_new(len);
00799 rb_str_buf_cat(str, ptr, len);
00800
00801 return str;
00802 }
00803
00804 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00805 #define rb_str_buf_new2 rb_str_buf_new_cstr
00806
00807 VALUE
00808 rb_str_tmp_new(long len)
00809 {
00810 return str_new(0, 0, len);
00811 }
00812
00813 void *
00814 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00815 {
00816 VALUE s = rb_str_tmp_new(len);
00817 *store = s;
00818 return RSTRING_PTR(s);
00819 }
00820
00821 void
00822 rb_free_tmp_buffer(volatile VALUE *store)
00823 {
00824 VALUE s = *store;
00825 *store = 0;
00826 if (s) rb_str_clear(s);
00827 }
00828
00829 void
00830 rb_str_free(VALUE str)
00831 {
00832 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00833 xfree(RSTRING(str)->as.heap.ptr);
00834 }
00835 }
00836
00837 RUBY_FUNC_EXPORTED size_t
00838 rb_str_memsize(VALUE str)
00839 {
00840 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00841 return RSTRING(str)->as.heap.aux.capa + 1;
00842 }
00843 else {
00844 return 0;
00845 }
00846 }
00847
00848 VALUE
00849 rb_str_to_str(VALUE str)
00850 {
00851 return rb_convert_type(str, T_STRING, "String", "to_str");
00852 }
00853
00854 static inline void str_discard(VALUE str);
00855
00856 void
00857 rb_str_shared_replace(VALUE str, VALUE str2)
00858 {
00859 rb_encoding *enc;
00860 int cr;
00861 if (str == str2) return;
00862 enc = STR_ENC_GET(str2);
00863 cr = ENC_CODERANGE(str2);
00864 str_discard(str);
00865 OBJ_INFECT(str, str2);
00866 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00867 STR_SET_EMBED(str);
00868 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00869 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00870 rb_enc_associate(str, enc);
00871 ENC_CODERANGE_SET(str, cr);
00872 return;
00873 }
00874 STR_SET_NOEMBED(str);
00875 STR_UNSET_NOCAPA(str);
00876 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00877 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00878 if (STR_NOCAPA_P(str2)) {
00879 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00880 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00881 }
00882 else {
00883 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00884 }
00885 STR_SET_EMBED(str2);
00886 RSTRING_PTR(str2)[0] = 0;
00887 STR_SET_EMBED_LEN(str2, 0);
00888 rb_enc_associate(str, enc);
00889 ENC_CODERANGE_SET(str, cr);
00890 }
00891
00892 static ID id_to_s;
00893
00894 VALUE
00895 rb_obj_as_string(VALUE obj)
00896 {
00897 VALUE str;
00898
00899 if (RB_TYPE_P(obj, T_STRING)) {
00900 return obj;
00901 }
00902 str = rb_funcall(obj, id_to_s, 0);
00903 if (!RB_TYPE_P(str, T_STRING))
00904 return rb_any_to_s(obj);
00905 if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00906 return str;
00907 }
00908
00909 static VALUE
00910 str_replace(VALUE str, VALUE str2)
00911 {
00912 long len;
00913
00914 len = RSTRING_LEN(str2);
00915 if (STR_ASSOC_P(str2)) {
00916 str2 = rb_str_new4(str2);
00917 }
00918 if (STR_SHARED_P(str2)) {
00919 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00920 assert(OBJ_FROZEN(shared));
00921 STR_SET_NOEMBED(str);
00922 RSTRING(str)->as.heap.len = len;
00923 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00924 FL_SET(str, ELTS_SHARED);
00925 FL_UNSET(str, STR_ASSOC);
00926 RSTRING(str)->as.heap.aux.shared = shared;
00927 }
00928 else {
00929 str_replace_shared(str, str2);
00930 }
00931
00932 OBJ_INFECT(str, str2);
00933 rb_enc_cr_str_exact_copy(str, str2);
00934 return str;
00935 }
00936
00937 static VALUE
00938 str_duplicate(VALUE klass, VALUE str)
00939 {
00940 VALUE dup = str_alloc(klass);
00941 str_replace(dup, str);
00942 return dup;
00943 }
00944
00945 VALUE
00946 rb_str_dup(VALUE str)
00947 {
00948 return str_duplicate(rb_obj_class(str), str);
00949 }
00950
00951 VALUE
00952 rb_str_resurrect(VALUE str)
00953 {
00954 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00955 RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
00956 rb_sourcefile(), rb_sourceline());
00957 }
00958 return str_replace(str_alloc(rb_cString), str);
00959 }
00960
00961
00962
00963
00964
00965
00966
00967
00968 static VALUE
00969 rb_str_init(int argc, VALUE *argv, VALUE str)
00970 {
00971 VALUE orig;
00972
00973 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00974 rb_str_replace(str, orig);
00975 return str;
00976 }
00977
00978 static inline long
00979 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00980 {
00981 long c;
00982 const char *q;
00983
00984 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00985 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00986 }
00987 else if (rb_enc_asciicompat(enc)) {
00988 c = 0;
00989 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00990 while (p < e) {
00991 if (ISASCII(*p)) {
00992 q = search_nonascii(p, e);
00993 if (!q)
00994 return c + (e - p);
00995 c += q - p;
00996 p = q;
00997 }
00998 p += rb_enc_fast_mbclen(p, e, enc);
00999 c++;
01000 }
01001 }
01002 else {
01003 while (p < e) {
01004 if (ISASCII(*p)) {
01005 q = search_nonascii(p, e);
01006 if (!q)
01007 return c + (e - p);
01008 c += q - p;
01009 p = q;
01010 }
01011 p += rb_enc_mbclen(p, e, enc);
01012 c++;
01013 }
01014 }
01015 return c;
01016 }
01017
01018 for (c=0; p<e; c++) {
01019 p += rb_enc_mbclen(p, e, enc);
01020 }
01021 return c;
01022 }
01023
01024 long
01025 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
01026 {
01027 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
01028 }
01029
01030 long
01031 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
01032 {
01033 long c;
01034 const char *q;
01035 int ret;
01036
01037 *cr = 0;
01038 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01039 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
01040 }
01041 else if (rb_enc_asciicompat(enc)) {
01042 c = 0;
01043 while (p < e) {
01044 if (ISASCII(*p)) {
01045 q = search_nonascii(p, e);
01046 if (!q) {
01047 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01048 return c + (e - p);
01049 }
01050 c += q - p;
01051 p = q;
01052 }
01053 ret = rb_enc_precise_mbclen(p, e, enc);
01054 if (MBCLEN_CHARFOUND_P(ret)) {
01055 *cr |= ENC_CODERANGE_VALID;
01056 p += MBCLEN_CHARFOUND_LEN(ret);
01057 }
01058 else {
01059 *cr = ENC_CODERANGE_BROKEN;
01060 p++;
01061 }
01062 c++;
01063 }
01064 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01065 return c;
01066 }
01067
01068 for (c=0; p<e; c++) {
01069 ret = rb_enc_precise_mbclen(p, e, enc);
01070 if (MBCLEN_CHARFOUND_P(ret)) {
01071 *cr |= ENC_CODERANGE_VALID;
01072 p += MBCLEN_CHARFOUND_LEN(ret);
01073 }
01074 else {
01075 *cr = ENC_CODERANGE_BROKEN;
01076 if (p + rb_enc_mbminlen(enc) <= e)
01077 p += rb_enc_mbminlen(enc);
01078 else
01079 p = e;
01080 }
01081 }
01082 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01083 return c;
01084 }
01085
01086 #ifdef NONASCII_MASK
01087 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01088
01089
01090
01091
01092
01093
01094
01095
01096
01097
01098
01099
01100
01101 static inline VALUE
01102 count_utf8_lead_bytes_with_word(const VALUE *s)
01103 {
01104 VALUE d = *s;
01105
01106
01107 d |= ~(d>>1);
01108 d >>= 6;
01109 d &= NONASCII_MASK >> 7;
01110
01111
01112 d += (d>>8);
01113 d += (d>>16);
01114 #if SIZEOF_VALUE == 8
01115 d += (d>>32);
01116 #endif
01117 return (d&0xF);
01118 }
01119 #endif
01120
01121 static long
01122 str_strlen(VALUE str, rb_encoding *enc)
01123 {
01124 const char *p, *e;
01125 long n;
01126 int cr;
01127
01128 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01129 if (!enc) enc = STR_ENC_GET(str);
01130 p = RSTRING_PTR(str);
01131 e = RSTRING_END(str);
01132 cr = ENC_CODERANGE(str);
01133 #ifdef NONASCII_MASK
01134 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01135 enc == rb_utf8_encoding()) {
01136
01137 VALUE len = 0;
01138 if ((int)sizeof(VALUE) * 2 < e - p) {
01139 const VALUE *s, *t;
01140 const VALUE lowbits = sizeof(VALUE) - 1;
01141 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01142 t = (const VALUE*)(~lowbits & (VALUE)e);
01143 while (p < (const char *)s) {
01144 if (is_utf8_lead_byte(*p)) len++;
01145 p++;
01146 }
01147 while (s < t) {
01148 len += count_utf8_lead_bytes_with_word(s);
01149 s++;
01150 }
01151 p = (const char *)s;
01152 }
01153 while (p < e) {
01154 if (is_utf8_lead_byte(*p)) len++;
01155 p++;
01156 }
01157 return (long)len;
01158 }
01159 #endif
01160 n = rb_enc_strlen_cr(p, e, enc, &cr);
01161 if (cr) {
01162 ENC_CODERANGE_SET(str, cr);
01163 }
01164 return n;
01165 }
01166
01167 long
01168 rb_str_strlen(VALUE str)
01169 {
01170 return str_strlen(str, STR_ENC_GET(str));
01171 }
01172
01173
01174
01175
01176
01177
01178
01179
01180
01181 VALUE
01182 rb_str_length(VALUE str)
01183 {
01184 long len;
01185
01186 len = str_strlen(str, STR_ENC_GET(str));
01187 return LONG2NUM(len);
01188 }
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198
01199
01200 static VALUE
01201 rb_str_bytesize(VALUE str)
01202 {
01203 return LONG2NUM(RSTRING_LEN(str));
01204 }
01205
01206
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217 static VALUE
01218 rb_str_empty(VALUE str)
01219 {
01220 if (RSTRING_LEN(str) == 0)
01221 return Qtrue;
01222 return Qfalse;
01223 }
01224
01225
01226
01227
01228
01229
01230
01231
01232
01233
01234
01235 VALUE
01236 rb_str_plus(VALUE str1, VALUE str2)
01237 {
01238 VALUE str3;
01239 rb_encoding *enc;
01240
01241 StringValue(str2);
01242 enc = rb_enc_check(str1, str2);
01243 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01244 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01245 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01246 RSTRING_PTR(str2), RSTRING_LEN(str2));
01247 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01248
01249 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01250 OBJ_TAINT(str3);
01251 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01252 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01253 return str3;
01254 }
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265
01266
01267 VALUE
01268 rb_str_times(VALUE str, VALUE times)
01269 {
01270 VALUE str2;
01271 long n, len;
01272 char *ptr2;
01273
01274 len = NUM2LONG(times);
01275 if (len < 0) {
01276 rb_raise(rb_eArgError, "negative argument");
01277 }
01278 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
01279 rb_raise(rb_eArgError, "argument too big");
01280 }
01281
01282 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01283 ptr2 = RSTRING_PTR(str2);
01284 if (len) {
01285 n = RSTRING_LEN(str);
01286 memcpy(ptr2, RSTRING_PTR(str), n);
01287 while (n <= len/2) {
01288 memcpy(ptr2 + n, ptr2, n);
01289 n *= 2;
01290 }
01291 memcpy(ptr2 + n, ptr2, len-n);
01292 }
01293 ptr2[RSTRING_LEN(str2)] = '\0';
01294 OBJ_INFECT(str2, str);
01295 rb_enc_cr_str_copy_for_substr(str2, str);
01296
01297 return str2;
01298 }
01299
01300
01301
01302
01303
01304
01305
01306
01307
01308
01309
01310
01311
01312
01313
01314
01315 static VALUE
01316 rb_str_format_m(VALUE str, VALUE arg)
01317 {
01318 volatile VALUE tmp = rb_check_array_type(arg);
01319
01320 if (!NIL_P(tmp)) {
01321 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01322 }
01323 return rb_str_format(1, &arg, str);
01324 }
01325
01326 static inline void
01327 str_modifiable(VALUE str)
01328 {
01329 if (FL_TEST(str, STR_TMPLOCK)) {
01330 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01331 }
01332 rb_check_frozen(str);
01333 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01334 rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01335 }
01336
01337 static inline int
01338 str_independent(VALUE str)
01339 {
01340 str_modifiable(str);
01341 if (!STR_SHARED_P(str)) return 1;
01342 if (STR_EMBED_P(str)) return 1;
01343 return 0;
01344 }
01345
01346 static void
01347 str_make_independent_expand(VALUE str, long expand)
01348 {
01349 char *ptr;
01350 long len = RSTRING_LEN(str);
01351 long capa = len + expand;
01352
01353 if (len > capa) len = capa;
01354 ptr = ALLOC_N(char, capa + 1);
01355 if (RSTRING_PTR(str)) {
01356 memcpy(ptr, RSTRING_PTR(str), len);
01357 }
01358 STR_SET_NOEMBED(str);
01359 STR_UNSET_NOCAPA(str);
01360 ptr[len] = 0;
01361 RSTRING(str)->as.heap.ptr = ptr;
01362 RSTRING(str)->as.heap.len = len;
01363 RSTRING(str)->as.heap.aux.capa = capa;
01364 }
01365
01366 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01367
01368 void
01369 rb_str_modify(VALUE str)
01370 {
01371 if (!str_independent(str))
01372 str_make_independent(str);
01373 ENC_CODERANGE_CLEAR(str);
01374 }
01375
01376 void
01377 rb_str_modify_expand(VALUE str, long expand)
01378 {
01379 if (expand < 0) {
01380 rb_raise(rb_eArgError, "negative expanding string size");
01381 }
01382 if (!str_independent(str)) {
01383 str_make_independent_expand(str, expand);
01384 }
01385 else if (expand > 0) {
01386 long len = RSTRING_LEN(str);
01387 long capa = len + expand;
01388 if (!STR_EMBED_P(str)) {
01389 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
01390 STR_UNSET_NOCAPA(str);
01391 RSTRING(str)->as.heap.aux.capa = capa;
01392 }
01393 else if (capa > RSTRING_EMBED_LEN_MAX) {
01394 str_make_independent_expand(str, expand);
01395 }
01396 }
01397 ENC_CODERANGE_CLEAR(str);
01398 }
01399
01400
01401 static void
01402 str_modify_keep_cr(VALUE str)
01403 {
01404 if (!str_independent(str))
01405 str_make_independent(str);
01406 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01407
01408 ENC_CODERANGE_CLEAR(str);
01409 }
01410
01411 static inline void
01412 str_discard(VALUE str)
01413 {
01414 str_modifiable(str);
01415 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01416 xfree(RSTRING_PTR(str));
01417 RSTRING(str)->as.heap.ptr = 0;
01418 RSTRING(str)->as.heap.len = 0;
01419 }
01420 }
01421
01422 void
01423 rb_str_associate(VALUE str, VALUE add)
01424 {
01425
01426 rb_check_frozen(str);
01427 if (STR_ASSOC_P(str)) {
01428
01429 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01430 }
01431 else {
01432 if (STR_SHARED_P(str)) {
01433 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01434 str_make_independent(str);
01435 if (STR_ASSOC_P(assoc)) {
01436 assoc = RSTRING(assoc)->as.heap.aux.shared;
01437 rb_ary_concat(assoc, add);
01438 add = assoc;
01439 }
01440 }
01441 else if (STR_EMBED_P(str)) {
01442 str_make_independent(str);
01443 }
01444 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01445 RESIZE_CAPA(str, RSTRING_LEN(str));
01446 }
01447 FL_SET(str, STR_ASSOC);
01448 RBASIC(add)->klass = 0;
01449 RSTRING(str)->as.heap.aux.shared = add;
01450 }
01451 }
01452
01453 VALUE
01454 rb_str_associated(VALUE str)
01455 {
01456 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01457 if (STR_ASSOC_P(str)) {
01458 return RSTRING(str)->as.heap.aux.shared;
01459 }
01460 return Qfalse;
01461 }
01462
01463 void
01464 rb_must_asciicompat(VALUE str)
01465 {
01466 rb_encoding *enc = rb_enc_get(str);
01467 if (!rb_enc_asciicompat(enc)) {
01468 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
01469 }
01470 }
01471
01472 VALUE
01473 rb_string_value(volatile VALUE *ptr)
01474 {
01475 VALUE s = *ptr;
01476 if (!RB_TYPE_P(s, T_STRING)) {
01477 s = rb_str_to_str(s);
01478 *ptr = s;
01479 }
01480 return s;
01481 }
01482
01483 char *
01484 rb_string_value_ptr(volatile VALUE *ptr)
01485 {
01486 VALUE str = rb_string_value(ptr);
01487 return RSTRING_PTR(str);
01488 }
01489
01490 char *
01491 rb_string_value_cstr(volatile VALUE *ptr)
01492 {
01493 VALUE str = rb_string_value(ptr);
01494 char *s = RSTRING_PTR(str);
01495 long len = RSTRING_LEN(str);
01496
01497 if (!s || memchr(s, 0, len)) {
01498 rb_raise(rb_eArgError, "string contains null byte");
01499 }
01500 if (s[len]) {
01501 rb_str_modify(str);
01502 s = RSTRING_PTR(str);
01503 s[RSTRING_LEN(str)] = 0;
01504 }
01505 return s;
01506 }
01507
01508 VALUE
01509 rb_check_string_type(VALUE str)
01510 {
01511 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01512 return str;
01513 }
01514
01515
01516
01517
01518
01519
01520
01521
01522
01523
01524
01525
01526 static VALUE
01527 rb_str_s_try_convert(VALUE dummy, VALUE str)
01528 {
01529 return rb_check_string_type(str);
01530 }
01531
01532 static char*
01533 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01534 {
01535 long nth = *nthp;
01536 if (rb_enc_mbmaxlen(enc) == 1) {
01537 p += nth;
01538 }
01539 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01540 p += nth * rb_enc_mbmaxlen(enc);
01541 }
01542 else if (rb_enc_asciicompat(enc)) {
01543 const char *p2, *e2;
01544 int n;
01545
01546 while (p < e && 0 < nth) {
01547 e2 = p + nth;
01548 if (e < e2) {
01549 *nthp = nth;
01550 return (char *)e;
01551 }
01552 if (ISASCII(*p)) {
01553 p2 = search_nonascii(p, e2);
01554 if (!p2) {
01555 nth -= e2 - p;
01556 *nthp = nth;
01557 return (char *)e2;
01558 }
01559 nth -= p2 - p;
01560 p = p2;
01561 }
01562 n = rb_enc_mbclen(p, e, enc);
01563 p += n;
01564 nth--;
01565 }
01566 *nthp = nth;
01567 if (nth != 0) {
01568 return (char *)e;
01569 }
01570 return (char *)p;
01571 }
01572 else {
01573 while (p < e && nth--) {
01574 p += rb_enc_mbclen(p, e, enc);
01575 }
01576 }
01577 if (p > e) p = e;
01578 *nthp = nth;
01579 return (char*)p;
01580 }
01581
01582 char*
01583 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01584 {
01585 return str_nth_len(p, e, &nth, enc);
01586 }
01587
01588 static char*
01589 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01590 {
01591 if (singlebyte)
01592 p += nth;
01593 else {
01594 p = str_nth_len(p, e, &nth, enc);
01595 }
01596 if (!p) return 0;
01597 if (p > e) p = e;
01598 return (char *)p;
01599 }
01600
01601
01602 static long
01603 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01604 {
01605 const char *pp = str_nth(p, e, nth, enc, singlebyte);
01606 if (!pp) return e - p;
01607 return pp - p;
01608 }
01609
01610 long
01611 rb_str_offset(VALUE str, long pos)
01612 {
01613 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01614 STR_ENC_GET(str), single_byte_optimizable(str));
01615 }
01616
01617 #ifdef NONASCII_MASK
01618 static char *
01619 str_utf8_nth(const char *p, const char *e, long *nthp)
01620 {
01621 long nth = *nthp;
01622 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01623 const VALUE *s, *t;
01624 const VALUE lowbits = sizeof(VALUE) - 1;
01625 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01626 t = (const VALUE*)(~lowbits & (VALUE)e);
01627 while (p < (const char *)s) {
01628 if (is_utf8_lead_byte(*p)) nth--;
01629 p++;
01630 }
01631 do {
01632 nth -= count_utf8_lead_bytes_with_word(s);
01633 s++;
01634 } while (s < t && (int)sizeof(VALUE) <= nth);
01635 p = (char *)s;
01636 }
01637 while (p < e) {
01638 if (is_utf8_lead_byte(*p)) {
01639 if (nth == 0) break;
01640 nth--;
01641 }
01642 p++;
01643 }
01644 *nthp = nth;
01645 return (char *)p;
01646 }
01647
01648 static long
01649 str_utf8_offset(const char *p, const char *e, long nth)
01650 {
01651 const char *pp = str_utf8_nth(p, e, &nth);
01652 return pp - p;
01653 }
01654 #endif
01655
01656
01657 long
01658 rb_str_sublen(VALUE str, long pos)
01659 {
01660 if (single_byte_optimizable(str) || pos < 0)
01661 return pos;
01662 else {
01663 char *p = RSTRING_PTR(str);
01664 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01665 }
01666 }
01667
01668 VALUE
01669 rb_str_subseq(VALUE str, long beg, long len)
01670 {
01671 VALUE str2;
01672
01673 if (RSTRING_LEN(str) == beg + len &&
01674 RSTRING_EMBED_LEN_MAX < len) {
01675 str2 = rb_str_new_shared(rb_str_new_frozen(str));
01676 rb_str_drop_bytes(str2, beg);
01677 }
01678 else {
01679 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01680 RB_GC_GUARD(str);
01681 }
01682
01683 rb_enc_cr_str_copy_for_substr(str2, str);
01684 OBJ_INFECT(str2, str);
01685
01686 return str2;
01687 }
01688
01689 static char *
01690 rb_str_subpos(VALUE str, long beg, long *lenp)
01691 {
01692 long len = *lenp;
01693 long slen = -1L;
01694 long blen = RSTRING_LEN(str);
01695 rb_encoding *enc = STR_ENC_GET(str);
01696 char *p, *s = RSTRING_PTR(str), *e = s + blen;
01697
01698 if (len < 0) return 0;
01699 if (!blen) {
01700 len = 0;
01701 }
01702 if (single_byte_optimizable(str)) {
01703 if (beg > blen) return 0;
01704 if (beg < 0) {
01705 beg += blen;
01706 if (beg < 0) return 0;
01707 }
01708 if (beg + len > blen)
01709 len = blen - beg;
01710 if (len < 0) return 0;
01711 p = s + beg;
01712 goto end;
01713 }
01714 if (beg < 0) {
01715 if (len > -beg) len = -beg;
01716 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01717 beg = -beg;
01718 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01719 p = e;
01720 if (!p) return 0;
01721 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01722 if (!p) return 0;
01723 len = e - p;
01724 goto end;
01725 }
01726 else {
01727 slen = str_strlen(str, enc);
01728 beg += slen;
01729 if (beg < 0) return 0;
01730 p = s + beg;
01731 if (len == 0) goto end;
01732 }
01733 }
01734 else if (beg > 0 && beg > RSTRING_LEN(str)) {
01735 return 0;
01736 }
01737 if (len == 0) {
01738 if (beg > str_strlen(str, enc)) return 0;
01739 p = s + beg;
01740 }
01741 #ifdef NONASCII_MASK
01742 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01743 enc == rb_utf8_encoding()) {
01744 p = str_utf8_nth(s, e, &beg);
01745 if (beg > 0) return 0;
01746 len = str_utf8_offset(p, e, len);
01747 }
01748 #endif
01749 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01750 int char_sz = rb_enc_mbmaxlen(enc);
01751
01752 p = s + beg * char_sz;
01753 if (p > e) {
01754 return 0;
01755 }
01756 else if (len * char_sz > e - p)
01757 len = e - p;
01758 else
01759 len *= char_sz;
01760 }
01761 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01762 if (beg > 0) return 0;
01763 len = 0;
01764 }
01765 else {
01766 len = str_offset(p, e, len, enc, 0);
01767 }
01768 end:
01769 *lenp = len;
01770 RB_GC_GUARD(str);
01771 return p;
01772 }
01773
01774 VALUE
01775 rb_str_substr(VALUE str, long beg, long len)
01776 {
01777 VALUE str2;
01778 char *p = rb_str_subpos(str, beg, &len);
01779
01780 if (!p) return Qnil;
01781 if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
01782 str2 = rb_str_new4(str);
01783 str2 = str_new3(rb_obj_class(str2), str2);
01784 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01785 RSTRING(str2)->as.heap.len = len;
01786 }
01787 else {
01788 str2 = rb_str_new5(str, p, len);
01789 OBJ_INFECT(str2, str);
01790 RB_GC_GUARD(str);
01791 }
01792 rb_enc_cr_str_copy_for_substr(str2, str);
01793
01794 return str2;
01795 }
01796
01797 VALUE
01798 rb_str_freeze(VALUE str)
01799 {
01800 if (STR_ASSOC_P(str)) {
01801 VALUE ary = RSTRING(str)->as.heap.aux.shared;
01802 OBJ_FREEZE(ary);
01803 }
01804 return rb_obj_freeze(str);
01805 }
01806
01807 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01808 #define rb_str_dup_frozen rb_str_new_frozen
01809
01810 VALUE
01811 rb_str_locktmp(VALUE str)
01812 {
01813 if (FL_TEST(str, STR_TMPLOCK)) {
01814 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01815 }
01816 FL_SET(str, STR_TMPLOCK);
01817 return str;
01818 }
01819
01820 VALUE
01821 rb_str_unlocktmp(VALUE str)
01822 {
01823 if (!FL_TEST(str, STR_TMPLOCK)) {
01824 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01825 }
01826 FL_UNSET(str, STR_TMPLOCK);
01827 return str;
01828 }
01829
01830 VALUE
01831 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
01832 {
01833 rb_str_locktmp(str);
01834 return rb_ensure(func, arg, rb_str_unlocktmp, str);
01835 }
01836
01837 void
01838 rb_str_set_len(VALUE str, long len)
01839 {
01840 long capa;
01841
01842 str_modifiable(str);
01843 if (STR_SHARED_P(str)) {
01844 rb_raise(rb_eRuntimeError, "can't set length of shared string");
01845 }
01846 if (len > (capa = (long)rb_str_capacity(str))) {
01847 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
01848 }
01849 STR_SET_LEN(str, len);
01850 RSTRING_PTR(str)[len] = '\0';
01851 }
01852
01853 VALUE
01854 rb_str_resize(VALUE str, long len)
01855 {
01856 long slen;
01857 int independent;
01858
01859 if (len < 0) {
01860 rb_raise(rb_eArgError, "negative string size (or size too big)");
01861 }
01862
01863 independent = str_independent(str);
01864 ENC_CODERANGE_CLEAR(str);
01865 slen = RSTRING_LEN(str);
01866 {
01867 long capa;
01868 if (STR_EMBED_P(str)) {
01869 if (len == slen) return str;
01870 if (len + 1 <= RSTRING_EMBED_LEN_MAX + 1) {
01871 STR_SET_EMBED_LEN(str, len);
01872 RSTRING(str)->as.ary[len] = '\0';
01873 return str;
01874 }
01875 str_make_independent_expand(str, len - slen);
01876 STR_SET_NOEMBED(str);
01877 }
01878 else if (len <= RSTRING_EMBED_LEN_MAX) {
01879 char *ptr = RSTRING(str)->as.heap.ptr;
01880 STR_SET_EMBED(str);
01881 if (slen > len) slen = len;
01882 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01883 RSTRING(str)->as.ary[len] = '\0';
01884 STR_SET_EMBED_LEN(str, len);
01885 if (independent) xfree(ptr);
01886 return str;
01887 }
01888 else if (!independent) {
01889 if (len == slen) return str;
01890 str_make_independent_expand(str, len - slen);
01891 }
01892 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
01893 (capa - len) > (len < 1024 ? len : 1024)) {
01894 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01895 RSTRING(str)->as.heap.aux.capa = len;
01896 }
01897 else if (len == slen) return str;
01898 RSTRING(str)->as.heap.len = len;
01899 RSTRING(str)->as.heap.ptr[len] = '\0';
01900 }
01901 return str;
01902 }
01903
01904 static VALUE
01905 str_buf_cat(VALUE str, const char *ptr, long len)
01906 {
01907 long capa, total, off = -1;
01908
01909 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01910 off = ptr - RSTRING_PTR(str);
01911 }
01912 rb_str_modify(str);
01913 if (len == 0) return 0;
01914 if (STR_ASSOC_P(str)) {
01915 FL_UNSET(str, STR_ASSOC);
01916 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01917 }
01918 else if (STR_EMBED_P(str)) {
01919 capa = RSTRING_EMBED_LEN_MAX;
01920 }
01921 else {
01922 capa = RSTRING(str)->as.heap.aux.capa;
01923 }
01924 if (RSTRING_LEN(str) >= LONG_MAX - len) {
01925 rb_raise(rb_eArgError, "string sizes too big");
01926 }
01927 total = RSTRING_LEN(str)+len;
01928 if (capa <= total) {
01929 while (total > capa) {
01930 if (capa + 1 >= LONG_MAX / 2) {
01931 capa = (total + 4095) / 4096 * 4096;
01932 break;
01933 }
01934 capa = (capa + 1) * 2;
01935 }
01936 RESIZE_CAPA(str, capa);
01937 }
01938 if (off != -1) {
01939 ptr = RSTRING_PTR(str) + off;
01940 }
01941 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01942 STR_SET_LEN(str, total);
01943 RSTRING_PTR(str)[total] = '\0';
01944
01945 return str;
01946 }
01947
01948 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
01949
01950 VALUE
01951 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01952 {
01953 if (len == 0) return str;
01954 if (len < 0) {
01955 rb_raise(rb_eArgError, "negative string size (or size too big)");
01956 }
01957 return str_buf_cat(str, ptr, len);
01958 }
01959
01960 VALUE
01961 rb_str_buf_cat2(VALUE str, const char *ptr)
01962 {
01963 return rb_str_buf_cat(str, ptr, strlen(ptr));
01964 }
01965
01966 VALUE
01967 rb_str_cat(VALUE str, const char *ptr, long len)
01968 {
01969 if (len < 0) {
01970 rb_raise(rb_eArgError, "negative string size (or size too big)");
01971 }
01972 if (STR_ASSOC_P(str)) {
01973 char *p;
01974 rb_str_modify_expand(str, len);
01975 p = RSTRING(str)->as.heap.ptr;
01976 memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
01977 len = RSTRING(str)->as.heap.len += len;
01978 p[len] = '\0';
01979 return str;
01980 }
01981
01982 return rb_str_buf_cat(str, ptr, len);
01983 }
01984
01985 VALUE
01986 rb_str_cat2(VALUE str, const char *ptr)
01987 {
01988 return rb_str_cat(str, ptr, strlen(ptr));
01989 }
01990
01991 static VALUE
01992 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01993 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01994 {
01995 int str_encindex = ENCODING_GET(str);
01996 int res_encindex;
01997 int str_cr, res_cr;
01998
01999 str_cr = ENC_CODERANGE(str);
02000
02001 if (str_encindex == ptr_encindex) {
02002 if (str_cr == ENC_CODERANGE_UNKNOWN)
02003 ptr_cr = ENC_CODERANGE_UNKNOWN;
02004 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
02005 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
02006 }
02007 }
02008 else {
02009 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
02010 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
02011 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
02012 if (len == 0)
02013 return str;
02014 if (RSTRING_LEN(str) == 0) {
02015 rb_str_buf_cat(str, ptr, len);
02016 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
02017 return str;
02018 }
02019 goto incompatible;
02020 }
02021 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
02022 ptr_cr = coderange_scan(ptr, len, ptr_enc);
02023 }
02024 if (str_cr == ENC_CODERANGE_UNKNOWN) {
02025 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
02026 str_cr = rb_enc_str_coderange(str);
02027 }
02028 }
02029 }
02030 if (ptr_cr_ret)
02031 *ptr_cr_ret = ptr_cr;
02032
02033 if (str_encindex != ptr_encindex &&
02034 str_cr != ENC_CODERANGE_7BIT &&
02035 ptr_cr != ENC_CODERANGE_7BIT) {
02036 incompatible:
02037 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
02038 rb_enc_name(rb_enc_from_index(str_encindex)),
02039 rb_enc_name(rb_enc_from_index(ptr_encindex)));
02040 }
02041
02042 if (str_cr == ENC_CODERANGE_UNKNOWN) {
02043 res_encindex = str_encindex;
02044 res_cr = ENC_CODERANGE_UNKNOWN;
02045 }
02046 else if (str_cr == ENC_CODERANGE_7BIT) {
02047 if (ptr_cr == ENC_CODERANGE_7BIT) {
02048 res_encindex = str_encindex;
02049 res_cr = ENC_CODERANGE_7BIT;
02050 }
02051 else {
02052 res_encindex = ptr_encindex;
02053 res_cr = ptr_cr;
02054 }
02055 }
02056 else if (str_cr == ENC_CODERANGE_VALID) {
02057 res_encindex = str_encindex;
02058 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
02059 res_cr = str_cr;
02060 else
02061 res_cr = ptr_cr;
02062 }
02063 else {
02064 res_encindex = str_encindex;
02065 res_cr = str_cr;
02066 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
02067 }
02068
02069 if (len < 0) {
02070 rb_raise(rb_eArgError, "negative string size (or size too big)");
02071 }
02072 str_buf_cat(str, ptr, len);
02073 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
02074 return str;
02075 }
02076
02077 VALUE
02078 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
02079 {
02080 return rb_enc_cr_str_buf_cat(str, ptr, len,
02081 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
02082 }
02083
02084 VALUE
02085 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02086 {
02087
02088 int encindex = ENCODING_GET(str);
02089 rb_encoding *enc = rb_enc_from_index(encindex);
02090 if (rb_enc_asciicompat(enc)) {
02091 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02092 encindex, ENC_CODERANGE_7BIT, 0);
02093 }
02094 else {
02095 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02096 while (*ptr) {
02097 unsigned int c = (unsigned char)*ptr;
02098 int len = rb_enc_codelen(c, enc);
02099 rb_enc_mbcput(c, buf, enc);
02100 rb_enc_cr_str_buf_cat(str, buf, len,
02101 encindex, ENC_CODERANGE_VALID, 0);
02102 ptr++;
02103 }
02104 return str;
02105 }
02106 }
02107
02108 VALUE
02109 rb_str_buf_append(VALUE str, VALUE str2)
02110 {
02111 int str2_cr;
02112
02113 str2_cr = ENC_CODERANGE(str2);
02114
02115 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02116 ENCODING_GET(str2), str2_cr, &str2_cr);
02117
02118 OBJ_INFECT(str, str2);
02119 ENC_CODERANGE_SET(str2, str2_cr);
02120
02121 return str;
02122 }
02123
02124 VALUE
02125 rb_str_append(VALUE str, VALUE str2)
02126 {
02127 rb_encoding *enc;
02128 int cr, cr2;
02129 long len2;
02130
02131 StringValue(str2);
02132 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02133 long len = RSTRING_LEN(str) + len2;
02134 enc = rb_enc_check(str, str2);
02135 cr = ENC_CODERANGE(str);
02136 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
02137 rb_str_modify_expand(str, len2);
02138 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
02139 RSTRING_PTR(str2), len2+1);
02140 RSTRING(str)->as.heap.len = len;
02141 rb_enc_associate(str, enc);
02142 ENC_CODERANGE_SET(str, cr);
02143 OBJ_INFECT(str, str2);
02144 return str;
02145 }
02146 return rb_str_buf_append(str, str2);
02147 }
02148
02149
02150
02151
02152
02153
02154
02155
02156
02157
02158
02159
02160
02161
02162
02163
02164
02165 VALUE
02166 rb_str_concat(VALUE str1, VALUE str2)
02167 {
02168 unsigned int code;
02169 rb_encoding *enc = STR_ENC_GET(str1);
02170
02171 if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) {
02172 if (rb_num_to_uint(str2, &code) == 0) {
02173 }
02174 else if (FIXNUM_P(str2)) {
02175 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02176 }
02177 else {
02178 rb_raise(rb_eRangeError, "bignum out of char range");
02179 }
02180 }
02181 else {
02182 return rb_str_append(str1, str2);
02183 }
02184
02185 if (enc == rb_usascii_encoding()) {
02186
02187 char buf[1];
02188 buf[0] = (char)code;
02189 if (code > 0xFF) {
02190 rb_raise(rb_eRangeError, "%u out of char range", code);
02191 }
02192 rb_str_cat(str1, buf, 1);
02193 if (code > 127) {
02194 rb_enc_associate(str1, rb_ascii8bit_encoding());
02195 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02196 }
02197 }
02198 else {
02199 long pos = RSTRING_LEN(str1);
02200 int cr = ENC_CODERANGE(str1);
02201 int len;
02202 char *buf;
02203
02204 switch (len = rb_enc_codelen(code, enc)) {
02205 case ONIGERR_INVALID_CODE_POINT_VALUE:
02206 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02207 break;
02208 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02209 case 0:
02210 rb_raise(rb_eRangeError, "%u out of char range", code);
02211 break;
02212 }
02213 buf = ALLOCA_N(char, len + 1);
02214 rb_enc_mbcput(code, buf, enc);
02215 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02216 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02217 }
02218 rb_str_resize(str1, pos+len);
02219 memcpy(RSTRING_PTR(str1) + pos, buf, len);
02220 if (cr == ENC_CODERANGE_7BIT && code > 127)
02221 cr = ENC_CODERANGE_VALID;
02222 ENC_CODERANGE_SET(str1, cr);
02223 }
02224 return str1;
02225 }
02226
02227
02228
02229
02230
02231
02232
02233
02234
02235
02236
02237
02238 static VALUE
02239 rb_str_prepend(VALUE str, VALUE str2)
02240 {
02241 StringValue(str2);
02242 StringValue(str);
02243 rb_str_update(str, 0L, 0L, str2);
02244 return str;
02245 }
02246
02247 st_index_t
02248 rb_str_hash(VALUE str)
02249 {
02250 int e = ENCODING_GET(str);
02251 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02252 e = 0;
02253 }
02254 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02255 }
02256
02257 int
02258 rb_str_hash_cmp(VALUE str1, VALUE str2)
02259 {
02260 long len;
02261
02262 if (!rb_str_comparable(str1, str2)) return 1;
02263 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02264 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02265 return 0;
02266 }
02267 return 1;
02268 }
02269
02270
02271
02272
02273
02274
02275
02276
02277 static VALUE
02278 rb_str_hash_m(VALUE str)
02279 {
02280 st_index_t hval = rb_str_hash(str);
02281 return INT2FIX(hval);
02282 }
02283
02284 #define lesser(a,b) (((a)>(b))?(b):(a))
02285
02286 int
02287 rb_str_comparable(VALUE str1, VALUE str2)
02288 {
02289 int idx1, idx2;
02290 int rc1, rc2;
02291
02292 if (RSTRING_LEN(str1) == 0) return TRUE;
02293 if (RSTRING_LEN(str2) == 0) return TRUE;
02294 idx1 = ENCODING_GET(str1);
02295 idx2 = ENCODING_GET(str2);
02296 if (idx1 == idx2) return TRUE;
02297 rc1 = rb_enc_str_coderange(str1);
02298 rc2 = rb_enc_str_coderange(str2);
02299 if (rc1 == ENC_CODERANGE_7BIT) {
02300 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02301 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02302 return TRUE;
02303 }
02304 if (rc2 == ENC_CODERANGE_7BIT) {
02305 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02306 return TRUE;
02307 }
02308 return FALSE;
02309 }
02310
02311 int
02312 rb_str_cmp(VALUE str1, VALUE str2)
02313 {
02314 long len1, len2;
02315 const char *ptr1, *ptr2;
02316 int retval;
02317
02318 if (str1 == str2) return 0;
02319 RSTRING_GETMEM(str1, ptr1, len1);
02320 RSTRING_GETMEM(str2, ptr2, len2);
02321 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02322 if (len1 == len2) {
02323 if (!rb_str_comparable(str1, str2)) {
02324 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02325 return 1;
02326 return -1;
02327 }
02328 return 0;
02329 }
02330 if (len1 > len2) return 1;
02331 return -1;
02332 }
02333 if (retval > 0) return 1;
02334 return -1;
02335 }
02336
02337
02338 static VALUE
02339 str_eql(const VALUE str1, const VALUE str2)
02340 {
02341 const long len = RSTRING_LEN(str1);
02342 const char *ptr1, *ptr2;
02343
02344 if (len != RSTRING_LEN(str2)) return Qfalse;
02345 if (!rb_str_comparable(str1, str2)) return Qfalse;
02346 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02347 return Qtrue;
02348 if (memcmp(ptr1, ptr2, len) == 0)
02349 return Qtrue;
02350 return Qfalse;
02351 }
02352
02353
02354
02355
02356
02357
02358
02359
02360
02361
02362 VALUE
02363 rb_str_equal(VALUE str1, VALUE str2)
02364 {
02365 if (str1 == str2) return Qtrue;
02366 if (!RB_TYPE_P(str2, T_STRING)) {
02367 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02368 return Qfalse;
02369 }
02370 return rb_equal(str2, str1);
02371 }
02372 return str_eql(str1, str2);
02373 }
02374
02375
02376
02377
02378
02379
02380
02381
02382 static VALUE
02383 rb_str_eql(VALUE str1, VALUE str2)
02384 {
02385 if (str1 == str2) return Qtrue;
02386 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
02387 return str_eql(str1, str2);
02388 }
02389
02390
02391
02392
02393
02394
02395
02396
02397
02398
02399
02400
02401
02402
02403
02404
02405
02406
02407
02408
02409
02410
02411
02412
02413
02414
02415 static VALUE
02416 rb_str_cmp_m(VALUE str1, VALUE str2)
02417 {
02418 int result;
02419
02420 if (!RB_TYPE_P(str2, T_STRING)) {
02421 VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0);
02422 if (RB_TYPE_P(tmp, T_STRING)) {
02423 result = rb_str_cmp(str1, tmp);
02424 }
02425 else {
02426 return rb_invcmp(str1, str2);
02427 }
02428 }
02429 else {
02430 result = rb_str_cmp(str1, str2);
02431 }
02432 return INT2FIX(result);
02433 }
02434
02435
02436
02437
02438
02439
02440
02441
02442
02443
02444
02445
02446
02447 static VALUE
02448 rb_str_casecmp(VALUE str1, VALUE str2)
02449 {
02450 long len;
02451 rb_encoding *enc;
02452 char *p1, *p1end, *p2, *p2end;
02453
02454 StringValue(str2);
02455 enc = rb_enc_compatible(str1, str2);
02456 if (!enc) {
02457 return Qnil;
02458 }
02459
02460 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02461 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02462 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02463 while (p1 < p1end && p2 < p2end) {
02464 if (*p1 != *p2) {
02465 unsigned int c1 = TOUPPER(*p1 & 0xff);
02466 unsigned int c2 = TOUPPER(*p2 & 0xff);
02467 if (c1 != c2)
02468 return INT2FIX(c1 < c2 ? -1 : 1);
02469 }
02470 p1++;
02471 p2++;
02472 }
02473 }
02474 else {
02475 while (p1 < p1end && p2 < p2end) {
02476 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02477 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02478
02479 if (0 <= c1 && 0 <= c2) {
02480 c1 = TOUPPER(c1);
02481 c2 = TOUPPER(c2);
02482 if (c1 != c2)
02483 return INT2FIX(c1 < c2 ? -1 : 1);
02484 }
02485 else {
02486 int r;
02487 l1 = rb_enc_mbclen(p1, p1end, enc);
02488 l2 = rb_enc_mbclen(p2, p2end, enc);
02489 len = l1 < l2 ? l1 : l2;
02490 r = memcmp(p1, p2, len);
02491 if (r != 0)
02492 return INT2FIX(r < 0 ? -1 : 1);
02493 if (l1 != l2)
02494 return INT2FIX(l1 < l2 ? -1 : 1);
02495 }
02496 p1 += l1;
02497 p2 += l2;
02498 }
02499 }
02500 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02501 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02502 return INT2FIX(-1);
02503 }
02504
02505 static long
02506 rb_str_index(VALUE str, VALUE sub, long offset)
02507 {
02508 long pos;
02509 char *s, *sptr, *e;
02510 long len, slen;
02511 rb_encoding *enc;
02512
02513 enc = rb_enc_check(str, sub);
02514 if (is_broken_string(sub)) {
02515 return -1;
02516 }
02517 len = str_strlen(str, enc);
02518 slen = str_strlen(sub, enc);
02519 if (offset < 0) {
02520 offset += len;
02521 if (offset < 0) return -1;
02522 }
02523 if (len - offset < slen) return -1;
02524 s = RSTRING_PTR(str);
02525 e = s + RSTRING_LEN(str);
02526 if (offset) {
02527 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02528 s += offset;
02529 }
02530 if (slen == 0) return offset;
02531
02532 sptr = RSTRING_PTR(sub);
02533 slen = RSTRING_LEN(sub);
02534 len = RSTRING_LEN(str) - offset;
02535 for (;;) {
02536 char *t;
02537 pos = rb_memsearch(sptr, slen, s, len, enc);
02538 if (pos < 0) return pos;
02539 t = rb_enc_right_char_head(s, s+pos, e, enc);
02540 if (t == s + pos) break;
02541 if ((len -= t - s) <= 0) return -1;
02542 offset += t - s;
02543 s = t;
02544 }
02545 return pos + offset;
02546 }
02547
02548
02549
02550
02551
02552
02553
02554
02555
02556
02557
02558
02559
02560
02561
02562
02563
02564
02565
02566 static VALUE
02567 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02568 {
02569 VALUE sub;
02570 VALUE initpos;
02571 long pos;
02572
02573 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02574 pos = NUM2LONG(initpos);
02575 }
02576 else {
02577 pos = 0;
02578 }
02579 if (pos < 0) {
02580 pos += str_strlen(str, STR_ENC_GET(str));
02581 if (pos < 0) {
02582 if (RB_TYPE_P(sub, T_REGEXP)) {
02583 rb_backref_set(Qnil);
02584 }
02585 return Qnil;
02586 }
02587 }
02588
02589 if (SPECIAL_CONST_P(sub)) goto generic;
02590 switch (BUILTIN_TYPE(sub)) {
02591 case T_REGEXP:
02592 if (pos > str_strlen(str, STR_ENC_GET(str)))
02593 return Qnil;
02594 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02595 rb_enc_check(str, sub), single_byte_optimizable(str));
02596
02597 pos = rb_reg_search(sub, str, pos, 0);
02598 pos = rb_str_sublen(str, pos);
02599 break;
02600
02601 generic:
02602 default: {
02603 VALUE tmp;
02604
02605 tmp = rb_check_string_type(sub);
02606 if (NIL_P(tmp)) {
02607 rb_raise(rb_eTypeError, "type mismatch: %s given",
02608 rb_obj_classname(sub));
02609 }
02610 sub = tmp;
02611 }
02612
02613 case T_STRING:
02614 pos = rb_str_index(str, sub, pos);
02615 pos = rb_str_sublen(str, pos);
02616 break;
02617 }
02618
02619 if (pos == -1) return Qnil;
02620 return LONG2NUM(pos);
02621 }
02622
02623 static long
02624 rb_str_rindex(VALUE str, VALUE sub, long pos)
02625 {
02626 long len, slen;
02627 char *s, *sbeg, *e, *t;
02628 rb_encoding *enc;
02629 int singlebyte = single_byte_optimizable(str);
02630
02631 enc = rb_enc_check(str, sub);
02632 if (is_broken_string(sub)) {
02633 return -1;
02634 }
02635 len = str_strlen(str, enc);
02636 slen = str_strlen(sub, enc);
02637
02638 if (len < slen) return -1;
02639 if (len - pos < slen) {
02640 pos = len - slen;
02641 }
02642 if (len == 0) {
02643 return pos;
02644 }
02645 sbeg = RSTRING_PTR(str);
02646 e = RSTRING_END(str);
02647 t = RSTRING_PTR(sub);
02648 slen = RSTRING_LEN(sub);
02649 s = str_nth(sbeg, e, pos, enc, singlebyte);
02650 while (s) {
02651 if (memcmp(s, t, slen) == 0) {
02652 return pos;
02653 }
02654 if (pos == 0) break;
02655 pos--;
02656 s = rb_enc_prev_char(sbeg, s, e, enc);
02657 }
02658 return -1;
02659 }
02660
02661
02662
02663
02664
02665
02666
02667
02668
02669
02670
02671
02672
02673
02674
02675
02676
02677
02678
02679
02680 static VALUE
02681 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02682 {
02683 VALUE sub;
02684 VALUE vpos;
02685 rb_encoding *enc = STR_ENC_GET(str);
02686 long pos, len = str_strlen(str, enc);
02687
02688 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02689 pos = NUM2LONG(vpos);
02690 if (pos < 0) {
02691 pos += len;
02692 if (pos < 0) {
02693 if (RB_TYPE_P(sub, T_REGEXP)) {
02694 rb_backref_set(Qnil);
02695 }
02696 return Qnil;
02697 }
02698 }
02699 if (pos > len) pos = len;
02700 }
02701 else {
02702 pos = len;
02703 }
02704
02705 if (SPECIAL_CONST_P(sub)) goto generic;
02706 switch (BUILTIN_TYPE(sub)) {
02707 case T_REGEXP:
02708
02709 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02710 STR_ENC_GET(str), single_byte_optimizable(str));
02711
02712 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02713 pos = rb_reg_search(sub, str, pos, 1);
02714 pos = rb_str_sublen(str, pos);
02715 }
02716 if (pos >= 0) return LONG2NUM(pos);
02717 break;
02718
02719 generic:
02720 default: {
02721 VALUE tmp;
02722
02723 tmp = rb_check_string_type(sub);
02724 if (NIL_P(tmp)) {
02725 rb_raise(rb_eTypeError, "type mismatch: %s given",
02726 rb_obj_classname(sub));
02727 }
02728 sub = tmp;
02729 }
02730
02731 case T_STRING:
02732 pos = rb_str_rindex(str, sub, pos);
02733 if (pos >= 0) return LONG2NUM(pos);
02734 break;
02735 }
02736 return Qnil;
02737 }
02738
02739
02740
02741
02742
02743
02744
02745
02746
02747
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757 static VALUE
02758 rb_str_match(VALUE x, VALUE y)
02759 {
02760 if (SPECIAL_CONST_P(y)) goto generic;
02761 switch (BUILTIN_TYPE(y)) {
02762 case T_STRING:
02763 rb_raise(rb_eTypeError, "type mismatch: String given");
02764
02765 case T_REGEXP:
02766 return rb_reg_match(y, x);
02767
02768 generic:
02769 default:
02770 return rb_funcall(y, rb_intern("=~"), 1, x);
02771 }
02772 }
02773
02774
02775 static VALUE get_pat(VALUE, int);
02776
02777
02778
02779
02780
02781
02782
02783
02784
02785
02786
02787
02788
02789
02790
02791
02792
02793
02794
02795
02796
02797
02798
02799
02800
02801
02802
02803
02804
02805
02806
02807 static VALUE
02808 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02809 {
02810 VALUE re, result;
02811 if (argc < 1)
02812 rb_check_arity(argc, 1, 2);
02813 re = argv[0];
02814 argv[0] = str;
02815 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02816 if (!NIL_P(result) && rb_block_given_p()) {
02817 return rb_yield(result);
02818 }
02819 return result;
02820 }
02821
02822 enum neighbor_char {
02823 NEIGHBOR_NOT_CHAR,
02824 NEIGHBOR_FOUND,
02825 NEIGHBOR_WRAPPED
02826 };
02827
02828 static enum neighbor_char
02829 enc_succ_char(char *p, long len, rb_encoding *enc)
02830 {
02831 long i;
02832 int l;
02833 while (1) {
02834 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02835 p[i] = '\0';
02836 if (i < 0)
02837 return NEIGHBOR_WRAPPED;
02838 ++((unsigned char*)p)[i];
02839 l = rb_enc_precise_mbclen(p, p+len, enc);
02840 if (MBCLEN_CHARFOUND_P(l)) {
02841 l = MBCLEN_CHARFOUND_LEN(l);
02842 if (l == len) {
02843 return NEIGHBOR_FOUND;
02844 }
02845 else {
02846 memset(p+l, 0xff, len-l);
02847 }
02848 }
02849 if (MBCLEN_INVALID_P(l) && i < len-1) {
02850 long len2;
02851 int l2;
02852 for (len2 = len-1; 0 < len2; len2--) {
02853 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02854 if (!MBCLEN_INVALID_P(l2))
02855 break;
02856 }
02857 memset(p+len2+1, 0xff, len-(len2+1));
02858 }
02859 }
02860 }
02861
02862 static enum neighbor_char
02863 enc_pred_char(char *p, long len, rb_encoding *enc)
02864 {
02865 long i;
02866 int l;
02867 while (1) {
02868 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02869 p[i] = '\xff';
02870 if (i < 0)
02871 return NEIGHBOR_WRAPPED;
02872 --((unsigned char*)p)[i];
02873 l = rb_enc_precise_mbclen(p, p+len, enc);
02874 if (MBCLEN_CHARFOUND_P(l)) {
02875 l = MBCLEN_CHARFOUND_LEN(l);
02876 if (l == len) {
02877 return NEIGHBOR_FOUND;
02878 }
02879 else {
02880 memset(p+l, 0, len-l);
02881 }
02882 }
02883 if (MBCLEN_INVALID_P(l) && i < len-1) {
02884 long len2;
02885 int l2;
02886 for (len2 = len-1; 0 < len2; len2--) {
02887 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02888 if (!MBCLEN_INVALID_P(l2))
02889 break;
02890 }
02891 memset(p+len2+1, 0, len-(len2+1));
02892 }
02893 }
02894 }
02895
02896
02897
02898
02899
02900
02901
02902
02903
02904
02905 static enum neighbor_char
02906 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02907 {
02908 enum neighbor_char ret;
02909 unsigned int c;
02910 int ctype;
02911 int range;
02912 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02913
02914 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02915 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02916 ctype = ONIGENC_CTYPE_DIGIT;
02917 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02918 ctype = ONIGENC_CTYPE_ALPHA;
02919 else
02920 return NEIGHBOR_NOT_CHAR;
02921
02922 MEMCPY(save, p, char, len);
02923 ret = enc_succ_char(p, len, enc);
02924 if (ret == NEIGHBOR_FOUND) {
02925 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02926 if (rb_enc_isctype(c, ctype, enc))
02927 return NEIGHBOR_FOUND;
02928 }
02929 MEMCPY(p, save, char, len);
02930 range = 1;
02931 while (1) {
02932 MEMCPY(save, p, char, len);
02933 ret = enc_pred_char(p, len, enc);
02934 if (ret == NEIGHBOR_FOUND) {
02935 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02936 if (!rb_enc_isctype(c, ctype, enc)) {
02937 MEMCPY(p, save, char, len);
02938 break;
02939 }
02940 }
02941 else {
02942 MEMCPY(p, save, char, len);
02943 break;
02944 }
02945 range++;
02946 }
02947 if (range == 1) {
02948 return NEIGHBOR_NOT_CHAR;
02949 }
02950
02951 if (ctype != ONIGENC_CTYPE_DIGIT) {
02952 MEMCPY(carry, p, char, len);
02953 return NEIGHBOR_WRAPPED;
02954 }
02955
02956 MEMCPY(carry, p, char, len);
02957 enc_succ_char(carry, len, enc);
02958 return NEIGHBOR_WRAPPED;
02959 }
02960
02961
02962
02963
02964
02965
02966
02967
02968
02969
02970
02971
02972
02973
02974
02975
02976
02977
02978
02979
02980
02981
02982
02983
02984
02985
02986
02987 VALUE
02988 rb_str_succ(VALUE orig)
02989 {
02990 rb_encoding *enc;
02991 VALUE str;
02992 char *sbeg, *s, *e, *last_alnum = 0;
02993 int c = -1;
02994 long l;
02995 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02996 long carry_pos = 0, carry_len = 1;
02997 enum neighbor_char neighbor = NEIGHBOR_FOUND;
02998
02999 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
03000 rb_enc_cr_str_copy_for_substr(str, orig);
03001 OBJ_INFECT(str, orig);
03002 if (RSTRING_LEN(str) == 0) return str;
03003
03004 enc = STR_ENC_GET(orig);
03005 sbeg = RSTRING_PTR(str);
03006 s = e = sbeg + RSTRING_LEN(str);
03007
03008 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
03009 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
03010 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
03011 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
03012 s = last_alnum;
03013 break;
03014 }
03015 }
03016 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
03017 neighbor = enc_succ_alnum_char(s, l, enc, carry);
03018 switch (neighbor) {
03019 case NEIGHBOR_NOT_CHAR:
03020 continue;
03021 case NEIGHBOR_FOUND:
03022 return str;
03023 case NEIGHBOR_WRAPPED:
03024 last_alnum = s;
03025 break;
03026 }
03027 c = 1;
03028 carry_pos = s - sbeg;
03029 carry_len = l;
03030 }
03031 if (c == -1) {
03032 s = e;
03033 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
03034 enum neighbor_char neighbor;
03035 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
03036 neighbor = enc_succ_char(s, l, enc);
03037 if (neighbor == NEIGHBOR_FOUND)
03038 return str;
03039 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
03040
03041 enc_succ_char(s, l, enc);
03042 }
03043 if (!rb_enc_asciicompat(enc)) {
03044 MEMCPY(carry, s, char, l);
03045 carry_len = l;
03046 }
03047 carry_pos = s - sbeg;
03048 }
03049 }
03050 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
03051 s = RSTRING_PTR(str) + carry_pos;
03052 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
03053 memmove(s, carry, carry_len);
03054 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
03055 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03056 rb_enc_str_coderange(str);
03057 return str;
03058 }
03059
03060
03061
03062
03063
03064
03065
03066
03067
03068
03069
03070 static VALUE
03071 rb_str_succ_bang(VALUE str)
03072 {
03073 rb_str_shared_replace(str, rb_str_succ(str));
03074
03075 return str;
03076 }
03077
03078
03079
03080
03081
03082
03083
03084
03085
03086
03087
03088
03089
03090
03091
03092
03093
03094
03095
03096
03097
03098
03099
03100
03101
03102
03103
03104
03105
03106
03107
03108
03109
03110
03111 static VALUE
03112 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03113 {
03114 VALUE end, exclusive;
03115 VALUE current, after_end;
03116 ID succ;
03117 int n, excl, ascii;
03118 rb_encoding *enc;
03119
03120 rb_scan_args(argc, argv, "11", &end, &exclusive);
03121 RETURN_ENUMERATOR(beg, argc, argv);
03122 excl = RTEST(exclusive);
03123 CONST_ID(succ, "succ");
03124 StringValue(end);
03125 enc = rb_enc_check(beg, end);
03126 ascii = (is_ascii_string(beg) && is_ascii_string(end));
03127
03128 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03129 char c = RSTRING_PTR(beg)[0];
03130 char e = RSTRING_PTR(end)[0];
03131
03132 if (c > e || (excl && c == e)) return beg;
03133 for (;;) {
03134 rb_yield(rb_enc_str_new(&c, 1, enc));
03135 if (!excl && c == e) break;
03136 c++;
03137 if (excl && c == e) break;
03138 }
03139 return beg;
03140 }
03141
03142 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03143 char *s, *send;
03144 VALUE b, e;
03145 int width;
03146
03147 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03148 width = rb_long2int(send - s);
03149 while (s < send) {
03150 if (!ISDIGIT(*s)) goto no_digits;
03151 s++;
03152 }
03153 s = RSTRING_PTR(end); send = RSTRING_END(end);
03154 while (s < send) {
03155 if (!ISDIGIT(*s)) goto no_digits;
03156 s++;
03157 }
03158 b = rb_str_to_inum(beg, 10, FALSE);
03159 e = rb_str_to_inum(end, 10, FALSE);
03160 if (FIXNUM_P(b) && FIXNUM_P(e)) {
03161 long bi = FIX2LONG(b);
03162 long ei = FIX2LONG(e);
03163 rb_encoding *usascii = rb_usascii_encoding();
03164
03165 while (bi <= ei) {
03166 if (excl && bi == ei) break;
03167 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03168 bi++;
03169 }
03170 }
03171 else {
03172 ID op = excl ? '<' : rb_intern("<=");
03173 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03174
03175 args[0] = INT2FIX(width);
03176 while (rb_funcall(b, op, 1, e)) {
03177 args[1] = b;
03178 rb_yield(rb_str_format(numberof(args), args, fmt));
03179 b = rb_funcall(b, succ, 0, 0);
03180 }
03181 }
03182 return beg;
03183 }
03184
03185 no_digits:
03186 n = rb_str_cmp(beg, end);
03187 if (n > 0 || (excl && n == 0)) return beg;
03188
03189 after_end = rb_funcall(end, succ, 0, 0);
03190 current = rb_str_dup(beg);
03191 while (!rb_str_equal(current, after_end)) {
03192 VALUE next = Qnil;
03193 if (excl || !rb_str_equal(current, end))
03194 next = rb_funcall(current, succ, 0, 0);
03195 rb_yield(current);
03196 if (NIL_P(next)) break;
03197 current = next;
03198 StringValue(current);
03199 if (excl && rb_str_equal(current, end)) break;
03200 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03201 break;
03202 }
03203
03204 return beg;
03205 }
03206
03207 static VALUE
03208 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03209 {
03210 if (rb_reg_search(re, str, 0, 0) >= 0) {
03211 VALUE match = rb_backref_get();
03212 int nth = rb_reg_backref_number(match, backref);
03213 return rb_reg_nth_match(nth, match);
03214 }
03215 return Qnil;
03216 }
03217
03218 static VALUE
03219 rb_str_aref(VALUE str, VALUE indx)
03220 {
03221 long idx;
03222
03223 if (FIXNUM_P(indx)) {
03224 idx = FIX2LONG(indx);
03225
03226 num_index:
03227 str = rb_str_substr(str, idx, 1);
03228 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03229 return str;
03230 }
03231
03232 if (SPECIAL_CONST_P(indx)) goto generic;
03233 switch (BUILTIN_TYPE(indx)) {
03234 case T_REGEXP:
03235 return rb_str_subpat(str, indx, INT2FIX(0));
03236
03237 case T_STRING:
03238 if (rb_str_index(str, indx, 0) != -1)
03239 return rb_str_dup(indx);
03240 return Qnil;
03241
03242 generic:
03243 default:
03244
03245 {
03246 long beg, len;
03247 VALUE tmp;
03248
03249 len = str_strlen(str, STR_ENC_GET(str));
03250 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03251 case Qfalse:
03252 break;
03253 case Qnil:
03254 return Qnil;
03255 default:
03256 tmp = rb_str_substr(str, beg, len);
03257 return tmp;
03258 }
03259 }
03260 idx = NUM2LONG(indx);
03261 goto num_index;
03262 }
03263
03264 UNREACHABLE;
03265 }
03266
03267
03268
03269
03270
03271
03272
03273
03274
03275
03276
03277
03278
03279
03280
03281
03282
03283
03284
03285
03286
03287
03288
03289
03290
03291
03292
03293
03294
03295
03296
03297
03298
03299
03300
03301
03302
03303
03304
03305
03306
03307
03308
03309
03310
03311
03312
03313
03314
03315
03316
03317
03318
03319
03320
03321
03322
03323
03324
03325
03326
03327
03328
03329
03330
03331
03332
03333
03334
03335
03336
03337 static VALUE
03338 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03339 {
03340 if (argc == 2) {
03341 if (RB_TYPE_P(argv[0], T_REGEXP)) {
03342 return rb_str_subpat(str, argv[0], argv[1]);
03343 }
03344 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03345 }
03346 rb_check_arity(argc, 1, 2);
03347 return rb_str_aref(str, argv[0]);
03348 }
03349
03350 VALUE
03351 rb_str_drop_bytes(VALUE str, long len)
03352 {
03353 char *ptr = RSTRING_PTR(str);
03354 long olen = RSTRING_LEN(str), nlen;
03355
03356 str_modifiable(str);
03357 if (len > olen) len = olen;
03358 nlen = olen - len;
03359 if (nlen <= RSTRING_EMBED_LEN_MAX) {
03360 char *oldptr = ptr;
03361 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03362 STR_SET_EMBED(str);
03363 STR_SET_EMBED_LEN(str, nlen);
03364 ptr = RSTRING(str)->as.ary;
03365 memmove(ptr, oldptr + len, nlen);
03366 if (fl == STR_NOEMBED) xfree(oldptr);
03367 }
03368 else {
03369 if (!STR_SHARED_P(str)) rb_str_new4(str);
03370 ptr = RSTRING(str)->as.heap.ptr += len;
03371 RSTRING(str)->as.heap.len = nlen;
03372 }
03373 ptr[nlen] = 0;
03374 ENC_CODERANGE_CLEAR(str);
03375 return str;
03376 }
03377
03378 static void
03379 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03380 {
03381 if (beg == 0 && RSTRING_LEN(val) == 0) {
03382 rb_str_drop_bytes(str, len);
03383 OBJ_INFECT(str, val);
03384 return;
03385 }
03386
03387 rb_str_modify(str);
03388 if (len < RSTRING_LEN(val)) {
03389
03390 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03391 }
03392
03393 if (RSTRING_LEN(val) != len) {
03394 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03395 RSTRING_PTR(str) + beg + len,
03396 RSTRING_LEN(str) - (beg + len));
03397 }
03398 if (RSTRING_LEN(val) < beg && len < 0) {
03399 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03400 }
03401 if (RSTRING_LEN(val) > 0) {
03402 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03403 }
03404 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03405 if (RSTRING_PTR(str)) {
03406 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03407 }
03408 OBJ_INFECT(str, val);
03409 }
03410
03411 static void
03412 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03413 {
03414 long slen;
03415 char *p, *e;
03416 rb_encoding *enc;
03417 int singlebyte = single_byte_optimizable(str);
03418 int cr;
03419
03420 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03421
03422 StringValue(val);
03423 enc = rb_enc_check(str, val);
03424 slen = str_strlen(str, enc);
03425
03426 if (slen < beg) {
03427 out_of_range:
03428 rb_raise(rb_eIndexError, "index %ld out of string", beg);
03429 }
03430 if (beg < 0) {
03431 if (-beg > slen) {
03432 goto out_of_range;
03433 }
03434 beg += slen;
03435 }
03436 if (slen < len || slen < beg + len) {
03437 len = slen - beg;
03438 }
03439 str_modify_keep_cr(str);
03440 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03441 if (!p) p = RSTRING_END(str);
03442 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03443 if (!e) e = RSTRING_END(str);
03444
03445 beg = p - RSTRING_PTR(str);
03446 len = e - p;
03447 rb_str_splice_0(str, beg, len, val);
03448 rb_enc_associate(str, enc);
03449 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03450 if (cr != ENC_CODERANGE_BROKEN)
03451 ENC_CODERANGE_SET(str, cr);
03452 }
03453
03454 void
03455 rb_str_update(VALUE str, long beg, long len, VALUE val)
03456 {
03457 rb_str_splice(str, beg, len, val);
03458 }
03459
03460 static void
03461 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03462 {
03463 int nth;
03464 VALUE match;
03465 long start, end, len;
03466 rb_encoding *enc;
03467 struct re_registers *regs;
03468
03469 if (rb_reg_search(re, str, 0, 0) < 0) {
03470 rb_raise(rb_eIndexError, "regexp not matched");
03471 }
03472 match = rb_backref_get();
03473 nth = rb_reg_backref_number(match, backref);
03474 regs = RMATCH_REGS(match);
03475 if (nth >= regs->num_regs) {
03476 out_of_range:
03477 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03478 }
03479 if (nth < 0) {
03480 if (-nth >= regs->num_regs) {
03481 goto out_of_range;
03482 }
03483 nth += regs->num_regs;
03484 }
03485
03486 start = BEG(nth);
03487 if (start == -1) {
03488 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03489 }
03490 end = END(nth);
03491 len = end - start;
03492 StringValue(val);
03493 enc = rb_enc_check(str, val);
03494 rb_str_splice_0(str, start, len, val);
03495 rb_enc_associate(str, enc);
03496 }
03497
03498 static VALUE
03499 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03500 {
03501 long idx, beg;
03502
03503 if (FIXNUM_P(indx)) {
03504 idx = FIX2LONG(indx);
03505 num_index:
03506 rb_str_splice(str, idx, 1, val);
03507 return val;
03508 }
03509
03510 if (SPECIAL_CONST_P(indx)) goto generic;
03511 switch (TYPE(indx)) {
03512 case T_REGEXP:
03513 rb_str_subpat_set(str, indx, INT2FIX(0), val);
03514 return val;
03515
03516 case T_STRING:
03517 beg = rb_str_index(str, indx, 0);
03518 if (beg < 0) {
03519 rb_raise(rb_eIndexError, "string not matched");
03520 }
03521 beg = rb_str_sublen(str, beg);
03522 rb_str_splice(str, beg, str_strlen(indx, 0), val);
03523 return val;
03524
03525 generic:
03526 default:
03527
03528 {
03529 long beg, len;
03530 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03531 rb_str_splice(str, beg, len, val);
03532 return val;
03533 }
03534 }
03535 idx = NUM2LONG(indx);
03536 goto num_index;
03537 }
03538 }
03539
03540
03541
03542
03543
03544
03545
03546
03547
03548
03549
03550
03551
03552
03553
03554
03555
03556
03557
03558
03559
03560
03561
03562
03563
03564
03565 static VALUE
03566 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03567 {
03568 if (argc == 3) {
03569 if (RB_TYPE_P(argv[0], T_REGEXP)) {
03570 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03571 }
03572 else {
03573 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03574 }
03575 return argv[2];
03576 }
03577 rb_check_arity(argc, 2, 3);
03578 return rb_str_aset(str, argv[0], argv[1]);
03579 }
03580
03581
03582
03583
03584
03585
03586
03587
03588
03589
03590
03591
03592
03593
03594
03595
03596
03597
03598 static VALUE
03599 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03600 {
03601 long pos = NUM2LONG(idx);
03602
03603 if (pos == -1) {
03604 return rb_str_append(str, str2);
03605 }
03606 else if (pos < 0) {
03607 pos++;
03608 }
03609 rb_str_splice(str, pos, 0, str2);
03610 return str;
03611 }
03612
03613
03614
03615
03616
03617
03618
03619
03620
03621
03622
03623
03624
03625
03626
03627
03628
03629
03630
03631
03632
03633 static VALUE
03634 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03635 {
03636 VALUE result;
03637 VALUE buf[3];
03638 int i;
03639
03640 rb_check_arity(argc, 1, 2);
03641 for (i=0; i<argc; i++) {
03642 buf[i] = argv[i];
03643 }
03644 str_modify_keep_cr(str);
03645 result = rb_str_aref_m(argc, buf, str);
03646 if (!NIL_P(result)) {
03647 buf[i] = rb_str_new(0,0);
03648 rb_str_aset_m(argc+1, buf, str);
03649 }
03650 return result;
03651 }
03652
03653 static VALUE
03654 get_pat(VALUE pat, int quote)
03655 {
03656 VALUE val;
03657
03658 switch (TYPE(pat)) {
03659 case T_REGEXP:
03660 return pat;
03661
03662 case T_STRING:
03663 break;
03664
03665 default:
03666 val = rb_check_string_type(pat);
03667 if (NIL_P(val)) {
03668 Check_Type(pat, T_REGEXP);
03669 }
03670 pat = val;
03671 }
03672
03673 if (quote) {
03674 pat = rb_reg_quote(pat);
03675 }
03676
03677 return rb_reg_regcomp(pat);
03678 }
03679
03680
03681
03682
03683
03684
03685
03686
03687
03688
03689
03690
03691
03692 static VALUE
03693 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03694 {
03695 VALUE pat, repl, hash = Qnil;
03696 int iter = 0;
03697 int tainted = 0;
03698 int untrusted = 0;
03699 long plen;
03700 int min_arity = rb_block_given_p() ? 1 : 2;
03701
03702 rb_check_arity(argc, min_arity, 2);
03703 if (argc == 1) {
03704 iter = 1;
03705 }
03706 else {
03707 repl = argv[1];
03708 hash = rb_check_hash_type(argv[1]);
03709 if (NIL_P(hash)) {
03710 StringValue(repl);
03711 }
03712 if (OBJ_TAINTED(repl)) tainted = 1;
03713 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03714 }
03715
03716 pat = get_pat(argv[0], 1);
03717 str_modifiable(str);
03718 if (rb_reg_search(pat, str, 0, 0) >= 0) {
03719 rb_encoding *enc;
03720 int cr = ENC_CODERANGE(str);
03721 VALUE match = rb_backref_get();
03722 struct re_registers *regs = RMATCH_REGS(match);
03723 long beg0 = BEG(0);
03724 long end0 = END(0);
03725 char *p, *rp;
03726 long len, rlen;
03727
03728 if (iter || !NIL_P(hash)) {
03729 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03730
03731 if (iter) {
03732 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03733 }
03734 else {
03735 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03736 repl = rb_obj_as_string(repl);
03737 }
03738 str_mod_check(str, p, len);
03739 rb_check_frozen(str);
03740 }
03741 else {
03742 repl = rb_reg_regsub(repl, str, regs, pat);
03743 }
03744 enc = rb_enc_compatible(str, repl);
03745 if (!enc) {
03746 rb_encoding *str_enc = STR_ENC_GET(str);
03747 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03748 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03749 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03750 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03751 rb_enc_name(str_enc),
03752 rb_enc_name(STR_ENC_GET(repl)));
03753 }
03754 enc = STR_ENC_GET(repl);
03755 }
03756 rb_str_modify(str);
03757 rb_enc_associate(str, enc);
03758 if (OBJ_TAINTED(repl)) tainted = 1;
03759 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03760 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03761 int cr2 = ENC_CODERANGE(repl);
03762 if (cr2 == ENC_CODERANGE_BROKEN ||
03763 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03764 cr = ENC_CODERANGE_UNKNOWN;
03765 else
03766 cr = cr2;
03767 }
03768 plen = end0 - beg0;
03769 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03770 len = RSTRING_LEN(str);
03771 if (rlen > plen) {
03772 RESIZE_CAPA(str, len + rlen - plen);
03773 }
03774 p = RSTRING_PTR(str);
03775 if (rlen != plen) {
03776 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03777 }
03778 memcpy(p + beg0, rp, rlen);
03779 len += rlen - plen;
03780 STR_SET_LEN(str, len);
03781 RSTRING_PTR(str)[len] = '\0';
03782 ENC_CODERANGE_SET(str, cr);
03783 if (tainted) OBJ_TAINT(str);
03784 if (untrusted) OBJ_UNTRUST(str);
03785
03786 return str;
03787 }
03788 return Qnil;
03789 }
03790
03791
03792
03793
03794
03795
03796
03797
03798
03799
03800
03801
03802
03803
03804
03805
03806
03807
03808
03809
03810
03811
03812
03813
03814
03815
03816
03817
03818
03819
03820
03821
03822
03823
03824
03825
03826
03827
03828
03829
03830
03831 static VALUE
03832 rb_str_sub(int argc, VALUE *argv, VALUE str)
03833 {
03834 str = rb_str_dup(str);
03835 rb_str_sub_bang(argc, argv, str);
03836 return str;
03837 }
03838
03839 static VALUE
03840 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03841 {
03842 VALUE pat, val, repl, match, dest, hash = Qnil;
03843 struct re_registers *regs;
03844 long beg, n;
03845 long beg0, end0;
03846 long offset, blen, slen, len, last;
03847 int iter = 0;
03848 char *sp, *cp;
03849 int tainted = 0;
03850 rb_encoding *str_enc;
03851
03852 switch (argc) {
03853 case 1:
03854 RETURN_ENUMERATOR(str, argc, argv);
03855 iter = 1;
03856 break;
03857 case 2:
03858 repl = argv[1];
03859 hash = rb_check_hash_type(argv[1]);
03860 if (NIL_P(hash)) {
03861 StringValue(repl);
03862 }
03863 if (OBJ_TAINTED(repl)) tainted = 1;
03864 break;
03865 default:
03866 rb_check_arity(argc, 1, 2);
03867 }
03868
03869 pat = get_pat(argv[0], 1);
03870 beg = rb_reg_search(pat, str, 0, 0);
03871 if (beg < 0) {
03872 if (bang) return Qnil;
03873 return rb_str_dup(str);
03874 }
03875
03876 offset = 0;
03877 n = 0;
03878 blen = RSTRING_LEN(str) + 30;
03879 dest = rb_str_buf_new(blen);
03880 sp = RSTRING_PTR(str);
03881 slen = RSTRING_LEN(str);
03882 cp = sp;
03883 str_enc = STR_ENC_GET(str);
03884 rb_enc_associate(dest, str_enc);
03885 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
03886
03887 do {
03888 n++;
03889 match = rb_backref_get();
03890 regs = RMATCH_REGS(match);
03891 beg0 = BEG(0);
03892 end0 = END(0);
03893 if (iter || !NIL_P(hash)) {
03894 if (iter) {
03895 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03896 }
03897 else {
03898 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03899 val = rb_obj_as_string(val);
03900 }
03901 str_mod_check(str, sp, slen);
03902 if (val == dest) {
03903 rb_raise(rb_eRuntimeError, "block should not cheat");
03904 }
03905 }
03906 else {
03907 val = rb_reg_regsub(repl, str, regs, pat);
03908 }
03909
03910 if (OBJ_TAINTED(val)) tainted = 1;
03911
03912 len = beg0 - offset;
03913 if (len) {
03914 rb_enc_str_buf_cat(dest, cp, len, str_enc);
03915 }
03916
03917 rb_str_buf_append(dest, val);
03918
03919 last = offset;
03920 offset = end0;
03921 if (beg0 == end0) {
03922
03923
03924
03925
03926 if (RSTRING_LEN(str) <= end0) break;
03927 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03928 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03929 offset = end0 + len;
03930 }
03931 cp = RSTRING_PTR(str) + offset;
03932 if (offset > RSTRING_LEN(str)) break;
03933 beg = rb_reg_search(pat, str, offset, 0);
03934 } while (beg >= 0);
03935 if (RSTRING_LEN(str) > offset) {
03936 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03937 }
03938 rb_reg_search(pat, str, last, 0);
03939 if (bang) {
03940 rb_str_shared_replace(str, dest);
03941 }
03942 else {
03943 RBASIC(dest)->klass = rb_obj_class(str);
03944 OBJ_INFECT(dest, str);
03945 str = dest;
03946 }
03947
03948 if (tainted) OBJ_TAINT(str);
03949 return str;
03950 }
03951
03952
03953
03954
03955
03956
03957
03958
03959
03960
03961
03962
03963
03964 static VALUE
03965 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03966 {
03967 str_modify_keep_cr(str);
03968 return str_gsub(argc, argv, str, 1);
03969 }
03970
03971
03972
03973
03974
03975
03976
03977
03978
03979
03980
03981
03982
03983
03984
03985
03986
03987
03988
03989
03990
03991
03992
03993
03994
03995
03996
03997
03998
03999
04000
04001
04002
04003
04004
04005
04006
04007
04008
04009
04010
04011
04012
04013
04014
04015 static VALUE
04016 rb_str_gsub(int argc, VALUE *argv, VALUE str)
04017 {
04018 return str_gsub(argc, argv, str, 0);
04019 }
04020
04021
04022
04023
04024
04025
04026
04027
04028
04029
04030
04031
04032
04033 VALUE
04034 rb_str_replace(VALUE str, VALUE str2)
04035 {
04036 str_modifiable(str);
04037 if (str == str2) return str;
04038
04039 StringValue(str2);
04040 str_discard(str);
04041 return str_replace(str, str2);
04042 }
04043
04044
04045
04046
04047
04048
04049
04050
04051
04052
04053
04054 static VALUE
04055 rb_str_clear(VALUE str)
04056 {
04057 str_discard(str);
04058 STR_SET_EMBED(str);
04059 STR_SET_EMBED_LEN(str, 0);
04060 RSTRING_PTR(str)[0] = 0;
04061 if (rb_enc_asciicompat(STR_ENC_GET(str)))
04062 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04063 else
04064 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04065 return str;
04066 }
04067
04068
04069
04070
04071
04072
04073
04074
04075
04076
04077
04078 static VALUE
04079 rb_str_chr(VALUE str)
04080 {
04081 return rb_str_substr(str, 0, 1);
04082 }
04083
04084
04085
04086
04087
04088
04089
04090 static VALUE
04091 rb_str_getbyte(VALUE str, VALUE index)
04092 {
04093 long pos = NUM2LONG(index);
04094
04095 if (pos < 0)
04096 pos += RSTRING_LEN(str);
04097 if (pos < 0 || RSTRING_LEN(str) <= pos)
04098 return Qnil;
04099
04100 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
04101 }
04102
04103
04104
04105
04106
04107
04108
04109 static VALUE
04110 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
04111 {
04112 long pos = NUM2LONG(index);
04113 int byte = NUM2INT(value);
04114
04115 rb_str_modify(str);
04116
04117 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04118 rb_raise(rb_eIndexError, "index %ld out of string", pos);
04119 if (pos < 0)
04120 pos += RSTRING_LEN(str);
04121
04122 RSTRING_PTR(str)[pos] = byte;
04123
04124 return value;
04125 }
04126
04127 static VALUE
04128 str_byte_substr(VALUE str, long beg, long len)
04129 {
04130 char *p, *s = RSTRING_PTR(str);
04131 long n = RSTRING_LEN(str);
04132 VALUE str2;
04133
04134 if (beg > n || len < 0) return Qnil;
04135 if (beg < 0) {
04136 beg += n;
04137 if (beg < 0) return Qnil;
04138 }
04139 if (beg + len > n)
04140 len = n - beg;
04141 if (len <= 0) {
04142 len = 0;
04143 p = 0;
04144 }
04145 else
04146 p = s + beg;
04147
04148 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04149 str2 = rb_str_new4(str);
04150 str2 = str_new3(rb_obj_class(str2), str2);
04151 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04152 RSTRING(str2)->as.heap.len = len;
04153 }
04154 else {
04155 str2 = rb_str_new5(str, p, len);
04156 }
04157
04158 str_enc_copy(str2, str);
04159
04160 if (RSTRING_LEN(str2) == 0) {
04161 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
04162 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
04163 else
04164 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04165 }
04166 else {
04167 switch (ENC_CODERANGE(str)) {
04168 case ENC_CODERANGE_7BIT:
04169 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04170 break;
04171 default:
04172 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
04173 break;
04174 }
04175 }
04176
04177 OBJ_INFECT(str2, str);
04178
04179 return str2;
04180 }
04181
04182 static VALUE
04183 str_byte_aref(VALUE str, VALUE indx)
04184 {
04185 long idx;
04186 switch (TYPE(indx)) {
04187 case T_FIXNUM:
04188 idx = FIX2LONG(indx);
04189
04190 num_index:
04191 str = str_byte_substr(str, idx, 1);
04192 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04193 return str;
04194
04195 default:
04196
04197 {
04198 long beg, len = RSTRING_LEN(str);
04199
04200 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04201 case Qfalse:
04202 break;
04203 case Qnil:
04204 return Qnil;
04205 default:
04206 return str_byte_substr(str, beg, len);
04207 }
04208 }
04209 idx = NUM2LONG(indx);
04210 goto num_index;
04211 }
04212
04213 UNREACHABLE;
04214 }
04215
04216
04217
04218
04219
04220
04221
04222
04223
04224
04225
04226
04227
04228
04229
04230
04231
04232
04233
04234
04235
04236
04237
04238
04239 static VALUE
04240 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04241 {
04242 if (argc == 2) {
04243 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04244 }
04245 rb_check_arity(argc, 1, 2);
04246 return str_byte_aref(str, argv[0]);
04247 }
04248
04249
04250
04251
04252
04253
04254
04255
04256
04257
04258 static VALUE
04259 rb_str_reverse(VALUE str)
04260 {
04261 rb_encoding *enc;
04262 VALUE rev;
04263 char *s, *e, *p;
04264 int single = 1;
04265
04266 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04267 enc = STR_ENC_GET(str);
04268 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04269 s = RSTRING_PTR(str); e = RSTRING_END(str);
04270 p = RSTRING_END(rev);
04271
04272 if (RSTRING_LEN(str) > 1) {
04273 if (single_byte_optimizable(str)) {
04274 while (s < e) {
04275 *--p = *s++;
04276 }
04277 }
04278 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
04279 while (s < e) {
04280 int clen = rb_enc_fast_mbclen(s, e, enc);
04281
04282 if (clen > 1 || (*s & 0x80)) single = 0;
04283 p -= clen;
04284 memcpy(p, s, clen);
04285 s += clen;
04286 }
04287 }
04288 else {
04289 while (s < e) {
04290 int clen = rb_enc_mbclen(s, e, enc);
04291
04292 if (clen > 1 || (*s & 0x80)) single = 0;
04293 p -= clen;
04294 memcpy(p, s, clen);
04295 s += clen;
04296 }
04297 }
04298 }
04299 STR_SET_LEN(rev, RSTRING_LEN(str));
04300 OBJ_INFECT(rev, str);
04301 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
04302 if (single) {
04303 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04304 }
04305 else {
04306 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04307 }
04308 }
04309 rb_enc_cr_str_copy_for_substr(rev, str);
04310
04311 return rev;
04312 }
04313
04314
04315
04316
04317
04318
04319
04320
04321
04322 static VALUE
04323 rb_str_reverse_bang(VALUE str)
04324 {
04325 if (RSTRING_LEN(str) > 1) {
04326 if (single_byte_optimizable(str)) {
04327 char *s, *e, c;
04328
04329 str_modify_keep_cr(str);
04330 s = RSTRING_PTR(str);
04331 e = RSTRING_END(str) - 1;
04332 while (s < e) {
04333 c = *s;
04334 *s++ = *e;
04335 *e-- = c;
04336 }
04337 }
04338 else {
04339 rb_str_shared_replace(str, rb_str_reverse(str));
04340 }
04341 }
04342 else {
04343 str_modify_keep_cr(str);
04344 }
04345 return str;
04346 }
04347
04348
04349
04350
04351
04352
04353
04354
04355
04356
04357
04358
04359
04360
04361 static VALUE
04362 rb_str_include(VALUE str, VALUE arg)
04363 {
04364 long i;
04365
04366 StringValue(arg);
04367 i = rb_str_index(str, arg, 0);
04368
04369 if (i == -1) return Qfalse;
04370 return Qtrue;
04371 }
04372
04373
04374
04375
04376
04377
04378
04379
04380
04381
04382
04383
04384
04385
04386
04387
04388
04389
04390
04391
04392
04393
04394
04395 static VALUE
04396 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04397 {
04398 int base;
04399
04400 if (argc == 0) base = 10;
04401 else {
04402 VALUE b;
04403
04404 rb_scan_args(argc, argv, "01", &b);
04405 base = NUM2INT(b);
04406 }
04407 if (base < 0) {
04408 rb_raise(rb_eArgError, "invalid radix %d", base);
04409 }
04410 return rb_str_to_inum(str, base, FALSE);
04411 }
04412
04413
04414
04415
04416
04417
04418
04419
04420
04421
04422
04423
04424
04425
04426
04427
04428 static VALUE
04429 rb_str_to_f(VALUE str)
04430 {
04431 return DBL2NUM(rb_str_to_dbl(str, FALSE));
04432 }
04433
04434
04435
04436
04437
04438
04439
04440
04441
04442
04443 static VALUE
04444 rb_str_to_s(VALUE str)
04445 {
04446 if (rb_obj_class(str) != rb_cString) {
04447 return str_duplicate(rb_cString, str);
04448 }
04449 return str;
04450 }
04451
04452 #if 0
04453 static void
04454 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04455 {
04456 char s[RUBY_MAX_CHAR_LEN];
04457 int n = rb_enc_codelen(c, enc);
04458
04459 rb_enc_mbcput(c, s, enc);
04460 rb_enc_str_buf_cat(str, s, n, enc);
04461 }
04462 #endif
04463
04464 #define CHAR_ESC_LEN 13
04465
04466 int
04467 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04468 {
04469 char buf[CHAR_ESC_LEN + 1];
04470 int l;
04471
04472 #if SIZEOF_INT > 4
04473 c &= 0xffffffff;
04474 #endif
04475 if (unicode_p) {
04476 if (c < 0x7F && ISPRINT(c)) {
04477 snprintf(buf, CHAR_ESC_LEN, "%c", c);
04478 }
04479 else if (c < 0x10000) {
04480 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04481 }
04482 else {
04483 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04484 }
04485 }
04486 else {
04487 if (c < 0x100) {
04488 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04489 }
04490 else {
04491 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04492 }
04493 }
04494 l = (int)strlen(buf);
04495 rb_str_buf_cat(result, buf, l);
04496 return l;
04497 }
04498
04499
04500
04501
04502
04503
04504
04505
04506
04507
04508
04509
04510
04511 VALUE
04512 rb_str_inspect(VALUE str)
04513 {
04514 rb_encoding *enc = STR_ENC_GET(str);
04515 const char *p, *pend, *prev;
04516 char buf[CHAR_ESC_LEN + 1];
04517 VALUE result = rb_str_buf_new(0);
04518 rb_encoding *resenc = rb_default_internal_encoding();
04519 int unicode_p = rb_enc_unicode_p(enc);
04520 int asciicompat = rb_enc_asciicompat(enc);
04521 static rb_encoding *utf16, *utf32;
04522
04523 if (!utf16) utf16 = rb_enc_find("UTF-16");
04524 if (!utf32) utf32 = rb_enc_find("UTF-32");
04525 if (resenc == NULL) resenc = rb_default_external_encoding();
04526 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04527 rb_enc_associate(result, resenc);
04528 str_buf_cat2(result, "\"");
04529
04530 p = RSTRING_PTR(str); pend = RSTRING_END(str);
04531 prev = p;
04532 if (enc == utf16) {
04533 const unsigned char *q = (const unsigned char *)p;
04534 if (q[0] == 0xFE && q[1] == 0xFF)
04535 enc = rb_enc_find("UTF-16BE");
04536 else if (q[0] == 0xFF && q[1] == 0xFE)
04537 enc = rb_enc_find("UTF-16LE");
04538 else
04539 unicode_p = 0;
04540 }
04541 else if (enc == utf32) {
04542 const unsigned char *q = (const unsigned char *)p;
04543 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
04544 enc = rb_enc_find("UTF-32BE");
04545 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
04546 enc = rb_enc_find("UTF-32LE");
04547 else
04548 unicode_p = 0;
04549 }
04550 while (p < pend) {
04551 unsigned int c, cc;
04552 int n;
04553
04554 n = rb_enc_precise_mbclen(p, pend, enc);
04555 if (!MBCLEN_CHARFOUND_P(n)) {
04556 if (p > prev) str_buf_cat(result, prev, p - prev);
04557 n = rb_enc_mbminlen(enc);
04558 if (pend < p + n)
04559 n = (int)(pend - p);
04560 while (n--) {
04561 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04562 str_buf_cat(result, buf, strlen(buf));
04563 prev = ++p;
04564 }
04565 continue;
04566 }
04567 n = MBCLEN_CHARFOUND_LEN(n);
04568 c = rb_enc_mbc_to_codepoint(p, pend, enc);
04569 p += n;
04570 if ((asciicompat || unicode_p) &&
04571 (c == '"'|| c == '\\' ||
04572 (c == '#' &&
04573 p < pend &&
04574 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04575 (cc = rb_enc_codepoint(p,pend,enc),
04576 (cc == '$' || cc == '@' || cc == '{'))))) {
04577 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04578 str_buf_cat2(result, "\\");
04579 if (asciicompat || enc == resenc) {
04580 prev = p - n;
04581 continue;
04582 }
04583 }
04584 switch (c) {
04585 case '\n': cc = 'n'; break;
04586 case '\r': cc = 'r'; break;
04587 case '\t': cc = 't'; break;
04588 case '\f': cc = 'f'; break;
04589 case '\013': cc = 'v'; break;
04590 case '\010': cc = 'b'; break;
04591 case '\007': cc = 'a'; break;
04592 case 033: cc = 'e'; break;
04593 default: cc = 0; break;
04594 }
04595 if (cc) {
04596 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04597 buf[0] = '\\';
04598 buf[1] = (char)cc;
04599 str_buf_cat(result, buf, 2);
04600 prev = p;
04601 continue;
04602 }
04603 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04604 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04605 continue;
04606 }
04607 else {
04608 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04609 rb_str_buf_cat_escaped_char(result, c, unicode_p);
04610 prev = p;
04611 continue;
04612 }
04613 }
04614 if (p > prev) str_buf_cat(result, prev, p - prev);
04615 str_buf_cat2(result, "\"");
04616
04617 OBJ_INFECT(result, str);
04618 return result;
04619 }
04620
04621 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04622
04623
04624
04625
04626
04627
04628
04629
04630
04631
04632
04633 VALUE
04634 rb_str_dump(VALUE str)
04635 {
04636 rb_encoding *enc = rb_enc_get(str);
04637 long len;
04638 const char *p, *pend;
04639 char *q, *qend;
04640 VALUE result;
04641 int u8 = (enc == rb_utf8_encoding());
04642
04643 len = 2;
04644 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04645 while (p < pend) {
04646 unsigned char c = *p++;
04647 switch (c) {
04648 case '"': case '\\':
04649 case '\n': case '\r':
04650 case '\t': case '\f':
04651 case '\013': case '\010': case '\007': case '\033':
04652 len += 2;
04653 break;
04654
04655 case '#':
04656 len += IS_EVSTR(p, pend) ? 2 : 1;
04657 break;
04658
04659 default:
04660 if (ISPRINT(c)) {
04661 len++;
04662 }
04663 else {
04664 if (u8) {
04665 int n = rb_enc_precise_mbclen(p-1, pend, enc);
04666 if (MBCLEN_CHARFOUND_P(n-1)) {
04667 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04668 while (cc >>= 4) len++;
04669 len += 5;
04670 p += MBCLEN_CHARFOUND_LEN(n)-1;
04671 break;
04672 }
04673 }
04674 len += 4;
04675 }
04676 break;
04677 }
04678 }
04679 if (!rb_enc_asciicompat(enc)) {
04680 len += 19;
04681 len += strlen(enc->name);
04682 }
04683
04684 result = rb_str_new5(str, 0, len);
04685 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04686 q = RSTRING_PTR(result); qend = q + len + 1;
04687
04688 *q++ = '"';
04689 while (p < pend) {
04690 unsigned char c = *p++;
04691
04692 if (c == '"' || c == '\\') {
04693 *q++ = '\\';
04694 *q++ = c;
04695 }
04696 else if (c == '#') {
04697 if (IS_EVSTR(p, pend)) *q++ = '\\';
04698 *q++ = '#';
04699 }
04700 else if (c == '\n') {
04701 *q++ = '\\';
04702 *q++ = 'n';
04703 }
04704 else if (c == '\r') {
04705 *q++ = '\\';
04706 *q++ = 'r';
04707 }
04708 else if (c == '\t') {
04709 *q++ = '\\';
04710 *q++ = 't';
04711 }
04712 else if (c == '\f') {
04713 *q++ = '\\';
04714 *q++ = 'f';
04715 }
04716 else if (c == '\013') {
04717 *q++ = '\\';
04718 *q++ = 'v';
04719 }
04720 else if (c == '\010') {
04721 *q++ = '\\';
04722 *q++ = 'b';
04723 }
04724 else if (c == '\007') {
04725 *q++ = '\\';
04726 *q++ = 'a';
04727 }
04728 else if (c == '\033') {
04729 *q++ = '\\';
04730 *q++ = 'e';
04731 }
04732 else if (ISPRINT(c)) {
04733 *q++ = c;
04734 }
04735 else {
04736 *q++ = '\\';
04737 if (u8) {
04738 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04739 if (MBCLEN_CHARFOUND_P(n)) {
04740 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04741 p += n;
04742 snprintf(q, qend-q, "u{%x}", cc);
04743 q += strlen(q);
04744 continue;
04745 }
04746 }
04747 snprintf(q, qend-q, "x%02X", c);
04748 q += 3;
04749 }
04750 }
04751 *q++ = '"';
04752 *q = '\0';
04753 if (!rb_enc_asciicompat(enc)) {
04754 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04755 enc = rb_ascii8bit_encoding();
04756 }
04757 OBJ_INFECT(result, str);
04758
04759 rb_enc_associate(result, enc);
04760 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04761 return result;
04762 }
04763
04764
04765 static void
04766 rb_str_check_dummy_enc(rb_encoding *enc)
04767 {
04768 if (rb_enc_dummy_p(enc)) {
04769 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04770 rb_enc_name(enc));
04771 }
04772 }
04773
04774
04775
04776
04777
04778
04779
04780
04781
04782
04783 static VALUE
04784 rb_str_upcase_bang(VALUE str)
04785 {
04786 rb_encoding *enc;
04787 char *s, *send;
04788 int modify = 0;
04789 int n;
04790
04791 str_modify_keep_cr(str);
04792 enc = STR_ENC_GET(str);
04793 rb_str_check_dummy_enc(enc);
04794 s = RSTRING_PTR(str); send = RSTRING_END(str);
04795 if (single_byte_optimizable(str)) {
04796 while (s < send) {
04797 unsigned int c = *(unsigned char*)s;
04798
04799 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04800 *s = 'A' + (c - 'a');
04801 modify = 1;
04802 }
04803 s++;
04804 }
04805 }
04806 else {
04807 int ascompat = rb_enc_asciicompat(enc);
04808
04809 while (s < send) {
04810 unsigned int c;
04811
04812 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04813 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04814 *s = 'A' + (c - 'a');
04815 modify = 1;
04816 }
04817 s++;
04818 }
04819 else {
04820 c = rb_enc_codepoint_len(s, send, &n, enc);
04821 if (rb_enc_islower(c, enc)) {
04822
04823 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04824 modify = 1;
04825 }
04826 s += n;
04827 }
04828 }
04829 }
04830
04831 if (modify) return str;
04832 return Qnil;
04833 }
04834
04835
04836
04837
04838
04839
04840
04841
04842
04843
04844
04845
04846
04847
04848 static VALUE
04849 rb_str_upcase(VALUE str)
04850 {
04851 str = rb_str_dup(str);
04852 rb_str_upcase_bang(str);
04853 return str;
04854 }
04855
04856
04857
04858
04859
04860
04861
04862
04863
04864
04865
04866 static VALUE
04867 rb_str_downcase_bang(VALUE str)
04868 {
04869 rb_encoding *enc;
04870 char *s, *send;
04871 int modify = 0;
04872
04873 str_modify_keep_cr(str);
04874 enc = STR_ENC_GET(str);
04875 rb_str_check_dummy_enc(enc);
04876 s = RSTRING_PTR(str); send = RSTRING_END(str);
04877 if (single_byte_optimizable(str)) {
04878 while (s < send) {
04879 unsigned int c = *(unsigned char*)s;
04880
04881 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04882 *s = 'a' + (c - 'A');
04883 modify = 1;
04884 }
04885 s++;
04886 }
04887 }
04888 else {
04889 int ascompat = rb_enc_asciicompat(enc);
04890
04891 while (s < send) {
04892 unsigned int c;
04893 int n;
04894
04895 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04896 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04897 *s = 'a' + (c - 'A');
04898 modify = 1;
04899 }
04900 s++;
04901 }
04902 else {
04903 c = rb_enc_codepoint_len(s, send, &n, enc);
04904 if (rb_enc_isupper(c, enc)) {
04905
04906 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04907 modify = 1;
04908 }
04909 s += n;
04910 }
04911 }
04912 }
04913
04914 if (modify) return str;
04915 return Qnil;
04916 }
04917
04918
04919
04920
04921
04922
04923
04924
04925
04926
04927
04928
04929
04930
04931 static VALUE
04932 rb_str_downcase(VALUE str)
04933 {
04934 str = rb_str_dup(str);
04935 rb_str_downcase_bang(str);
04936 return str;
04937 }
04938
04939
04940
04941
04942
04943
04944
04945
04946
04947
04948
04949
04950
04951
04952
04953
04954 static VALUE
04955 rb_str_capitalize_bang(VALUE str)
04956 {
04957 rb_encoding *enc;
04958 char *s, *send;
04959 int modify = 0;
04960 unsigned int c;
04961 int n;
04962
04963 str_modify_keep_cr(str);
04964 enc = STR_ENC_GET(str);
04965 rb_str_check_dummy_enc(enc);
04966 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04967 s = RSTRING_PTR(str); send = RSTRING_END(str);
04968
04969 c = rb_enc_codepoint_len(s, send, &n, enc);
04970 if (rb_enc_islower(c, enc)) {
04971 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04972 modify = 1;
04973 }
04974 s += n;
04975 while (s < send) {
04976 c = rb_enc_codepoint_len(s, send, &n, enc);
04977 if (rb_enc_isupper(c, enc)) {
04978 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04979 modify = 1;
04980 }
04981 s += n;
04982 }
04983
04984 if (modify) return str;
04985 return Qnil;
04986 }
04987
04988
04989
04990
04991
04992
04993
04994
04995
04996
04997
04998
04999
05000
05001
05002 static VALUE
05003 rb_str_capitalize(VALUE str)
05004 {
05005 str = rb_str_dup(str);
05006 rb_str_capitalize_bang(str);
05007 return str;
05008 }
05009
05010
05011
05012
05013
05014
05015
05016
05017
05018
05019
05020 static VALUE
05021 rb_str_swapcase_bang(VALUE str)
05022 {
05023 rb_encoding *enc;
05024 char *s, *send;
05025 int modify = 0;
05026 int n;
05027
05028 str_modify_keep_cr(str);
05029 enc = STR_ENC_GET(str);
05030 rb_str_check_dummy_enc(enc);
05031 s = RSTRING_PTR(str); send = RSTRING_END(str);
05032 while (s < send) {
05033 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
05034
05035 if (rb_enc_isupper(c, enc)) {
05036
05037 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
05038 modify = 1;
05039 }
05040 else if (rb_enc_islower(c, enc)) {
05041
05042 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
05043 modify = 1;
05044 }
05045 s += n;
05046 }
05047
05048 if (modify) return str;
05049 return Qnil;
05050 }
05051
05052
05053
05054
05055
05056
05057
05058
05059
05060
05061
05062
05063
05064
05065 static VALUE
05066 rb_str_swapcase(VALUE str)
05067 {
05068 str = rb_str_dup(str);
05069 rb_str_swapcase_bang(str);
05070 return str;
05071 }
05072
05073 typedef unsigned char *USTR;
05074
05075 struct tr {
05076 int gen;
05077 unsigned int now, max;
05078 char *p, *pend;
05079 };
05080
05081 static unsigned int
05082 trnext(struct tr *t, rb_encoding *enc)
05083 {
05084 int n;
05085
05086 for (;;) {
05087 if (!t->gen) {
05088 nextpart:
05089 if (t->p == t->pend) return -1;
05090 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
05091 t->p += n;
05092 }
05093 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
05094 t->p += n;
05095 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
05096 t->p += n;
05097 if (t->p < t->pend) {
05098 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
05099 t->p += n;
05100 if (t->now > c) {
05101 if (t->now < 0x80 && c < 0x80) {
05102 rb_raise(rb_eArgError,
05103 "invalid range \"%c-%c\" in string transliteration",
05104 t->now, c);
05105 }
05106 else {
05107 rb_raise(rb_eArgError, "invalid range in string transliteration");
05108 }
05109 continue;
05110 }
05111 t->gen = 1;
05112 t->max = c;
05113 }
05114 }
05115 return t->now;
05116 }
05117 else {
05118 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
05119 if (t->now == t->max) {
05120 t->gen = 0;
05121 goto nextpart;
05122 }
05123 }
05124 if (t->now < t->max) {
05125 return t->now;
05126 }
05127 else {
05128 t->gen = 0;
05129 return t->max;
05130 }
05131 }
05132 }
05133 }
05134
05135 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
05136
05137 static VALUE
05138 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
05139 {
05140 const unsigned int errc = -1;
05141 unsigned int trans[256];
05142 rb_encoding *enc, *e1, *e2;
05143 struct tr trsrc, trrepl;
05144 int cflag = 0;
05145 unsigned int c, c0, last = 0;
05146 int modify = 0, i, l;
05147 char *s, *send;
05148 VALUE hash = 0;
05149 int singlebyte = single_byte_optimizable(str);
05150 int cr;
05151
05152 #define CHECK_IF_ASCII(c) \
05153 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05154 (cr = ENC_CODERANGE_VALID) : 0)
05155
05156 StringValue(src);
05157 StringValue(repl);
05158 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05159 if (RSTRING_LEN(repl) == 0) {
05160 return rb_str_delete_bang(1, &src, str);
05161 }
05162
05163 cr = ENC_CODERANGE(str);
05164 e1 = rb_enc_check(str, src);
05165 e2 = rb_enc_check(str, repl);
05166 if (e1 == e2) {
05167 enc = e1;
05168 }
05169 else {
05170 enc = rb_enc_check(src, repl);
05171 }
05172 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05173 if (RSTRING_LEN(src) > 1 &&
05174 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05175 trsrc.p + l < trsrc.pend) {
05176 cflag = 1;
05177 trsrc.p += l;
05178 }
05179 trrepl.p = RSTRING_PTR(repl);
05180 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05181 trsrc.gen = trrepl.gen = 0;
05182 trsrc.now = trrepl.now = 0;
05183 trsrc.max = trrepl.max = 0;
05184
05185 if (cflag) {
05186 for (i=0; i<256; i++) {
05187 trans[i] = 1;
05188 }
05189 while ((c = trnext(&trsrc, enc)) != errc) {
05190 if (c < 256) {
05191 trans[c] = errc;
05192 }
05193 else {
05194 if (!hash) hash = rb_hash_new();
05195 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05196 }
05197 }
05198 while ((c = trnext(&trrepl, enc)) != errc)
05199 ;
05200 last = trrepl.now;
05201 for (i=0; i<256; i++) {
05202 if (trans[i] != errc) {
05203 trans[i] = last;
05204 }
05205 }
05206 }
05207 else {
05208 unsigned int r;
05209
05210 for (i=0; i<256; i++) {
05211 trans[i] = errc;
05212 }
05213 while ((c = trnext(&trsrc, enc)) != errc) {
05214 r = trnext(&trrepl, enc);
05215 if (r == errc) r = trrepl.now;
05216 if (c < 256) {
05217 trans[c] = r;
05218 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05219 }
05220 else {
05221 if (!hash) hash = rb_hash_new();
05222 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05223 }
05224 }
05225 }
05226
05227 if (cr == ENC_CODERANGE_VALID)
05228 cr = ENC_CODERANGE_7BIT;
05229 str_modify_keep_cr(str);
05230 s = RSTRING_PTR(str); send = RSTRING_END(str);
05231 if (sflag) {
05232 int clen, tlen;
05233 long offset, max = RSTRING_LEN(str);
05234 unsigned int save = -1;
05235 char *buf = ALLOC_N(char, max), *t = buf;
05236
05237 while (s < send) {
05238 int may_modify = 0;
05239
05240 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05241 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05242
05243 s += clen;
05244 if (c < 256) {
05245 c = trans[c];
05246 }
05247 else if (hash) {
05248 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05249 if (NIL_P(tmp)) {
05250 if (cflag) c = last;
05251 else c = errc;
05252 }
05253 else if (cflag) c = errc;
05254 else c = NUM2INT(tmp);
05255 }
05256 else {
05257 c = errc;
05258 }
05259 if (c != (unsigned int)-1) {
05260 if (save == c) {
05261 CHECK_IF_ASCII(c);
05262 continue;
05263 }
05264 save = c;
05265 tlen = rb_enc_codelen(c, enc);
05266 modify = 1;
05267 }
05268 else {
05269 save = -1;
05270 c = c0;
05271 if (enc != e1) may_modify = 1;
05272 }
05273 while (t - buf + tlen >= max) {
05274 offset = t - buf;
05275 max *= 2;
05276 REALLOC_N(buf, char, max);
05277 t = buf + offset;
05278 }
05279 rb_enc_mbcput(c, t, enc);
05280 if (may_modify && memcmp(s, t, tlen) != 0) {
05281 modify = 1;
05282 }
05283 CHECK_IF_ASCII(c);
05284 t += tlen;
05285 }
05286 if (!STR_EMBED_P(str)) {
05287 xfree(RSTRING(str)->as.heap.ptr);
05288 }
05289 *t = '\0';
05290 RSTRING(str)->as.heap.ptr = buf;
05291 RSTRING(str)->as.heap.len = t - buf;
05292 STR_SET_NOEMBED(str);
05293 RSTRING(str)->as.heap.aux.capa = max;
05294 }
05295 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05296 while (s < send) {
05297 c = (unsigned char)*s;
05298 if (trans[c] != errc) {
05299 if (!cflag) {
05300 c = trans[c];
05301 *s = c;
05302 modify = 1;
05303 }
05304 else {
05305 *s = last;
05306 modify = 1;
05307 }
05308 }
05309 CHECK_IF_ASCII(c);
05310 s++;
05311 }
05312 }
05313 else {
05314 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05315 long offset;
05316 char *buf = ALLOC_N(char, max), *t = buf;
05317
05318 while (s < send) {
05319 int may_modify = 0;
05320 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05321 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05322
05323 if (c < 256) {
05324 c = trans[c];
05325 }
05326 else if (hash) {
05327 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05328 if (NIL_P(tmp)) {
05329 if (cflag) c = last;
05330 else c = errc;
05331 }
05332 else if (cflag) c = errc;
05333 else c = NUM2INT(tmp);
05334 }
05335 else {
05336 c = cflag ? last : errc;
05337 }
05338 if (c != errc) {
05339 tlen = rb_enc_codelen(c, enc);
05340 modify = 1;
05341 }
05342 else {
05343 c = c0;
05344 if (enc != e1) may_modify = 1;
05345 }
05346 while (t - buf + tlen >= max) {
05347 offset = t - buf;
05348 max *= 2;
05349 REALLOC_N(buf, char, max);
05350 t = buf + offset;
05351 }
05352 if (s != t) {
05353 rb_enc_mbcput(c, t, enc);
05354 if (may_modify && memcmp(s, t, tlen) != 0) {
05355 modify = 1;
05356 }
05357 }
05358 CHECK_IF_ASCII(c);
05359 s += clen;
05360 t += tlen;
05361 }
05362 if (!STR_EMBED_P(str)) {
05363 xfree(RSTRING(str)->as.heap.ptr);
05364 }
05365 *t = '\0';
05366 RSTRING(str)->as.heap.ptr = buf;
05367 RSTRING(str)->as.heap.len = t - buf;
05368 STR_SET_NOEMBED(str);
05369 RSTRING(str)->as.heap.aux.capa = max;
05370 }
05371
05372 if (modify) {
05373 if (cr != ENC_CODERANGE_BROKEN)
05374 ENC_CODERANGE_SET(str, cr);
05375 rb_enc_associate(str, enc);
05376 return str;
05377 }
05378 return Qnil;
05379 }
05380
05381
05382
05383
05384
05385
05386
05387
05388
05389
05390
05391 static VALUE
05392 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05393 {
05394 return tr_trans(str, src, repl, 0);
05395 }
05396
05397
05398
05399
05400
05401
05402
05403
05404
05405
05406
05407
05408
05409
05410
05411
05412
05413
05414
05415
05416
05417
05418
05419
05420
05421
05422
05423
05424
05425
05426
05427
05428
05429
05430
05431
05432
05433 static VALUE
05434 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05435 {
05436 str = rb_str_dup(str);
05437 tr_trans(str, src, repl, 0);
05438 return str;
05439 }
05440
05441 #define TR_TABLE_SIZE 257
05442 static void
05443 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05444 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05445 {
05446 const unsigned int errc = -1;
05447 char buf[256];
05448 struct tr tr;
05449 unsigned int c;
05450 VALUE table = 0, ptable = 0;
05451 int i, l, cflag = 0;
05452
05453 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05454 tr.gen = tr.now = tr.max = 0;
05455
05456 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05457 cflag = 1;
05458 tr.p += l;
05459 }
05460 if (first) {
05461 for (i=0; i<256; i++) {
05462 stable[i] = 1;
05463 }
05464 stable[256] = cflag;
05465 }
05466 else if (stable[256] && !cflag) {
05467 stable[256] = 0;
05468 }
05469 for (i=0; i<256; i++) {
05470 buf[i] = cflag;
05471 }
05472
05473 while ((c = trnext(&tr, enc)) != errc) {
05474 if (c < 256) {
05475 buf[c & 0xff] = !cflag;
05476 }
05477 else {
05478 VALUE key = UINT2NUM(c);
05479
05480 if (!table && (first || *tablep || stable[256])) {
05481 if (cflag) {
05482 ptable = *ctablep;
05483 table = ptable ? ptable : rb_hash_new();
05484 *ctablep = table;
05485 }
05486 else {
05487 table = rb_hash_new();
05488 ptable = *tablep;
05489 *tablep = table;
05490 }
05491 }
05492 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
05493 rb_hash_aset(table, key, Qtrue);
05494 }
05495 }
05496 }
05497 for (i=0; i<256; i++) {
05498 stable[i] = stable[i] && buf[i];
05499 }
05500 if (!table && !cflag) {
05501 *tablep = 0;
05502 }
05503 }
05504
05505
05506 static int
05507 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05508 {
05509 if (c < 256) {
05510 return table[c] != 0;
05511 }
05512 else {
05513 VALUE v = UINT2NUM(c);
05514
05515 if (del) {
05516 if (!NIL_P(rb_hash_lookup(del, v)) &&
05517 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05518 return TRUE;
05519 }
05520 }
05521 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05522 return FALSE;
05523 }
05524 return table[256] ? TRUE : FALSE;
05525 }
05526 }
05527
05528
05529
05530
05531
05532
05533
05534
05535
05536 static VALUE
05537 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05538 {
05539 char squeez[TR_TABLE_SIZE];
05540 rb_encoding *enc = 0;
05541 char *s, *send, *t;
05542 VALUE del = 0, nodel = 0;
05543 int modify = 0;
05544 int i, ascompat, cr;
05545
05546 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05547 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
05548 for (i=0; i<argc; i++) {
05549 VALUE s = argv[i];
05550
05551 StringValue(s);
05552 enc = rb_enc_check(str, s);
05553 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05554 }
05555
05556 str_modify_keep_cr(str);
05557 ascompat = rb_enc_asciicompat(enc);
05558 s = t = RSTRING_PTR(str);
05559 send = RSTRING_END(str);
05560 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05561 while (s < send) {
05562 unsigned int c;
05563 int clen;
05564
05565 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05566 if (squeez[c]) {
05567 modify = 1;
05568 }
05569 else {
05570 if (t != s) *t = c;
05571 t++;
05572 }
05573 s++;
05574 }
05575 else {
05576 c = rb_enc_codepoint_len(s, send, &clen, enc);
05577
05578 if (tr_find(c, squeez, del, nodel)) {
05579 modify = 1;
05580 }
05581 else {
05582 if (t != s) rb_enc_mbcput(c, t, enc);
05583 t += clen;
05584 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05585 }
05586 s += clen;
05587 }
05588 }
05589 *t = '\0';
05590 STR_SET_LEN(str, t - RSTRING_PTR(str));
05591 ENC_CODERANGE_SET(str, cr);
05592
05593 if (modify) return str;
05594 return Qnil;
05595 }
05596
05597
05598
05599
05600
05601
05602
05603
05604
05605
05606
05607
05608
05609
05610
05611
05612 static VALUE
05613 rb_str_delete(int argc, VALUE *argv, VALUE str)
05614 {
05615 str = rb_str_dup(str);
05616 rb_str_delete_bang(argc, argv, str);
05617 return str;
05618 }
05619
05620
05621
05622
05623
05624
05625
05626
05627
05628
05629 static VALUE
05630 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05631 {
05632 char squeez[TR_TABLE_SIZE];
05633 rb_encoding *enc = 0;
05634 VALUE del = 0, nodel = 0;
05635 char *s, *send, *t;
05636 int i, modify = 0;
05637 int ascompat, singlebyte = single_byte_optimizable(str);
05638 unsigned int save;
05639
05640 if (argc == 0) {
05641 enc = STR_ENC_GET(str);
05642 }
05643 else {
05644 for (i=0; i<argc; i++) {
05645 VALUE s = argv[i];
05646
05647 StringValue(s);
05648 enc = rb_enc_check(str, s);
05649 if (singlebyte && !single_byte_optimizable(s))
05650 singlebyte = 0;
05651 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05652 }
05653 }
05654
05655 str_modify_keep_cr(str);
05656 s = t = RSTRING_PTR(str);
05657 if (!s || RSTRING_LEN(str) == 0) return Qnil;
05658 send = RSTRING_END(str);
05659 save = -1;
05660 ascompat = rb_enc_asciicompat(enc);
05661
05662 if (singlebyte) {
05663 while (s < send) {
05664 unsigned int c = *(unsigned char*)s++;
05665 if (c != save || (argc > 0 && !squeez[c])) {
05666 *t++ = save = c;
05667 }
05668 }
05669 } else {
05670 while (s < send) {
05671 unsigned int c;
05672 int clen;
05673
05674 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05675 if (c != save || (argc > 0 && !squeez[c])) {
05676 *t++ = save = c;
05677 }
05678 s++;
05679 }
05680 else {
05681 c = rb_enc_codepoint_len(s, send, &clen, enc);
05682
05683 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05684 if (t != s) rb_enc_mbcput(c, t, enc);
05685 save = c;
05686 t += clen;
05687 }
05688 s += clen;
05689 }
05690 }
05691 }
05692
05693 *t = '\0';
05694 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05695 STR_SET_LEN(str, t - RSTRING_PTR(str));
05696 modify = 1;
05697 }
05698
05699 if (modify) return str;
05700 return Qnil;
05701 }
05702
05703
05704
05705
05706
05707
05708
05709
05710
05711
05712
05713
05714
05715
05716
05717
05718
05719 static VALUE
05720 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05721 {
05722 str = rb_str_dup(str);
05723 rb_str_squeeze_bang(argc, argv, str);
05724 return str;
05725 }
05726
05727
05728
05729
05730
05731
05732
05733
05734
05735
05736 static VALUE
05737 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05738 {
05739 return tr_trans(str, src, repl, 1);
05740 }
05741
05742
05743
05744
05745
05746
05747
05748
05749
05750
05751
05752
05753
05754
05755
05756 static VALUE
05757 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05758 {
05759 str = rb_str_dup(str);
05760 tr_trans(str, src, repl, 1);
05761 return str;
05762 }
05763
05764
05765
05766
05767
05768
05769
05770
05771
05772
05773
05774
05775
05776
05777
05778
05779
05780
05781
05782
05783
05784
05785
05786
05787
05788
05789
05790
05791
05792 static VALUE
05793 rb_str_count(int argc, VALUE *argv, VALUE str)
05794 {
05795 char table[TR_TABLE_SIZE];
05796 rb_encoding *enc = 0;
05797 VALUE del = 0, nodel = 0, tstr;
05798 char *s, *send;
05799 int i;
05800 int ascompat;
05801
05802 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
05803
05804 tstr = argv[0];
05805 StringValue(tstr);
05806 enc = rb_enc_check(str, tstr);
05807 if (argc == 1) {
05808 const char *ptstr;
05809 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05810 (ptstr = RSTRING_PTR(tstr),
05811 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
05812 !is_broken_string(str)) {
05813 int n = 0;
05814 int clen;
05815 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
05816
05817 s = RSTRING_PTR(str);
05818 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05819 send = RSTRING_END(str);
05820 while (s < send) {
05821 if (*(unsigned char*)s++ == c) n++;
05822 }
05823 return INT2NUM(n);
05824 }
05825 }
05826
05827 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
05828 for (i=1; i<argc; i++) {
05829 tstr = argv[i];
05830 StringValue(tstr);
05831 enc = rb_enc_check(str, tstr);
05832 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
05833 }
05834
05835 s = RSTRING_PTR(str);
05836 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05837 send = RSTRING_END(str);
05838 ascompat = rb_enc_asciicompat(enc);
05839 i = 0;
05840 while (s < send) {
05841 unsigned int c;
05842
05843 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05844 if (table[c]) {
05845 i++;
05846 }
05847 s++;
05848 }
05849 else {
05850 int clen;
05851 c = rb_enc_codepoint_len(s, send, &clen, enc);
05852 if (tr_find(c, table, del, nodel)) {
05853 i++;
05854 }
05855 s += clen;
05856 }
05857 }
05858
05859 return INT2NUM(i);
05860 }
05861
05862 static const char isspacetable[256] = {
05863 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05865 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05866 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05867 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05868 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05869 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05870 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05871 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05872 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05873 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05879 };
05880
05881 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05882
05883
05884
05885
05886
05887
05888
05889
05890
05891
05892
05893
05894
05895
05896
05897
05898
05899
05900
05901
05902
05903
05904
05905
05906
05907
05908
05909
05910
05911
05912
05913
05914
05915
05916
05917
05918
05919
05920
05921
05922
05923
05924
05925
05926
05927
05928
05929
05930 static VALUE
05931 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05932 {
05933 rb_encoding *enc;
05934 VALUE spat;
05935 VALUE limit;
05936 enum {awk, string, regexp} split_type;
05937 long beg, end, i = 0;
05938 int lim = 0;
05939 VALUE result, tmp;
05940
05941 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05942 lim = NUM2INT(limit);
05943 if (lim <= 0) limit = Qnil;
05944 else if (lim == 1) {
05945 if (RSTRING_LEN(str) == 0)
05946 return rb_ary_new2(0);
05947 return rb_ary_new3(1, str);
05948 }
05949 i = 1;
05950 }
05951
05952 enc = STR_ENC_GET(str);
05953 if (NIL_P(spat)) {
05954 if (!NIL_P(rb_fs)) {
05955 spat = rb_fs;
05956 goto fs_set;
05957 }
05958 split_type = awk;
05959 }
05960 else {
05961 fs_set:
05962 if (RB_TYPE_P(spat, T_STRING)) {
05963 rb_encoding *enc2 = STR_ENC_GET(spat);
05964
05965 split_type = string;
05966 if (RSTRING_LEN(spat) == 0) {
05967
05968 spat = rb_reg_regcomp(spat);
05969 split_type = regexp;
05970 }
05971 else if (rb_enc_asciicompat(enc2) == 1) {
05972 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05973 split_type = awk;
05974 }
05975 }
05976 else {
05977 int l;
05978 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05979 RSTRING_LEN(spat) == l) {
05980 split_type = awk;
05981 }
05982 }
05983 }
05984 else {
05985 spat = get_pat(spat, 1);
05986 split_type = regexp;
05987 }
05988 }
05989
05990 result = rb_ary_new();
05991 beg = 0;
05992 if (split_type == awk) {
05993 char *ptr = RSTRING_PTR(str);
05994 char *eptr = RSTRING_END(str);
05995 char *bptr = ptr;
05996 int skip = 1;
05997 unsigned int c;
05998
05999 end = beg;
06000 if (is_ascii_string(str)) {
06001 while (ptr < eptr) {
06002 c = (unsigned char)*ptr++;
06003 if (skip) {
06004 if (ascii_isspace(c)) {
06005 beg = ptr - bptr;
06006 }
06007 else {
06008 end = ptr - bptr;
06009 skip = 0;
06010 if (!NIL_P(limit) && lim <= i) break;
06011 }
06012 }
06013 else if (ascii_isspace(c)) {
06014 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06015 skip = 1;
06016 beg = ptr - bptr;
06017 if (!NIL_P(limit)) ++i;
06018 }
06019 else {
06020 end = ptr - bptr;
06021 }
06022 }
06023 }
06024 else {
06025 while (ptr < eptr) {
06026 int n;
06027
06028 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
06029 ptr += n;
06030 if (skip) {
06031 if (rb_isspace(c)) {
06032 beg = ptr - bptr;
06033 }
06034 else {
06035 end = ptr - bptr;
06036 skip = 0;
06037 if (!NIL_P(limit) && lim <= i) break;
06038 }
06039 }
06040 else if (rb_isspace(c)) {
06041 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06042 skip = 1;
06043 beg = ptr - bptr;
06044 if (!NIL_P(limit)) ++i;
06045 }
06046 else {
06047 end = ptr - bptr;
06048 }
06049 }
06050 }
06051 }
06052 else if (split_type == string) {
06053 char *ptr = RSTRING_PTR(str);
06054 char *temp = ptr;
06055 char *eptr = RSTRING_END(str);
06056 char *sptr = RSTRING_PTR(spat);
06057 long slen = RSTRING_LEN(spat);
06058
06059 if (is_broken_string(str)) {
06060 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
06061 }
06062 if (is_broken_string(spat)) {
06063 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
06064 }
06065 enc = rb_enc_check(str, spat);
06066 while (ptr < eptr &&
06067 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
06068
06069 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
06070 if (t != ptr + end) {
06071 ptr = t;
06072 continue;
06073 }
06074 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
06075 ptr += end + slen;
06076 if (!NIL_P(limit) && lim <= ++i) break;
06077 }
06078 beg = ptr - temp;
06079 }
06080 else {
06081 char *ptr = RSTRING_PTR(str);
06082 long len = RSTRING_LEN(str);
06083 long start = beg;
06084 long idx;
06085 int last_null = 0;
06086 struct re_registers *regs;
06087
06088 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
06089 regs = RMATCH_REGS(rb_backref_get());
06090 if (start == end && BEG(0) == END(0)) {
06091 if (!ptr) {
06092 rb_ary_push(result, str_new_empty(str));
06093 break;
06094 }
06095 else if (last_null == 1) {
06096 rb_ary_push(result, rb_str_subseq(str, beg,
06097 rb_enc_fast_mbclen(ptr+beg,
06098 ptr+len,
06099 enc)));
06100 beg = start;
06101 }
06102 else {
06103 if (ptr+start == ptr+len)
06104 start++;
06105 else
06106 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
06107 last_null = 1;
06108 continue;
06109 }
06110 }
06111 else {
06112 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06113 beg = start = END(0);
06114 }
06115 last_null = 0;
06116
06117 for (idx=1; idx < regs->num_regs; idx++) {
06118 if (BEG(idx) == -1) continue;
06119 if (BEG(idx) == END(idx))
06120 tmp = str_new_empty(str);
06121 else
06122 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
06123 rb_ary_push(result, tmp);
06124 }
06125 if (!NIL_P(limit) && lim <= ++i) break;
06126 }
06127 }
06128 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
06129 if (RSTRING_LEN(str) == beg)
06130 tmp = str_new_empty(str);
06131 else
06132 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
06133 rb_ary_push(result, tmp);
06134 }
06135 if (NIL_P(limit) && lim == 0) {
06136 long len;
06137 while ((len = RARRAY_LEN(result)) > 0 &&
06138 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
06139 rb_ary_pop(result);
06140 }
06141
06142 return result;
06143 }
06144
06145 VALUE
06146 rb_str_split(VALUE str, const char *sep0)
06147 {
06148 VALUE sep;
06149
06150 StringValue(str);
06151 sep = rb_str_new2(sep0);
06152 return rb_str_split_m(1, &sep, str);
06153 }
06154
06155
06156 static VALUE
06157 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
06158 {
06159 rb_encoding *enc;
06160 VALUE rs;
06161 unsigned int newline;
06162 const char *p, *pend, *s, *ptr;
06163 long len, rslen;
06164 VALUE line;
06165 int n;
06166 VALUE orig = str;
06167 VALUE UNINITIALIZED_VAR(ary);
06168
06169 if (argc == 0) {
06170 rs = rb_rs;
06171 }
06172 else {
06173 rb_scan_args(argc, argv, "01", &rs);
06174 }
06175
06176 if (rb_block_given_p()) {
06177 if (wantarray) {
06178 #if 0
06179 rb_warn("given block not used");
06180 ary = rb_ary_new();
06181 #else
06182 rb_warning("passing a block to String#lines is deprecated");
06183 wantarray = 0;
06184 #endif
06185 }
06186 }
06187 else {
06188 if (wantarray)
06189 ary = rb_ary_new();
06190 else
06191 RETURN_ENUMERATOR(str, argc, argv);
06192 }
06193
06194 if (NIL_P(rs)) {
06195 if (wantarray) {
06196 rb_ary_push(ary, str);
06197 return ary;
06198 }
06199 else {
06200 rb_yield(str);
06201 return orig;
06202 }
06203 }
06204 str = rb_str_new4(str);
06205 ptr = p = s = RSTRING_PTR(str);
06206 pend = p + RSTRING_LEN(str);
06207 len = RSTRING_LEN(str);
06208 StringValue(rs);
06209 if (rs == rb_default_rs) {
06210 enc = rb_enc_get(str);
06211 while (p < pend) {
06212 char *p0;
06213
06214 p = memchr(p, '\n', pend - p);
06215 if (!p) break;
06216 p0 = rb_enc_left_char_head(s, p, pend, enc);
06217 if (!rb_enc_is_newline(p0, pend, enc)) {
06218 p++;
06219 continue;
06220 }
06221 p = p0 + rb_enc_mbclen(p0, pend, enc);
06222 line = rb_str_subseq(str, s - ptr, p - s);
06223 if (wantarray)
06224 rb_ary_push(ary, line);
06225 else
06226 rb_yield(line);
06227 str_mod_check(str, ptr, len);
06228 s = p;
06229 }
06230 goto finish;
06231 }
06232
06233 enc = rb_enc_check(str, rs);
06234 rslen = RSTRING_LEN(rs);
06235 if (rslen == 0) {
06236 newline = '\n';
06237 }
06238 else {
06239 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
06240 }
06241
06242 while (p < pend) {
06243 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
06244
06245 again:
06246 if (rslen == 0 && c == newline) {
06247 p += n;
06248 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
06249 goto again;
06250 }
06251 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
06252 p += n;
06253 }
06254 p -= n;
06255 }
06256 if (c == newline &&
06257 (rslen <= 1 ||
06258 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
06259 const char *pp = p + (rslen ? rslen : n);
06260 line = rb_str_subseq(str, s - ptr, pp - s);
06261 if (wantarray)
06262 rb_ary_push(ary, line);
06263 else
06264 rb_yield(line);
06265 str_mod_check(str, ptr, len);
06266 s = pp;
06267 }
06268 p += n;
06269 }
06270
06271 finish:
06272 if (s != pend) {
06273 line = rb_str_subseq(str, s - ptr, pend - s);
06274 if (wantarray)
06275 rb_ary_push(ary, line);
06276 else
06277 rb_yield(line);
06278 RB_GC_GUARD(str);
06279 }
06280
06281 if (wantarray)
06282 return ary;
06283 else
06284 return orig;
06285 }
06286
06287
06288
06289
06290
06291
06292
06293
06294
06295
06296
06297
06298
06299
06300
06301
06302
06303
06304
06305
06306
06307
06308
06309
06310
06311
06312
06313
06314
06315
06316
06317
06318
06319
06320
06321
06322 static VALUE
06323 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06324 {
06325 return rb_str_enumerate_lines(argc, argv, str, 0);
06326 }
06327
06328
06329
06330
06331
06332
06333
06334
06335
06336
06337
06338
06339
06340 static VALUE
06341 rb_str_lines(int argc, VALUE *argv, VALUE str)
06342 {
06343 return rb_str_enumerate_lines(argc, argv, str, 1);
06344 }
06345
06346 static VALUE
06347 rb_str_each_byte_size(VALUE str, VALUE args)
06348 {
06349 return LONG2FIX(RSTRING_LEN(str));
06350 }
06351
06352 static VALUE
06353 rb_str_enumerate_bytes(VALUE str, int wantarray)
06354 {
06355 long i;
06356 VALUE UNINITIALIZED_VAR(ary);
06357
06358 if (rb_block_given_p()) {
06359 if (wantarray) {
06360 #if 0
06361 rb_warn("given block not used");
06362 ary = rb_ary_new();
06363 #else
06364 rb_warning("passing a block to String#bytes is deprecated");
06365 wantarray = 0;
06366 #endif
06367 }
06368 }
06369 else {
06370 if (wantarray)
06371 ary = rb_ary_new2(RSTRING_LEN(str));
06372 else
06373 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
06374 }
06375
06376 for (i=0; i<RSTRING_LEN(str); i++) {
06377 if (wantarray)
06378 rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06379 else
06380 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06381 }
06382 if (wantarray)
06383 return ary;
06384 else
06385 return str;
06386 }
06387
06388
06389
06390
06391
06392
06393
06394
06395
06396
06397
06398
06399
06400
06401
06402
06403 static VALUE
06404 rb_str_each_byte(VALUE str)
06405 {
06406 return rb_str_enumerate_bytes(str, 0);
06407 }
06408
06409
06410
06411
06412
06413
06414
06415
06416
06417
06418
06419
06420 static VALUE
06421 rb_str_bytes(VALUE str)
06422 {
06423 return rb_str_enumerate_bytes(str, 1);
06424 }
06425
06426 static VALUE
06427 rb_str_each_char_size(VALUE str)
06428 {
06429 long len = RSTRING_LEN(str);
06430 if (!single_byte_optimizable(str)) {
06431 const char *ptr = RSTRING_PTR(str);
06432 rb_encoding *enc = rb_enc_get(str);
06433 const char *end_ptr = ptr + len;
06434 for (len = 0; ptr < end_ptr; ++len) {
06435 ptr += rb_enc_mbclen(ptr, end_ptr, enc);
06436 }
06437 }
06438 return LONG2FIX(len);
06439 }
06440
06441 static VALUE
06442 rb_str_enumerate_chars(VALUE str, int wantarray)
06443 {
06444 VALUE orig = str;
06445 VALUE substr;
06446 long i, len, n;
06447 const char *ptr;
06448 rb_encoding *enc;
06449 VALUE UNINITIALIZED_VAR(ary);
06450
06451 if (rb_block_given_p()) {
06452 if (wantarray) {
06453 #if 0
06454 rb_warn("given block not used");
06455 ary = rb_ary_new();
06456 #else
06457 rb_warning("passing a block to String#chars is deprecated");
06458 wantarray = 0;
06459 #endif
06460 }
06461 }
06462 else {
06463 if (wantarray)
06464 ary = rb_ary_new();
06465 else
06466 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
06467 }
06468
06469 str = rb_str_new4(str);
06470 ptr = RSTRING_PTR(str);
06471 len = RSTRING_LEN(str);
06472 enc = rb_enc_get(str);
06473 switch (ENC_CODERANGE(str)) {
06474 case ENC_CODERANGE_VALID:
06475 case ENC_CODERANGE_7BIT:
06476 for (i = 0; i < len; i += n) {
06477 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06478 substr = rb_str_subseq(str, i, n);
06479 if (wantarray)
06480 rb_ary_push(ary, substr);
06481 else
06482 rb_yield(substr);
06483 }
06484 break;
06485 default:
06486 for (i = 0; i < len; i += n) {
06487 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06488 substr = rb_str_subseq(str, i, n);
06489 if (wantarray)
06490 rb_ary_push(ary, substr);
06491 else
06492 rb_yield(substr);
06493 }
06494 }
06495 RB_GC_GUARD(str);
06496 if (wantarray)
06497 return ary;
06498 else
06499 return orig;
06500 }
06501
06502
06503
06504
06505
06506
06507
06508
06509
06510
06511
06512
06513
06514
06515
06516
06517 static VALUE
06518 rb_str_each_char(VALUE str)
06519 {
06520 return rb_str_enumerate_chars(str, 0);
06521 }
06522
06523
06524
06525
06526
06527
06528
06529
06530
06531
06532
06533
06534 static VALUE
06535 rb_str_chars(VALUE str)
06536 {
06537 return rb_str_enumerate_chars(str, 1);
06538 }
06539
06540
06541 static VALUE
06542 rb_str_enumerate_codepoints(VALUE str, int wantarray)
06543 {
06544 VALUE orig = str;
06545 int n;
06546 unsigned int c;
06547 const char *ptr, *end;
06548 rb_encoding *enc;
06549 VALUE UNINITIALIZED_VAR(ary);
06550
06551 if (single_byte_optimizable(str))
06552 return rb_str_enumerate_bytes(str, wantarray);
06553
06554 if (rb_block_given_p()) {
06555 if (wantarray) {
06556 #if 0
06557 rb_warn("given block not used");
06558 ary = rb_ary_new();
06559 #else
06560 rb_warning("passing a block to String#codepoints is deprecated");
06561 wantarray = 0;
06562 #endif
06563 }
06564 }
06565 else {
06566 if (wantarray)
06567 ary = rb_ary_new();
06568 else
06569 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
06570 }
06571
06572 str = rb_str_new4(str);
06573 ptr = RSTRING_PTR(str);
06574 end = RSTRING_END(str);
06575 enc = STR_ENC_GET(str);
06576 while (ptr < end) {
06577 c = rb_enc_codepoint_len(ptr, end, &n, enc);
06578 if (wantarray)
06579 rb_ary_push(ary, UINT2NUM(c));
06580 else
06581 rb_yield(UINT2NUM(c));
06582 ptr += n;
06583 }
06584 RB_GC_GUARD(str);
06585 if (wantarray)
06586 return ary;
06587 else
06588 return orig;
06589 }
06590
06591
06592
06593
06594
06595
06596
06597
06598
06599
06600
06601
06602
06603
06604
06605
06606
06607
06608
06609 static VALUE
06610 rb_str_each_codepoint(VALUE str)
06611 {
06612 return rb_str_enumerate_codepoints(str, 0);
06613 }
06614
06615
06616
06617
06618
06619
06620
06621
06622
06623
06624
06625
06626
06627 static VALUE
06628 rb_str_codepoints(VALUE str)
06629 {
06630 return rb_str_enumerate_codepoints(str, 1);
06631 }
06632
06633
06634 static long
06635 chopped_length(VALUE str)
06636 {
06637 rb_encoding *enc = STR_ENC_GET(str);
06638 const char *p, *p2, *beg, *end;
06639
06640 beg = RSTRING_PTR(str);
06641 end = beg + RSTRING_LEN(str);
06642 if (beg > end) return 0;
06643 p = rb_enc_prev_char(beg, end, end, enc);
06644 if (!p) return 0;
06645 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06646 p2 = rb_enc_prev_char(beg, p, end, enc);
06647 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06648 }
06649 return p - beg;
06650 }
06651
06652
06653
06654
06655
06656
06657
06658
06659
06660
06661 static VALUE
06662 rb_str_chop_bang(VALUE str)
06663 {
06664 str_modify_keep_cr(str);
06665 if (RSTRING_LEN(str) > 0) {
06666 long len;
06667 len = chopped_length(str);
06668 STR_SET_LEN(str, len);
06669 RSTRING_PTR(str)[len] = '\0';
06670 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06671 ENC_CODERANGE_CLEAR(str);
06672 }
06673 return str;
06674 }
06675 return Qnil;
06676 }
06677
06678
06679
06680
06681
06682
06683
06684
06685
06686
06687
06688
06689
06690
06691
06692
06693
06694
06695
06696 static VALUE
06697 rb_str_chop(VALUE str)
06698 {
06699 return rb_str_subseq(str, 0, chopped_length(str));
06700 }
06701
06702
06703
06704
06705
06706
06707
06708
06709
06710
06711 static VALUE
06712 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06713 {
06714 rb_encoding *enc;
06715 VALUE rs;
06716 int newline;
06717 char *p, *pp, *e;
06718 long len, rslen;
06719
06720 str_modify_keep_cr(str);
06721 len = RSTRING_LEN(str);
06722 if (len == 0) return Qnil;
06723 p = RSTRING_PTR(str);
06724 e = p + len;
06725 if (argc == 0) {
06726 rs = rb_rs;
06727 if (rs == rb_default_rs) {
06728 smart_chomp:
06729 enc = rb_enc_get(str);
06730 if (rb_enc_mbminlen(enc) > 1) {
06731 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06732 if (rb_enc_is_newline(pp, e, enc)) {
06733 e = pp;
06734 }
06735 pp = e - rb_enc_mbminlen(enc);
06736 if (pp >= p) {
06737 pp = rb_enc_left_char_head(p, pp, e, enc);
06738 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06739 e = pp;
06740 }
06741 }
06742 if (e == RSTRING_END(str)) {
06743 return Qnil;
06744 }
06745 len = e - RSTRING_PTR(str);
06746 STR_SET_LEN(str, len);
06747 }
06748 else {
06749 if (RSTRING_PTR(str)[len-1] == '\n') {
06750 STR_DEC_LEN(str);
06751 if (RSTRING_LEN(str) > 0 &&
06752 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06753 STR_DEC_LEN(str);
06754 }
06755 }
06756 else if (RSTRING_PTR(str)[len-1] == '\r') {
06757 STR_DEC_LEN(str);
06758 }
06759 else {
06760 return Qnil;
06761 }
06762 }
06763 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06764 return str;
06765 }
06766 }
06767 else {
06768 rb_scan_args(argc, argv, "01", &rs);
06769 }
06770 if (NIL_P(rs)) return Qnil;
06771 StringValue(rs);
06772 rslen = RSTRING_LEN(rs);
06773 if (rslen == 0) {
06774 while (len>0 && p[len-1] == '\n') {
06775 len--;
06776 if (len>0 && p[len-1] == '\r')
06777 len--;
06778 }
06779 if (len < RSTRING_LEN(str)) {
06780 STR_SET_LEN(str, len);
06781 RSTRING_PTR(str)[len] = '\0';
06782 return str;
06783 }
06784 return Qnil;
06785 }
06786 if (rslen > len) return Qnil;
06787 newline = RSTRING_PTR(rs)[rslen-1];
06788 if (rslen == 1 && newline == '\n')
06789 goto smart_chomp;
06790
06791 enc = rb_enc_check(str, rs);
06792 if (is_broken_string(rs)) {
06793 return Qnil;
06794 }
06795 pp = e - rslen;
06796 if (p[len-1] == newline &&
06797 (rslen <= 1 ||
06798 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06799 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06800 return Qnil;
06801 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06802 ENC_CODERANGE_CLEAR(str);
06803 }
06804 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06805 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06806 return str;
06807 }
06808 return Qnil;
06809 }
06810
06811
06812
06813
06814
06815
06816
06817
06818
06819
06820
06821
06822
06823
06824
06825
06826
06827
06828
06829
06830
06831
06832
06833
06834 static VALUE
06835 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06836 {
06837 str = rb_str_dup(str);
06838 rb_str_chomp_bang(argc, argv, str);
06839 return str;
06840 }
06841
06842
06843
06844
06845
06846
06847
06848
06849
06850
06851
06852
06853
06854 static VALUE
06855 rb_str_lstrip_bang(VALUE str)
06856 {
06857 rb_encoding *enc;
06858 char *s, *t, *e;
06859
06860 str_modify_keep_cr(str);
06861 enc = STR_ENC_GET(str);
06862 s = RSTRING_PTR(str);
06863 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06864 e = t = RSTRING_END(str);
06865
06866 while (s < e) {
06867 int n;
06868 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06869
06870 if (!rb_isspace(cc)) break;
06871 s += n;
06872 }
06873
06874 if (s > RSTRING_PTR(str)) {
06875 STR_SET_LEN(str, t-s);
06876 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06877 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06878 return str;
06879 }
06880 return Qnil;
06881 }
06882
06883
06884
06885
06886
06887
06888
06889
06890
06891
06892
06893
06894
06895 static VALUE
06896 rb_str_lstrip(VALUE str)
06897 {
06898 str = rb_str_dup(str);
06899 rb_str_lstrip_bang(str);
06900 return str;
06901 }
06902
06903
06904
06905
06906
06907
06908
06909
06910
06911
06912
06913
06914
06915
06916 static VALUE
06917 rb_str_rstrip_bang(VALUE str)
06918 {
06919 rb_encoding *enc;
06920 char *s, *t, *e;
06921
06922 str_modify_keep_cr(str);
06923 enc = STR_ENC_GET(str);
06924 rb_str_check_dummy_enc(enc);
06925 s = RSTRING_PTR(str);
06926 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06927 t = e = RSTRING_END(str);
06928
06929
06930 if (single_byte_optimizable(str)) {
06931 unsigned char c;
06932 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06933 }
06934 else {
06935 char *tp;
06936
06937 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06938 unsigned int c = rb_enc_codepoint(tp, e, enc);
06939 if (c && !rb_isspace(c)) break;
06940 t = tp;
06941 }
06942 }
06943 if (t < e) {
06944 long len = t-RSTRING_PTR(str);
06945
06946 STR_SET_LEN(str, len);
06947 RSTRING_PTR(str)[len] = '\0';
06948 return str;
06949 }
06950 return Qnil;
06951 }
06952
06953
06954
06955
06956
06957
06958
06959
06960
06961
06962
06963
06964
06965 static VALUE
06966 rb_str_rstrip(VALUE str)
06967 {
06968 str = rb_str_dup(str);
06969 rb_str_rstrip_bang(str);
06970 return str;
06971 }
06972
06973
06974
06975
06976
06977
06978
06979
06980
06981
06982 static VALUE
06983 rb_str_strip_bang(VALUE str)
06984 {
06985 VALUE l = rb_str_lstrip_bang(str);
06986 VALUE r = rb_str_rstrip_bang(str);
06987
06988 if (NIL_P(l) && NIL_P(r)) return Qnil;
06989 return str;
06990 }
06991
06992
06993
06994
06995
06996
06997
06998
06999
07000
07001
07002
07003 static VALUE
07004 rb_str_strip(VALUE str)
07005 {
07006 str = rb_str_dup(str);
07007 rb_str_strip_bang(str);
07008 return str;
07009 }
07010
07011 static VALUE
07012 scan_once(VALUE str, VALUE pat, long *start)
07013 {
07014 VALUE result, match;
07015 struct re_registers *regs;
07016 int i;
07017
07018 if (rb_reg_search(pat, str, *start, 0) >= 0) {
07019 match = rb_backref_get();
07020 regs = RMATCH_REGS(match);
07021 if (BEG(0) == END(0)) {
07022 rb_encoding *enc = STR_ENC_GET(str);
07023
07024
07025
07026 if (RSTRING_LEN(str) > END(0))
07027 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
07028 RSTRING_END(str), enc);
07029 else
07030 *start = END(0)+1;
07031 }
07032 else {
07033 *start = END(0);
07034 }
07035 if (regs->num_regs == 1) {
07036 return rb_reg_nth_match(0, match);
07037 }
07038 result = rb_ary_new2(regs->num_regs);
07039 for (i=1; i < regs->num_regs; i++) {
07040 rb_ary_push(result, rb_reg_nth_match(i, match));
07041 }
07042
07043 return result;
07044 }
07045 return Qnil;
07046 }
07047
07048
07049
07050
07051
07052
07053
07054
07055
07056
07057
07058
07059
07060
07061
07062
07063
07064
07065
07066
07067
07068
07069
07070
07071
07072
07073
07074
07075
07076
07077
07078
07079
07080 static VALUE
07081 rb_str_scan(VALUE str, VALUE pat)
07082 {
07083 VALUE result;
07084 long start = 0;
07085 long last = -1, prev = 0;
07086 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
07087
07088 pat = get_pat(pat, 1);
07089 if (!rb_block_given_p()) {
07090 VALUE ary = rb_ary_new();
07091
07092 while (!NIL_P(result = scan_once(str, pat, &start))) {
07093 last = prev;
07094 prev = start;
07095 rb_ary_push(ary, result);
07096 }
07097 if (last >= 0) rb_reg_search(pat, str, last, 0);
07098 return ary;
07099 }
07100
07101 while (!NIL_P(result = scan_once(str, pat, &start))) {
07102 last = prev;
07103 prev = start;
07104 rb_yield(result);
07105 str_mod_check(str, p, len);
07106 }
07107 if (last >= 0) rb_reg_search(pat, str, last, 0);
07108 return str;
07109 }
07110
07111
07112
07113
07114
07115
07116
07117
07118
07119
07120
07121
07122
07123
07124
07125
07126 static VALUE
07127 rb_str_hex(VALUE str)
07128 {
07129 return rb_str_to_inum(str, 16, FALSE);
07130 }
07131
07132
07133
07134
07135
07136
07137
07138
07139
07140
07141
07142
07143
07144
07145
07146
07147 static VALUE
07148 rb_str_oct(VALUE str)
07149 {
07150 return rb_str_to_inum(str, -8, FALSE);
07151 }
07152
07153
07154
07155
07156
07157
07158
07159
07160
07161
07162
07163
07164
07165
07166
07167
07168
07169
07170
07171 static VALUE
07172 rb_str_crypt(VALUE str, VALUE salt)
07173 {
07174 extern char *crypt(const char *, const char *);
07175 VALUE result;
07176 const char *s, *saltp;
07177 char *res;
07178 #ifdef BROKEN_CRYPT
07179 char salt_8bit_clean[3];
07180 #endif
07181
07182 StringValue(salt);
07183 if (RSTRING_LEN(salt) < 2)
07184 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
07185
07186 s = RSTRING_PTR(str);
07187 if (!s) s = "";
07188 saltp = RSTRING_PTR(salt);
07189 #ifdef BROKEN_CRYPT
07190 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
07191 salt_8bit_clean[0] = saltp[0] & 0x7f;
07192 salt_8bit_clean[1] = saltp[1] & 0x7f;
07193 salt_8bit_clean[2] = '\0';
07194 saltp = salt_8bit_clean;
07195 }
07196 #endif
07197 res = crypt(s, saltp);
07198 if (!res) {
07199 rb_sys_fail("crypt");
07200 }
07201 result = rb_str_new2(res);
07202 OBJ_INFECT(result, str);
07203 OBJ_INFECT(result, salt);
07204 return result;
07205 }
07206
07207
07208
07209
07210
07211
07212
07213
07214
07215
07216
07217
07218
07219
07220
07221
07222
07223
07224
07225
07226
07227
07228 VALUE
07229 rb_str_intern(VALUE s)
07230 {
07231 VALUE str = RB_GC_GUARD(s);
07232 ID id;
07233
07234 id = rb_intern_str(str);
07235 return ID2SYM(id);
07236 }
07237
07238
07239
07240
07241
07242
07243
07244
07245
07246
07247
07248 VALUE
07249 rb_str_ord(VALUE s)
07250 {
07251 unsigned int c;
07252
07253 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
07254 return UINT2NUM(c);
07255 }
07256
07257
07258
07259
07260
07261
07262
07263
07264
07265
07266
07267 static VALUE
07268 rb_str_sum(int argc, VALUE *argv, VALUE str)
07269 {
07270 VALUE vbits;
07271 int bits;
07272 char *ptr, *p, *pend;
07273 long len;
07274 VALUE sum = INT2FIX(0);
07275 unsigned long sum0 = 0;
07276
07277 if (argc == 0) {
07278 bits = 16;
07279 }
07280 else {
07281 rb_scan_args(argc, argv, "01", &vbits);
07282 bits = NUM2INT(vbits);
07283 }
07284 ptr = p = RSTRING_PTR(str);
07285 len = RSTRING_LEN(str);
07286 pend = p + len;
07287
07288 while (p < pend) {
07289 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
07290 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07291 str_mod_check(str, ptr, len);
07292 sum0 = 0;
07293 }
07294 sum0 += (unsigned char)*p;
07295 p++;
07296 }
07297
07298 if (bits == 0) {
07299 if (sum0) {
07300 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07301 }
07302 }
07303 else {
07304 if (sum == INT2FIX(0)) {
07305 if (bits < (int)sizeof(long)*CHAR_BIT) {
07306 sum0 &= (((unsigned long)1)<<bits)-1;
07307 }
07308 sum = LONG2FIX(sum0);
07309 }
07310 else {
07311 VALUE mod;
07312
07313 if (sum0) {
07314 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07315 }
07316
07317 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
07318 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
07319 sum = rb_funcall(sum, '&', 1, mod);
07320 }
07321 }
07322 return sum;
07323 }
07324
07325 static VALUE
07326 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
07327 {
07328 rb_encoding *enc;
07329 VALUE w;
07330 long width, len, flen = 1, fclen = 1;
07331 VALUE res;
07332 char *p;
07333 const char *f = " ";
07334 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
07335 volatile VALUE pad;
07336 int singlebyte = 1, cr;
07337
07338 rb_scan_args(argc, argv, "11", &w, &pad);
07339 enc = STR_ENC_GET(str);
07340 width = NUM2LONG(w);
07341 if (argc == 2) {
07342 StringValue(pad);
07343 enc = rb_enc_check(str, pad);
07344 f = RSTRING_PTR(pad);
07345 flen = RSTRING_LEN(pad);
07346 fclen = str_strlen(pad, enc);
07347 singlebyte = single_byte_optimizable(pad);
07348 if (flen == 0 || fclen == 0) {
07349 rb_raise(rb_eArgError, "zero width padding");
07350 }
07351 }
07352 len = str_strlen(str, enc);
07353 if (width < 0 || len >= width) return rb_str_dup(str);
07354 n = width - len;
07355 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
07356 rlen = n - llen;
07357 cr = ENC_CODERANGE(str);
07358 if (flen > 1) {
07359 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
07360 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
07361 }
07362 size = RSTRING_LEN(str);
07363 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
07364 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
07365 (len += llen2 + rlen2) >= LONG_MAX - size) {
07366 rb_raise(rb_eArgError, "argument too big");
07367 }
07368 len += size;
07369 res = rb_str_new5(str, 0, len);
07370 p = RSTRING_PTR(res);
07371 if (flen <= 1) {
07372 memset(p, *f, llen);
07373 p += llen;
07374 }
07375 else {
07376 while (llen >= fclen) {
07377 memcpy(p,f,flen);
07378 p += flen;
07379 llen -= fclen;
07380 }
07381 if (llen > 0) {
07382 memcpy(p, f, llen2);
07383 p += llen2;
07384 }
07385 }
07386 memcpy(p, RSTRING_PTR(str), size);
07387 p += size;
07388 if (flen <= 1) {
07389 memset(p, *f, rlen);
07390 p += rlen;
07391 }
07392 else {
07393 while (rlen >= fclen) {
07394 memcpy(p,f,flen);
07395 p += flen;
07396 rlen -= fclen;
07397 }
07398 if (rlen > 0) {
07399 memcpy(p, f, rlen2);
07400 p += rlen2;
07401 }
07402 }
07403 *p = '\0';
07404 STR_SET_LEN(res, p-RSTRING_PTR(res));
07405 OBJ_INFECT(res, str);
07406 if (!NIL_P(pad)) OBJ_INFECT(res, pad);
07407 rb_enc_associate(res, enc);
07408 if (argc == 2)
07409 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07410 if (cr != ENC_CODERANGE_BROKEN)
07411 ENC_CODERANGE_SET(res, cr);
07412 return res;
07413 }
07414
07415
07416
07417
07418
07419
07420
07421
07422
07423
07424
07425
07426
07427
07428
07429 static VALUE
07430 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07431 {
07432 return rb_str_justify(argc, argv, str, 'l');
07433 }
07434
07435
07436
07437
07438
07439
07440
07441
07442
07443
07444
07445
07446
07447
07448
07449 static VALUE
07450 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07451 {
07452 return rb_str_justify(argc, argv, str, 'r');
07453 }
07454
07455
07456
07457
07458
07459
07460
07461
07462
07463
07464
07465
07466
07467
07468
07469 static VALUE
07470 rb_str_center(int argc, VALUE *argv, VALUE str)
07471 {
07472 return rb_str_justify(argc, argv, str, 'c');
07473 }
07474
07475
07476
07477
07478
07479
07480
07481
07482
07483
07484
07485
07486
07487
07488
07489
07490 static VALUE
07491 rb_str_partition(VALUE str, VALUE sep)
07492 {
07493 long pos;
07494 int regex = FALSE;
07495
07496 if (RB_TYPE_P(sep, T_REGEXP)) {
07497 pos = rb_reg_search(sep, str, 0, 0);
07498 regex = TRUE;
07499 }
07500 else {
07501 VALUE tmp;
07502
07503 tmp = rb_check_string_type(sep);
07504 if (NIL_P(tmp)) {
07505 rb_raise(rb_eTypeError, "type mismatch: %s given",
07506 rb_obj_classname(sep));
07507 }
07508 sep = tmp;
07509 pos = rb_str_index(str, sep, 0);
07510 }
07511 if (pos < 0) {
07512 failed:
07513 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07514 }
07515 if (regex) {
07516 sep = rb_str_subpat(str, sep, INT2FIX(0));
07517 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07518 }
07519 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07520 sep,
07521 rb_str_subseq(str, pos+RSTRING_LEN(sep),
07522 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07523 }
07524
07525
07526
07527
07528
07529
07530
07531
07532
07533
07534
07535
07536
07537
07538
07539
07540 static VALUE
07541 rb_str_rpartition(VALUE str, VALUE sep)
07542 {
07543 long pos = RSTRING_LEN(str);
07544 int regex = FALSE;
07545
07546 if (RB_TYPE_P(sep, T_REGEXP)) {
07547 pos = rb_reg_search(sep, str, pos, 1);
07548 regex = TRUE;
07549 }
07550 else {
07551 VALUE tmp;
07552
07553 tmp = rb_check_string_type(sep);
07554 if (NIL_P(tmp)) {
07555 rb_raise(rb_eTypeError, "type mismatch: %s given",
07556 rb_obj_classname(sep));
07557 }
07558 sep = tmp;
07559 pos = rb_str_sublen(str, pos);
07560 pos = rb_str_rindex(str, sep, pos);
07561 }
07562 if (pos < 0) {
07563 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07564 }
07565 if (regex) {
07566 sep = rb_reg_nth_match(0, rb_backref_get());
07567 }
07568 return rb_ary_new3(3, rb_str_substr(str, 0, pos),
07569 sep,
07570 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
07571 }
07572
07573
07574
07575
07576
07577
07578
07579
07580
07581
07582
07583
07584
07585
07586 static VALUE
07587 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07588 {
07589 int i;
07590
07591 for (i=0; i<argc; i++) {
07592 VALUE tmp = argv[i];
07593 StringValue(tmp);
07594 rb_enc_check(str, tmp);
07595 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07596 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07597 return Qtrue;
07598 }
07599 return Qfalse;
07600 }
07601
07602
07603
07604
07605
07606
07607
07608
07609 static VALUE
07610 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07611 {
07612 int i;
07613 char *p, *s, *e;
07614 rb_encoding *enc;
07615
07616 for (i=0; i<argc; i++) {
07617 VALUE tmp = argv[i];
07618 StringValue(tmp);
07619 enc = rb_enc_check(str, tmp);
07620 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07621 p = RSTRING_PTR(str);
07622 e = p + RSTRING_LEN(str);
07623 s = e - RSTRING_LEN(tmp);
07624 if (rb_enc_left_char_head(p, s, e, enc) != s)
07625 continue;
07626 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07627 return Qtrue;
07628 }
07629 return Qfalse;
07630 }
07631
07632 void
07633 rb_str_setter(VALUE val, ID id, VALUE *var)
07634 {
07635 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
07636 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07637 }
07638 *var = val;
07639 }
07640
07641
07642
07643
07644
07645
07646
07647
07648
07649 static VALUE
07650 rb_str_force_encoding(VALUE str, VALUE enc)
07651 {
07652 str_modifiable(str);
07653 rb_enc_associate(str, rb_to_encoding(enc));
07654 ENC_CODERANGE_CLEAR(str);
07655 return str;
07656 }
07657
07658
07659
07660
07661
07662
07663
07664
07665 static VALUE
07666 rb_str_b(VALUE str)
07667 {
07668 VALUE str2 = str_alloc(rb_cString);
07669 str_replace_shared_without_enc(str2, str);
07670 OBJ_INFECT(str2, str);
07671 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
07672 return str2;
07673 }
07674
07675
07676
07677
07678
07679
07680
07681
07682
07683
07684
07685
07686 static VALUE
07687 rb_str_valid_encoding_p(VALUE str)
07688 {
07689 int cr = rb_enc_str_coderange(str);
07690
07691 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07692 }
07693
07694
07695
07696
07697
07698
07699
07700
07701
07702
07703
07704 static VALUE
07705 rb_str_is_ascii_only_p(VALUE str)
07706 {
07707 int cr = rb_enc_str_coderange(str);
07708
07709 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07710 }
07711
07726 VALUE
07727 rb_str_ellipsize(VALUE str, long len)
07728 {
07729 static const char ellipsis[] = "...";
07730 const long ellipsislen = sizeof(ellipsis) - 1;
07731 rb_encoding *const enc = rb_enc_get(str);
07732 const long blen = RSTRING_LEN(str);
07733 const char *const p = RSTRING_PTR(str), *e = p + blen;
07734 VALUE estr, ret = 0;
07735
07736 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07737 if (len * rb_enc_mbminlen(enc) >= blen ||
07738 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07739 ret = str;
07740 }
07741 else if (len <= ellipsislen ||
07742 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07743 if (rb_enc_asciicompat(enc)) {
07744 ret = rb_str_new_with_class(str, ellipsis, len);
07745 rb_enc_associate(ret, enc);
07746 }
07747 else {
07748 estr = rb_usascii_str_new(ellipsis, len);
07749 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07750 }
07751 }
07752 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07753 rb_str_cat(ret, ellipsis, ellipsislen);
07754 }
07755 else {
07756 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07757 rb_enc_from_encoding(enc), 0, Qnil);
07758 rb_str_append(ret, estr);
07759 }
07760 return ret;
07761 }
07762
07763
07764
07765
07766
07767
07768
07769
07770
07771
07772
07773
07774
07775
07776
07777
07778
07779
07780
07781
07782
07783
07784
07785
07786
07787
07788
07789
07790
07791
07792
07793
07794
07795
07796
07797
07798
07799
07800
07801
07802
07803
07804
07805 static VALUE
07806 sym_equal(VALUE sym1, VALUE sym2)
07807 {
07808 if (sym1 == sym2) return Qtrue;
07809 return Qfalse;
07810 }
07811
07812
07813 static int
07814 sym_printable(const char *s, const char *send, rb_encoding *enc)
07815 {
07816 while (s < send) {
07817 int n;
07818 int c = rb_enc_codepoint_len(s, send, &n, enc);
07819
07820 if (!rb_enc_isprint(c, enc)) return FALSE;
07821 s += n;
07822 }
07823 return TRUE;
07824 }
07825
07826 int
07827 rb_str_symname_p(VALUE sym)
07828 {
07829 rb_encoding *enc;
07830 const char *ptr;
07831 long len;
07832 rb_encoding *resenc = rb_default_internal_encoding();
07833
07834 if (resenc == NULL) resenc = rb_default_external_encoding();
07835 enc = STR_ENC_GET(sym);
07836 ptr = RSTRING_PTR(sym);
07837 len = RSTRING_LEN(sym);
07838 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07839 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07840 return FALSE;
07841 }
07842 return TRUE;
07843 }
07844
07845 VALUE
07846 rb_str_quote_unprintable(VALUE str)
07847 {
07848 rb_encoding *enc;
07849 const char *ptr;
07850 long len;
07851 rb_encoding *resenc;
07852
07853 Check_Type(str, T_STRING);
07854 resenc = rb_default_internal_encoding();
07855 if (resenc == NULL) resenc = rb_default_external_encoding();
07856 enc = STR_ENC_GET(str);
07857 ptr = RSTRING_PTR(str);
07858 len = RSTRING_LEN(str);
07859 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
07860 !sym_printable(ptr, ptr + len, enc)) {
07861 return rb_str_inspect(str);
07862 }
07863 return str;
07864 }
07865
07866 VALUE
07867 rb_id_quote_unprintable(ID id)
07868 {
07869 return rb_str_quote_unprintable(rb_id2str(id));
07870 }
07871
07872
07873
07874
07875
07876
07877
07878
07879
07880
07881 static VALUE
07882 sym_inspect(VALUE sym)
07883 {
07884 VALUE str;
07885 const char *ptr;
07886 long len;
07887 ID id = SYM2ID(sym);
07888 char *dest;
07889
07890 sym = rb_id2str(id);
07891 if (!rb_str_symname_p(sym)) {
07892 str = rb_str_inspect(sym);
07893 len = RSTRING_LEN(str);
07894 rb_str_resize(str, len + 1);
07895 dest = RSTRING_PTR(str);
07896 memmove(dest + 1, dest, len);
07897 dest[0] = ':';
07898 }
07899 else {
07900 rb_encoding *enc = STR_ENC_GET(sym);
07901 ptr = RSTRING_PTR(sym);
07902 len = RSTRING_LEN(sym);
07903 str = rb_enc_str_new(0, len + 1, enc);
07904 dest = RSTRING_PTR(str);
07905 dest[0] = ':';
07906 memcpy(dest + 1, ptr, len);
07907 }
07908 return str;
07909 }
07910
07911
07912
07913
07914
07915
07916
07917
07918
07919
07920
07921
07922
07923 VALUE
07924 rb_sym_to_s(VALUE sym)
07925 {
07926 ID id = SYM2ID(sym);
07927
07928 return str_new3(rb_cString, rb_id2str(id));
07929 }
07930
07931
07932
07933
07934
07935
07936
07937
07938
07939
07940
07941
07942 static VALUE
07943 sym_to_sym(VALUE sym)
07944 {
07945 return sym;
07946 }
07947
07948 static VALUE
07949 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
07950 {
07951 VALUE obj;
07952
07953 if (argc < 1) {
07954 rb_raise(rb_eArgError, "no receiver given");
07955 }
07956 obj = argv[0];
07957 return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
07958 }
07959
07960
07961
07962
07963
07964
07965
07966
07967
07968
07969 static VALUE
07970 sym_to_proc(VALUE sym)
07971 {
07972 static VALUE sym_proc_cache = Qfalse;
07973 enum {SYM_PROC_CACHE_SIZE = 67};
07974 VALUE proc;
07975 long id, index;
07976 VALUE *aryp;
07977
07978 if (!sym_proc_cache) {
07979 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07980 rb_gc_register_mark_object(sym_proc_cache);
07981 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07982 }
07983
07984 id = SYM2ID(sym);
07985 index = (id % SYM_PROC_CACHE_SIZE) << 1;
07986
07987 aryp = RARRAY_PTR(sym_proc_cache);
07988 if (aryp[index] == sym) {
07989 return aryp[index + 1];
07990 }
07991 else {
07992 proc = rb_proc_new(sym_call, (VALUE)id);
07993 aryp[index] = sym;
07994 aryp[index + 1] = proc;
07995 return proc;
07996 }
07997 }
07998
07999
08000
08001
08002
08003
08004
08005
08006
08007 static VALUE
08008 sym_succ(VALUE sym)
08009 {
08010 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
08011 }
08012
08013
08014
08015
08016
08017
08018
08019
08020
08021
08022
08023
08024
08025
08026
08027 static VALUE
08028 sym_cmp(VALUE sym, VALUE other)
08029 {
08030 if (!SYMBOL_P(other)) {
08031 return Qnil;
08032 }
08033 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
08034 }
08035
08036
08037
08038
08039
08040
08041
08042
08043
08044 static VALUE
08045 sym_casecmp(VALUE sym, VALUE other)
08046 {
08047 if (!SYMBOL_P(other)) {
08048 return Qnil;
08049 }
08050 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
08051 }
08052
08053
08054
08055
08056
08057
08058
08059
08060 static VALUE
08061 sym_match(VALUE sym, VALUE other)
08062 {
08063 return rb_str_match(rb_sym_to_s(sym), other);
08064 }
08065
08066
08067
08068
08069
08070
08071
08072
08073
08074 static VALUE
08075 sym_aref(int argc, VALUE *argv, VALUE sym)
08076 {
08077 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
08078 }
08079
08080
08081
08082
08083
08084
08085
08086
08087 static VALUE
08088 sym_length(VALUE sym)
08089 {
08090 return rb_str_length(rb_id2str(SYM2ID(sym)));
08091 }
08092
08093
08094
08095
08096
08097
08098
08099
08100 static VALUE
08101 sym_empty(VALUE sym)
08102 {
08103 return rb_str_empty(rb_id2str(SYM2ID(sym)));
08104 }
08105
08106
08107
08108
08109
08110
08111
08112
08113 static VALUE
08114 sym_upcase(VALUE sym)
08115 {
08116 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
08117 }
08118
08119
08120
08121
08122
08123
08124
08125
08126 static VALUE
08127 sym_downcase(VALUE sym)
08128 {
08129 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
08130 }
08131
08132
08133
08134
08135
08136
08137
08138
08139 static VALUE
08140 sym_capitalize(VALUE sym)
08141 {
08142 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
08143 }
08144
08145
08146
08147
08148
08149
08150
08151
08152 static VALUE
08153 sym_swapcase(VALUE sym)
08154 {
08155 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
08156 }
08157
08158
08159
08160
08161
08162
08163
08164
08165 static VALUE
08166 sym_encoding(VALUE sym)
08167 {
08168 return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
08169 }
08170
08171 ID
08172 rb_to_id(VALUE name)
08173 {
08174 VALUE tmp;
08175
08176 switch (TYPE(name)) {
08177 default:
08178 tmp = rb_check_string_type(name);
08179 if (NIL_P(tmp)) {
08180 tmp = rb_inspect(name);
08181 rb_raise(rb_eTypeError, "%s is not a symbol",
08182 RSTRING_PTR(tmp));
08183 }
08184 name = tmp;
08185
08186 case T_STRING:
08187 name = rb_str_intern(name);
08188
08189 case T_SYMBOL:
08190 return SYM2ID(name);
08191 }
08192
08193 UNREACHABLE;
08194 }
08195
08196
08197
08198
08199
08200
08201
08202
08203
08204
08205
08206
08207
08208
08209 void
08210 Init_String(void)
08211 {
08212 #undef rb_intern
08213 #define rb_intern(str) rb_intern_const(str)
08214
08215 rb_cString = rb_define_class("String", rb_cObject);
08216 rb_include_module(rb_cString, rb_mComparable);
08217 rb_define_alloc_func(rb_cString, empty_str_alloc);
08218 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
08219 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
08220 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
08221 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
08222 rb_define_method(rb_cString, "==", rb_str_equal, 1);
08223 rb_define_method(rb_cString, "===", rb_str_equal, 1);
08224 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
08225 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
08226 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
08227 rb_define_method(rb_cString, "+", rb_str_plus, 1);
08228 rb_define_method(rb_cString, "*", rb_str_times, 1);
08229 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
08230 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
08231 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
08232 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
08233 rb_define_method(rb_cString, "length", rb_str_length, 0);
08234 rb_define_method(rb_cString, "size", rb_str_length, 0);
08235 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
08236 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
08237 rb_define_method(rb_cString, "=~", rb_str_match, 1);
08238 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
08239 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
08240 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
08241 rb_define_method(rb_cString, "next", rb_str_succ, 0);
08242 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
08243 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
08244 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
08245 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
08246 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
08247 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
08248 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
08249 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
08250 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
08251 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
08252
08253 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
08254 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
08255 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
08256 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
08257 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
08258 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
08259
08260 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
08261 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
08262 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
08263 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
08264
08265 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
08266 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
08267 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
08268 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
08269
08270 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
08271 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
08272 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
08273 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
08274 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
08275 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
08276 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
08277 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
08278 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
08279 rb_define_method(rb_cString, "concat", rb_str_concat, 1);
08280 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
08281 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
08282 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
08283 rb_define_method(rb_cString, "intern", rb_str_intern, 0);
08284 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
08285 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
08286
08287 rb_define_method(rb_cString, "include?", rb_str_include, 1);
08288 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
08289 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
08290
08291 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
08292
08293 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
08294 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
08295 rb_define_method(rb_cString, "center", rb_str_center, -1);
08296
08297 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
08298 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
08299 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
08300 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
08301 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
08302 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
08303 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
08304
08305 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
08306 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
08307 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
08308 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
08309 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
08310 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
08311 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
08312
08313 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
08314 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
08315 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
08316 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
08317 rb_define_method(rb_cString, "count", rb_str_count, -1);
08318
08319 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
08320 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
08321 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
08322 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
08323
08324 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
08325 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
08326 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
08327 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
08328
08329 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
08330
08331 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
08332 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
08333
08334 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
08335 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
08336
08337 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0);
08338 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
08339 rb_define_method(rb_cString, "b", rb_str_b, 0);
08340 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
08341 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
08342
08343 id_to_s = rb_intern("to_s");
08344
08345 rb_fs = Qnil;
08346 rb_define_variable("$;", &rb_fs);
08347 rb_define_variable("$-F", &rb_fs);
08348
08349 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
08350 rb_include_module(rb_cSymbol, rb_mComparable);
08351 rb_undef_alloc_func(rb_cSymbol);
08352 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
08353 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0);
08354
08355 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
08356 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
08357 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
08358 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
08359 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
08360 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
08361 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
08362 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
08363 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
08364 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
08365
08366 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
08367 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
08368 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
08369
08370 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
08371 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
08372 rb_define_method(rb_cSymbol, "length", sym_length, 0);
08373 rb_define_method(rb_cSymbol, "size", sym_length, 0);
08374 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
08375 rb_define_method(rb_cSymbol, "match", sym_match, 1);
08376
08377 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
08378 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
08379 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
08380 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
08381
08382 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
08383 }
08384