00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/encoding.h"
00014 #include "internal.h"
00015 #include "regenc.h"
00016 #include <ctype.h>
00017 #include "ruby/util.h"
00018
00019 #undef rb_ascii8bit_encindex
00020 #undef rb_utf8_encindex
00021 #undef rb_usascii_encindex
00022
00023 #if defined __GNUC__ && __GNUC__ >= 4
00024 #pragma GCC visibility push(default)
00025 int rb_enc_register(const char *name, rb_encoding *encoding);
00026 void rb_enc_set_base(const char *name, const char *orig);
00027 int rb_enc_set_dummy(int index);
00028 void rb_encdb_declare(const char *name);
00029 int rb_encdb_replicate(const char *name, const char *orig);
00030 int rb_encdb_dummy(const char *name);
00031 int rb_encdb_alias(const char *alias, const char *orig);
00032 void rb_encdb_set_unicode(int index);
00033 #pragma GCC visibility pop
00034 #endif
00035
00036 static ID id_encoding;
00037 VALUE rb_cEncoding;
00038 static VALUE rb_encoding_list;
00039
00040 struct rb_encoding_entry {
00041 const char *name;
00042 rb_encoding *enc;
00043 rb_encoding *base;
00044 };
00045
00046 static struct {
00047 struct rb_encoding_entry *list;
00048 int count;
00049 int size;
00050 st_table *names;
00051 } enc_table;
00052
00053 void rb_enc_init(void);
00054
00055 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
00056 #define UNSPECIFIED_ENCODING INT_MAX
00057
00058 #define ENCODING_NAMELEN_MAX 63
00059 #define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
00060
00061 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
00062
00063 static int load_encoding(const char *name);
00064
00065 static size_t
00066 enc_memsize(const void *p)
00067 {
00068 return 0;
00069 }
00070
00071 static const rb_data_type_t encoding_data_type = {
00072 "encoding",
00073 {0, 0, enc_memsize,},
00074 NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
00075 };
00076
00077 #define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type)
00078 #define is_obj_encoding(obj) (RB_TYPE_P((obj), T_DATA) && is_data_encoding(obj))
00079
00080 static VALUE
00081 enc_new(rb_encoding *encoding)
00082 {
00083 return TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, encoding);
00084 }
00085
00086 static VALUE
00087 rb_enc_from_encoding_index(int idx)
00088 {
00089 VALUE list, enc;
00090
00091 if (!(list = rb_encoding_list)) {
00092 rb_bug("rb_enc_from_encoding_index(%d): no rb_encoding_list", idx);
00093 }
00094 enc = rb_ary_entry(list, idx);
00095 if (NIL_P(enc)) {
00096 rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx);
00097 }
00098 return enc;
00099 }
00100
00101 VALUE
00102 rb_enc_from_encoding(rb_encoding *encoding)
00103 {
00104 int idx;
00105 if (!encoding) return Qnil;
00106 idx = ENC_TO_ENCINDEX(encoding);
00107 return rb_enc_from_encoding_index(idx);
00108 }
00109
00110 static int enc_autoload(rb_encoding *);
00111
00112 static int
00113 check_encoding(rb_encoding *enc)
00114 {
00115 int index = rb_enc_to_index(enc);
00116 if (rb_enc_from_index(index) != enc)
00117 return -1;
00118 if (enc_autoload_p(enc)) {
00119 index = enc_autoload(enc);
00120 }
00121 return index;
00122 }
00123
00124 static int
00125 enc_check_encoding(VALUE obj)
00126 {
00127 if (!is_obj_encoding(obj)) {
00128 return -1;
00129 }
00130 return check_encoding(RDATA(obj)->data);
00131 }
00132
00133 NORETURN(static void not_encoding(VALUE enc));
00134 static void
00135 not_encoding(VALUE enc)
00136 {
00137 rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Encoding)",
00138 rb_obj_class(enc));
00139 }
00140
00141 static rb_encoding *
00142 must_encoding(VALUE enc)
00143 {
00144 int index = enc_check_encoding(enc);
00145 if (index < 0) {
00146 not_encoding(enc);
00147 }
00148 return DATA_PTR(enc);
00149 }
00150
00151 static rb_encoding *
00152 must_encindex(int index)
00153 {
00154 rb_encoding *enc = rb_enc_from_index(index);
00155 if (!enc) {
00156 rb_raise(rb_eEncodingError, "encoding index out of bound: %d",
00157 index);
00158 }
00159 if (ENC_TO_ENCINDEX(enc) != (int)(index & ENC_INDEX_MASK)) {
00160 rb_raise(rb_eEncodingError, "wrong encoding index %d for %s (expected %d)",
00161 index, rb_enc_name(enc), ENC_TO_ENCINDEX(enc));
00162 }
00163 if (enc_autoload_p(enc) && enc_autoload(enc) == -1) {
00164 rb_loaderror("failed to load encoding (%s)",
00165 rb_enc_name(enc));
00166 }
00167 return enc;
00168 }
00169
00170 int
00171 rb_to_encoding_index(VALUE enc)
00172 {
00173 int idx;
00174
00175 idx = enc_check_encoding(enc);
00176 if (idx >= 0) {
00177 return idx;
00178 }
00179 else if (NIL_P(enc = rb_check_string_type(enc))) {
00180 return -1;
00181 }
00182 if (!rb_enc_asciicompat(rb_enc_get(enc))) {
00183 return -1;
00184 }
00185 return rb_enc_find_index(StringValueCStr(enc));
00186 }
00187
00188
00189 static int
00190 str_find_encindex(VALUE enc)
00191 {
00192 int idx;
00193
00194 StringValue(enc);
00195 if (!rb_enc_asciicompat(rb_enc_get(enc))) {
00196 rb_raise(rb_eArgError, "invalid name encoding (non ASCII)");
00197 }
00198 idx = rb_enc_find_index(StringValueCStr(enc));
00199 return idx;
00200 }
00201
00202 static int
00203 str_to_encindex(VALUE enc)
00204 {
00205 int idx = str_find_encindex(enc);
00206 if (idx < 0) {
00207 rb_raise(rb_eArgError, "unknown encoding name - %"PRIsVALUE, enc);
00208 }
00209 return idx;
00210 }
00211
00212 static rb_encoding *
00213 str_to_encoding(VALUE enc)
00214 {
00215 return rb_enc_from_index(str_to_encindex(enc));
00216 }
00217
00218 rb_encoding *
00219 rb_to_encoding(VALUE enc)
00220 {
00221 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
00222 return str_to_encoding(enc);
00223 }
00224
00225 rb_encoding *
00226 rb_find_encoding(VALUE enc)
00227 {
00228 int idx;
00229 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
00230 idx = str_find_encindex(enc);
00231 if (idx < 0) return NULL;
00232 return rb_enc_from_index(idx);
00233 }
00234
00235 void
00236 rb_gc_mark_encodings(void)
00237 {
00238 }
00239
00240 static int
00241 enc_table_expand(int newsize)
00242 {
00243 struct rb_encoding_entry *ent;
00244 int count = newsize;
00245
00246 if (enc_table.size >= newsize) return newsize;
00247 newsize = (newsize + 7) / 8 * 8;
00248 ent = realloc(enc_table.list, sizeof(*enc_table.list) * newsize);
00249 if (!ent) return -1;
00250 memset(ent + enc_table.size, 0, sizeof(*ent)*(newsize - enc_table.size));
00251 enc_table.list = ent;
00252 enc_table.size = newsize;
00253 return count;
00254 }
00255
00256 static int
00257 enc_register_at(int index, const char *name, rb_encoding *encoding)
00258 {
00259 struct rb_encoding_entry *ent = &enc_table.list[index];
00260 VALUE list;
00261
00262 if (!valid_encoding_name_p(name)) return -1;
00263 if (!ent->name) {
00264 ent->name = name = strdup(name);
00265 }
00266 else if (STRCASECMP(name, ent->name)) {
00267 return -1;
00268 }
00269 if (!ent->enc) {
00270 ent->enc = xmalloc(sizeof(rb_encoding));
00271 }
00272 if (encoding) {
00273 *ent->enc = *encoding;
00274 }
00275 else {
00276 memset(ent->enc, 0, sizeof(*ent->enc));
00277 }
00278 encoding = ent->enc;
00279 encoding->name = name;
00280 encoding->ruby_encoding_index = index;
00281 st_insert(enc_table.names, (st_data_t)name, (st_data_t)index);
00282 list = rb_encoding_list;
00283 if (list && NIL_P(rb_ary_entry(list, index))) {
00284
00285 rb_ary_store(list, index, enc_new(encoding));
00286 }
00287 return index;
00288 }
00289
00290 static int
00291 enc_register(const char *name, rb_encoding *encoding)
00292 {
00293 int index = enc_table.count;
00294
00295 if ((index = enc_table_expand(index + 1)) < 0) return -1;
00296 enc_table.count = index;
00297 return enc_register_at(index - 1, name, encoding);
00298 }
00299
00300 static void set_encoding_const(const char *, rb_encoding *);
00301 int rb_enc_registered(const char *name);
00302
00303 int
00304 rb_enc_register(const char *name, rb_encoding *encoding)
00305 {
00306 int index = rb_enc_registered(name);
00307
00308 if (index >= 0) {
00309 rb_encoding *oldenc = rb_enc_from_index(index);
00310 if (STRCASECMP(name, rb_enc_name(oldenc))) {
00311 index = enc_register(name, encoding);
00312 }
00313 else if (enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) {
00314 enc_register_at(index, name, encoding);
00315 }
00316 else {
00317 rb_raise(rb_eArgError, "encoding %s is already registered", name);
00318 }
00319 }
00320 else {
00321 index = enc_register(name, encoding);
00322 set_encoding_const(name, rb_enc_from_index(index));
00323 }
00324 return index;
00325 }
00326
00327 void
00328 rb_encdb_declare(const char *name)
00329 {
00330 int idx = rb_enc_registered(name);
00331 if (idx < 0) {
00332 idx = enc_register(name, 0);
00333 }
00334 set_encoding_const(name, rb_enc_from_index(idx));
00335 }
00336
00337 static void
00338 enc_check_duplication(const char *name)
00339 {
00340 if (rb_enc_registered(name) >= 0) {
00341 rb_raise(rb_eArgError, "encoding %s is already registered", name);
00342 }
00343 }
00344
00345 static rb_encoding*
00346 set_base_encoding(int index, rb_encoding *base)
00347 {
00348 rb_encoding *enc = enc_table.list[index].enc;
00349
00350 enc_table.list[index].base = base;
00351 if (rb_enc_dummy_p(base)) ENC_SET_DUMMY(enc);
00352 return enc;
00353 }
00354
00355
00356
00357
00358
00359 void
00360 rb_enc_set_base(const char *name, const char *orig)
00361 {
00362 int idx = rb_enc_registered(name);
00363 int origidx = rb_enc_registered(orig);
00364 set_base_encoding(idx, rb_enc_from_index(origidx));
00365 }
00366
00367
00368
00369
00370 int
00371 rb_enc_set_dummy(int index)
00372 {
00373 rb_encoding *enc = enc_table.list[index].enc;
00374
00375 ENC_SET_DUMMY(enc);
00376 return index;
00377 }
00378
00379 int
00380 rb_enc_replicate(const char *name, rb_encoding *encoding)
00381 {
00382 int idx;
00383
00384 enc_check_duplication(name);
00385 idx = enc_register(name, encoding);
00386 set_base_encoding(idx, encoding);
00387 set_encoding_const(name, rb_enc_from_index(idx));
00388 return idx;
00389 }
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400 static VALUE
00401 enc_replicate(VALUE encoding, VALUE name)
00402 {
00403 return rb_enc_from_encoding_index(
00404 rb_enc_replicate(StringValueCStr(name),
00405 rb_to_encoding(encoding)));
00406 }
00407
00408 static int
00409 enc_replicate_with_index(const char *name, rb_encoding *origenc, int idx)
00410 {
00411 if (idx < 0) {
00412 idx = enc_register(name, origenc);
00413 }
00414 else {
00415 idx = enc_register_at(idx, name, origenc);
00416 }
00417 if (idx >= 0) {
00418 set_base_encoding(idx, origenc);
00419 set_encoding_const(name, rb_enc_from_index(idx));
00420 }
00421 return idx;
00422 }
00423
00424 int
00425 rb_encdb_replicate(const char *name, const char *orig)
00426 {
00427 int origidx = rb_enc_registered(orig);
00428 int idx = rb_enc_registered(name);
00429
00430 if (origidx < 0) {
00431 origidx = enc_register(orig, 0);
00432 }
00433 return enc_replicate_with_index(name, rb_enc_from_index(origidx), idx);
00434 }
00435
00436 int
00437 rb_define_dummy_encoding(const char *name)
00438 {
00439 int index = rb_enc_replicate(name, rb_ascii8bit_encoding());
00440 rb_encoding *enc = enc_table.list[index].enc;
00441
00442 ENC_SET_DUMMY(enc);
00443 return index;
00444 }
00445
00446 int
00447 rb_encdb_dummy(const char *name)
00448 {
00449 int index = enc_replicate_with_index(name, rb_ascii8bit_encoding(),
00450 rb_enc_registered(name));
00451 rb_encoding *enc = enc_table.list[index].enc;
00452
00453 ENC_SET_DUMMY(enc);
00454 return index;
00455 }
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470 static VALUE
00471 enc_dummy_p(VALUE enc)
00472 {
00473 return ENC_DUMMY_P(must_encoding(enc)) ? Qtrue : Qfalse;
00474 }
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486 static VALUE
00487 enc_ascii_compatible_p(VALUE enc)
00488 {
00489 return rb_enc_asciicompat(must_encoding(enc)) ? Qtrue : Qfalse;
00490 }
00491
00492
00493
00494
00495 int
00496 rb_enc_unicode_p(rb_encoding *enc)
00497 {
00498 return ONIGENC_IS_UNICODE(enc);
00499 }
00500
00501 static st_data_t
00502 enc_dup_name(st_data_t name)
00503 {
00504 return (st_data_t)strdup((const char *)name);
00505 }
00506
00507
00508
00509
00510
00511 static int
00512 enc_alias_internal(const char *alias, int idx)
00513 {
00514 return st_insert2(enc_table.names, (st_data_t)alias, (st_data_t)idx,
00515 enc_dup_name);
00516 }
00517
00518 static int
00519 enc_alias(const char *alias, int idx)
00520 {
00521 if (!valid_encoding_name_p(alias)) return -1;
00522 if (!enc_alias_internal(alias, idx))
00523 set_encoding_const(alias, rb_enc_from_index(idx));
00524 return idx;
00525 }
00526
00527 int
00528 rb_enc_alias(const char *alias, const char *orig)
00529 {
00530 int idx;
00531
00532 enc_check_duplication(alias);
00533 if (!enc_table.list) {
00534 rb_enc_init();
00535 }
00536 if ((idx = rb_enc_find_index(orig)) < 0) {
00537 return -1;
00538 }
00539 return enc_alias(alias, idx);
00540 }
00541
00542 int
00543 rb_encdb_alias(const char *alias, const char *orig)
00544 {
00545 int idx = rb_enc_registered(orig);
00546
00547 if (idx < 0) {
00548 idx = enc_register(orig, 0);
00549 }
00550 return enc_alias(alias, idx);
00551 }
00552
00553 void
00554 rb_encdb_set_unicode(int index)
00555 {
00556 rb_enc_from_index(index)->flags |= ONIGENC_FLAG_UNICODE;
00557 }
00558
00559 extern rb_encoding OnigEncodingUTF_8;
00560 extern rb_encoding OnigEncodingUS_ASCII;
00561
00562 void
00563 rb_enc_init(void)
00564 {
00565 enc_table_expand(ENCODING_COUNT + 1);
00566 if (!enc_table.names) {
00567 enc_table.names = st_init_strcasetable();
00568 }
00569 #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
00570 ENC_REGISTER(ASCII);
00571 ENC_REGISTER(UTF_8);
00572 ENC_REGISTER(US_ASCII);
00573 #undef ENC_REGISTER
00574 #define ENCDB_REGISTER(name, enc) enc_register_at(ENCINDEX_##enc, name, NULL)
00575 ENCDB_REGISTER("UTF-16BE", UTF_16BE);
00576 ENCDB_REGISTER("UTF-16LE", UTF_16LE);
00577 ENCDB_REGISTER("UTF-32BE", UTF_32BE);
00578 ENCDB_REGISTER("UTF-32LE", UTF_32LE);
00579 ENCDB_REGISTER("UTF-16", UTF_16);
00580 ENCDB_REGISTER("UTF-32", UTF_32);
00581 ENCDB_REGISTER("UTF8-MAC", UTF8_MAC);
00582
00583 ENCDB_REGISTER("EUC-JP", EUC_JP);
00584 ENCDB_REGISTER("Windows-31J", Windows_31J);
00585 #undef ENCDB_REGISTER
00586 enc_table.count = ENCINDEX_BUILTIN_MAX;
00587 }
00588
00589 rb_encoding *
00590 rb_enc_from_index(int index)
00591 {
00592 if (!enc_table.list) {
00593 rb_enc_init();
00594 }
00595 if (index < 0 || enc_table.count <= (index &= ENC_INDEX_MASK)) {
00596 return 0;
00597 }
00598 return enc_table.list[index].enc;
00599 }
00600
00601 rb_encoding *
00602 rb_enc_get_from_index(int index)
00603 {
00604 return must_encindex(index);
00605 }
00606
00607 int
00608 rb_enc_registered(const char *name)
00609 {
00610 st_data_t idx = 0;
00611
00612 if (!name) return -1;
00613 if (!enc_table.list) return -1;
00614 if (st_lookup(enc_table.names, (st_data_t)name, &idx)) {
00615 return (int)idx;
00616 }
00617 return -1;
00618 }
00619
00620 static VALUE
00621 require_enc(VALUE enclib)
00622 {
00623 int safe = rb_safe_level();
00624 return rb_require_safe(enclib, safe > 3 ? 3 : safe);
00625 }
00626
00627 static int
00628 load_encoding(const char *name)
00629 {
00630 VALUE enclib = rb_sprintf("enc/%s.so", name);
00631 VALUE verbose = ruby_verbose;
00632 VALUE debug = ruby_debug;
00633 VALUE errinfo;
00634 VALUE loaded;
00635 char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3;
00636 int idx;
00637
00638 while (s < e) {
00639 if (!ISALNUM(*s)) *s = '_';
00640 else if (ISUPPER(*s)) *s = (char)TOLOWER(*s);
00641 ++s;
00642 }
00643 FL_UNSET(enclib, FL_TAINT);
00644 OBJ_FREEZE(enclib);
00645 ruby_verbose = Qfalse;
00646 ruby_debug = Qfalse;
00647 errinfo = rb_errinfo();
00648 loaded = rb_protect(require_enc, enclib, 0);
00649 ruby_verbose = verbose;
00650 ruby_debug = debug;
00651 rb_set_errinfo(errinfo);
00652 if (NIL_P(loaded)) return -1;
00653 if ((idx = rb_enc_registered(name)) < 0) return -1;
00654 if (enc_autoload_p(enc_table.list[idx].enc)) return -1;
00655 return idx;
00656 }
00657
00658 static int
00659 enc_autoload(rb_encoding *enc)
00660 {
00661 int i;
00662 rb_encoding *base = enc_table.list[ENC_TO_ENCINDEX(enc)].base;
00663
00664 if (base) {
00665 i = 0;
00666 do {
00667 if (i >= enc_table.count) return -1;
00668 } while (enc_table.list[i].enc != base && (++i, 1));
00669 if (enc_autoload_p(base)) {
00670 if (enc_autoload(base) < 0) return -1;
00671 }
00672 i = enc->ruby_encoding_index;
00673 enc_register_at(i & ENC_INDEX_MASK, rb_enc_name(enc), base);
00674 enc->ruby_encoding_index = i;
00675 }
00676 else {
00677 i = load_encoding(rb_enc_name(enc));
00678 }
00679 return i;
00680 }
00681
00682
00683 int
00684 rb_enc_find_index(const char *name)
00685 {
00686 int i = rb_enc_registered(name);
00687 rb_encoding *enc;
00688
00689 if (i < 0) {
00690 i = load_encoding(name);
00691 }
00692 else if (!(enc = rb_enc_from_index(i))) {
00693 if (i != UNSPECIFIED_ENCODING) {
00694 rb_raise(rb_eArgError, "encoding %s is not registered", name);
00695 }
00696 }
00697 else if (enc_autoload_p(enc)) {
00698 if (enc_autoload(enc) < 0) {
00699 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
00700 name);
00701 return 0;
00702 }
00703 }
00704 return i;
00705 }
00706
00707 rb_encoding *
00708 rb_enc_find(const char *name)
00709 {
00710 int idx = rb_enc_find_index(name);
00711 if (idx < 0) idx = 0;
00712 return rb_enc_from_index(idx);
00713 }
00714
00715 static inline int
00716 enc_capable(VALUE obj)
00717 {
00718 if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj);
00719 switch (BUILTIN_TYPE(obj)) {
00720 case T_STRING:
00721 case T_REGEXP:
00722 case T_FILE:
00723 return TRUE;
00724 case T_DATA:
00725 if (is_data_encoding(obj)) return TRUE;
00726 default:
00727 return FALSE;
00728 }
00729 }
00730
00731 ID
00732 rb_id_encoding(void)
00733 {
00734 CONST_ID(id_encoding, "encoding");
00735 return id_encoding;
00736 }
00737
00738 int
00739 rb_enc_get_index(VALUE obj)
00740 {
00741 int i = -1;
00742 VALUE tmp;
00743
00744 if (SPECIAL_CONST_P(obj)) {
00745 if (!SYMBOL_P(obj)) return -1;
00746 obj = rb_id2str(SYM2ID(obj));
00747 }
00748 switch (BUILTIN_TYPE(obj)) {
00749 as_default:
00750 default:
00751 case T_STRING:
00752 case T_REGEXP:
00753 i = ENCODING_GET_INLINED(obj);
00754 if (i == ENCODING_INLINE_MAX) {
00755 VALUE iv;
00756
00757 iv = rb_ivar_get(obj, rb_id_encoding());
00758 i = NUM2INT(iv);
00759 }
00760 break;
00761 case T_FILE:
00762 tmp = rb_funcall(obj, rb_intern("internal_encoding"), 0, 0);
00763 if (NIL_P(tmp)) obj = rb_funcall(obj, rb_intern("external_encoding"), 0, 0);
00764 else obj = tmp;
00765 if (NIL_P(obj)) break;
00766 case T_DATA:
00767 if (is_data_encoding(obj)) {
00768 i = enc_check_encoding(obj);
00769 }
00770 else {
00771 goto as_default;
00772 }
00773 break;
00774 }
00775 return i;
00776 }
00777
00778 static void
00779 enc_set_index(VALUE obj, int idx)
00780 {
00781 if (idx < ENCODING_INLINE_MAX) {
00782 ENCODING_SET_INLINED(obj, idx);
00783 return;
00784 }
00785 ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
00786 rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
00787 }
00788
00789 void
00790 rb_enc_set_index(VALUE obj, int idx)
00791 {
00792 rb_check_frozen(obj);
00793 must_encindex(idx);
00794 enc_set_index(obj, idx);
00795 }
00796
00797 VALUE
00798 rb_enc_associate_index(VALUE obj, int idx)
00799 {
00800 rb_encoding *enc;
00801 int oldidx, oldtermlen, termlen;
00802
00803
00804 rb_check_frozen(obj);
00805 oldidx = rb_enc_get_index(obj);
00806 if (oldidx == idx)
00807 return obj;
00808 if (SPECIAL_CONST_P(obj)) {
00809 rb_raise(rb_eArgError, "cannot set encoding");
00810 }
00811 enc = must_encindex(idx);
00812 if (!ENC_CODERANGE_ASCIIONLY(obj) ||
00813 !rb_enc_asciicompat(enc)) {
00814 ENC_CODERANGE_CLEAR(obj);
00815 }
00816 termlen = rb_enc_mbminlen(enc);
00817 oldtermlen = rb_enc_mbminlen(rb_enc_from_index(oldidx));
00818 if (oldtermlen < termlen && RB_TYPE_P(obj, T_STRING)) {
00819 rb_str_fill_terminator(obj, termlen);
00820 }
00821 enc_set_index(obj, idx);
00822 return obj;
00823 }
00824
00825 VALUE
00826 rb_enc_associate(VALUE obj, rb_encoding *enc)
00827 {
00828 return rb_enc_associate_index(obj, rb_enc_to_index(enc));
00829 }
00830
00831 rb_encoding*
00832 rb_enc_get(VALUE obj)
00833 {
00834 return rb_enc_from_index(rb_enc_get_index(obj));
00835 }
00836
00837 rb_encoding*
00838 rb_enc_check(VALUE str1, VALUE str2)
00839 {
00840 rb_encoding *enc = rb_enc_compatible(str1, str2);
00841 if (!enc)
00842 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
00843 rb_enc_name(rb_enc_get(str1)),
00844 rb_enc_name(rb_enc_get(str2)));
00845 return enc;
00846 }
00847
00848 rb_encoding*
00849 rb_enc_compatible(VALUE str1, VALUE str2)
00850 {
00851 int idx1, idx2;
00852 rb_encoding *enc1, *enc2;
00853 int isstr1, isstr2;
00854
00855 idx1 = rb_enc_get_index(str1);
00856 idx2 = rb_enc_get_index(str2);
00857
00858 if (idx1 < 0 || idx2 < 0)
00859 return 0;
00860
00861 if (idx1 == idx2) {
00862 return rb_enc_from_index(idx1);
00863 }
00864 enc1 = rb_enc_from_index(idx1);
00865 enc2 = rb_enc_from_index(idx2);
00866
00867 isstr2 = RB_TYPE_P(str2, T_STRING);
00868 if (isstr2 && RSTRING_LEN(str2) == 0)
00869 return enc1;
00870 isstr1 = RB_TYPE_P(str1, T_STRING);
00871 if (isstr1 && RSTRING_LEN(str1) == 0)
00872 return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
00873 if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
00874 return 0;
00875 }
00876
00877
00878 if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
00879 return enc1;
00880 if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
00881 return enc2;
00882
00883 if (!isstr1) {
00884 VALUE tmp = str1;
00885 int idx0 = idx1;
00886 str1 = str2;
00887 str2 = tmp;
00888 idx1 = idx2;
00889 idx2 = idx0;
00890 idx0 = isstr1;
00891 isstr1 = isstr2;
00892 isstr2 = idx0;
00893 }
00894 if (isstr1) {
00895 int cr1, cr2;
00896
00897 cr1 = rb_enc_str_coderange(str1);
00898 if (isstr2) {
00899 cr2 = rb_enc_str_coderange(str2);
00900 if (cr1 != cr2) {
00901
00902 if (cr1 == ENC_CODERANGE_7BIT) return enc2;
00903 if (cr2 == ENC_CODERANGE_7BIT) return enc1;
00904 }
00905 if (cr2 == ENC_CODERANGE_7BIT) {
00906 return enc1;
00907 }
00908 }
00909 if (cr1 == ENC_CODERANGE_7BIT)
00910 return enc2;
00911 }
00912 return 0;
00913 }
00914
00915 void
00916 rb_enc_copy(VALUE obj1, VALUE obj2)
00917 {
00918 rb_enc_associate_index(obj1, rb_enc_get_index(obj2));
00919 }
00920
00921
00922
00923
00924
00925
00926
00927
00928
00929 VALUE
00930 rb_obj_encoding(VALUE obj)
00931 {
00932 int idx = rb_enc_get_index(obj);
00933 if (idx < 0) {
00934 rb_raise(rb_eTypeError, "unknown encoding");
00935 }
00936 return rb_enc_from_encoding_index(idx & ENC_INDEX_MASK);
00937 }
00938
00939 int
00940 rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
00941 {
00942 return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
00943 }
00944
00945 int
00946 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
00947 {
00948 int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
00949 if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
00950 return MBCLEN_CHARFOUND_LEN(n);
00951 else {
00952 int min = rb_enc_mbminlen(enc);
00953 return min <= e-p ? min : (int)(e-p);
00954 }
00955 }
00956
00957 int
00958 rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
00959 {
00960 int n;
00961 if (e <= p)
00962 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
00963 n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
00964 if (e-p < n)
00965 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p));
00966 return n;
00967 }
00968
00969 int
00970 rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
00971 {
00972 unsigned int c, l;
00973 if (e <= p)
00974 return -1;
00975 if (rb_enc_asciicompat(enc)) {
00976 c = (unsigned char)*p;
00977 if (!ISASCII(c))
00978 return -1;
00979 if (len) *len = 1;
00980 return c;
00981 }
00982 l = rb_enc_precise_mbclen(p, e, enc);
00983 if (!MBCLEN_CHARFOUND_P(l))
00984 return -1;
00985 c = rb_enc_mbc_to_codepoint(p, e, enc);
00986 if (!rb_enc_isascii(c, enc))
00987 return -1;
00988 if (len) *len = l;
00989 return c;
00990 }
00991
00992 unsigned int
00993 rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
00994 {
00995 int r;
00996 if (e <= p)
00997 rb_raise(rb_eArgError, "empty string");
00998 r = rb_enc_precise_mbclen(p, e, enc);
00999 if (!MBCLEN_CHARFOUND_P(r)) {
01000 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
01001 }
01002 if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
01003 return rb_enc_mbc_to_codepoint(p, e, enc);
01004 }
01005
01006 #undef rb_enc_codepoint
01007 unsigned int
01008 rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
01009 {
01010 return rb_enc_codepoint_len(p, e, 0, enc);
01011 }
01012
01013 int
01014 rb_enc_codelen(int c, rb_encoding *enc)
01015 {
01016 int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
01017 if (n == 0) {
01018 rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc));
01019 }
01020 return n;
01021 }
01022
01023 #undef rb_enc_code_to_mbclen
01024 int
01025 rb_enc_code_to_mbclen(int code, rb_encoding *enc)
01026 {
01027 return ONIGENC_CODE_TO_MBCLEN(enc, code);
01028 }
01029
01030 int
01031 rb_enc_toupper(int c, rb_encoding *enc)
01032 {
01033 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c));
01034 }
01035
01036 int
01037 rb_enc_tolower(int c, rb_encoding *enc)
01038 {
01039 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c));
01040 }
01041
01042
01043
01044
01045
01046
01047
01048
01049
01050
01051 static VALUE
01052 enc_inspect(VALUE self)
01053 {
01054 rb_encoding *enc;
01055
01056 if (!is_data_encoding(self)) {
01057 not_encoding(self);
01058 }
01059 if (!(enc = DATA_PTR(self)) || rb_enc_from_index(rb_enc_to_index(enc)) != enc) {
01060 rb_raise(rb_eTypeError, "broken Encoding");
01061 }
01062 return rb_enc_sprintf(rb_usascii_encoding(),
01063 "#<%"PRIsVALUE":%s%s%s>", rb_obj_class(self),
01064 rb_enc_name(enc),
01065 (ENC_DUMMY_P(enc) ? " (dummy)" : ""),
01066 enc_autoload_p(enc) ? " (autoload)" : "");
01067 }
01068
01069
01070
01071
01072
01073
01074
01075
01076
01077
01078 static VALUE
01079 enc_name(VALUE self)
01080 {
01081 return rb_usascii_str_new2(rb_enc_name((rb_encoding*)DATA_PTR(self)));
01082 }
01083
01084 static int
01085 enc_names_i(st_data_t name, st_data_t idx, st_data_t args)
01086 {
01087 VALUE *arg = (VALUE *)args;
01088
01089 if ((int)idx == (int)arg[0]) {
01090 VALUE str = rb_usascii_str_new2((char *)name);
01091 OBJ_FREEZE(str);
01092 rb_ary_push(arg[1], str);
01093 }
01094 return ST_CONTINUE;
01095 }
01096
01097
01098
01099
01100
01101
01102
01103
01104
01105 static VALUE
01106 enc_names(VALUE self)
01107 {
01108 VALUE args[2];
01109
01110 args[0] = (VALUE)rb_to_encoding_index(self);
01111 args[1] = rb_ary_new2(0);
01112 st_foreach(enc_table.names, enc_names_i, (st_data_t)args);
01113 return args[1];
01114 }
01115
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125
01126
01127
01128
01129
01130
01131
01132
01133
01134 static VALUE
01135 enc_list(VALUE klass)
01136 {
01137 VALUE ary = rb_ary_new2(0);
01138 rb_ary_replace(ary, rb_encoding_list);
01139 return ary;
01140 }
01141
01142
01143
01144
01145
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156
01157
01158
01159
01160
01161
01162
01163
01164 static VALUE
01165 enc_find(VALUE klass, VALUE enc)
01166 {
01167 int idx;
01168 if (is_obj_encoding(enc))
01169 return enc;
01170 idx = str_to_encindex(enc);
01171 if (idx == UNSPECIFIED_ENCODING) return Qnil;
01172 return rb_enc_from_encoding_index(idx);
01173 }
01174
01175
01176
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198
01199 static VALUE
01200 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
01201 {
01202 rb_encoding *enc;
01203
01204 if (!enc_capable(str1)) return Qnil;
01205 if (!enc_capable(str2)) return Qnil;
01206 enc = rb_enc_compatible(str1, str2);
01207 if (!enc) return Qnil;
01208 return rb_enc_from_encoding(enc);
01209 }
01210
01211
01212 static VALUE
01213 enc_s_alloc(VALUE klass)
01214 {
01215 rb_undefined_alloc(klass);
01216 return Qnil;
01217 }
01218
01219
01220 static VALUE
01221 enc_dump(int argc, VALUE *argv, VALUE self)
01222 {
01223 rb_scan_args(argc, argv, "01", 0);
01224 return enc_name(self);
01225 }
01226
01227
01228 static VALUE
01229 enc_load(VALUE klass, VALUE str)
01230 {
01231 return str;
01232 }
01233
01234
01235 static VALUE
01236 enc_m_loader(VALUE klass, VALUE str)
01237 {
01238 return enc_find(klass, str);
01239 }
01240
01241 rb_encoding *
01242 rb_ascii8bit_encoding(void)
01243 {
01244 if (!enc_table.list) {
01245 rb_enc_init();
01246 }
01247 return enc_table.list[ENCINDEX_ASCII].enc;
01248 }
01249
01250 int
01251 rb_ascii8bit_encindex(void)
01252 {
01253 return ENCINDEX_ASCII;
01254 }
01255
01256 rb_encoding *
01257 rb_utf8_encoding(void)
01258 {
01259 if (!enc_table.list) {
01260 rb_enc_init();
01261 }
01262 return enc_table.list[ENCINDEX_UTF_8].enc;
01263 }
01264
01265 int
01266 rb_utf8_encindex(void)
01267 {
01268 return ENCINDEX_UTF_8;
01269 }
01270
01271 rb_encoding *
01272 rb_usascii_encoding(void)
01273 {
01274 if (!enc_table.list) {
01275 rb_enc_init();
01276 }
01277 return enc_table.list[ENCINDEX_US_ASCII].enc;
01278 }
01279
01280 int
01281 rb_usascii_encindex(void)
01282 {
01283 return ENCINDEX_US_ASCII;
01284 }
01285
01286 int
01287 rb_locale_encindex(void)
01288 {
01289 VALUE charmap = rb_locale_charmap(rb_cEncoding);
01290 int idx;
01291
01292 if (NIL_P(charmap))
01293 idx = ENCINDEX_US_ASCII;
01294 else if ((idx = rb_enc_find_index(StringValueCStr(charmap))) < 0)
01295 idx = ENCINDEX_ASCII;
01296
01297 if (rb_enc_registered("locale") < 0) {
01298 # if defined _WIN32
01299 void Init_w32_codepage(void);
01300 Init_w32_codepage();
01301 # endif
01302 enc_alias_internal("locale", idx);
01303 }
01304
01305 return idx;
01306 }
01307
01308 rb_encoding *
01309 rb_locale_encoding(void)
01310 {
01311 return rb_enc_from_index(rb_locale_encindex());
01312 }
01313
01314 int
01315 rb_filesystem_encindex(void)
01316 {
01317 int idx = rb_enc_registered("filesystem");
01318 if (idx < 0)
01319 idx = ENCINDEX_ASCII;
01320 return idx;
01321 }
01322
01323 rb_encoding *
01324 rb_filesystem_encoding(void)
01325 {
01326 return rb_enc_from_index(rb_filesystem_encindex());
01327 }
01328
01329 struct default_encoding {
01330 int index;
01331 rb_encoding *enc;
01332 };
01333
01334 static struct default_encoding default_external = {0};
01335
01336 extern int Init_enc_set_filesystem_encoding(void);
01337
01338 static int
01339 enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name)
01340 {
01341 int overridden = FALSE;
01342
01343 if (def->index != -2)
01344
01345 overridden = TRUE;
01346
01347 if (NIL_P(encoding)) {
01348 def->index = -1;
01349 def->enc = 0;
01350 st_insert(enc_table.names, (st_data_t)strdup(name),
01351 (st_data_t)UNSPECIFIED_ENCODING);
01352 }
01353 else {
01354 def->index = rb_enc_to_index(rb_to_encoding(encoding));
01355 def->enc = 0;
01356 enc_alias_internal(name, def->index);
01357 }
01358
01359 if (def == &default_external)
01360 enc_alias_internal("filesystem", Init_enc_set_filesystem_encoding());
01361
01362 return overridden;
01363 }
01364
01365 rb_encoding *
01366 rb_default_external_encoding(void)
01367 {
01368 if (default_external.enc) return default_external.enc;
01369
01370 if (default_external.index >= 0) {
01371 default_external.enc = rb_enc_from_index(default_external.index);
01372 return default_external.enc;
01373 }
01374 else {
01375 return rb_locale_encoding();
01376 }
01377 }
01378
01379 VALUE
01380 rb_enc_default_external(void)
01381 {
01382 return rb_enc_from_encoding(rb_default_external_encoding());
01383 }
01384
01385
01386
01387
01388
01389
01390
01391
01392
01393
01394
01395
01396
01397
01398
01399
01400
01401
01402
01403
01404
01405
01406
01407
01408
01409
01410
01411 static VALUE
01412 get_default_external(VALUE klass)
01413 {
01414 return rb_enc_default_external();
01415 }
01416
01417 void
01418 rb_enc_set_default_external(VALUE encoding)
01419 {
01420 if (NIL_P(encoding)) {
01421 rb_raise(rb_eArgError, "default external can not be nil");
01422 }
01423 enc_set_default_encoding(&default_external, encoding,
01424 "external");
01425 }
01426
01427
01428
01429
01430
01431
01432
01433
01434
01435
01436
01437
01438
01439
01440 static VALUE
01441 set_default_external(VALUE klass, VALUE encoding)
01442 {
01443 rb_warning("setting Encoding.default_external");
01444 rb_enc_set_default_external(encoding);
01445 return encoding;
01446 }
01447
01448 static struct default_encoding default_internal = {-2};
01449
01450 rb_encoding *
01451 rb_default_internal_encoding(void)
01452 {
01453 if (!default_internal.enc && default_internal.index >= 0) {
01454 default_internal.enc = rb_enc_from_index(default_internal.index);
01455 }
01456 return default_internal.enc;
01457 }
01458
01459 VALUE
01460 rb_enc_default_internal(void)
01461 {
01462
01463 return rb_enc_from_encoding(rb_default_internal_encoding());
01464 }
01465
01466
01467
01468
01469
01470
01471
01472
01473
01474
01475
01476
01477
01478
01479
01480
01481
01482
01483
01484
01485
01486
01487
01488
01489
01490
01491
01492
01493
01494
01495
01496 static VALUE
01497 get_default_internal(VALUE klass)
01498 {
01499 return rb_enc_default_internal();
01500 }
01501
01502 void
01503 rb_enc_set_default_internal(VALUE encoding)
01504 {
01505 enc_set_default_encoding(&default_internal, encoding,
01506 "internal");
01507 }
01508
01509
01510
01511
01512
01513
01514
01515
01516
01517
01518
01519
01520
01521
01522 static VALUE
01523 set_default_internal(VALUE klass, VALUE encoding)
01524 {
01525 rb_warning("setting Encoding.default_internal");
01526 rb_enc_set_default_internal(encoding);
01527 return encoding;
01528 }
01529
01530
01531
01532
01533
01534
01535
01536
01537
01538
01539
01540
01541
01542
01543
01544
01545
01546
01547
01548
01549
01550
01551
01552
01553
01554
01555 VALUE
01556 rb_locale_charmap(VALUE klass);
01557
01558 static void
01559 set_encoding_const(const char *name, rb_encoding *enc)
01560 {
01561 VALUE encoding = rb_enc_from_encoding(enc);
01562 char *s = (char *)name;
01563 int haslower = 0, hasupper = 0, valid = 0;
01564
01565 if (ISDIGIT(*s)) return;
01566 if (ISUPPER(*s)) {
01567 hasupper = 1;
01568 while (*++s && (ISALNUM(*s) || *s == '_')) {
01569 if (ISLOWER(*s)) haslower = 1;
01570 }
01571 }
01572 if (!*s) {
01573 if (s - name > ENCODING_NAMELEN_MAX) return;
01574 valid = 1;
01575 rb_define_const(rb_cEncoding, name, encoding);
01576 }
01577 if (!valid || haslower) {
01578 size_t len = s - name;
01579 if (len > ENCODING_NAMELEN_MAX) return;
01580 if (!haslower || !hasupper) {
01581 do {
01582 if (ISLOWER(*s)) haslower = 1;
01583 if (ISUPPER(*s)) hasupper = 1;
01584 } while (*++s && (!haslower || !hasupper));
01585 len = s - name;
01586 }
01587 len += strlen(s);
01588 if (len++ > ENCODING_NAMELEN_MAX) return;
01589 MEMCPY(s = ALLOCA_N(char, len), name, char, len);
01590 name = s;
01591 if (!valid) {
01592 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
01593 for (; *s; ++s) {
01594 if (!ISALNUM(*s)) *s = '_';
01595 }
01596 if (hasupper) {
01597 rb_define_const(rb_cEncoding, name, encoding);
01598 }
01599 }
01600 if (haslower) {
01601 for (s = (char *)name; *s; ++s) {
01602 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
01603 }
01604 rb_define_const(rb_cEncoding, name, encoding);
01605 }
01606 }
01607 }
01608
01609 static int
01610 rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
01611 {
01612 VALUE ary = (VALUE)arg;
01613 VALUE str = rb_usascii_str_new2((char *)name);
01614 OBJ_FREEZE(str);
01615 rb_ary_push(ary, str);
01616 return ST_CONTINUE;
01617 }
01618
01619
01620
01621
01622
01623
01624
01625
01626
01627
01628
01629
01630
01631
01632
01633 static VALUE
01634 rb_enc_name_list(VALUE klass)
01635 {
01636 VALUE ary = rb_ary_new2(enc_table.names->num_entries);
01637 st_foreach(enc_table.names, rb_enc_name_list_i, (st_data_t)ary);
01638 return ary;
01639 }
01640
01641 static int
01642 rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg)
01643 {
01644 VALUE *p = (VALUE *)arg;
01645 VALUE aliases = p[0], ary = p[1];
01646 int idx = (int)orig;
01647 VALUE key, str = rb_ary_entry(ary, idx);
01648
01649 if (NIL_P(str)) {
01650 rb_encoding *enc = rb_enc_from_index(idx);
01651
01652 if (!enc) return ST_CONTINUE;
01653 if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) {
01654 return ST_CONTINUE;
01655 }
01656 str = rb_usascii_str_new2(rb_enc_name(enc));
01657 OBJ_FREEZE(str);
01658 rb_ary_store(ary, idx, str);
01659 }
01660 key = rb_usascii_str_new2((char *)name);
01661 OBJ_FREEZE(key);
01662 rb_hash_aset(aliases, key, str);
01663 return ST_CONTINUE;
01664 }
01665
01666
01667
01668
01669
01670
01671
01672
01673
01674
01675
01676
01677
01678 static VALUE
01679 rb_enc_aliases(VALUE klass)
01680 {
01681 VALUE aliases[2];
01682 aliases[0] = rb_hash_new();
01683 aliases[1] = rb_ary_new();
01684 st_foreach(enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases);
01685 return aliases[0];
01686 }
01687
01688
01689
01690
01691
01692
01693
01694
01695
01696
01697
01698
01699
01700
01701
01702
01703
01704
01705
01706
01707
01708
01709
01710
01711
01712
01713
01714
01715
01716
01717
01718
01719
01720
01721
01722
01723
01724
01725
01726
01727
01728
01729
01730
01731
01732
01733
01734
01735
01736
01737
01738
01739
01740
01741
01742
01743
01744
01745
01746
01747
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757
01758
01759
01760
01761
01762
01763
01764
01765
01766
01767
01768
01769
01770
01771
01772
01773
01774
01775
01776
01777
01778
01779
01780
01781
01782
01783
01784
01785
01786
01787
01788
01789
01790
01791
01792
01793
01794
01795
01796
01797
01798
01799
01800
01801
01802
01803
01804
01805
01806
01807
01808
01809
01810
01811
01812
01813
01814
01815
01816
01817
01818
01819
01820
01821
01822
01823
01824
01825
01826
01827
01828
01829
01830
01831
01832
01833
01834
01835
01836
01837
01838
01839
01840
01841
01842
01843
01844
01845
01846
01847
01848
01849
01850
01851
01852
01853
01854
01855
01856
01857
01858
01859
01860
01861
01862
01863
01864
01865
01866
01867
01868
01869
01870
01871
01872
01873
01874
01875
01876
01877
01878
01879
01880
01881
01882
01883
01884
01885
01886
01887
01888 void
01889 Init_Encoding(void)
01890 {
01891 #undef rb_intern
01892 #define rb_intern(str) rb_intern_const(str)
01893 VALUE list;
01894 int i;
01895
01896 rb_cEncoding = rb_define_class("Encoding", rb_cObject);
01897 rb_define_alloc_func(rb_cEncoding, enc_s_alloc);
01898 rb_undef_method(CLASS_OF(rb_cEncoding), "new");
01899 rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
01900 rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
01901 rb_define_method(rb_cEncoding, "name", enc_name, 0);
01902 rb_define_method(rb_cEncoding, "names", enc_names, 0);
01903 rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
01904 rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0);
01905 rb_define_method(rb_cEncoding, "replicate", enc_replicate, 1);
01906 rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
01907 rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
01908 rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
01909 rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
01910 rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
01911
01912 rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
01913 rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
01914
01915 rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
01916 rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1);
01917 rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0);
01918 rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1);
01919 rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
01920
01921 list = rb_ary_new2(enc_table.count);
01922 RBASIC_CLEAR_CLASS(list);
01923 rb_encoding_list = list;
01924 rb_gc_register_mark_object(list);
01925
01926 for (i = 0; i < enc_table.count; ++i) {
01927 rb_ary_push(list, enc_new(enc_table.list[i].enc));
01928 }
01929
01930 rb_marshal_define_compat(rb_cEncoding, Qnil, NULL, enc_m_loader);
01931 }
01932
01933
01934
01935 #define ctype_test(c, ctype) \
01936 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), (ctype)))
01937
01938 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); }
01939 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); }
01940 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); }
01941 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); }
01942 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); }
01943 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); }
01944 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); }
01945 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); }
01946 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); }
01947 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); }
01948 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); }
01949 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); }
01950
01951 int
01952 rb_tolower(int c)
01953 {
01954 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c;
01955 }
01956
01957 int
01958 rb_toupper(int c)
01959 {
01960 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c;
01961 }
01962
01963 void
01964 rb_enc_foreach_name(int (*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg)
01965 {
01966 st_foreach(enc_table.names, func, arg);
01967 }
01968