00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/encoding.h"
00014 #include "internal.h"
00015 #include "transcode_data.h"
00016 #include <ctype.h>
00017
00018 #define ENABLE_ECONV_NEWLINE_OPTION 1
00019
00020
00021 VALUE rb_eUndefinedConversionError;
00022 VALUE rb_eInvalidByteSequenceError;
00023 VALUE rb_eConverterNotFoundError;
00024
00025 VALUE rb_cEncodingConverter;
00026
00027 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback, sym_aref;
00028 static VALUE sym_xml, sym_text, sym_attr;
00029 static VALUE sym_universal_newline;
00030 static VALUE sym_crlf_newline;
00031 static VALUE sym_cr_newline;
00032 #ifdef ENABLE_ECONV_NEWLINE_OPTION
00033 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
00034 #endif
00035 static VALUE sym_partial_input;
00036
00037 static VALUE sym_invalid_byte_sequence;
00038 static VALUE sym_undefined_conversion;
00039 static VALUE sym_destination_buffer_full;
00040 static VALUE sym_source_buffer_empty;
00041 static VALUE sym_finished;
00042 static VALUE sym_after_output;
00043 static VALUE sym_incomplete_input;
00044
00045 static unsigned char *
00046 allocate_converted_string(const char *sname, const char *dname,
00047 const unsigned char *str, size_t len,
00048 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
00049 size_t *dst_len_ptr);
00050
00051
00052
00053 typedef struct rb_transcoding {
00054 const rb_transcoder *transcoder;
00055
00056 int flags;
00057
00058 int resume_position;
00059 unsigned int next_table;
00060 VALUE next_info;
00061 unsigned char next_byte;
00062 unsigned int output_index;
00063
00064 ssize_t recognized_len;
00065 ssize_t readagain_len;
00066 union {
00067 unsigned char ary[8];
00068 unsigned char *ptr;
00069 } readbuf;
00070
00071 ssize_t writebuf_off;
00072 ssize_t writebuf_len;
00073 union {
00074 unsigned char ary[8];
00075 unsigned char *ptr;
00076 } writebuf;
00077
00078 union rb_transcoding_state_t {
00079 void *ptr;
00080 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
00081 double dummy_for_alignment;
00082 } state;
00083 } rb_transcoding;
00084 #define TRANSCODING_READBUF(tc) \
00085 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
00086 (tc)->readbuf.ary : \
00087 (tc)->readbuf.ptr)
00088 #define TRANSCODING_WRITEBUF(tc) \
00089 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00090 (tc)->writebuf.ary : \
00091 (tc)->writebuf.ptr)
00092 #define TRANSCODING_WRITEBUF_SIZE(tc) \
00093 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00094 sizeof((tc)->writebuf.ary) : \
00095 (size_t)(tc)->transcoder->max_output)
00096 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
00097 #define TRANSCODING_STATE(tc) \
00098 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
00099 (tc)->state.ary : \
00100 (tc)->state.ptr)
00101
00102 typedef struct {
00103 struct rb_transcoding *tc;
00104 unsigned char *out_buf_start;
00105 unsigned char *out_data_start;
00106 unsigned char *out_data_end;
00107 unsigned char *out_buf_end;
00108 rb_econv_result_t last_result;
00109 } rb_econv_elem_t;
00110
00111 struct rb_econv_t {
00112 int flags;
00113 const char *source_encoding_name;
00114 const char *destination_encoding_name;
00115
00116 int started;
00117
00118 const unsigned char *replacement_str;
00119 size_t replacement_len;
00120 const char *replacement_enc;
00121 int replacement_allocated;
00122
00123 unsigned char *in_buf_start;
00124 unsigned char *in_data_start;
00125 unsigned char *in_data_end;
00126 unsigned char *in_buf_end;
00127 rb_econv_elem_t *elems;
00128 int num_allocated;
00129 int num_trans;
00130 int num_finished;
00131 struct rb_transcoding *last_tc;
00132
00133
00134 struct {
00135 rb_econv_result_t result;
00136 struct rb_transcoding *error_tc;
00137 const char *source_encoding;
00138 const char *destination_encoding;
00139 const unsigned char *error_bytes_start;
00140 size_t error_bytes_len;
00141 size_t readagain_len;
00142 } last_error;
00143
00144
00145
00146 rb_encoding *source_encoding;
00147 rb_encoding *destination_encoding;
00148 };
00149
00150
00151
00152
00153
00154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
00155
00156 typedef struct {
00157 const char *sname;
00158 const char *dname;
00159 const char *lib;
00160 const rb_transcoder *transcoder;
00161 } transcoder_entry_t;
00162
00163 static st_table *transcoder_table;
00164
00165 static transcoder_entry_t *
00166 make_transcoder_entry(const char *sname, const char *dname)
00167 {
00168 st_data_t val;
00169 st_table *table2;
00170
00171 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00172 val = (st_data_t)st_init_strcasetable();
00173 st_add_direct(transcoder_table, (st_data_t)sname, val);
00174 }
00175 table2 = (st_table *)val;
00176 if (!st_lookup(table2, (st_data_t)dname, &val)) {
00177 transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
00178 entry->sname = sname;
00179 entry->dname = dname;
00180 entry->lib = NULL;
00181 entry->transcoder = NULL;
00182 val = (st_data_t)entry;
00183 st_add_direct(table2, (st_data_t)dname, val);
00184 }
00185 return (transcoder_entry_t *)val;
00186 }
00187
00188 static transcoder_entry_t *
00189 get_transcoder_entry(const char *sname, const char *dname)
00190 {
00191 st_data_t val;
00192 st_table *table2;
00193
00194 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00195 return NULL;
00196 }
00197 table2 = (st_table *)val;
00198 if (!st_lookup(table2, (st_data_t)dname, &val)) {
00199 return NULL;
00200 }
00201 return (transcoder_entry_t *)val;
00202 }
00203
00204 void
00205 rb_register_transcoder(const rb_transcoder *tr)
00206 {
00207 const char *const sname = tr->src_encoding;
00208 const char *const dname = tr->dst_encoding;
00209
00210 transcoder_entry_t *entry;
00211
00212 entry = make_transcoder_entry(sname, dname);
00213 if (entry->transcoder) {
00214 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
00215 sname, dname);
00216 }
00217
00218 entry->transcoder = tr;
00219 }
00220
00221 static void
00222 declare_transcoder(const char *sname, const char *dname, const char *lib)
00223 {
00224 transcoder_entry_t *entry;
00225
00226 entry = make_transcoder_entry(sname, dname);
00227 entry->lib = lib;
00228 }
00229
00230 static const char transcoder_lib_prefix[] = "enc/trans/";
00231
00232 void
00233 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
00234 {
00235 if (!lib) {
00236 rb_raise(rb_eArgError, "invalid library name - (null)");
00237 }
00238 declare_transcoder(enc1, enc2, lib);
00239 }
00240
00241 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
00242
00243 typedef struct search_path_queue_tag {
00244 struct search_path_queue_tag *next;
00245 const char *enc;
00246 } search_path_queue_t;
00247
00248 typedef struct {
00249 st_table *visited;
00250 search_path_queue_t *queue;
00251 search_path_queue_t **queue_last_ptr;
00252 const char *base_enc;
00253 } search_path_bfs_t;
00254
00255 static int
00256 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
00257 {
00258 const char *dname = (const char *)key;
00259 search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
00260 search_path_queue_t *q;
00261
00262 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
00263 return ST_CONTINUE;
00264 }
00265
00266 q = ALLOC(search_path_queue_t);
00267 q->enc = dname;
00268 q->next = NULL;
00269 *bfs->queue_last_ptr = q;
00270 bfs->queue_last_ptr = &q->next;
00271
00272 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
00273 return ST_CONTINUE;
00274 }
00275
00276 static int
00277 transcode_search_path(const char *sname, const char *dname,
00278 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
00279 void *arg)
00280 {
00281 search_path_bfs_t bfs;
00282 search_path_queue_t *q;
00283 st_data_t val;
00284 st_table *table2;
00285 int found;
00286 int pathlen = -1;
00287
00288 if (encoding_equal(sname, dname))
00289 return -1;
00290
00291 q = ALLOC(search_path_queue_t);
00292 q->enc = sname;
00293 q->next = NULL;
00294 bfs.queue_last_ptr = &q->next;
00295 bfs.queue = q;
00296
00297 bfs.visited = st_init_strcasetable();
00298 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
00299
00300 while (bfs.queue) {
00301 q = bfs.queue;
00302 bfs.queue = q->next;
00303 if (!bfs.queue)
00304 bfs.queue_last_ptr = &bfs.queue;
00305
00306 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
00307 xfree(q);
00308 continue;
00309 }
00310 table2 = (st_table *)val;
00311
00312 if (st_lookup(table2, (st_data_t)dname, &val)) {
00313 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
00314 xfree(q);
00315 found = 1;
00316 goto cleanup;
00317 }
00318
00319 bfs.base_enc = q->enc;
00320 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
00321 bfs.base_enc = NULL;
00322
00323 xfree(q);
00324 }
00325 found = 0;
00326
00327 cleanup:
00328 while (bfs.queue) {
00329 q = bfs.queue;
00330 bfs.queue = q->next;
00331 xfree(q);
00332 }
00333
00334 if (found) {
00335 const char *enc = dname;
00336 int depth;
00337 pathlen = 0;
00338 while (1) {
00339 st_lookup(bfs.visited, (st_data_t)enc, &val);
00340 if (!val)
00341 break;
00342 pathlen++;
00343 enc = (const char *)val;
00344 }
00345 depth = pathlen;
00346 enc = dname;
00347 while (1) {
00348 st_lookup(bfs.visited, (st_data_t)enc, &val);
00349 if (!val)
00350 break;
00351 callback((const char *)val, enc, --depth, arg);
00352 enc = (const char *)val;
00353 }
00354 }
00355
00356 st_free_table(bfs.visited);
00357
00358 return pathlen;
00359 }
00360
00361 static const rb_transcoder *
00362 load_transcoder_entry(transcoder_entry_t *entry)
00363 {
00364 if (entry->transcoder)
00365 return entry->transcoder;
00366
00367 if (entry->lib) {
00368 const char *const lib = entry->lib;
00369 const size_t len = strlen(lib);
00370 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
00371 const VALUE fn = rb_str_new(0, total_len);
00372 char *const path = RSTRING_PTR(fn);
00373 const int safe = rb_safe_level();
00374
00375 entry->lib = NULL;
00376
00377 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
00378 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
00379 rb_str_set_len(fn, total_len);
00380 FL_UNSET(fn, FL_TAINT|FL_UNTRUSTED);
00381 OBJ_FREEZE(fn);
00382 if (!rb_require_safe(fn, safe > 3 ? 3 : safe))
00383 return NULL;
00384 }
00385
00386 if (entry->transcoder)
00387 return entry->transcoder;
00388
00389 return NULL;
00390 }
00391
00392 static const char*
00393 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
00394 {
00395 if (encoding_equal(encname, "UTF-8")) {
00396 *len_ret = 3;
00397 *repl_encname_ptr = "UTF-8";
00398 return "\xEF\xBF\xBD";
00399 }
00400 else {
00401 *len_ret = 1;
00402 *repl_encname_ptr = "US-ASCII";
00403 return "?";
00404 }
00405 }
00406
00407
00408
00409
00410
00411 static const unsigned char *
00412 transcode_char_start(rb_transcoding *tc,
00413 const unsigned char *in_start,
00414 const unsigned char *inchar_start,
00415 const unsigned char *in_p,
00416 size_t *char_len_ptr)
00417 {
00418 const unsigned char *ptr;
00419 if (inchar_start - in_start < tc->recognized_len) {
00420 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
00421 inchar_start, unsigned char, in_p - inchar_start);
00422 ptr = TRANSCODING_READBUF(tc);
00423 }
00424 else {
00425 ptr = inchar_start - tc->recognized_len;
00426 }
00427 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
00428 return ptr;
00429 }
00430
00431 static rb_econv_result_t
00432 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
00433 const unsigned char *in_stop, unsigned char *out_stop,
00434 rb_transcoding *tc,
00435 const int opt)
00436 {
00437 const rb_transcoder *tr = tc->transcoder;
00438 int unitlen = tr->input_unit_length;
00439 ssize_t readagain_len = 0;
00440
00441 const unsigned char *inchar_start;
00442 const unsigned char *in_p;
00443
00444 unsigned char *out_p;
00445
00446 in_p = inchar_start = *in_pos;
00447
00448 out_p = *out_pos;
00449
00450 #define SUSPEND(ret, num) \
00451 do { \
00452 tc->resume_position = (num); \
00453 if (0 < in_p - inchar_start) \
00454 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
00455 inchar_start, unsigned char, in_p - inchar_start); \
00456 *in_pos = in_p; \
00457 *out_pos = out_p; \
00458 tc->recognized_len += in_p - inchar_start; \
00459 if (readagain_len) { \
00460 tc->recognized_len -= readagain_len; \
00461 tc->readagain_len = readagain_len; \
00462 } \
00463 return (ret); \
00464 resume_label ## num:; \
00465 } while (0)
00466 #define SUSPEND_OBUF(num) \
00467 do { \
00468 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
00469 } while (0)
00470
00471 #define SUSPEND_AFTER_OUTPUT(num) \
00472 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
00473 SUSPEND(econv_after_output, num); \
00474 }
00475
00476 #define next_table (tc->next_table)
00477 #define next_info (tc->next_info)
00478 #define next_byte (tc->next_byte)
00479 #define writebuf_len (tc->writebuf_len)
00480 #define writebuf_off (tc->writebuf_off)
00481
00482 switch (tc->resume_position) {
00483 case 0: break;
00484 case 1: goto resume_label1;
00485 case 2: goto resume_label2;
00486 case 3: goto resume_label3;
00487 case 4: goto resume_label4;
00488 case 5: goto resume_label5;
00489 case 6: goto resume_label6;
00490 case 7: goto resume_label7;
00491 case 8: goto resume_label8;
00492 case 9: goto resume_label9;
00493 case 10: goto resume_label10;
00494 case 11: goto resume_label11;
00495 case 12: goto resume_label12;
00496 case 13: goto resume_label13;
00497 case 14: goto resume_label14;
00498 case 15: goto resume_label15;
00499 case 16: goto resume_label16;
00500 case 17: goto resume_label17;
00501 case 18: goto resume_label18;
00502 case 19: goto resume_label19;
00503 case 20: goto resume_label20;
00504 case 21: goto resume_label21;
00505 case 22: goto resume_label22;
00506 case 23: goto resume_label23;
00507 case 24: goto resume_label24;
00508 case 25: goto resume_label25;
00509 case 26: goto resume_label26;
00510 case 27: goto resume_label27;
00511 case 28: goto resume_label28;
00512 case 29: goto resume_label29;
00513 case 30: goto resume_label30;
00514 case 31: goto resume_label31;
00515 case 32: goto resume_label32;
00516 case 33: goto resume_label33;
00517 case 34: goto resume_label34;
00518 }
00519
00520 while (1) {
00521 inchar_start = in_p;
00522 tc->recognized_len = 0;
00523 next_table = tr->conv_tree_start;
00524
00525 SUSPEND_AFTER_OUTPUT(24);
00526
00527 if (in_stop <= in_p) {
00528 if (!(opt & ECONV_PARTIAL_INPUT))
00529 break;
00530 SUSPEND(econv_source_buffer_empty, 7);
00531 continue;
00532 }
00533
00534 #define BYTE_ADDR(index) (tr->byte_array + (index))
00535 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
00536 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
00537 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
00538 #define BL_MIN_BYTE (BL_BASE[0])
00539 #define BL_MAX_BYTE (BL_BASE[1])
00540 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
00541 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
00542
00543 next_byte = (unsigned char)*in_p++;
00544 follow_byte:
00545 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
00546 next_info = INVALID;
00547 else {
00548 next_info = (VALUE)BL_ACTION(next_byte);
00549 }
00550 follow_info:
00551 switch (next_info & 0x1F) {
00552 case NOMAP:
00553 {
00554 const unsigned char *p = inchar_start;
00555 writebuf_off = 0;
00556 while (p < in_p) {
00557 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
00558 }
00559 writebuf_len = writebuf_off;
00560 writebuf_off = 0;
00561 while (writebuf_off < writebuf_len) {
00562 SUSPEND_OBUF(3);
00563 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00564 }
00565 }
00566 continue;
00567 case 0x00: case 0x04: case 0x08: case 0x0C:
00568 case 0x10: case 0x14: case 0x18: case 0x1C:
00569 SUSPEND_AFTER_OUTPUT(25);
00570 while (in_p >= in_stop) {
00571 if (!(opt & ECONV_PARTIAL_INPUT))
00572 goto incomplete;
00573 SUSPEND(econv_source_buffer_empty, 5);
00574 }
00575 next_byte = (unsigned char)*in_p++;
00576 next_table = (unsigned int)next_info;
00577 goto follow_byte;
00578 case ZERObt:
00579 continue;
00580 case ONEbt:
00581 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
00582 continue;
00583 case TWObt:
00584 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
00585 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
00586 continue;
00587 case THREEbt:
00588 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
00589 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
00590 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
00591 continue;
00592 case FOURbt:
00593 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
00594 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
00595 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
00596 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
00597 continue;
00598 case GB4bt:
00599 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
00600 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
00601 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
00602 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
00603 continue;
00604 case STR1:
00605 tc->output_index = 0;
00606 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
00607 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
00608 tc->output_index++;
00609 }
00610 continue;
00611 case FUNii:
00612 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
00613 goto follow_info;
00614 case FUNsi:
00615 {
00616 const unsigned char *char_start;
00617 size_t char_len;
00618 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00619 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
00620 goto follow_info;
00621 }
00622 case FUNio:
00623 SUSPEND_OBUF(13);
00624 if (tr->max_output <= out_stop - out_p)
00625 out_p += tr->func_io(TRANSCODING_STATE(tc),
00626 next_info, out_p, out_stop - out_p);
00627 else {
00628 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
00629 next_info,
00630 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00631 writebuf_off = 0;
00632 while (writebuf_off < writebuf_len) {
00633 SUSPEND_OBUF(20);
00634 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00635 }
00636 }
00637 break;
00638 case FUNso:
00639 {
00640 const unsigned char *char_start;
00641 size_t char_len;
00642 SUSPEND_OBUF(14);
00643 if (tr->max_output <= out_stop - out_p) {
00644 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00645 out_p += tr->func_so(TRANSCODING_STATE(tc),
00646 char_start, (size_t)char_len,
00647 out_p, out_stop - out_p);
00648 }
00649 else {
00650 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00651 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
00652 char_start, (size_t)char_len,
00653 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00654 writebuf_off = 0;
00655 while (writebuf_off < writebuf_len) {
00656 SUSPEND_OBUF(22);
00657 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00658 }
00659 }
00660 break;
00661 }
00662 case FUNsio:
00663 {
00664 const unsigned char *char_start;
00665 size_t char_len;
00666 SUSPEND_OBUF(33);
00667 if (tr->max_output <= out_stop - out_p) {
00668 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00669 out_p += tr->func_sio(TRANSCODING_STATE(tc),
00670 char_start, (size_t)char_len, next_info,
00671 out_p, out_stop - out_p);
00672 }
00673 else {
00674 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00675 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
00676 char_start, (size_t)char_len, next_info,
00677 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00678 writebuf_off = 0;
00679 while (writebuf_off < writebuf_len) {
00680 SUSPEND_OBUF(34);
00681 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00682 }
00683 }
00684 break;
00685 }
00686 case INVALID:
00687 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
00688 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
00689 SUSPEND_AFTER_OUTPUT(26);
00690 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
00691 in_p = in_stop;
00692 SUSPEND(econv_source_buffer_empty, 8);
00693 }
00694 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
00695 in_p = in_stop;
00696 }
00697 else {
00698 in_p = inchar_start + (unitlen - tc->recognized_len);
00699 }
00700 }
00701 else {
00702 ssize_t invalid_len;
00703 ssize_t discard_len;
00704 invalid_len = tc->recognized_len + (in_p - inchar_start);
00705 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
00706 readagain_len = invalid_len - discard_len;
00707 }
00708 goto invalid;
00709 case UNDEF:
00710 goto undef;
00711 default:
00712 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
00713 }
00714 continue;
00715
00716 invalid:
00717 SUSPEND(econv_invalid_byte_sequence, 1);
00718 continue;
00719
00720 incomplete:
00721 SUSPEND(econv_incomplete_input, 27);
00722 continue;
00723
00724 undef:
00725 SUSPEND(econv_undefined_conversion, 2);
00726 continue;
00727 }
00728
00729
00730 if (tr->finish_func) {
00731 SUSPEND_OBUF(4);
00732 if (tr->max_output <= out_stop - out_p) {
00733 out_p += tr->finish_func(TRANSCODING_STATE(tc),
00734 out_p, out_stop - out_p);
00735 }
00736 else {
00737 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
00738 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00739 writebuf_off = 0;
00740 while (writebuf_off < writebuf_len) {
00741 SUSPEND_OBUF(23);
00742 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00743 }
00744 }
00745 }
00746 while (1)
00747 SUSPEND(econv_finished, 6);
00748 #undef SUSPEND
00749 #undef next_table
00750 #undef next_info
00751 #undef next_byte
00752 #undef writebuf_len
00753 #undef writebuf_off
00754 }
00755
00756 static rb_econv_result_t
00757 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
00758 const unsigned char *in_stop, unsigned char *out_stop,
00759 rb_transcoding *tc,
00760 const int opt)
00761 {
00762 if (tc->readagain_len) {
00763 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
00764 const unsigned char *readagain_pos = readagain_buf;
00765 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
00766 rb_econv_result_t res;
00767
00768 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
00769 unsigned char, tc->readagain_len);
00770 tc->readagain_len = 0;
00771 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
00772 if (res != econv_source_buffer_empty) {
00773 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
00774 readagain_pos, unsigned char, readagain_stop - readagain_pos);
00775 tc->readagain_len += readagain_stop - readagain_pos;
00776 return res;
00777 }
00778 }
00779 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
00780 }
00781
00782 static rb_transcoding *
00783 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
00784 {
00785 rb_transcoding *tc;
00786
00787 tc = ALLOC(rb_transcoding);
00788 tc->transcoder = tr;
00789 tc->flags = flags;
00790 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00791 tc->state.ptr = xmalloc(tr->state_size);
00792 if (tr->state_init_func) {
00793 (tr->state_init_func)(TRANSCODING_STATE(tc));
00794 }
00795 tc->resume_position = 0;
00796 tc->recognized_len = 0;
00797 tc->readagain_len = 0;
00798 tc->writebuf_len = 0;
00799 tc->writebuf_off = 0;
00800 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00801 tc->readbuf.ptr = xmalloc(tr->max_input);
00802 }
00803 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00804 tc->writebuf.ptr = xmalloc(tr->max_output);
00805 }
00806 return tc;
00807 }
00808
00809 static rb_econv_result_t
00810 rb_transcoding_convert(rb_transcoding *tc,
00811 const unsigned char **input_ptr, const unsigned char *input_stop,
00812 unsigned char **output_ptr, unsigned char *output_stop,
00813 int flags)
00814 {
00815 return transcode_restartable(
00816 input_ptr, output_ptr,
00817 input_stop, output_stop,
00818 tc, flags);
00819 }
00820
00821 static void
00822 rb_transcoding_close(rb_transcoding *tc)
00823 {
00824 const rb_transcoder *tr = tc->transcoder;
00825 if (tr->state_fini_func) {
00826 (tr->state_fini_func)(TRANSCODING_STATE(tc));
00827 }
00828 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00829 xfree(tc->state.ptr);
00830 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
00831 xfree(tc->readbuf.ptr);
00832 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
00833 xfree(tc->writebuf.ptr);
00834 xfree(tc);
00835 }
00836
00837 static size_t
00838 rb_transcoding_memsize(rb_transcoding *tc)
00839 {
00840 size_t size = sizeof(rb_transcoding);
00841 const rb_transcoder *tr = tc->transcoder;
00842
00843 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
00844 size += tr->state_size;
00845 }
00846 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00847 size += tr->max_input;
00848 }
00849 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00850 size += tr->max_output;
00851 }
00852 return size;
00853 }
00854
00855 static rb_econv_t *
00856 rb_econv_alloc(int n_hint)
00857 {
00858 rb_econv_t *ec;
00859
00860 if (n_hint <= 0)
00861 n_hint = 1;
00862
00863 ec = ALLOC(rb_econv_t);
00864 ec->flags = 0;
00865 ec->source_encoding_name = NULL;
00866 ec->destination_encoding_name = NULL;
00867 ec->started = 0;
00868 ec->replacement_str = NULL;
00869 ec->replacement_len = 0;
00870 ec->replacement_enc = NULL;
00871 ec->replacement_allocated = 0;
00872 ec->in_buf_start = NULL;
00873 ec->in_data_start = NULL;
00874 ec->in_data_end = NULL;
00875 ec->in_buf_end = NULL;
00876 ec->num_allocated = n_hint;
00877 ec->num_trans = 0;
00878 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
00879 ec->num_finished = 0;
00880 ec->last_tc = NULL;
00881 ec->last_error.result = econv_source_buffer_empty;
00882 ec->last_error.error_tc = NULL;
00883 ec->last_error.source_encoding = NULL;
00884 ec->last_error.destination_encoding = NULL;
00885 ec->last_error.error_bytes_start = NULL;
00886 ec->last_error.error_bytes_len = 0;
00887 ec->last_error.readagain_len = 0;
00888 ec->source_encoding = NULL;
00889 ec->destination_encoding = NULL;
00890 return ec;
00891 }
00892
00893 static int
00894 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
00895 {
00896 int n, j;
00897 int bufsize = 4096;
00898 unsigned char *p;
00899
00900 if (ec->num_trans == ec->num_allocated) {
00901 n = ec->num_allocated * 2;
00902 REALLOC_N(ec->elems, rb_econv_elem_t, n);
00903 ec->num_allocated = n;
00904 }
00905
00906 p = xmalloc(bufsize);
00907
00908 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
00909
00910 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
00911 ec->elems[i].out_buf_start = p;
00912 ec->elems[i].out_buf_end = p + bufsize;
00913 ec->elems[i].out_data_start = p;
00914 ec->elems[i].out_data_end = p;
00915 ec->elems[i].last_result = econv_source_buffer_empty;
00916
00917 ec->num_trans++;
00918
00919 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
00920 for (j = ec->num_trans-1; i <= j; j--) {
00921 rb_transcoding *tc = ec->elems[j].tc;
00922 const rb_transcoder *tr2 = tc->transcoder;
00923 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
00924 ec->last_tc = tc;
00925 break;
00926 }
00927 }
00928
00929 return 0;
00930 }
00931
00932 static rb_econv_t *
00933 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
00934 {
00935 rb_econv_t *ec;
00936 int i, ret;
00937
00938 for (i = 0; i < n; i++) {
00939 const rb_transcoder *tr;
00940 tr = load_transcoder_entry(entries[i]);
00941 if (!tr)
00942 return NULL;
00943 }
00944
00945 ec = rb_econv_alloc(n);
00946
00947 for (i = 0; i < n; i++) {
00948 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
00949 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
00950 if (ret == -1) {
00951 rb_econv_close(ec);
00952 return NULL;
00953 }
00954 }
00955
00956 return ec;
00957 }
00958
00959 struct trans_open_t {
00960 transcoder_entry_t **entries;
00961 int num_additional;
00962 };
00963
00964 static void
00965 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
00966 {
00967 struct trans_open_t *toarg = arg;
00968
00969 if (!toarg->entries) {
00970 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
00971 }
00972 toarg->entries[depth] = get_transcoder_entry(sname, dname);
00973 }
00974
00975 static rb_econv_t *
00976 rb_econv_open0(const char *sname, const char *dname, int ecflags)
00977 {
00978 transcoder_entry_t **entries = NULL;
00979 int num_trans;
00980 rb_econv_t *ec;
00981
00982 int sidx, didx;
00983
00984 if (*sname) {
00985 sidx = rb_enc_find_index(sname);
00986 if (0 <= sidx) {
00987 rb_enc_from_index(sidx);
00988 }
00989 }
00990
00991 if (*dname) {
00992 didx = rb_enc_find_index(dname);
00993 if (0 <= didx) {
00994 rb_enc_from_index(didx);
00995 }
00996 }
00997
00998 if (*sname == '\0' && *dname == '\0') {
00999 num_trans = 0;
01000 entries = NULL;
01001 }
01002 else {
01003 struct trans_open_t toarg;
01004 toarg.entries = NULL;
01005 toarg.num_additional = 0;
01006 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
01007 entries = toarg.entries;
01008 if (num_trans < 0) {
01009 xfree(entries);
01010 return NULL;
01011 }
01012 }
01013
01014 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
01015 xfree(entries);
01016 if (!ec)
01017 return NULL;
01018
01019 ec->flags = ecflags;
01020 ec->source_encoding_name = sname;
01021 ec->destination_encoding_name = dname;
01022
01023 return ec;
01024 }
01025
01026 #define MAX_ECFLAGS_DECORATORS 32
01027
01028 static int
01029 decorator_names(int ecflags, const char **decorators_ret)
01030 {
01031 int num_decorators;
01032
01033 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
01034 case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
01035 case ECONV_CRLF_NEWLINE_DECORATOR:
01036 case ECONV_CR_NEWLINE_DECORATOR:
01037 case 0:
01038 break;
01039 default:
01040 return -1;
01041 }
01042
01043 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
01044 (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
01045 return -1;
01046
01047 num_decorators = 0;
01048
01049 if (ecflags & ECONV_XML_TEXT_DECORATOR)
01050 decorators_ret[num_decorators++] = "xml_text_escape";
01051 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
01052 decorators_ret[num_decorators++] = "xml_attr_content_escape";
01053 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
01054 decorators_ret[num_decorators++] = "xml_attr_quote";
01055
01056 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
01057 decorators_ret[num_decorators++] = "crlf_newline";
01058 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
01059 decorators_ret[num_decorators++] = "cr_newline";
01060 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
01061 decorators_ret[num_decorators++] = "universal_newline";
01062
01063 return num_decorators;
01064 }
01065
01066 rb_econv_t *
01067 rb_econv_open(const char *sname, const char *dname, int ecflags)
01068 {
01069 rb_econv_t *ec;
01070 int num_decorators;
01071 const char *decorators[MAX_ECFLAGS_DECORATORS];
01072 int i;
01073
01074 num_decorators = decorator_names(ecflags, decorators);
01075 if (num_decorators == -1)
01076 return NULL;
01077
01078 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
01079 if (!ec)
01080 return NULL;
01081
01082 for (i = 0; i < num_decorators; i++)
01083 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
01084 rb_econv_close(ec);
01085 return NULL;
01086 }
01087
01088 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
01089
01090 return ec;
01091 }
01092
01093 static int
01094 trans_sweep(rb_econv_t *ec,
01095 const unsigned char **input_ptr, const unsigned char *input_stop,
01096 unsigned char **output_ptr, unsigned char *output_stop,
01097 int flags,
01098 int start)
01099 {
01100 int try;
01101 int i, f;
01102
01103 const unsigned char **ipp, *is, *iold;
01104 unsigned char **opp, *os, *oold;
01105 rb_econv_result_t res;
01106
01107 try = 1;
01108 while (try) {
01109 try = 0;
01110 for (i = start; i < ec->num_trans; i++) {
01111 rb_econv_elem_t *te = &ec->elems[i];
01112
01113 if (i == 0) {
01114 ipp = input_ptr;
01115 is = input_stop;
01116 }
01117 else {
01118 rb_econv_elem_t *prev_te = &ec->elems[i-1];
01119 ipp = (const unsigned char **)&prev_te->out_data_start;
01120 is = prev_te->out_data_end;
01121 }
01122
01123 if (i == ec->num_trans-1) {
01124 opp = output_ptr;
01125 os = output_stop;
01126 }
01127 else {
01128 if (te->out_buf_start != te->out_data_start) {
01129 ssize_t len = te->out_data_end - te->out_data_start;
01130 ssize_t off = te->out_data_start - te->out_buf_start;
01131 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
01132 te->out_data_start = te->out_buf_start;
01133 te->out_data_end -= off;
01134 }
01135 opp = &te->out_data_end;
01136 os = te->out_buf_end;
01137 }
01138
01139 f = flags;
01140 if (ec->num_finished != i)
01141 f |= ECONV_PARTIAL_INPUT;
01142 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
01143 start = 1;
01144 flags &= ~ECONV_AFTER_OUTPUT;
01145 }
01146 if (i != 0)
01147 f &= ~ECONV_AFTER_OUTPUT;
01148 iold = *ipp;
01149 oold = *opp;
01150 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
01151 if (iold != *ipp || oold != *opp)
01152 try = 1;
01153
01154 switch (res) {
01155 case econv_invalid_byte_sequence:
01156 case econv_incomplete_input:
01157 case econv_undefined_conversion:
01158 case econv_after_output:
01159 return i;
01160
01161 case econv_destination_buffer_full:
01162 case econv_source_buffer_empty:
01163 break;
01164
01165 case econv_finished:
01166 ec->num_finished = i+1;
01167 break;
01168 }
01169 }
01170 }
01171 return -1;
01172 }
01173
01174 static rb_econv_result_t
01175 rb_trans_conv(rb_econv_t *ec,
01176 const unsigned char **input_ptr, const unsigned char *input_stop,
01177 unsigned char **output_ptr, unsigned char *output_stop,
01178 int flags,
01179 int *result_position_ptr)
01180 {
01181 int i;
01182 int needreport_index;
01183 int sweep_start;
01184
01185 unsigned char empty_buf;
01186 unsigned char *empty_ptr = &empty_buf;
01187
01188 if (!input_ptr) {
01189 input_ptr = (const unsigned char **)&empty_ptr;
01190 input_stop = empty_ptr;
01191 }
01192
01193 if (!output_ptr) {
01194 output_ptr = &empty_ptr;
01195 output_stop = empty_ptr;
01196 }
01197
01198 if (ec->elems[0].last_result == econv_after_output)
01199 ec->elems[0].last_result = econv_source_buffer_empty;
01200
01201 needreport_index = -1;
01202 for (i = ec->num_trans-1; 0 <= i; i--) {
01203 switch (ec->elems[i].last_result) {
01204 case econv_invalid_byte_sequence:
01205 case econv_incomplete_input:
01206 case econv_undefined_conversion:
01207 case econv_after_output:
01208 case econv_finished:
01209 sweep_start = i+1;
01210 needreport_index = i;
01211 goto found_needreport;
01212
01213 case econv_destination_buffer_full:
01214 case econv_source_buffer_empty:
01215 break;
01216
01217 default:
01218 rb_bug("unexpected transcode last result");
01219 }
01220 }
01221
01222
01223
01224 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
01225 (flags & ECONV_AFTER_OUTPUT)) {
01226 rb_econv_result_t res;
01227
01228 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
01229 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
01230 result_position_ptr);
01231
01232 if (res == econv_source_buffer_empty)
01233 return econv_after_output;
01234 return res;
01235 }
01236
01237 sweep_start = 0;
01238
01239 found_needreport:
01240
01241 do {
01242 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
01243 sweep_start = needreport_index + 1;
01244 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
01245
01246 for (i = ec->num_trans-1; 0 <= i; i--) {
01247 if (ec->elems[i].last_result != econv_source_buffer_empty) {
01248 rb_econv_result_t res = ec->elems[i].last_result;
01249 if (res == econv_invalid_byte_sequence ||
01250 res == econv_incomplete_input ||
01251 res == econv_undefined_conversion ||
01252 res == econv_after_output) {
01253 ec->elems[i].last_result = econv_source_buffer_empty;
01254 }
01255 if (result_position_ptr)
01256 *result_position_ptr = i;
01257 return res;
01258 }
01259 }
01260 if (result_position_ptr)
01261 *result_position_ptr = -1;
01262 return econv_source_buffer_empty;
01263 }
01264
01265 static rb_econv_result_t
01266 rb_econv_convert0(rb_econv_t *ec,
01267 const unsigned char **input_ptr, const unsigned char *input_stop,
01268 unsigned char **output_ptr, unsigned char *output_stop,
01269 int flags)
01270 {
01271 rb_econv_result_t res;
01272 int result_position;
01273 int has_output = 0;
01274
01275 memset(&ec->last_error, 0, sizeof(ec->last_error));
01276
01277 if (ec->num_trans == 0) {
01278 size_t len;
01279 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
01280 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
01281 len = output_stop - *output_ptr;
01282 memcpy(*output_ptr, ec->in_data_start, len);
01283 *output_ptr = output_stop;
01284 ec->in_data_start += len;
01285 res = econv_destination_buffer_full;
01286 goto gotresult;
01287 }
01288 len = ec->in_data_end - ec->in_data_start;
01289 memcpy(*output_ptr, ec->in_data_start, len);
01290 *output_ptr += len;
01291 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
01292 if (flags & ECONV_AFTER_OUTPUT) {
01293 res = econv_after_output;
01294 goto gotresult;
01295 }
01296 }
01297 if (output_stop - *output_ptr < input_stop - *input_ptr) {
01298 len = output_stop - *output_ptr;
01299 }
01300 else {
01301 len = input_stop - *input_ptr;
01302 }
01303 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
01304 *(*output_ptr)++ = *(*input_ptr)++;
01305 res = econv_after_output;
01306 goto gotresult;
01307 }
01308 memcpy(*output_ptr, *input_ptr, len);
01309 *output_ptr += len;
01310 *input_ptr += len;
01311 if (*input_ptr != input_stop)
01312 res = econv_destination_buffer_full;
01313 else if (flags & ECONV_PARTIAL_INPUT)
01314 res = econv_source_buffer_empty;
01315 else
01316 res = econv_finished;
01317 goto gotresult;
01318 }
01319
01320 if (ec->elems[ec->num_trans-1].out_data_start) {
01321 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
01322 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
01323 if (data_start != data_end) {
01324 size_t len;
01325 if (output_stop - *output_ptr < data_end - data_start) {
01326 len = output_stop - *output_ptr;
01327 memcpy(*output_ptr, data_start, len);
01328 *output_ptr = output_stop;
01329 ec->elems[ec->num_trans-1].out_data_start += len;
01330 res = econv_destination_buffer_full;
01331 goto gotresult;
01332 }
01333 len = data_end - data_start;
01334 memcpy(*output_ptr, data_start, len);
01335 *output_ptr += len;
01336 ec->elems[ec->num_trans-1].out_data_start =
01337 ec->elems[ec->num_trans-1].out_data_end =
01338 ec->elems[ec->num_trans-1].out_buf_start;
01339 has_output = 1;
01340 }
01341 }
01342
01343 if (ec->in_buf_start &&
01344 ec->in_data_start != ec->in_data_end) {
01345 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
01346 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
01347 if (res != econv_source_buffer_empty)
01348 goto gotresult;
01349 }
01350
01351 if (has_output &&
01352 (flags & ECONV_AFTER_OUTPUT) &&
01353 *input_ptr != input_stop) {
01354 input_stop = *input_ptr;
01355 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01356 if (res == econv_source_buffer_empty)
01357 res = econv_after_output;
01358 }
01359 else if ((flags & ECONV_AFTER_OUTPUT) ||
01360 ec->num_trans == 1) {
01361 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01362 }
01363 else {
01364 flags |= ECONV_AFTER_OUTPUT;
01365 do {
01366 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01367 } while (res == econv_after_output);
01368 }
01369
01370 gotresult:
01371 ec->last_error.result = res;
01372 if (res == econv_invalid_byte_sequence ||
01373 res == econv_incomplete_input ||
01374 res == econv_undefined_conversion) {
01375 rb_transcoding *error_tc = ec->elems[result_position].tc;
01376 ec->last_error.error_tc = error_tc;
01377 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
01378 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
01379 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
01380 ec->last_error.error_bytes_len = error_tc->recognized_len;
01381 ec->last_error.readagain_len = error_tc->readagain_len;
01382 }
01383
01384 return res;
01385 }
01386
01387 static int output_replacement_character(rb_econv_t *ec);
01388
01389 static int
01390 output_hex_charref(rb_econv_t *ec)
01391 {
01392 int ret;
01393 unsigned char utfbuf[1024];
01394 const unsigned char *utf;
01395 size_t utf_len;
01396 int utf_allocated = 0;
01397 char charef_buf[16];
01398 const unsigned char *p;
01399
01400 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
01401 utf = ec->last_error.error_bytes_start;
01402 utf_len = ec->last_error.error_bytes_len;
01403 }
01404 else {
01405 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
01406 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
01407 utfbuf, sizeof(utfbuf),
01408 &utf_len);
01409 if (!utf)
01410 return -1;
01411 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
01412 utf_allocated = 1;
01413 }
01414
01415 if (utf_len % 4 != 0)
01416 goto fail;
01417
01418 p = utf;
01419 while (4 <= utf_len) {
01420 unsigned int u = 0;
01421 u += p[0] << 24;
01422 u += p[1] << 16;
01423 u += p[2] << 8;
01424 u += p[3];
01425 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
01426
01427 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
01428 if (ret == -1)
01429 goto fail;
01430
01431 p += 4;
01432 utf_len -= 4;
01433 }
01434
01435 if (utf_allocated)
01436 xfree((void *)utf);
01437 return 0;
01438
01439 fail:
01440 if (utf_allocated)
01441 xfree((void *)utf);
01442 return -1;
01443 }
01444
01445 rb_econv_result_t
01446 rb_econv_convert(rb_econv_t *ec,
01447 const unsigned char **input_ptr, const unsigned char *input_stop,
01448 unsigned char **output_ptr, unsigned char *output_stop,
01449 int flags)
01450 {
01451 rb_econv_result_t ret;
01452
01453 unsigned char empty_buf;
01454 unsigned char *empty_ptr = &empty_buf;
01455
01456 ec->started = 1;
01457
01458 if (!input_ptr) {
01459 input_ptr = (const unsigned char **)&empty_ptr;
01460 input_stop = empty_ptr;
01461 }
01462
01463 if (!output_ptr) {
01464 output_ptr = &empty_ptr;
01465 output_stop = empty_ptr;
01466 }
01467
01468 resume:
01469 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
01470
01471 if (ret == econv_invalid_byte_sequence ||
01472 ret == econv_incomplete_input) {
01473
01474
01475 switch (ec->flags & ECONV_INVALID_MASK) {
01476 case ECONV_INVALID_REPLACE:
01477 if (output_replacement_character(ec) == 0)
01478 goto resume;
01479 }
01480 }
01481
01482 if (ret == econv_undefined_conversion) {
01483
01484
01485
01486 switch (ec->flags & ECONV_UNDEF_MASK) {
01487 case ECONV_UNDEF_REPLACE:
01488 if (output_replacement_character(ec) == 0)
01489 goto resume;
01490 break;
01491
01492 case ECONV_UNDEF_HEX_CHARREF:
01493 if (output_hex_charref(ec) == 0)
01494 goto resume;
01495 break;
01496 }
01497 }
01498
01499 return ret;
01500 }
01501
01502 const char *
01503 rb_econv_encoding_to_insert_output(rb_econv_t *ec)
01504 {
01505 rb_transcoding *tc = ec->last_tc;
01506 const rb_transcoder *tr;
01507
01508 if (tc == NULL)
01509 return "";
01510
01511 tr = tc->transcoder;
01512
01513 if (tr->asciicompat_type == asciicompat_encoder)
01514 return tr->src_encoding;
01515 return tr->dst_encoding;
01516 }
01517
01518 static unsigned char *
01519 allocate_converted_string(const char *sname, const char *dname,
01520 const unsigned char *str, size_t len,
01521 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
01522 size_t *dst_len_ptr)
01523 {
01524 unsigned char *dst_str;
01525 size_t dst_len;
01526 size_t dst_bufsize;
01527
01528 rb_econv_t *ec;
01529 rb_econv_result_t res;
01530
01531 const unsigned char *sp;
01532 unsigned char *dp;
01533
01534 if (caller_dst_buf)
01535 dst_bufsize = caller_dst_bufsize;
01536 else if (len == 0)
01537 dst_bufsize = 1;
01538 else
01539 dst_bufsize = len;
01540
01541 ec = rb_econv_open(sname, dname, 0);
01542 if (ec == NULL)
01543 return NULL;
01544 if (caller_dst_buf)
01545 dst_str = caller_dst_buf;
01546 else
01547 dst_str = xmalloc(dst_bufsize);
01548 dst_len = 0;
01549 sp = str;
01550 dp = dst_str+dst_len;
01551 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01552 dst_len = dp - dst_str;
01553 while (res == econv_destination_buffer_full) {
01554 if (SIZE_MAX/2 < dst_bufsize) {
01555 goto fail;
01556 }
01557 dst_bufsize *= 2;
01558 if (dst_str == caller_dst_buf) {
01559 unsigned char *tmp;
01560 tmp = xmalloc(dst_bufsize);
01561 memcpy(tmp, dst_str, dst_bufsize/2);
01562 dst_str = tmp;
01563 }
01564 else {
01565 dst_str = xrealloc(dst_str, dst_bufsize);
01566 }
01567 dp = dst_str+dst_len;
01568 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01569 dst_len = dp - dst_str;
01570 }
01571 if (res != econv_finished) {
01572 goto fail;
01573 }
01574 rb_econv_close(ec);
01575 *dst_len_ptr = dst_len;
01576 return dst_str;
01577
01578 fail:
01579 if (dst_str != caller_dst_buf)
01580 xfree(dst_str);
01581 rb_econv_close(ec);
01582 return NULL;
01583 }
01584
01585
01586 int
01587 rb_econv_insert_output(rb_econv_t *ec,
01588 const unsigned char *str, size_t len, const char *str_encoding)
01589 {
01590 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
01591 unsigned char insert_buf[4096];
01592 const unsigned char *insert_str = NULL;
01593 size_t insert_len;
01594
01595 int last_trans_index;
01596 rb_transcoding *tc;
01597
01598 unsigned char **buf_start_p;
01599 unsigned char **data_start_p;
01600 unsigned char **data_end_p;
01601 unsigned char **buf_end_p;
01602
01603 size_t need;
01604
01605 ec->started = 1;
01606
01607 if (len == 0)
01608 return 0;
01609
01610 if (encoding_equal(insert_encoding, str_encoding)) {
01611 insert_str = str;
01612 insert_len = len;
01613 }
01614 else {
01615 insert_str = allocate_converted_string(str_encoding, insert_encoding,
01616 str, len, insert_buf, sizeof(insert_buf), &insert_len);
01617 if (insert_str == NULL)
01618 return -1;
01619 }
01620
01621 need = insert_len;
01622
01623 last_trans_index = ec->num_trans-1;
01624 if (ec->num_trans == 0) {
01625 tc = NULL;
01626 buf_start_p = &ec->in_buf_start;
01627 data_start_p = &ec->in_data_start;
01628 data_end_p = &ec->in_data_end;
01629 buf_end_p = &ec->in_buf_end;
01630 }
01631 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
01632 tc = ec->elems[last_trans_index].tc;
01633 need += tc->readagain_len;
01634 if (need < insert_len)
01635 goto fail;
01636 if (last_trans_index == 0) {
01637 buf_start_p = &ec->in_buf_start;
01638 data_start_p = &ec->in_data_start;
01639 data_end_p = &ec->in_data_end;
01640 buf_end_p = &ec->in_buf_end;
01641 }
01642 else {
01643 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
01644 buf_start_p = &ee->out_buf_start;
01645 data_start_p = &ee->out_data_start;
01646 data_end_p = &ee->out_data_end;
01647 buf_end_p = &ee->out_buf_end;
01648 }
01649 }
01650 else {
01651 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
01652 buf_start_p = &ee->out_buf_start;
01653 data_start_p = &ee->out_data_start;
01654 data_end_p = &ee->out_data_end;
01655 buf_end_p = &ee->out_buf_end;
01656 tc = ec->elems[last_trans_index].tc;
01657 }
01658
01659 if (*buf_start_p == NULL) {
01660 unsigned char *buf = xmalloc(need);
01661 *buf_start_p = buf;
01662 *data_start_p = buf;
01663 *data_end_p = buf;
01664 *buf_end_p = buf+need;
01665 }
01666 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
01667 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
01668 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
01669 *data_start_p = *buf_start_p;
01670 if ((size_t)(*buf_end_p - *data_end_p) < need) {
01671 unsigned char *buf;
01672 size_t s = (*data_end_p - *buf_start_p) + need;
01673 if (s < need)
01674 goto fail;
01675 buf = xrealloc(*buf_start_p, s);
01676 *data_start_p = buf;
01677 *data_end_p = buf + (*data_end_p - *buf_start_p);
01678 *buf_start_p = buf;
01679 *buf_end_p = buf + s;
01680 }
01681 }
01682
01683 memcpy(*data_end_p, insert_str, insert_len);
01684 *data_end_p += insert_len;
01685 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
01686 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
01687 *data_end_p += tc->readagain_len;
01688 tc->readagain_len = 0;
01689 }
01690
01691 if (insert_str != str && insert_str != insert_buf)
01692 xfree((void*)insert_str);
01693 return 0;
01694
01695 fail:
01696 if (insert_str != str && insert_str != insert_buf)
01697 xfree((void*)insert_str);
01698 return -1;
01699 }
01700
01701 void
01702 rb_econv_close(rb_econv_t *ec)
01703 {
01704 int i;
01705
01706 if (ec->replacement_allocated) {
01707 xfree((void *)ec->replacement_str);
01708 }
01709 for (i = 0; i < ec->num_trans; i++) {
01710 rb_transcoding_close(ec->elems[i].tc);
01711 if (ec->elems[i].out_buf_start)
01712 xfree(ec->elems[i].out_buf_start);
01713 }
01714 xfree(ec->in_buf_start);
01715 xfree(ec->elems);
01716 xfree(ec);
01717 }
01718
01719 size_t
01720 rb_econv_memsize(rb_econv_t *ec)
01721 {
01722 size_t size = sizeof(rb_econv_t);
01723 int i;
01724
01725 if (ec->replacement_allocated) {
01726 size += ec->replacement_len;
01727 }
01728 for (i = 0; i < ec->num_trans; i++) {
01729 size += rb_transcoding_memsize(ec->elems[i].tc);
01730
01731 if (ec->elems[i].out_buf_start) {
01732 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
01733 }
01734 }
01735 size += ec->in_buf_end - ec->in_buf_start;
01736 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
01737
01738 return size;
01739 }
01740
01741 int
01742 rb_econv_putbackable(rb_econv_t *ec)
01743 {
01744 if (ec->num_trans == 0)
01745 return 0;
01746 #if SIZEOF_SIZE_T > SIZEOF_INT
01747 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
01748 #endif
01749 return (int)ec->elems[0].tc->readagain_len;
01750 }
01751
01752 void
01753 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
01754 {
01755 rb_transcoding *tc;
01756 if (ec->num_trans == 0 || n == 0)
01757 return;
01758 tc = ec->elems[0].tc;
01759 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
01760 tc->readagain_len -= n;
01761 }
01762
01763 struct asciicompat_encoding_t {
01764 const char *ascii_compat_name;
01765 const char *ascii_incompat_name;
01766 };
01767
01768 static int
01769 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
01770 {
01771 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
01772 transcoder_entry_t *entry = (transcoder_entry_t *)val;
01773 const rb_transcoder *tr;
01774
01775 if (DECORATOR_P(entry->sname, entry->dname))
01776 return ST_CONTINUE;
01777 tr = load_transcoder_entry(entry);
01778 if (tr && tr->asciicompat_type == asciicompat_decoder) {
01779 data->ascii_compat_name = tr->dst_encoding;
01780 return ST_STOP;
01781 }
01782 return ST_CONTINUE;
01783 }
01784
01785 const char *
01786 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
01787 {
01788 st_data_t v;
01789 st_table *table2;
01790 struct asciicompat_encoding_t data;
01791
01792 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
01793 return NULL;
01794 table2 = (st_table *)v;
01795
01796
01797
01798
01799
01800
01801
01802
01803 if (table2->num_entries != 1)
01804 return NULL;
01805
01806 data.ascii_incompat_name = ascii_incompat_name;
01807 data.ascii_compat_name = NULL;
01808 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
01809 return data.ascii_compat_name;
01810 }
01811
01812 VALUE
01813 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
01814 {
01815 unsigned const char *ss, *sp, *se;
01816 unsigned char *ds, *dp, *de;
01817 rb_econv_result_t res;
01818 int max_output;
01819
01820 if (NIL_P(dst)) {
01821 dst = rb_str_buf_new(len);
01822 if (ec->destination_encoding)
01823 rb_enc_associate(dst, ec->destination_encoding);
01824 }
01825
01826 if (ec->last_tc)
01827 max_output = ec->last_tc->transcoder->max_output;
01828 else
01829 max_output = 1;
01830
01831 res = econv_destination_buffer_full;
01832 while (res == econv_destination_buffer_full) {
01833 long dlen = RSTRING_LEN(dst);
01834 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
01835 unsigned long new_capa = (unsigned long)dlen + len + max_output;
01836 if (LONG_MAX < new_capa)
01837 rb_raise(rb_eArgError, "too long string");
01838 rb_str_resize(dst, new_capa);
01839 rb_str_set_len(dst, dlen);
01840 }
01841 ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
01842 se = ss + len;
01843 ds = (unsigned char *)RSTRING_PTR(dst);
01844 de = ds + rb_str_capacity(dst);
01845 dp = ds += dlen;
01846 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
01847 off += sp - ss;
01848 len -= sp - ss;
01849 rb_str_set_len(dst, dlen + (dp - ds));
01850 rb_econv_check_error(ec);
01851 }
01852
01853 return dst;
01854 }
01855
01856 VALUE
01857 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
01858 {
01859 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
01860 }
01861
01862 VALUE
01863 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
01864 {
01865 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
01866 }
01867
01868 VALUE
01869 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
01870 {
01871 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
01872 }
01873
01874 static int
01875 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
01876 {
01877 transcoder_entry_t *entry;
01878 const rb_transcoder *tr;
01879
01880 if (ec->started != 0)
01881 return -1;
01882
01883 entry = get_transcoder_entry(sname, dname);
01884 if (!entry)
01885 return -1;
01886
01887 tr = load_transcoder_entry(entry);
01888 if (!tr) return -1;
01889
01890 return rb_econv_add_transcoder_at(ec, tr, n);
01891 }
01892
01893 static int
01894 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
01895 {
01896 return rb_econv_add_converter(ec, "", decorator_name, n);
01897 }
01898
01899 int
01900 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
01901 {
01902 const rb_transcoder *tr;
01903
01904 if (ec->num_trans == 0)
01905 return rb_econv_decorate_at(ec, decorator_name, 0);
01906
01907 tr = ec->elems[0].tc->transcoder;
01908
01909 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01910 tr->asciicompat_type == asciicompat_decoder)
01911 return rb_econv_decorate_at(ec, decorator_name, 1);
01912
01913 return rb_econv_decorate_at(ec, decorator_name, 0);
01914 }
01915
01916 int
01917 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
01918 {
01919 const rb_transcoder *tr;
01920
01921 if (ec->num_trans == 0)
01922 return rb_econv_decorate_at(ec, decorator_name, 0);
01923
01924 tr = ec->elems[ec->num_trans-1].tc->transcoder;
01925
01926 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01927 tr->asciicompat_type == asciicompat_encoder)
01928 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
01929
01930 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
01931 }
01932
01933 void
01934 rb_econv_binmode(rb_econv_t *ec)
01935 {
01936 const char *dname = 0;
01937
01938 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
01939 case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
01940 dname = "universal_newline";
01941 break;
01942 case ECONV_CRLF_NEWLINE_DECORATOR:
01943 dname = "crlf_newline";
01944 break;
01945 case ECONV_CR_NEWLINE_DECORATOR:
01946 dname = "cr_newline";
01947 break;
01948 }
01949
01950 if (dname) {
01951 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
01952 int num_trans = ec->num_trans;
01953 int i, j = 0;
01954
01955 for (i=0; i < num_trans; i++) {
01956 if (transcoder == ec->elems[i].tc->transcoder) {
01957 rb_transcoding_close(ec->elems[i].tc);
01958 xfree(ec->elems[i].out_buf_start);
01959 ec->num_trans--;
01960 }
01961 else
01962 ec->elems[j++] = ec->elems[i];
01963 }
01964 }
01965
01966 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
01967 }
01968
01969 static VALUE
01970 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
01971 {
01972 int has_description = 0;
01973
01974 if (NIL_P(mesg))
01975 mesg = rb_str_new(NULL, 0);
01976
01977 if (*sname != '\0' || *dname != '\0') {
01978 if (*sname == '\0')
01979 rb_str_cat2(mesg, dname);
01980 else if (*dname == '\0')
01981 rb_str_cat2(mesg, sname);
01982 else
01983 rb_str_catf(mesg, "%s to %s", sname, dname);
01984 has_description = 1;
01985 }
01986
01987 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
01988 ECONV_XML_TEXT_DECORATOR|
01989 ECONV_XML_ATTR_CONTENT_DECORATOR|
01990 ECONV_XML_ATTR_QUOTE_DECORATOR)) {
01991 const char *pre = "";
01992 if (has_description)
01993 rb_str_cat2(mesg, " with ");
01994 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
01995 rb_str_cat2(mesg, pre); pre = ",";
01996 rb_str_cat2(mesg, "universal_newline");
01997 }
01998 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
01999 rb_str_cat2(mesg, pre); pre = ",";
02000 rb_str_cat2(mesg, "crlf_newline");
02001 }
02002 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
02003 rb_str_cat2(mesg, pre); pre = ",";
02004 rb_str_cat2(mesg, "cr_newline");
02005 }
02006 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
02007 rb_str_cat2(mesg, pre); pre = ",";
02008 rb_str_cat2(mesg, "xml_text");
02009 }
02010 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
02011 rb_str_cat2(mesg, pre); pre = ",";
02012 rb_str_cat2(mesg, "xml_attr_content");
02013 }
02014 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
02015 rb_str_cat2(mesg, pre); pre = ",";
02016 rb_str_cat2(mesg, "xml_attr_quote");
02017 }
02018 has_description = 1;
02019 }
02020 if (!has_description) {
02021 rb_str_cat2(mesg, "no-conversion");
02022 }
02023
02024 return mesg;
02025 }
02026
02027 VALUE
02028 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
02029 {
02030 VALUE mesg, exc;
02031 mesg = rb_str_new_cstr("code converter not found (");
02032 econv_description(sname, dname, ecflags, mesg);
02033 rb_str_cat2(mesg, ")");
02034 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
02035 return exc;
02036 }
02037
02038 static VALUE
02039 make_econv_exception(rb_econv_t *ec)
02040 {
02041 VALUE mesg, exc;
02042 if (ec->last_error.result == econv_invalid_byte_sequence ||
02043 ec->last_error.result == econv_incomplete_input) {
02044 const char *err = (const char *)ec->last_error.error_bytes_start;
02045 size_t error_len = ec->last_error.error_bytes_len;
02046 VALUE bytes = rb_str_new(err, error_len);
02047 VALUE dumped = rb_str_dump(bytes);
02048 size_t readagain_len = ec->last_error.readagain_len;
02049 VALUE bytes2 = Qnil;
02050 VALUE dumped2;
02051 int idx;
02052 if (ec->last_error.result == econv_incomplete_input) {
02053 mesg = rb_sprintf("incomplete %s on %s",
02054 StringValueCStr(dumped),
02055 ec->last_error.source_encoding);
02056 }
02057 else if (readagain_len) {
02058 bytes2 = rb_str_new(err+error_len, readagain_len);
02059 dumped2 = rb_str_dump(bytes2);
02060 mesg = rb_sprintf("%s followed by %s on %s",
02061 StringValueCStr(dumped),
02062 StringValueCStr(dumped2),
02063 ec->last_error.source_encoding);
02064 }
02065 else {
02066 mesg = rb_sprintf("%s on %s",
02067 StringValueCStr(dumped),
02068 ec->last_error.source_encoding);
02069 }
02070
02071 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
02072 rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
02073 rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
02074 rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
02075
02076 set_encs:
02077 rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
02078 rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
02079 idx = rb_enc_find_index(ec->last_error.source_encoding);
02080 if (0 <= idx)
02081 rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02082 idx = rb_enc_find_index(ec->last_error.destination_encoding);
02083 if (0 <= idx)
02084 rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02085 return exc;
02086 }
02087 if (ec->last_error.result == econv_undefined_conversion) {
02088 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
02089 ec->last_error.error_bytes_len);
02090 VALUE dumped = Qnil;
02091 int idx;
02092 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
02093 rb_encoding *utf8 = rb_utf8_encoding();
02094 const char *start, *end;
02095 int n;
02096 start = (const char *)ec->last_error.error_bytes_start;
02097 end = start + ec->last_error.error_bytes_len;
02098 n = rb_enc_precise_mbclen(start, end, utf8);
02099 if (MBCLEN_CHARFOUND_P(n) &&
02100 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
02101 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
02102 dumped = rb_sprintf("U+%04X", cc);
02103 }
02104 }
02105 if (dumped == Qnil)
02106 dumped = rb_str_dump(bytes);
02107 if (strcmp(ec->last_error.source_encoding,
02108 ec->source_encoding_name) == 0 &&
02109 strcmp(ec->last_error.destination_encoding,
02110 ec->destination_encoding_name) == 0) {
02111 mesg = rb_sprintf("%s from %s to %s",
02112 StringValueCStr(dumped),
02113 ec->last_error.source_encoding,
02114 ec->last_error.destination_encoding);
02115 }
02116 else {
02117 int i;
02118 mesg = rb_sprintf("%s to %s in conversion from %s",
02119 StringValueCStr(dumped),
02120 ec->last_error.destination_encoding,
02121 ec->source_encoding_name);
02122 for (i = 0; i < ec->num_trans; i++) {
02123 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
02124 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
02125 rb_str_catf(mesg, " to %s",
02126 ec->elems[i].tc->transcoder->dst_encoding);
02127 }
02128 }
02129 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
02130 idx = rb_enc_find_index(ec->last_error.source_encoding);
02131 if (0 <= idx)
02132 rb_enc_associate_index(bytes, idx);
02133 rb_ivar_set(exc, rb_intern("error_char"), bytes);
02134 goto set_encs;
02135 }
02136 return Qnil;
02137 }
02138
02139 static void
02140 more_output_buffer(
02141 VALUE destination,
02142 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02143 int max_output,
02144 unsigned char **out_start_ptr,
02145 unsigned char **out_pos,
02146 unsigned char **out_stop_ptr)
02147 {
02148 size_t len = (*out_pos - *out_start_ptr);
02149 size_t new_len = (len + max_output) * 2;
02150 *out_start_ptr = resize_destination(destination, len, new_len);
02151 *out_pos = *out_start_ptr + len;
02152 *out_stop_ptr = *out_start_ptr + new_len;
02153 }
02154
02155 static int
02156 make_replacement(rb_econv_t *ec)
02157 {
02158 rb_transcoding *tc;
02159 const rb_transcoder *tr;
02160 const unsigned char *replacement;
02161 const char *repl_enc;
02162 const char *ins_enc;
02163 size_t len;
02164
02165 if (ec->replacement_str)
02166 return 0;
02167
02168 ins_enc = rb_econv_encoding_to_insert_output(ec);
02169
02170 tc = ec->last_tc;
02171 if (*ins_enc) {
02172 tr = tc->transcoder;
02173 rb_enc_find(tr->dst_encoding);
02174 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
02175 }
02176 else {
02177 replacement = (unsigned char *)"?";
02178 len = 1;
02179 repl_enc = "";
02180 }
02181
02182 ec->replacement_str = replacement;
02183 ec->replacement_len = len;
02184 ec->replacement_enc = repl_enc;
02185 ec->replacement_allocated = 0;
02186 return 0;
02187 }
02188
02189 int
02190 rb_econv_set_replacement(rb_econv_t *ec,
02191 const unsigned char *str, size_t len, const char *encname)
02192 {
02193 unsigned char *str2;
02194 size_t len2;
02195 const char *encname2;
02196
02197 encname2 = rb_econv_encoding_to_insert_output(ec);
02198
02199 if (encoding_equal(encname, encname2)) {
02200 str2 = xmalloc(len);
02201 MEMCPY(str2, str, unsigned char, len);
02202 len2 = len;
02203 encname2 = encname;
02204 }
02205 else {
02206 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
02207 if (!str2)
02208 return -1;
02209 }
02210
02211 if (ec->replacement_allocated) {
02212 xfree((void *)ec->replacement_str);
02213 }
02214 ec->replacement_allocated = 1;
02215 ec->replacement_str = str2;
02216 ec->replacement_len = len2;
02217 ec->replacement_enc = encname2;
02218 return 0;
02219 }
02220
02221 static int
02222 output_replacement_character(rb_econv_t *ec)
02223 {
02224 int ret;
02225
02226 if (make_replacement(ec) == -1)
02227 return -1;
02228
02229 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
02230 if (ret == -1)
02231 return -1;
02232
02233 return 0;
02234 }
02235
02236 #if 1
02237 #define hash_fallback rb_hash_aref
02238
02239 static VALUE
02240 proc_fallback(VALUE fallback, VALUE c)
02241 {
02242 return rb_proc_call(fallback, rb_ary_new4(1, &c));
02243 }
02244
02245 static VALUE
02246 method_fallback(VALUE fallback, VALUE c)
02247 {
02248 return rb_method_call(1, &c, fallback);
02249 }
02250
02251 static VALUE
02252 aref_fallback(VALUE fallback, VALUE c)
02253 {
02254 return rb_funcall3(fallback, sym_aref, 1, &c);
02255 }
02256
02257 static void
02258 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02259 const unsigned char *in_stop, unsigned char *out_stop,
02260 VALUE destination,
02261 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02262 const char *src_encoding,
02263 const char *dst_encoding,
02264 int ecflags,
02265 VALUE ecopts)
02266 {
02267 rb_econv_t *ec;
02268 rb_transcoding *last_tc;
02269 rb_econv_result_t ret;
02270 unsigned char *out_start = *out_pos;
02271 int max_output;
02272 VALUE exc;
02273 VALUE fallback = Qnil;
02274 VALUE (*fallback_func)(VALUE, VALUE) = 0;
02275
02276 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02277 if (!ec)
02278 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02279
02280 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
02281 fallback = rb_hash_aref(ecopts, sym_fallback);
02282 if (RB_TYPE_P(fallback, T_HASH)) {
02283 fallback_func = hash_fallback;
02284 }
02285 else if (rb_obj_is_proc(fallback)) {
02286 fallback_func = proc_fallback;
02287 }
02288 else if (rb_obj_is_method(fallback)) {
02289 fallback_func = method_fallback;
02290 }
02291 else {
02292 fallback_func = aref_fallback;
02293 }
02294 }
02295 last_tc = ec->last_tc;
02296 max_output = last_tc ? last_tc->transcoder->max_output : 1;
02297
02298 resume:
02299 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
02300
02301 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
02302 VALUE rep = rb_enc_str_new(
02303 (const char *)ec->last_error.error_bytes_start,
02304 ec->last_error.error_bytes_len,
02305 rb_enc_find(ec->last_error.source_encoding));
02306 rep = (*fallback_func)(fallback, rep);
02307 if (rep != Qundef && !NIL_P(rep)) {
02308 StringValue(rep);
02309 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
02310 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
02311 if ((int)ret == -1) {
02312 rb_raise(rb_eArgError, "too big fallback string");
02313 }
02314 goto resume;
02315 }
02316 }
02317
02318 if (ret == econv_invalid_byte_sequence ||
02319 ret == econv_incomplete_input ||
02320 ret == econv_undefined_conversion) {
02321 exc = make_econv_exception(ec);
02322 rb_econv_close(ec);
02323 rb_exc_raise(exc);
02324 }
02325
02326 if (ret == econv_destination_buffer_full) {
02327 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02328 goto resume;
02329 }
02330
02331 rb_econv_close(ec);
02332 return;
02333 }
02334 #else
02335
02336 static void
02337 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02338 const unsigned char *in_stop, unsigned char *out_stop,
02339 VALUE destination,
02340 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02341 const char *src_encoding,
02342 const char *dst_encoding,
02343 int ecflags,
02344 VALUE ecopts)
02345 {
02346 rb_econv_t *ec;
02347 rb_transcoding *last_tc;
02348 rb_econv_result_t ret;
02349 unsigned char *out_start = *out_pos;
02350 const unsigned char *ptr;
02351 int max_output;
02352 VALUE exc;
02353
02354 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02355 if (!ec)
02356 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02357
02358 last_tc = ec->last_tc;
02359 max_output = last_tc ? last_tc->transcoder->max_output : 1;
02360
02361 ret = econv_source_buffer_empty;
02362 ptr = *in_pos;
02363 while (ret != econv_finished) {
02364 unsigned char input_byte;
02365 const unsigned char *p = &input_byte;
02366
02367 if (ret == econv_source_buffer_empty) {
02368 if (ptr < in_stop) {
02369 input_byte = *ptr;
02370 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02371 }
02372 else {
02373 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
02374 }
02375 }
02376 else {
02377 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02378 }
02379 if (&input_byte != p)
02380 ptr += p - &input_byte;
02381 switch (ret) {
02382 case econv_invalid_byte_sequence:
02383 case econv_incomplete_input:
02384 case econv_undefined_conversion:
02385 exc = make_econv_exception(ec);
02386 rb_econv_close(ec);
02387 rb_exc_raise(exc);
02388 break;
02389
02390 case econv_destination_buffer_full:
02391 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02392 break;
02393
02394 case econv_source_buffer_empty:
02395 break;
02396
02397 case econv_finished:
02398 break;
02399 }
02400 }
02401 rb_econv_close(ec);
02402 *in_pos = in_stop;
02403 return;
02404 }
02405 #endif
02406
02407
02408
02409
02410
02411
02412 static unsigned char *
02413 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
02414 {
02415 rb_str_resize(destination, new_len);
02416 return (unsigned char *)RSTRING_PTR(destination);
02417 }
02418
02419 static int
02420 econv_opts(VALUE opt, int ecflags)
02421 {
02422 VALUE v;
02423
02424 v = rb_hash_aref(opt, sym_invalid);
02425 if (NIL_P(v)) {
02426 }
02427 else if (v==sym_replace) {
02428 ecflags |= ECONV_INVALID_REPLACE;
02429 }
02430 else {
02431 rb_raise(rb_eArgError, "unknown value for invalid character option");
02432 }
02433
02434 v = rb_hash_aref(opt, sym_undef);
02435 if (NIL_P(v)) {
02436 }
02437 else if (v==sym_replace) {
02438 ecflags |= ECONV_UNDEF_REPLACE;
02439 }
02440 else {
02441 rb_raise(rb_eArgError, "unknown value for undefined character option");
02442 }
02443
02444 v = rb_hash_aref(opt, sym_replace);
02445 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
02446 ecflags |= ECONV_UNDEF_REPLACE;
02447 }
02448
02449 v = rb_hash_aref(opt, sym_xml);
02450 if (!NIL_P(v)) {
02451 if (v==sym_text) {
02452 ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02453 }
02454 else if (v==sym_attr) {
02455 ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02456 }
02457 else if (RB_TYPE_P(v, T_SYMBOL)) {
02458 rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
02459 }
02460 else {
02461 rb_raise(rb_eArgError, "unexpected value for xml option");
02462 }
02463 }
02464
02465 #ifdef ENABLE_ECONV_NEWLINE_OPTION
02466 v = rb_hash_aref(opt, sym_newline);
02467 if (!NIL_P(v)) {
02468 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02469 if (v == sym_universal) {
02470 ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02471 }
02472 else if (v == sym_crlf) {
02473 ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02474 }
02475 else if (v == sym_cr) {
02476 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
02477 }
02478 else if (v == sym_lf) {
02479
02480 }
02481 else if (SYMBOL_P(v)) {
02482 rb_raise(rb_eArgError, "unexpected value for newline option: %s",
02483 rb_id2name(SYM2ID(v)));
02484 }
02485 else {
02486 rb_raise(rb_eArgError, "unexpected value for newline option");
02487 }
02488 }
02489 else
02490 #endif
02491 {
02492 int setflags = 0, newlineflag = 0;
02493
02494 v = rb_hash_aref(opt, sym_universal_newline);
02495 if (RTEST(v))
02496 setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02497 newlineflag |= !NIL_P(v);
02498
02499 v = rb_hash_aref(opt, sym_crlf_newline);
02500 if (RTEST(v))
02501 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02502 newlineflag |= !NIL_P(v);
02503
02504 v = rb_hash_aref(opt, sym_cr_newline);
02505 if (RTEST(v))
02506 setflags |= ECONV_CR_NEWLINE_DECORATOR;
02507 newlineflag |= !NIL_P(v);
02508
02509 if (newlineflag) {
02510 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02511 ecflags |= setflags;
02512 }
02513 }
02514
02515 return ecflags;
02516 }
02517
02518 int
02519 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
02520 {
02521 VALUE newhash = Qnil;
02522 VALUE v;
02523
02524 if (NIL_P(opthash)) {
02525 *opts = Qnil;
02526 return ecflags;
02527 }
02528 ecflags = econv_opts(opthash, ecflags);
02529
02530 v = rb_hash_aref(opthash, sym_replace);
02531 if (!NIL_P(v)) {
02532 StringValue(v);
02533 if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
02534 VALUE dumped = rb_str_dump(v);
02535 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
02536 StringValueCStr(dumped),
02537 rb_enc_name(rb_enc_get(v)));
02538 }
02539 v = rb_str_new_frozen(v);
02540 newhash = rb_hash_new();
02541 rb_hash_aset(newhash, sym_replace, v);
02542 }
02543
02544 v = rb_hash_aref(opthash, sym_fallback);
02545 if (!NIL_P(v)) {
02546 VALUE h = rb_check_hash_type(v);
02547 if (NIL_P(h)
02548 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, sym_aref))
02549 : (v = h, 1)) {
02550 if (NIL_P(newhash))
02551 newhash = rb_hash_new();
02552 rb_hash_aset(newhash, sym_fallback, v);
02553 }
02554 }
02555
02556 if (!NIL_P(newhash))
02557 rb_hash_freeze(newhash);
02558 *opts = newhash;
02559
02560 return ecflags;
02561 }
02562
02563 int
02564 rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
02565 {
02566 return rb_econv_prepare_options(opthash, opts, 0);
02567 }
02568
02569 rb_econv_t *
02570 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
02571 {
02572 rb_econv_t *ec;
02573 VALUE replacement;
02574
02575 if (NIL_P(opthash)) {
02576 replacement = Qnil;
02577 }
02578 else {
02579 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
02580 rb_bug("rb_econv_open_opts called with invalid opthash");
02581 replacement = rb_hash_aref(opthash, sym_replace);
02582 }
02583
02584 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
02585 if (!ec)
02586 return ec;
02587
02588 if (!NIL_P(replacement)) {
02589 int ret;
02590 rb_encoding *enc = rb_enc_get(replacement);
02591
02592 ret = rb_econv_set_replacement(ec,
02593 (const unsigned char *)RSTRING_PTR(replacement),
02594 RSTRING_LEN(replacement),
02595 rb_enc_name(enc));
02596 if (ret == -1) {
02597 rb_econv_close(ec);
02598 return NULL;
02599 }
02600 }
02601 return ec;
02602 }
02603
02604 static int
02605 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
02606 {
02607 rb_encoding *enc;
02608 const char *n;
02609 int encidx;
02610 VALUE encval;
02611
02612 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
02613 !(enc = rb_enc_from_index(encidx))) {
02614 enc = NULL;
02615 encidx = 0;
02616 n = StringValueCStr(*arg);
02617 }
02618 else {
02619 n = rb_enc_name(enc);
02620 }
02621
02622 *name_p = n;
02623 *enc_p = enc;
02624
02625 return encidx;
02626 }
02627
02628 static int
02629 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
02630 const char **sname_p, rb_encoding **senc_p,
02631 const char **dname_p, rb_encoding **denc_p)
02632 {
02633 rb_encoding *senc, *denc;
02634 const char *sname, *dname;
02635 int sencidx, dencidx;
02636
02637 dencidx = enc_arg(arg1, &dname, &denc);
02638
02639 if (NIL_P(*arg2)) {
02640 sencidx = rb_enc_get_index(str);
02641 senc = rb_enc_from_index(sencidx);
02642 sname = rb_enc_name(senc);
02643 }
02644 else {
02645 sencidx = enc_arg(arg2, &sname, &senc);
02646 }
02647
02648 *sname_p = sname;
02649 *senc_p = senc;
02650 *dname_p = dname;
02651 *denc_p = denc;
02652 return dencidx;
02653 }
02654
02655 static int
02656 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
02657 {
02658 VALUE dest;
02659 VALUE str = *self;
02660 volatile VALUE arg1, arg2;
02661 long blen, slen;
02662 unsigned char *buf, *bp, *sp;
02663 const unsigned char *fromp;
02664 rb_encoding *senc, *denc;
02665 const char *sname, *dname;
02666 int dencidx;
02667
02668 rb_check_arity(argc, 0, 2);
02669
02670 if (argc == 0) {
02671 arg1 = rb_enc_default_internal();
02672 if (NIL_P(arg1)) {
02673 if (!ecflags) return -1;
02674 arg1 = rb_obj_encoding(str);
02675 }
02676 ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
02677 }
02678 else {
02679 arg1 = argv[0];
02680 }
02681 arg2 = argc<=1 ? Qnil : argv[1];
02682 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
02683
02684 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
02685 ECONV_XML_TEXT_DECORATOR|
02686 ECONV_XML_ATTR_CONTENT_DECORATOR|
02687 ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
02688 if (senc && senc == denc) {
02689 return NIL_P(arg2) ? -1 : dencidx;
02690 }
02691 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
02692 if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02693 return dencidx;
02694 }
02695 }
02696 if (encoding_equal(sname, dname)) {
02697 return NIL_P(arg2) ? -1 : dencidx;
02698 }
02699 }
02700 else {
02701 if (encoding_equal(sname, dname)) {
02702 sname = "";
02703 dname = "";
02704 }
02705 }
02706
02707 fromp = sp = (unsigned char *)RSTRING_PTR(str);
02708 slen = RSTRING_LEN(str);
02709 blen = slen + 30;
02710 dest = rb_str_tmp_new(blen);
02711 bp = (unsigned char *)RSTRING_PTR(dest);
02712
02713 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
02714 if (fromp != sp+slen) {
02715 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
02716 }
02717 buf = (unsigned char *)RSTRING_PTR(dest);
02718 *bp = '\0';
02719 rb_str_set_len(dest, bp - buf);
02720
02721
02722 if (!denc) {
02723 dencidx = rb_define_dummy_encoding(dname);
02724 }
02725 *self = dest;
02726
02727 return dencidx;
02728 }
02729
02730 static int
02731 str_transcode(int argc, VALUE *argv, VALUE *self)
02732 {
02733 VALUE opt;
02734 int ecflags = 0;
02735 VALUE ecopts = Qnil;
02736
02737 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
02738 if (!NIL_P(opt)) {
02739 ecflags = rb_econv_prepare_opts(opt, &ecopts);
02740 }
02741 return str_transcode0(argc, argv, self, ecflags, ecopts);
02742 }
02743
02744 static inline VALUE
02745 str_encode_associate(VALUE str, int encidx)
02746 {
02747 int cr = 0;
02748
02749 rb_enc_associate_index(str, encidx);
02750
02751
02752 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
02753 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
02754 }
02755 else {
02756 cr = ENC_CODERANGE_VALID;
02757 }
02758 ENC_CODERANGE_SET(str, cr);
02759 return str;
02760 }
02761
02762
02763
02764
02765
02766
02767
02768
02769
02770
02771
02772
02773
02774
02775
02776 static VALUE
02777 str_encode_bang(int argc, VALUE *argv, VALUE str)
02778 {
02779 VALUE newstr;
02780 int encidx;
02781
02782 rb_check_frozen(str);
02783
02784 newstr = str;
02785 encidx = str_transcode(argc, argv, &newstr);
02786
02787 if (encidx < 0) return str;
02788 if (newstr == str) {
02789 rb_enc_associate_index(str, encidx);
02790 return str;
02791 }
02792 rb_str_shared_replace(str, newstr);
02793 return str_encode_associate(str, encidx);
02794 }
02795
02796 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
02797
02798
02799
02800
02801
02802
02803
02804
02805
02806
02807
02808
02809
02810
02811
02812
02813
02814
02815
02816
02817
02818
02819
02820
02821
02822
02823
02824
02825
02826
02827
02828
02829
02830
02831
02832
02833
02834
02835
02836
02837
02838
02839
02840
02841
02842
02843
02844
02845
02846
02847
02848
02849
02850
02851
02852
02853
02854
02855
02856
02857
02858 static VALUE
02859 str_encode(int argc, VALUE *argv, VALUE str)
02860 {
02861 VALUE newstr = str;
02862 int encidx = str_transcode(argc, argv, &newstr);
02863 return encoded_dup(newstr, str, encidx);
02864 }
02865
02866 VALUE
02867 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
02868 {
02869 int argc = 1;
02870 VALUE *argv = &to;
02871 VALUE newstr = str;
02872 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
02873 return encoded_dup(newstr, str, encidx);
02874 }
02875
02876 static VALUE
02877 encoded_dup(VALUE newstr, VALUE str, int encidx)
02878 {
02879 if (encidx < 0) return rb_str_dup(str);
02880 if (newstr == str) {
02881 newstr = rb_str_dup(str);
02882 rb_enc_associate_index(newstr, encidx);
02883 return newstr;
02884 }
02885 else {
02886 RBASIC(newstr)->klass = rb_obj_class(str);
02887 }
02888 return str_encode_associate(newstr, encidx);
02889 }
02890
02891 static void
02892 econv_free(void *ptr)
02893 {
02894 rb_econv_t *ec = ptr;
02895 rb_econv_close(ec);
02896 }
02897
02898 static size_t
02899 econv_memsize(const void *ptr)
02900 {
02901 return ptr ? sizeof(rb_econv_t) : 0;
02902 }
02903
02904 static const rb_data_type_t econv_data_type = {
02905 "econv",
02906 {NULL, econv_free, econv_memsize,},
02907 };
02908
02909 static VALUE
02910 econv_s_allocate(VALUE klass)
02911 {
02912 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
02913 }
02914
02915 static rb_encoding *
02916 make_dummy_encoding(const char *name)
02917 {
02918 rb_encoding *enc;
02919 int idx;
02920 idx = rb_define_dummy_encoding(name);
02921 enc = rb_enc_from_index(idx);
02922 return enc;
02923 }
02924
02925 static rb_encoding *
02926 make_encoding(const char *name)
02927 {
02928 rb_encoding *enc;
02929 enc = rb_enc_find(name);
02930 if (!enc)
02931 enc = make_dummy_encoding(name);
02932 return enc;
02933 }
02934
02935 static VALUE
02936 make_encobj(const char *name)
02937 {
02938 return rb_enc_from_encoding(make_encoding(name));
02939 }
02940
02941
02942
02943
02944
02945
02946
02947
02948
02949
02950
02951
02952
02953
02954
02955
02956
02957
02958
02959 static VALUE
02960 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
02961 {
02962 const char *arg_name, *result_name;
02963 rb_encoding *arg_enc, *result_enc;
02964
02965 enc_arg(&arg, &arg_name, &arg_enc);
02966
02967 result_name = rb_econv_asciicompat_encoding(arg_name);
02968
02969 if (result_name == NULL)
02970 return Qnil;
02971
02972 result_enc = make_encoding(result_name);
02973
02974 return rb_enc_from_encoding(result_enc);
02975 }
02976
02977 static void
02978 econv_args(int argc, VALUE *argv,
02979 volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
02980 const char **sname_p, const char **dname_p,
02981 rb_encoding **senc_p, rb_encoding **denc_p,
02982 int *ecflags_p,
02983 VALUE *ecopts_p)
02984 {
02985 VALUE opt, flags_v, ecopts;
02986 int sidx, didx;
02987 const char *sname, *dname;
02988 rb_encoding *senc, *denc;
02989 int ecflags;
02990
02991 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
02992
02993 if (!NIL_P(flags_v)) {
02994 if (!NIL_P(opt)) {
02995 rb_error_arity(argc + 1, 2, 3);
02996 }
02997 ecflags = NUM2INT(rb_to_int(flags_v));
02998 ecopts = Qnil;
02999 }
03000 else if (!NIL_P(opt)) {
03001 ecflags = rb_econv_prepare_opts(opt, &ecopts);
03002 }
03003 else {
03004 ecflags = 0;
03005 ecopts = Qnil;
03006 }
03007
03008 senc = NULL;
03009 sidx = rb_to_encoding_index(*snamev_p);
03010 if (0 <= sidx) {
03011 senc = rb_enc_from_index(sidx);
03012 }
03013 else {
03014 StringValue(*snamev_p);
03015 }
03016
03017 denc = NULL;
03018 didx = rb_to_encoding_index(*dnamev_p);
03019 if (0 <= didx) {
03020 denc = rb_enc_from_index(didx);
03021 }
03022 else {
03023 StringValue(*dnamev_p);
03024 }
03025
03026 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
03027 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
03028
03029 *sname_p = sname;
03030 *dname_p = dname;
03031 *senc_p = senc;
03032 *denc_p = denc;
03033 *ecflags_p = ecflags;
03034 *ecopts_p = ecopts;
03035 }
03036
03037 static int
03038 decorate_convpath(VALUE convpath, int ecflags)
03039 {
03040 int num_decorators;
03041 const char *decorators[MAX_ECFLAGS_DECORATORS];
03042 int i;
03043 int n, len;
03044
03045 num_decorators = decorator_names(ecflags, decorators);
03046 if (num_decorators == -1)
03047 return -1;
03048
03049 len = n = RARRAY_LENINT(convpath);
03050 if (n != 0) {
03051 VALUE pair = RARRAY_PTR(convpath)[n-1];
03052 if (RB_TYPE_P(pair, T_ARRAY)) {
03053 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
03054 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
03055 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
03056 const rb_transcoder *tr = load_transcoder_entry(entry);
03057 if (!tr)
03058 return -1;
03059 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
03060 tr->asciicompat_type == asciicompat_encoder) {
03061 n--;
03062 rb_ary_store(convpath, len + num_decorators - 1, pair);
03063 }
03064 }
03065 else {
03066 rb_ary_store(convpath, len + num_decorators - 1, pair);
03067 }
03068 }
03069
03070 for (i = 0; i < num_decorators; i++)
03071 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
03072
03073 return 0;
03074 }
03075
03076 static void
03077 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03078 {
03079 VALUE *ary_p = arg;
03080 VALUE v;
03081
03082 if (*ary_p == Qnil) {
03083 *ary_p = rb_ary_new();
03084 }
03085
03086 if (DECORATOR_P(sname, dname)) {
03087 v = rb_str_new_cstr(dname);
03088 }
03089 else {
03090 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
03091 }
03092 rb_ary_store(*ary_p, depth, v);
03093 }
03094
03095
03096
03097
03098
03099
03100
03101
03102
03103
03104
03105
03106
03107
03108
03109
03110
03111
03112
03113
03114
03115
03116
03117
03118
03119
03120 static VALUE
03121 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
03122 {
03123 volatile VALUE snamev, dnamev;
03124 const char *sname, *dname;
03125 rb_encoding *senc, *denc;
03126 int ecflags;
03127 VALUE ecopts;
03128 VALUE convpath;
03129
03130 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03131
03132 convpath = Qnil;
03133 transcode_search_path(sname, dname, search_convpath_i, &convpath);
03134
03135 if (NIL_P(convpath))
03136 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03137
03138 if (decorate_convpath(convpath, ecflags) == -1)
03139 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03140
03141 return convpath;
03142 }
03143
03144
03145
03146
03147
03148
03149 int
03150 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
03151 {
03152 VALUE convpath = Qnil;
03153 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
03154 &convpath);
03155 return RTEST(convpath);
03156 }
03157
03158 struct rb_econv_init_by_convpath_t {
03159 rb_econv_t *ec;
03160 int index;
03161 int ret;
03162 };
03163
03164 static void
03165 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03166 {
03167 struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
03168 int ret;
03169
03170 if (a->ret == -1)
03171 return;
03172
03173 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
03174
03175 a->ret = ret;
03176 return;
03177 }
03178
03179 static rb_econv_t *
03180 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
03181 const char **sname_p, const char **dname_p,
03182 rb_encoding **senc_p, rb_encoding**denc_p)
03183 {
03184 rb_econv_t *ec;
03185 long i;
03186 int ret, first=1;
03187 VALUE elt;
03188 rb_encoding *senc = 0, *denc = 0;
03189 const char *sname, *dname;
03190
03191 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
03192 DATA_PTR(self) = ec;
03193
03194 for (i = 0; i < RARRAY_LEN(convpath); i++) {
03195 volatile VALUE snamev, dnamev;
03196 VALUE pair;
03197 elt = rb_ary_entry(convpath, i);
03198 if (!NIL_P(pair = rb_check_array_type(elt))) {
03199 if (RARRAY_LEN(pair) != 2)
03200 rb_raise(rb_eArgError, "not a 2-element array in convpath");
03201 snamev = rb_ary_entry(pair, 0);
03202 enc_arg(&snamev, &sname, &senc);
03203 dnamev = rb_ary_entry(pair, 1);
03204 enc_arg(&dnamev, &dname, &denc);
03205 }
03206 else {
03207 sname = "";
03208 dname = StringValueCStr(elt);
03209 }
03210 if (DECORATOR_P(sname, dname)) {
03211 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
03212 if (ret == -1)
03213 rb_raise(rb_eArgError, "decoration failed: %s", dname);
03214 }
03215 else {
03216 int j = ec->num_trans;
03217 struct rb_econv_init_by_convpath_t arg;
03218 arg.ec = ec;
03219 arg.index = ec->num_trans;
03220 arg.ret = 0;
03221 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
03222 if (ret == -1 || arg.ret == -1)
03223 rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
03224 if (first) {
03225 first = 0;
03226 *senc_p = senc;
03227 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
03228 }
03229 *denc_p = denc;
03230 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
03231 }
03232 }
03233
03234 if (first) {
03235 *senc_p = NULL;
03236 *denc_p = NULL;
03237 *sname_p = "";
03238 *dname_p = "";
03239 }
03240
03241 ec->source_encoding_name = *sname_p;
03242 ec->destination_encoding_name = *dname_p;
03243
03244 return ec;
03245 }
03246
03247
03248
03249
03250
03251
03252
03253
03254
03255
03256
03257
03258
03259
03260
03261
03262
03263
03264
03265
03266
03267
03268
03269
03270
03271
03272
03273
03274
03275
03276
03277
03278
03279
03280
03281
03282
03283
03284
03285
03286
03287
03288
03289
03290
03291
03292
03293
03294
03295
03296
03297
03298
03299
03300
03301
03302
03303
03304
03305
03306
03307
03308
03309
03310
03311
03312
03313
03314
03315
03316
03317
03318
03319
03320
03321
03322
03323
03324
03325
03326
03327
03328
03329
03330
03331
03332
03333
03334
03335
03336
03337
03338
03339
03340
03341
03342
03343
03344
03345
03346
03347
03348
03349
03350
03351
03352
03353 static VALUE
03354 econv_init(int argc, VALUE *argv, VALUE self)
03355 {
03356 VALUE ecopts;
03357 volatile VALUE snamev, dnamev;
03358 const char *sname, *dname;
03359 rb_encoding *senc, *denc;
03360 rb_econv_t *ec;
03361 int ecflags;
03362 VALUE convpath;
03363
03364 if (rb_check_typeddata(self, &econv_data_type)) {
03365 rb_raise(rb_eTypeError, "already initialized");
03366 }
03367
03368 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
03369 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
03370 ecflags = 0;
03371 ecopts = Qnil;
03372 }
03373 else {
03374 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03375 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
03376 }
03377
03378 if (!ec) {
03379 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03380 }
03381
03382 if (!DECORATOR_P(sname, dname)) {
03383 if (!senc)
03384 senc = make_dummy_encoding(sname);
03385 if (!denc)
03386 denc = make_dummy_encoding(dname);
03387 }
03388
03389 ec->source_encoding = senc;
03390 ec->destination_encoding = denc;
03391
03392 DATA_PTR(self) = ec;
03393
03394 return self;
03395 }
03396
03397
03398
03399
03400
03401
03402
03403
03404
03405
03406
03407 static VALUE
03408 econv_inspect(VALUE self)
03409 {
03410 const char *cname = rb_obj_classname(self);
03411 rb_econv_t *ec;
03412
03413 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03414 if (!ec)
03415 return rb_sprintf("#<%s: uninitialized>", cname);
03416 else {
03417 const char *sname = ec->source_encoding_name;
03418 const char *dname = ec->destination_encoding_name;
03419 VALUE str;
03420 str = rb_sprintf("#<%s: ", cname);
03421 econv_description(sname, dname, ec->flags, str);
03422 rb_str_cat2(str, ">");
03423 return str;
03424 }
03425 }
03426
03427 static rb_econv_t *
03428 check_econv(VALUE self)
03429 {
03430 rb_econv_t *ec;
03431
03432 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03433 if (!ec) {
03434 rb_raise(rb_eTypeError, "uninitialized encoding converter");
03435 }
03436 return ec;
03437 }
03438
03439
03440
03441
03442
03443
03444
03445 static VALUE
03446 econv_source_encoding(VALUE self)
03447 {
03448 rb_econv_t *ec = check_econv(self);
03449 if (!ec->source_encoding)
03450 return Qnil;
03451 return rb_enc_from_encoding(ec->source_encoding);
03452 }
03453
03454
03455
03456
03457
03458
03459
03460 static VALUE
03461 econv_destination_encoding(VALUE self)
03462 {
03463 rb_econv_t *ec = check_econv(self);
03464 if (!ec->destination_encoding)
03465 return Qnil;
03466 return rb_enc_from_encoding(ec->destination_encoding);
03467 }
03468
03469
03470
03471
03472
03473
03474
03475
03476
03477
03478
03479
03480
03481
03482
03483
03484
03485
03486
03487
03488
03489
03490
03491 static VALUE
03492 econv_convpath(VALUE self)
03493 {
03494 rb_econv_t *ec = check_econv(self);
03495 VALUE result;
03496 int i;
03497
03498 result = rb_ary_new();
03499 for (i = 0; i < ec->num_trans; i++) {
03500 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
03501 VALUE v;
03502 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
03503 v = rb_str_new_cstr(tr->dst_encoding);
03504 else
03505 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
03506 rb_ary_push(result, v);
03507 }
03508 return result;
03509 }
03510
03511
03512
03513
03514
03515 static VALUE
03516 econv_equal(VALUE self, VALUE other)
03517 {
03518 rb_econv_t *ec1 = check_econv(self);
03519 rb_econv_t *ec2;
03520 int i;
03521
03522 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
03523 return Qnil;
03524 }
03525 ec2 = DATA_PTR(other);
03526 if (!ec2) return Qfalse;
03527 if (ec1->source_encoding_name != ec2->source_encoding_name &&
03528 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
03529 return Qfalse;
03530 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
03531 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
03532 return Qfalse;
03533 if (ec1->flags != ec2->flags) return Qfalse;
03534 if (ec1->replacement_enc != ec2->replacement_enc &&
03535 strcmp(ec1->replacement_enc, ec2->replacement_enc))
03536 return Qfalse;
03537 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
03538 if (ec1->replacement_str != ec2->replacement_str &&
03539 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
03540 return Qfalse;
03541
03542 if (ec1->num_trans != ec2->num_trans) return Qfalse;
03543 for (i = 0; i < ec1->num_trans; i++) {
03544 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
03545 return Qfalse;
03546 }
03547 return Qtrue;
03548 }
03549
03550 static VALUE
03551 econv_result_to_symbol(rb_econv_result_t res)
03552 {
03553 switch (res) {
03554 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
03555 case econv_incomplete_input: return sym_incomplete_input;
03556 case econv_undefined_conversion: return sym_undefined_conversion;
03557 case econv_destination_buffer_full: return sym_destination_buffer_full;
03558 case econv_source_buffer_empty: return sym_source_buffer_empty;
03559 case econv_finished: return sym_finished;
03560 case econv_after_output: return sym_after_output;
03561 default: return INT2NUM(res);
03562 }
03563 }
03564
03565
03566
03567
03568
03569
03570
03571
03572
03573
03574
03575
03576
03577
03578
03579
03580
03581
03582
03583
03584
03585
03586
03587
03588
03589
03590
03591
03592
03593
03594
03595
03596
03597
03598
03599
03600
03601
03602
03603
03604
03605
03606
03607
03608
03609
03610
03611
03612
03613
03614
03615
03616
03617
03618
03619
03620
03621
03622
03623
03624
03625
03626
03627
03628
03629
03630
03631
03632
03633
03634
03635
03636
03637
03638
03639
03640
03641
03642
03643
03644
03645
03646
03647
03648
03649
03650
03651
03652
03653
03654
03655
03656
03657
03658
03659 static VALUE
03660 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
03661 {
03662 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
03663 rb_econv_t *ec = check_econv(self);
03664 rb_econv_result_t res;
03665 const unsigned char *ip, *is;
03666 unsigned char *op, *os;
03667 long output_byteoffset, output_bytesize;
03668 unsigned long output_byteend;
03669 int flags;
03670
03671 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
03672
03673 if (NIL_P(output_byteoffset_v))
03674 output_byteoffset = 0;
03675 else
03676 output_byteoffset = NUM2LONG(output_byteoffset_v);
03677
03678 if (NIL_P(output_bytesize_v))
03679 output_bytesize = 0;
03680 else
03681 output_bytesize = NUM2LONG(output_bytesize_v);
03682
03683 if (!NIL_P(flags_v)) {
03684 if (!NIL_P(opt)) {
03685 rb_error_arity(argc + 1, 2, 5);
03686 }
03687 flags = NUM2INT(rb_to_int(flags_v));
03688 }
03689 else if (!NIL_P(opt)) {
03690 VALUE v;
03691 flags = 0;
03692 v = rb_hash_aref(opt, sym_partial_input);
03693 if (RTEST(v))
03694 flags |= ECONV_PARTIAL_INPUT;
03695 v = rb_hash_aref(opt, sym_after_output);
03696 if (RTEST(v))
03697 flags |= ECONV_AFTER_OUTPUT;
03698 }
03699 else {
03700 flags = 0;
03701 }
03702
03703 StringValue(output);
03704 if (!NIL_P(input))
03705 StringValue(input);
03706 rb_str_modify(output);
03707
03708 if (NIL_P(output_bytesize_v)) {
03709 output_bytesize = RSTRING_EMBED_LEN_MAX;
03710 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
03711 output_bytesize = RSTRING_LEN(input);
03712 }
03713
03714 retry:
03715
03716 if (NIL_P(output_byteoffset_v))
03717 output_byteoffset = RSTRING_LEN(output);
03718
03719 if (output_byteoffset < 0)
03720 rb_raise(rb_eArgError, "negative output_byteoffset");
03721
03722 if (RSTRING_LEN(output) < output_byteoffset)
03723 rb_raise(rb_eArgError, "output_byteoffset too big");
03724
03725 if (output_bytesize < 0)
03726 rb_raise(rb_eArgError, "negative output_bytesize");
03727
03728 output_byteend = (unsigned long)output_byteoffset +
03729 (unsigned long)output_bytesize;
03730
03731 if (output_byteend < (unsigned long)output_byteoffset ||
03732 LONG_MAX < output_byteend)
03733 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
03734
03735 if (rb_str_capacity(output) < output_byteend)
03736 rb_str_resize(output, output_byteend);
03737
03738 if (NIL_P(input)) {
03739 ip = is = NULL;
03740 }
03741 else {
03742 ip = (const unsigned char *)RSTRING_PTR(input);
03743 is = ip + RSTRING_LEN(input);
03744 }
03745
03746 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
03747 os = op + output_bytesize;
03748
03749 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
03750 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
03751 if (!NIL_P(input))
03752 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
03753
03754 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
03755 if (LONG_MAX / 2 < output_bytesize)
03756 rb_raise(rb_eArgError, "too long conversion result");
03757 output_bytesize *= 2;
03758 output_byteoffset_v = Qnil;
03759 goto retry;
03760 }
03761
03762 if (ec->destination_encoding) {
03763 rb_enc_associate(output, ec->destination_encoding);
03764 }
03765
03766 return econv_result_to_symbol(res);
03767 }
03768
03769
03770
03771
03772
03773
03774
03775
03776
03777
03778
03779
03780
03781
03782
03783
03784
03785
03786
03787
03788
03789
03790
03791
03792
03793
03794
03795
03796
03797
03798
03799
03800
03801
03802
03803 static VALUE
03804 econv_convert(VALUE self, VALUE source_string)
03805 {
03806 VALUE ret, dst;
03807 VALUE av[5];
03808 int ac;
03809 rb_econv_t *ec = check_econv(self);
03810
03811 StringValue(source_string);
03812
03813 dst = rb_str_new(NULL, 0);
03814
03815 av[0] = rb_str_dup(source_string);
03816 av[1] = dst;
03817 av[2] = Qnil;
03818 av[3] = Qnil;
03819 av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
03820 ac = 5;
03821
03822 ret = econv_primitive_convert(ac, av, self);
03823
03824 if (ret == sym_invalid_byte_sequence ||
03825 ret == sym_undefined_conversion ||
03826 ret == sym_incomplete_input) {
03827 VALUE exc = make_econv_exception(ec);
03828 rb_exc_raise(exc);
03829 }
03830
03831 if (ret == sym_finished) {
03832 rb_raise(rb_eArgError, "converter already finished");
03833 }
03834
03835 if (ret != sym_source_buffer_empty) {
03836 rb_bug("unexpected result of econv_primitive_convert");
03837 }
03838
03839 return dst;
03840 }
03841
03842
03843
03844
03845
03846
03847
03848
03849
03850
03851
03852
03853 static VALUE
03854 econv_finish(VALUE self)
03855 {
03856 VALUE ret, dst;
03857 VALUE av[5];
03858 int ac;
03859 rb_econv_t *ec = check_econv(self);
03860
03861 dst = rb_str_new(NULL, 0);
03862
03863 av[0] = Qnil;
03864 av[1] = dst;
03865 av[2] = Qnil;
03866 av[3] = Qnil;
03867 av[4] = INT2NUM(0);
03868 ac = 5;
03869
03870 ret = econv_primitive_convert(ac, av, self);
03871
03872 if (ret == sym_invalid_byte_sequence ||
03873 ret == sym_undefined_conversion ||
03874 ret == sym_incomplete_input) {
03875 VALUE exc = make_econv_exception(ec);
03876 rb_exc_raise(exc);
03877 }
03878
03879 if (ret != sym_finished) {
03880 rb_bug("unexpected result of econv_primitive_convert");
03881 }
03882
03883 return dst;
03884 }
03885
03886
03887
03888
03889
03890
03891
03892
03893
03894
03895
03896
03897
03898
03899
03900
03901
03902
03903
03904
03905
03906
03907
03908
03909
03910
03911
03912
03913
03914
03915
03916
03917
03918
03919
03920
03921
03922
03923
03924
03925
03926
03927
03928
03929
03930
03931
03932
03933
03934
03935
03936
03937
03938
03939
03940
03941
03942
03943
03944
03945
03946
03947
03948
03949
03950
03951
03952
03953
03954
03955
03956
03957
03958
03959
03960
03961 static VALUE
03962 econv_primitive_errinfo(VALUE self)
03963 {
03964 rb_econv_t *ec = check_econv(self);
03965
03966 VALUE ary;
03967
03968 ary = rb_ary_new2(5);
03969
03970 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
03971 rb_ary_store(ary, 4, Qnil);
03972
03973 if (ec->last_error.source_encoding)
03974 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
03975
03976 if (ec->last_error.destination_encoding)
03977 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
03978
03979 if (ec->last_error.error_bytes_start) {
03980 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
03981 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
03982 }
03983
03984 return ary;
03985 }
03986
03987
03988
03989
03990
03991
03992
03993
03994
03995
03996
03997
03998
03999
04000
04001
04002
04003
04004
04005
04006
04007
04008
04009
04010
04011
04012
04013
04014
04015
04016
04017
04018
04019 static VALUE
04020 econv_insert_output(VALUE self, VALUE string)
04021 {
04022 const char *insert_enc;
04023
04024 int ret;
04025
04026 rb_econv_t *ec = check_econv(self);
04027
04028 StringValue(string);
04029 insert_enc = rb_econv_encoding_to_insert_output(ec);
04030 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
04031
04032 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
04033 if (ret == -1) {
04034 rb_raise(rb_eArgError, "too big string");
04035 }
04036
04037 return Qnil;
04038 }
04039
04040
04041
04042
04043
04044
04045
04046
04047
04048
04049
04050
04051
04052
04053
04054
04055
04056
04057
04058
04059
04060
04061
04062
04063
04064 static VALUE
04065 econv_putback(int argc, VALUE *argv, VALUE self)
04066 {
04067 rb_econv_t *ec = check_econv(self);
04068 int n;
04069 int putbackable;
04070 VALUE str, max;
04071
04072 rb_scan_args(argc, argv, "01", &max);
04073
04074 if (NIL_P(max))
04075 n = rb_econv_putbackable(ec);
04076 else {
04077 n = NUM2INT(max);
04078 putbackable = rb_econv_putbackable(ec);
04079 if (putbackable < n)
04080 n = putbackable;
04081 }
04082
04083 str = rb_str_new(NULL, n);
04084 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
04085
04086 if (ec->source_encoding) {
04087 rb_enc_associate(str, ec->source_encoding);
04088 }
04089
04090 return str;
04091 }
04092
04093
04094
04095
04096
04097
04098
04099
04100
04101
04102
04103
04104
04105
04106
04107
04108
04109
04110
04111
04112
04113 static VALUE
04114 econv_last_error(VALUE self)
04115 {
04116 rb_econv_t *ec = check_econv(self);
04117 VALUE exc;
04118
04119 exc = make_econv_exception(ec);
04120 if (NIL_P(exc))
04121 return Qnil;
04122 return exc;
04123 }
04124
04125
04126
04127
04128
04129
04130
04131
04132
04133
04134
04135
04136
04137 static VALUE
04138 econv_get_replacement(VALUE self)
04139 {
04140 rb_econv_t *ec = check_econv(self);
04141 int ret;
04142 rb_encoding *enc;
04143
04144 ret = make_replacement(ec);
04145 if (ret == -1) {
04146 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04147 }
04148
04149 enc = rb_enc_find(ec->replacement_enc);
04150 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
04151 }
04152
04153
04154
04155
04156
04157
04158
04159
04160
04161
04162
04163 static VALUE
04164 econv_set_replacement(VALUE self, VALUE arg)
04165 {
04166 rb_econv_t *ec = check_econv(self);
04167 VALUE string = arg;
04168 int ret;
04169 rb_encoding *enc;
04170
04171 StringValue(string);
04172 enc = rb_enc_get(string);
04173
04174 ret = rb_econv_set_replacement(ec,
04175 (const unsigned char *)RSTRING_PTR(string),
04176 RSTRING_LEN(string),
04177 rb_enc_name(enc));
04178
04179 if (ret == -1) {
04180
04181 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04182 }
04183
04184 return arg;
04185 }
04186
04187 VALUE
04188 rb_econv_make_exception(rb_econv_t *ec)
04189 {
04190 return make_econv_exception(ec);
04191 }
04192
04193 void
04194 rb_econv_check_error(rb_econv_t *ec)
04195 {
04196 VALUE exc;
04197
04198 exc = make_econv_exception(ec);
04199 if (NIL_P(exc))
04200 return;
04201 rb_exc_raise(exc);
04202 }
04203
04204
04205
04206
04207
04208
04209
04210 static VALUE
04211 ecerr_source_encoding_name(VALUE self)
04212 {
04213 return rb_attr_get(self, rb_intern("source_encoding_name"));
04214 }
04215
04216
04217
04218
04219
04220
04221
04222
04223
04224
04225
04226
04227
04228
04229
04230
04231
04232
04233
04234
04235
04236 static VALUE
04237 ecerr_source_encoding(VALUE self)
04238 {
04239 return rb_attr_get(self, rb_intern("source_encoding"));
04240 }
04241
04242
04243
04244
04245
04246
04247
04248 static VALUE
04249 ecerr_destination_encoding_name(VALUE self)
04250 {
04251 return rb_attr_get(self, rb_intern("destination_encoding_name"));
04252 }
04253
04254
04255
04256
04257
04258
04259
04260 static VALUE
04261 ecerr_destination_encoding(VALUE self)
04262 {
04263 return rb_attr_get(self, rb_intern("destination_encoding"));
04264 }
04265
04266
04267
04268
04269
04270
04271
04272
04273
04274
04275
04276
04277
04278
04279
04280
04281 static VALUE
04282 ecerr_error_char(VALUE self)
04283 {
04284 return rb_attr_get(self, rb_intern("error_char"));
04285 }
04286
04287
04288
04289
04290
04291
04292
04293
04294
04295
04296
04297
04298
04299
04300
04301
04302 static VALUE
04303 ecerr_error_bytes(VALUE self)
04304 {
04305 return rb_attr_get(self, rb_intern("error_bytes"));
04306 }
04307
04308
04309
04310
04311
04312
04313
04314 static VALUE
04315 ecerr_readagain_bytes(VALUE self)
04316 {
04317 return rb_attr_get(self, rb_intern("readagain_bytes"));
04318 }
04319
04320
04321
04322
04323
04324
04325
04326
04327
04328
04329
04330
04331
04332
04333
04334
04335
04336
04337
04338
04339
04340
04341
04342
04343
04344 static VALUE
04345 ecerr_incomplete_input(VALUE self)
04346 {
04347 return rb_attr_get(self, rb_intern("incomplete_input"));
04348 }
04349
04350
04351
04352
04353
04354
04355
04356
04357
04358
04359
04360
04361
04362
04363
04364
04365
04366
04367
04368
04369
04370
04371
04372 void
04373 Init_transcode(void)
04374 {
04375 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
04376 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
04377 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
04378
04379 transcoder_table = st_init_strcasetable();
04380
04381 sym_invalid = ID2SYM(rb_intern("invalid"));
04382 sym_undef = ID2SYM(rb_intern("undef"));
04383 sym_replace = ID2SYM(rb_intern("replace"));
04384 sym_fallback = ID2SYM(rb_intern("fallback"));
04385 sym_aref = ID2SYM(rb_intern("[]"));
04386 sym_xml = ID2SYM(rb_intern("xml"));
04387 sym_text = ID2SYM(rb_intern("text"));
04388 sym_attr = ID2SYM(rb_intern("attr"));
04389
04390 sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
04391 sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
04392 sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
04393 sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
04394 sym_finished = ID2SYM(rb_intern("finished"));
04395 sym_after_output = ID2SYM(rb_intern("after_output"));
04396 sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
04397 sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
04398 sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
04399 sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
04400 sym_partial_input = ID2SYM(rb_intern("partial_input"));
04401
04402 #ifdef ENABLE_ECONV_NEWLINE_OPTION
04403 sym_newline = ID2SYM(rb_intern("newline"));
04404 sym_universal = ID2SYM(rb_intern("universal"));
04405 sym_crlf = ID2SYM(rb_intern("crlf"));
04406 sym_cr = ID2SYM(rb_intern("cr"));
04407 sym_lf = ID2SYM(rb_intern("lf"));
04408 #endif
04409
04410 rb_define_method(rb_cString, "encode", str_encode, -1);
04411 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
04412
04413 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
04414 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
04415 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
04416 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
04417 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
04418 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
04419 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
04420 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
04421 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
04422 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
04423 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
04424 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
04425 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
04426 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
04427 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
04428 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
04429 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
04430 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
04431 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
04432
04433
04434
04435
04436
04437 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
04438
04439
04440
04441
04442
04443 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
04444
04445
04446
04447
04448
04449
04450 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
04451
04452
04453
04454
04455
04456 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
04457
04458
04459
04460
04461
04462
04463
04464 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
04465
04466
04467
04468
04469
04470
04471 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
04472
04473
04474
04475
04476
04477
04478 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
04479
04480
04481
04482
04483
04484 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
04485
04486
04487
04488
04489
04490 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
04491
04492
04493
04494
04495
04496 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
04497
04498
04499
04500
04501
04502 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
04503
04504
04505
04506
04507
04508 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
04509
04510
04511
04512
04513
04514 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
04515
04516 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
04517 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04518 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
04519 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
04520 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
04521
04522 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
04523 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04524 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
04525 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
04526 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
04527 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
04528 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
04529
04530 Init_newline();
04531 }
04532