Ignore:
Timestamp:
Jul 13, 2009, 10:52:08 PM (12 years ago)
Author:
charles
Message:

(trunk libT) update JSON_parser.c: has UTF16 fix

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/libtransmission/JSON_parser.c

    r8409 r8799  
    3333   
    3434    Changelog:
     35        2009-05-17
     36            Incorporated benrudiak@googlemail.com fix for UTF16 decoding.
     37           
    3538        2009-05-14
    3639            Fixed float parsing bug related to a locale being set that didn't
     
    6669
    6770#include "JSON_parser.h"
    68 #include "ConvertUTF.h"
    6971
    7072#ifdef _MSC_VER
     
    8890#endif
    8991
     92typedef unsigned short UTF16;
    9093
    9194struct JSON_parser_struct {
     
    9396    void* ctx;
    9497    signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually;
    95     UTF16 utf16_decode_buffer[2];
     98    UTF16 utf16_high_surrogate;
    9699    long depth;
    97100    long top;
     
    237240    IX = -20, /* integer detected by 1-9 */
    238241    EX = -21, /* next char is escaped */
    239     UC = -22, /* Unicode character read */
     242    UC = -22  /* Unicode character read */
    240243};
    241244
     
    518521}
    519522
     523#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)
     524#define IS_LOW_SURROGATE(uc)  (((uc) & 0xFC00) == 0xDC00)
     525#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)
     526static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 };
     527
    520528static int decode_unicode_char(JSON_parser jc)
    521529{
    522     const unsigned chars = jc->utf16_decode_buffer[0] ? 2 : 1;
    523530    int i;
    524     UTF16 *uc = chars == 1 ? &jc->utf16_decode_buffer[0] : &jc->utf16_decode_buffer[1];
    525     UTF16 x;
     531    unsigned uc = 0;
    526532    char* p;
     533    int trail_bytes;
    527534   
    528535    assert(jc->parse_buffer_count >= 6);
     
    530537    p = &jc->parse_buffer[jc->parse_buffer_count - 4];
    531538   
    532     for (i = 0; i < 4; ++i, ++p) {
    533         x = *p;
     539    for (i = 12; i >= 0; i -= 4, ++p) {
     540        unsigned x = *p;
    534541       
    535542        if (x >= 'a') {
     
    538545            x -= ('A' - 10);
    539546        } else {
    540             x &= ~((UTF16) 0x30);
     547            x &= ~0x30u;
    541548        }
    542549       
    543550        assert(x < 16);
    544551       
    545         *uc |= x << ((3u - i) << 2);
    546     }
    547    
    548     /* clear UTF-16 char form buffer */
     552        uc |= x << i;
     553    }
     554   
     555    /* clear UTF-16 char from buffer */
    549556    jc->parse_buffer_count -= 6;
    550557    jc->parse_buffer[jc->parse_buffer_count] = 0;
    551558   
    552559    /* attempt decoding ... */
    553     {
    554         UTF8* dec_start = (UTF8*)&jc->parse_buffer[jc->parse_buffer_count];
    555         UTF8* dec_start_dup = dec_start;
    556         UTF8* dec_end = dec_start + 6;
    557        
    558         const UTF16* enc_start = &jc->utf16_decode_buffer[0];
    559         const UTF16* enc_end = enc_start + chars;
    560    
    561         const ConversionResult result = ConvertUTF16toUTF8(
    562             &enc_start, enc_end, &dec_start, dec_end, strictConversion);
    563        
    564         const size_t new_chars = dec_start - dec_start_dup;
    565        
    566         /* was it a surrogate UTF-16 char? */
    567         if (chars == 1 && result == sourceExhausted) {
     560    if (jc->utf16_high_surrogate) {
     561        if (IS_LOW_SURROGATE(uc)) {
     562            uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc);
     563            trail_bytes = 3;
     564            jc->utf16_high_surrogate = 0;
     565        } else {
     566            /* high surrogate without a following low surrogate */
     567            return false;
     568        }
     569    } else {
     570        if (uc < 0x80) {
     571            trail_bytes = 0;
     572        } else if (uc < 0x800) {
     573            trail_bytes = 1;
     574        } else if (IS_HIGH_SURROGATE(uc)) {
     575            /* save the high surrogate and wait for the low surrogate */
     576            jc->utf16_high_surrogate = uc;
    568577            return true;
    569         }
    570        
    571         if (result != conversionOK) {
     578        } else if (IS_LOW_SURROGATE(uc)) {
     579            /* low surrogate without a preceding high surrogate */
    572580            return false;
    573         }
    574        
    575         /* NOTE: clear decode buffer to resume string reading,
    576            otherwise we continue to read UTF-16 */
    577         jc->utf16_decode_buffer[0] = 0;
    578        
    579         assert(new_chars <= 6);
    580        
    581         jc->parse_buffer_count += new_chars;
    582         jc->parse_buffer[jc->parse_buffer_count] = 0;
    583     }
     581        } else {
     582            trail_bytes = 2;
     583        }
     584    }
     585   
     586    jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]);
     587   
     588    for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) {
     589        jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80);
     590    }
     591
     592    jc->parse_buffer[jc->parse_buffer_count] = 0;
    584593   
    585594    return true;
     
    695704            }
    696705            /* check if we need to read a second UTF-16 char */
    697             if (jc->utf16_decode_buffer[0]) {
     706            if (jc->utf16_high_surrogate) {
    698707                jc->state = D1;
    699708            } else {
Note: See TracChangeset for help on using the changeset viewer.