Changeset 7654
- Timestamp:
- Jan 10, 2009, 5:59:49 PM (12 years ago)
- Location:
- trunk/libtransmission
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/libtransmission/ConvertUTF.c
r6795 r7654 1 1 /* 2 2 * Copyright 2001-2004 Unicode, Inc. 3 * 3 * 4 4 * Disclaimer 5 * 5 * 6 6 * This source code is provided as is by Unicode, Inc. No claims are 7 7 * made as to fitness for any particular purpose. No warranties of any … … 11 11 * sole remedy for any claim will be exchange of defective media 12 12 * within 90 days of receipt. 13 * 13 * 14 14 * Limitations on Rights to Redistribute This Code 15 * 15 * 16 16 * Unicode, Inc. hereby grants the right to freely use the information 17 17 * supplied in this file in the creation of products supporting the … … 27 27 Rev History: Rick McGowan, fixes & updates May 2001. 28 28 Sept 2001: fixed const & error conditions per 29 29 mods suggested by S. Parent & A. Lillich. 30 30 June 2002: Tim Dodd added detection and handling of incomplete 31 32 31 source sequences, enhanced error detection, added casts 32 to eliminate compiler warnings. 33 33 July 2003: slight mods to back out aggressive FFFE detection. 34 34 Jan 2004: updated switches in from-UTF8 conversions. 35 35 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 36 May 2006: updated isLegalUTF8Sequence. 36 37 37 38 See the header file "ConvertUTF.h" for complete documentation. 38 39 39 40 ------------------------------------------------------------------------ */ 40 41 41 42 42 43 #include "ConvertUTF.h" 43 44 #ifdef CVTUTF_DEBUG 44 45 #include <stdio.h> 45 46 #endif 46 47 47 static const int 48 static const int halfShift = 10; /* used for shifting by 10 bits */ 48 49 49 50 static const UTF32 halfBase = 0x0010000UL; … … 54 55 #define UNI_SUR_LOW_START (UTF32)0xDC00 55 56 #define UNI_SUR_LOW_END (UTF32)0xDFFF 56 #define false 0 57 #define true 1 58 59 /* --------------------------------------------------------------------- */ 60 61 ConversionResult 62 ConvertUTF32toUTF16( const UTF32** sourceStart, 63 const UTF32* sourceEnd, 64 UTF16** targetStart, 65 UTF16* targetEnd, 66 ConversionFlags flags ) 67 { 57 #define false 0 58 #define true 1 59 60 /* --------------------------------------------------------------------- */ 61 62 ConversionResult ConvertUTF32toUTF16 ( 63 const UTF32** sourceStart, const UTF32* sourceEnd, 64 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 68 65 ConversionResult result = conversionOK; 69 const UTF32* source = *sourceStart; 70 UTF16* target = *targetStart; 71 72 while( source < sourceEnd ) 73 { 74 UTF32 ch; 75 if( target >= targetEnd ) 76 { 77 result = targetExhausted; break; 78 } 79 ch = *source++; 80 if( ch <= UNI_MAX_BMP ) /* Target is a character <= 0xFFFF */ 81 { /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are 82 both reserved values */ 83 if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) 84 { 85 if( flags == strictConversion ) 86 { 87 --source; /* return to the illegal value itself */ 88 result = sourceIllegal; 89 break; 90 } 91 else 92 { 93 *target++ = UNI_REPLACEMENT_CHAR; 94 } 95 } 96 else 97 { 98 *target++ = (UTF16)ch; /* normal case */ 99 } 100 } 101 else if( ch > UNI_MAX_LEGAL_UTF32 ) 102 { 103 if( flags == strictConversion ) 104 { 105 result = sourceIllegal; 106 } 107 else 108 { 109 *target++ = UNI_REPLACEMENT_CHAR; 110 } 111 } 112 else 113 { 114 /* target is a character in range 0xFFFF - 0x10FFFF. */ 115 if( target + 1 >= targetEnd ) 116 { 117 --source; /* Back up source pointer! */ 118 result = targetExhausted; break; 119 } 120 ch -= halfBase; 121 *target++ = (UTF16)( ( ch >> halfShift ) + UNI_SUR_HIGH_START ); 122 *target++ = (UTF16)( ( ch & halfMask ) + UNI_SUR_LOW_START ); 123 } 124 } 125 66 const UTF32* source = *sourceStart; 67 UTF16* target = *targetStart; 68 while (source < sourceEnd) { 69 UTF32 ch; 70 if (target >= targetEnd) { 71 result = targetExhausted; break; 72 } 73 ch = *source++; 74 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 75 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 76 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 77 if (flags == strictConversion) { 78 --source; /* return to the illegal value itself */ 79 result = sourceIllegal; 80 break; 81 } else { 82 *target++ = UNI_REPLACEMENT_CHAR; 83 } 84 } else { 85 *target++ = (UTF16)ch; /* normal case */ 86 } 87 } else if (ch > UNI_MAX_LEGAL_UTF32) { 88 if (flags == strictConversion) { 89 result = sourceIllegal; 90 } else { 91 *target++ = UNI_REPLACEMENT_CHAR; 92 } 93 } else { 94 /* target is a character in range 0xFFFF - 0x10FFFF. */ 95 if (target + 1 >= targetEnd) { 96 --source; /* Back up source pointer! */ 97 result = targetExhausted; break; 98 } 99 ch -= halfBase; 100 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 101 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 102 } 103 } 126 104 *sourceStart = source; 127 105 *targetStart = target; … … 131 109 /* --------------------------------------------------------------------- */ 132 110 133 ConversionResult 134 ConvertUTF16toUTF32( const UTF16** sourceStart, 135 const UTF16* sourceEnd, 136 UTF32** targetStart, 137 UTF32* targetEnd, 138 ConversionFlags flags ) 139 { 111 ConversionResult ConvertUTF16toUTF32 ( 112 const UTF16** sourceStart, const UTF16* sourceEnd, 113 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 140 114 ConversionResult result = conversionOK; 141 const UTF16* source = *sourceStart; 142 UTF32* target = *targetStart; 143 UTF32 ch, ch2; 144 145 while( source < sourceEnd ) 146 { 147 const UTF16* oldSource = source; /* In case we have to back up because 148 of target overflow. */ 149 ch = *source++; 150 /* If we have a surrogate pair, convert to UTF32 first. */ 151 if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END ) 152 { 153 /* If the 16 bits following the high surrogate are in the source 154 buffer... */ 155 if( source < sourceEnd ) 156 { 157 ch2 = *source; 158 /* If it's a low surrogate, convert to UTF32. */ 159 if( ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END ) 160 { 161 ch = ( ( ch - UNI_SUR_HIGH_START ) << halfShift ) 162 + ( ch2 - UNI_SUR_LOW_START ) + halfBase; 163 ++source; 164 } 165 else if( flags == strictConversion ) /* it's an unpaired high 166 surrogate */ 167 { 168 --source; /* return to the illegal value itself */ 169 result = sourceIllegal; 170 break; 171 } 172 } 173 else /* We don't have the 16 bits following the high surrogate. */ 174 { 175 --source; /* return to the high surrogate */ 176 result = sourceExhausted; 177 break; 178 } 179 } 180 else if( flags == strictConversion ) 181 { 182 /* UTF-16 surrogate values are illegal in UTF-32 */ 183 if( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END ) 184 { 185 --source; /* return to the illegal value itself */ 186 result = sourceIllegal; 187 break; 188 } 189 } 190 if( target >= targetEnd ) 191 { 192 source = oldSource; /* Back up source pointer! */ 193 result = targetExhausted; break; 194 } 195 *target++ = ch; 196 } 197 115 const UTF16* source = *sourceStart; 116 UTF32* target = *targetStart; 117 UTF32 ch, ch2; 118 while (source < sourceEnd) { 119 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 120 ch = *source++; 121 /* If we have a surrogate pair, convert to UTF32 first. */ 122 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 123 /* If the 16 bits following the high surrogate are in the source buffer... */ 124 if (source < sourceEnd) { 125 ch2 = *source; 126 /* If it's a low surrogate, convert to UTF32. */ 127 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 128 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 129 + (ch2 - UNI_SUR_LOW_START) + halfBase; 130 ++source; 131 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 132 --source; /* return to the illegal value itself */ 133 result = sourceIllegal; 134 break; 135 } 136 } else { /* We don't have the 16 bits following the high surrogate. */ 137 --source; /* return to the high surrogate */ 138 result = sourceExhausted; 139 break; 140 } 141 } else if (flags == strictConversion) { 142 /* UTF-16 surrogate values are illegal in UTF-32 */ 143 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 144 --source; /* return to the illegal value itself */ 145 result = sourceIllegal; 146 break; 147 } 148 } 149 if (target >= targetEnd) { 150 source = oldSource; /* Back up source pointer! */ 151 result = targetExhausted; break; 152 } 153 *target++ = ch; 154 } 198 155 *sourceStart = source; 199 156 *targetStart = target; 200 157 #ifdef CVTUTF_DEBUG 201 if( result == sourceIllegal ) 202 { 203 fprintf( stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", 204 ch, 205 ch2 ); 206 fflush( stderr ); 207 } 158 if (result == sourceIllegal) { 159 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 160 fflush(stderr); 161 } 208 162 #endif 209 163 return result; … … 219 173 * allowed in earlier algorithms. 220 174 */ 221 static const char trailingBytesForUTF8[256] = { 222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0, 234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 235 1, 1, 1, 1, 1, 1, 1, 1, 236 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 237 4, 4, 4, 4, 5, 5, 5, 5 175 static const char trailingBytesForUTF8[256] = { 176 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 177 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 178 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 179 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 180 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 181 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 182 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 183 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 238 184 }; 239 185 … … 243 189 * in a UTF-8 sequence. 244 190 */ 245 static const UTF32 offsetsFromUTF8[6] = 246 { 0x00000000UL, 0x00003080UL, 247 0x000E2080UL, 248 0x03C82080UL, 0xFA082080UL, 249 0x82082080UL }; 191 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 192 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 250 193 251 194 /* … … 256 199 * for *legal* UTF-8 will be 4 or fewer bytes total. 257 200 */ 258 static const UTF8 firstByteMark[7] = 259 { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 201 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 260 202 261 203 /* --------------------------------------------------------------------- */ … … 271 213 /* --------------------------------------------------------------------- */ 272 214 273 ConversionResult 274 ConvertUTF16toUTF8( const UTF16** sourceStart, 275 const UTF16* sourceEnd, 276 UTF8** targetStart, 277 UTF8* targetEnd, 278 ConversionFlags flags ) 279 { 215 ConversionResult ConvertUTF16toUTF8 ( 216 const UTF16** sourceStart, const UTF16* sourceEnd, 217 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 280 218 ConversionResult result = conversionOK; 281 const UTF16* source = *sourceStart; 282 UTF8* target = *targetStart; 283 284 while( source < sourceEnd ) 285 { 286 UTF32 ch; 287 unsigned short bytesToWrite = 0; 288 const UTF32 byteMask = 0xBF; 289 const UTF32 byteMark = 0x80; 290 const UTF16* oldSource = source; /* In case we have to back up because 291 of target overflow. */ 292 ch = *source++; 293 /* If we have a surrogate pair, convert to UTF32 first. */ 294 if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END ) 295 { 296 /* If the 16 bits following the high surrogate are in the source 297 buffer... */ 298 if( source < sourceEnd ) 299 { 300 UTF32 ch2 = *source; 301 /* If it's a low surrogate, convert to UTF32. */ 302 if( ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END ) 303 { 304 ch = ( ( ch - UNI_SUR_HIGH_START ) << halfShift ) 305 + ( ch2 - UNI_SUR_LOW_START ) + halfBase; 306 ++source; 307 } 308 else if( flags == strictConversion ) /* it's an unpaired high 309 surrogate */ 310 { 311 --source; /* return to the illegal value itself */ 312 result = sourceIllegal; 313 break; 314 } 315 } 316 else /* We don't have the 16 bits following the high surrogate. */ 317 { 318 --source; /* return to the high surrogate */ 319 result = sourceExhausted; 320 break; 321 } 322 } 323 else if( flags == strictConversion ) 324 { 325 /* UTF-16 surrogate values are illegal in UTF-32 */ 326 if( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END ) 327 { 328 --source; /* return to the illegal value itself */ 329 result = sourceIllegal; 330 break; 331 } 332 } 333 /* Figure out how many bytes the result will require */ 334 if( ch < (UTF32)0x80 ) 335 { 336 bytesToWrite = 1; 337 } 338 else if( ch < (UTF32)0x800 ) 339 { 340 bytesToWrite = 2; 341 } 342 else if( ch < (UTF32)0x10000 ) 343 { 344 bytesToWrite = 3; 345 } 346 else if( ch < (UTF32)0x110000 ) 347 { 348 bytesToWrite = 4; 349 } 350 else 351 { 352 bytesToWrite = 3; 353 ch = UNI_REPLACEMENT_CHAR; 354 } 355 356 target += bytesToWrite; 357 if( target > targetEnd ) 358 { 359 source = oldSource; /* Back up source pointer! */ 360 target -= bytesToWrite; result = targetExhausted; break; 361 } 362 switch( bytesToWrite ) /* note: everything falls through. */ 363 { 364 case 4: 365 *--target = 366 (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; 367 368 case 3: 369 *--target = 370 (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; 371 372 case 2: 373 *--target = 374 (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; 375 376 case 1: 377 *--target = (UTF8)( ch | firstByteMark[bytesToWrite] ); 378 } 379 target += bytesToWrite; 380 } 381 219 const UTF16* source = *sourceStart; 220 UTF8* target = *targetStart; 221 while (source < sourceEnd) { 222 UTF32 ch; 223 unsigned short bytesToWrite = 0; 224 const UTF32 byteMask = 0xBF; 225 const UTF32 byteMark = 0x80; 226 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 227 ch = *source++; 228 /* If we have a surrogate pair, convert to UTF32 first. */ 229 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 230 /* If the 16 bits following the high surrogate are in the source buffer... */ 231 if (source < sourceEnd) { 232 UTF32 ch2 = *source; 233 /* If it's a low surrogate, convert to UTF32. */ 234 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 235 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 236 + (ch2 - UNI_SUR_LOW_START) + halfBase; 237 ++source; 238 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 239 --source; /* return to the illegal value itself */ 240 result = sourceIllegal; 241 break; 242 } 243 } else { /* We don't have the 16 bits following the high surrogate. */ 244 --source; /* return to the high surrogate */ 245 result = sourceExhausted; 246 break; 247 } 248 } else if (flags == strictConversion) { 249 /* UTF-16 surrogate values are illegal in UTF-32 */ 250 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 251 --source; /* return to the illegal value itself */ 252 result = sourceIllegal; 253 break; 254 } 255 } 256 /* Figure out how many bytes the result will require */ 257 if (ch < (UTF32)0x80) { bytesToWrite = 1; 258 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 259 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 260 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 261 } else { bytesToWrite = 3; 262 ch = UNI_REPLACEMENT_CHAR; 263 } 264 265 target += bytesToWrite; 266 if (target > targetEnd) { 267 source = oldSource; /* Back up source pointer! */ 268 target -= bytesToWrite; result = targetExhausted; break; 269 } 270 switch (bytesToWrite) { /* note: everything falls through. */ 271 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 272 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 273 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 274 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 275 } 276 target += bytesToWrite; 277 } 382 278 *sourceStart = source; 383 279 *targetStart = target; … … 398 294 */ 399 295 400 static Boolean 401 isLegalUTF8( const UTF8 *source, 402 int length ) 403 { 404 UTF8 a; 405 const UTF8 *srcptr = source + length; 406 407 switch( length ) 408 { 409 default: 410 return false; 411 412 /* Everything else falls through when "true"... */ 413 case 4: 414 if( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return false; 415 416 case 3: 417 if( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return false; 418 419 case 2: 420 if( ( a = ( *--srcptr ) ) > 0xBF ) return false; 421 422 switch( *source ) 423 { 424 /* no fall-through in this inner switch */ 425 case 0xE0: 426 if( a < 0xA0 ) return false;break; 427 428 case 0xED: 429 if( a > 0x9F ) return false;break; 430 431 case 0xF0: 432 if( a < 0x90 ) return false;break; 433 434 case 0xF4: 435 if( a > 0x8F ) return false;break; 436 437 default: 438 if( a < 0x80 ) return false; 439 } 440 441 case 1: 442 if( *source >= 0x80 && *source < 0xC2 ) return false; 443 } 444 if( *source > 0xF4 ) return false; 296 static Boolean isLegalUTF8(const UTF8 *source, int length) { 297 UTF8 a; 298 const UTF8 *srcptr = source+length; 299 switch (length) { 300 default: return false; 301 /* Everything else falls through when "true"... */ 302 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 303 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 304 case 2: if ((a = (*--srcptr)) > 0xBF) return false; 305 306 switch (*source) { 307 /* no fall-through in this inner switch */ 308 case 0xE0: if (a < 0xA0) return false; break; 309 case 0xED: if ((a < 0x80) || (a > 0x9F)) return false; break; 310 case 0xF0: if (a < 0x90) return false; break; 311 case 0xF4: if (a > 0x8F) return false; break; 312 default: if (a < 0x80) return false; 313 } 314 315 case 1: if (*source >= 0x80 && *source < 0xC2) return false; 316 } 317 if (*source > 0xF4) return false; 445 318 return true; 446 319 } … … 452 325 * This is not used here; it's just exported. 453 326 */ 454 Boolean 455 isLegalUTF8Sequence( const UTF8 *source, 456 const UTF8 *sourceEnd ) 457 { 458 int length = trailingBytesForUTF8[*source] + 1; 459 460 if( source + length > sourceEnd ) 461 { 462 return false; 463 } 464 return isLegalUTF8( source, length ); 465 } 466 467 /* --------------------------------------------------------------------- */ 468 469 ConversionResult 470 ConvertUTF8toUTF16( const UTF8** sourceStart, 471 const UTF8* sourceEnd, 472 UTF16** targetStart, 473 UTF16* targetEnd, 474 ConversionFlags flags ) 475 { 327 328 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 329 int length; 330 if (source == sourceEnd) { 331 return true; 332 } 333 while (true) { 334 length = trailingBytesForUTF8[*source]+1; 335 if (source+length > sourceEnd) { 336 return false; 337 } 338 if (!isLegalUTF8(source, length)) { 339 return false; 340 } 341 source += length; 342 if (source >= sourceEnd) { 343 return true; 344 } 345 } 346 } 347 348 /* --------------------------------------------------------------------- */ 349 350 ConversionResult ConvertUTF8toUTF16 ( 351 const UTF8** sourceStart, const UTF8* sourceEnd, 352 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 476 353 ConversionResult result = conversionOK; 477 const UTF8* source = *sourceStart; 478 UTF16* target = *targetStart; 479 480 while( source < sourceEnd ) 481 { 482 UTF32 ch = 0; 483 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 484 if( source + extraBytesToRead >= sourceEnd ) 485 { 486 result = sourceExhausted; break; 487 } 488 /* Do this check whether lenient or strict */ 489 if( !isLegalUTF8( source, extraBytesToRead + 1 ) ) 490 { 491 result = sourceIllegal; 492 break; 493 } 494 /* 495 * The cases all fall through. See "Note A" below. 496 */ 497 switch( extraBytesToRead ) 498 { 499 case 5: 500 ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 501 502 case 4: 503 ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 504 505 case 3: 506 ch += *source++; ch <<= 6; 507 508 case 2: 509 ch += *source++; ch <<= 6; 510 511 case 1: 512 ch += *source++; ch <<= 6; 513 514 case 0: 515 ch += *source++; 516 } 517 ch -= offsetsFromUTF8[extraBytesToRead]; 518 519 if( target >= targetEnd ) 520 { 521 source -= ( extraBytesToRead + 1 ); /* Back up source pointer! */ 522 result = targetExhausted; break; 523 } 524 if( ch <= UNI_MAX_BMP ) /* Target is a character <= 0xFFFF */ 525 { /* UTF-16 surrogate values are illegal in UTF-32 */ 526 if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) 527 { 528 if( flags == strictConversion ) 529 { 530 source -= ( extraBytesToRead + 1 ); /* return to the illegal 531 value itself */ 532 result = sourceIllegal; 533 break; 534 } 535 else 536 { 537 *target++ = UNI_REPLACEMENT_CHAR; 538 } 539 } 540 else 541 { 542 *target++ = (UTF16)ch; /* normal case */ 543 } 544 } 545 else if( ch > UNI_MAX_UTF16 ) 546 { 547 if( flags == strictConversion ) 548 { 549 result = sourceIllegal; 550 source -= ( extraBytesToRead + 1 ); /* return to the start */ 551 break; /* Bail out; shouldn't continue */ 552 } 553 else 554 { 555 *target++ = UNI_REPLACEMENT_CHAR; 556 } 557 } 558 else 559 { 560 /* target is a character in range 0xFFFF - 0x10FFFF. */ 561 if( target + 1 >= targetEnd ) 562 { 563 source -= ( extraBytesToRead + 1 ); /* Back up source pointer! 564 */ 565 result = targetExhausted; break; 566 } 567 ch -= halfBase; 568 *target++ = (UTF16)( ( ch >> halfShift ) + UNI_SUR_HIGH_START ); 569 *target++ = (UTF16)( ( ch & halfMask ) + UNI_SUR_LOW_START ); 570 } 571 } 572 354 const UTF8* source = *sourceStart; 355 UTF16* target = *targetStart; 356 while (source < sourceEnd) { 357 UTF32 ch = 0; 358 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 359 if (source + extraBytesToRead >= sourceEnd) { 360 result = sourceExhausted; break; 361 } 362 /* Do this check whether lenient or strict */ 363 if (! isLegalUTF8(source, extraBytesToRead+1)) { 364 result = sourceIllegal; 365 break; 366 } 367 /* 368 * The cases all fall through. See "Note A" below. 369 */ 370 switch (extraBytesToRead) { 371 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 372 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 373 case 3: ch += *source++; ch <<= 6; 374 case 2: ch += *source++; ch <<= 6; 375 case 1: ch += *source++; ch <<= 6; 376 case 0: ch += *source++; 377 } 378 ch -= offsetsFromUTF8[extraBytesToRead]; 379 380 if (target >= targetEnd) { 381 source -= (extraBytesToRead+1); /* Back up source pointer! */ 382 result = targetExhausted; break; 383 } 384 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 385 /* UTF-16 surrogate values are illegal in UTF-32 */ 386 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 387 if (flags == strictConversion) { 388 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 389 result = sourceIllegal; 390 break; 391 } else { 392 *target++ = UNI_REPLACEMENT_CHAR; 393 } 394 } else { 395 *target++ = (UTF16)ch; /* normal case */ 396 } 397 } else if (ch > UNI_MAX_UTF16) { 398 if (flags == strictConversion) { 399 result = sourceIllegal; 400 source -= (extraBytesToRead+1); /* return to the start */ 401 break; /* Bail out; shouldn't continue */ 402 } else { 403 *target++ = UNI_REPLACEMENT_CHAR; 404 } 405 } else { 406 /* target is a character in range 0xFFFF - 0x10FFFF. */ 407 if (target + 1 >= targetEnd) { 408 source -= (extraBytesToRead+1); /* Back up source pointer! */ 409 result = targetExhausted; break; 410 } 411 ch -= halfBase; 412 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 413 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 414 } 415 } 573 416 *sourceStart = source; 574 417 *targetStart = target; … … 578 421 /* --------------------------------------------------------------------- */ 579 422 580 ConversionResult 581 ConvertUTF32toUTF8( const UTF32** sourceStart, 582 const UTF32* sourceEnd, 583 UTF8** targetStart, 584 UTF8* targetEnd, 585 ConversionFlags flags ) 586 { 423 ConversionResult ConvertUTF32toUTF8 ( 424 const UTF32** sourceStart, const UTF32* sourceEnd, 425 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 587 426 ConversionResult result = conversionOK; 588 const UTF32* source = *sourceStart; 589 UTF8* target = *targetStart; 590 591 while( source < sourceEnd ) 592 { 593 UTF32 ch; 594 unsigned short bytesToWrite = 0; 595 const UTF32 byteMask = 0xBF; 596 const UTF32 byteMark = 0x80; 597 ch = *source++; 598 if( flags == strictConversion ) 599 { 600 /* UTF-16 surrogate values are illegal in UTF-32 */ 601 if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) 602 { 603 --source; /* return to the illegal value itself */ 604 result = sourceIllegal; 605 break; 606 } 607 } 608 /* 609 * Figure out how many bytes the result will require. Turn any 610 * illegally large UTF32 things (> Plane 17) into replacement chars. 611 */ 612 if( ch < (UTF32)0x80 ) 613 { 614 bytesToWrite = 1; 615 } 616 else if( ch < (UTF32)0x800 ) 617 { 618 bytesToWrite = 2; 619 } 620 else if( ch < (UTF32)0x10000 ) 621 { 622 bytesToWrite = 3; 623 } 624 else if( ch <= UNI_MAX_LEGAL_UTF32 ) 625 { 626 bytesToWrite = 4; 627 } 628 else 629 { 630 bytesToWrite = 3; 631 ch = UNI_REPLACEMENT_CHAR; 632 result = sourceIllegal; 633 } 634 635 target += bytesToWrite; 636 if( target > targetEnd ) 637 { 638 --source; /* Back up source pointer! */ 639 target -= bytesToWrite; result = targetExhausted; break; 640 } 641 switch( bytesToWrite ) /* note: everything falls through. */ 642 { 643 case 4: 644 *--target = 645 (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; 646 647 case 3: 648 *--target = 649 (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; 650 651 case 2: 652 *--target = 653 (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; 654 655 case 1: 656 *--target = (UTF8) ( ch | firstByteMark[bytesToWrite] ); 657 } 658 target += bytesToWrite; 659 } 660 427 const UTF32* source = *sourceStart; 428 UTF8* target = *targetStart; 429 while (source < sourceEnd) { 430 UTF32 ch; 431 unsigned short bytesToWrite = 0; 432 const UTF32 byteMask = 0xBF; 433 const UTF32 byteMark = 0x80; 434 ch = *source++; 435 if (flags == strictConversion ) { 436 /* UTF-16 surrogate values are illegal in UTF-32 */ 437 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 438 --source; /* return to the illegal value itself */ 439 result = sourceIllegal; 440 break; 441 } 442 } 443 /* 444 * Figure out how many bytes the result will require. Turn any 445 * illegally large UTF32 things (> Plane 17) into replacement chars. 446 */ 447 if (ch < (UTF32)0x80) { bytesToWrite = 1; 448 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 449 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 450 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 451 } else { bytesToWrite = 3; 452 ch = UNI_REPLACEMENT_CHAR; 453 result = sourceIllegal; 454 } 455 456 target += bytesToWrite; 457 if (target > targetEnd) { 458 --source; /* Back up source pointer! */ 459 target -= bytesToWrite; result = targetExhausted; break; 460 } 461 switch (bytesToWrite) { /* note: everything falls through. */ 462 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 463 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 464 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 465 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 466 } 467 target += bytesToWrite; 468 } 661 469 *sourceStart = source; 662 470 *targetStart = target; … … 666 474 /* --------------------------------------------------------------------- */ 667 475 668 ConversionResult 669 ConvertUTF8toUTF32( const UTF8** sourceStart, 670 const UTF8* sourceEnd, 671 UTF32** targetStart, 672 UTF32* targetEnd, 673 ConversionFlags flags ) 674 { 476 ConversionResult ConvertUTF8toUTF32 ( 477 const UTF8** sourceStart, const UTF8* sourceEnd, 478 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 675 479 ConversionResult result = conversionOK; 676 const UTF8* source = *sourceStart; 677 UTF32* target = *targetStart; 678 679 while( source < sourceEnd ) 680 { 681 UTF32 ch = 0; 682 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 683 if( source + extraBytesToRead >= sourceEnd ) 684 { 685 result = sourceExhausted; break; 686 } 687 /* Do this check whether lenient or strict */ 688 if( !isLegalUTF8( source, extraBytesToRead + 1 ) ) 689 { 690 result = sourceIllegal; 691 break; 692 } 693 /* 694 * The cases all fall through. See "Note A" below. 695 */ 696 switch( extraBytesToRead ) 697 { 698 case 5: 699 ch += *source++; ch <<= 6; 700 701 case 4: 702 ch += *source++; ch <<= 6; 703 704 case 3: 705 ch += *source++; ch <<= 6; 706 707 case 2: 708 ch += *source++; ch <<= 6; 709 710 case 1: 711 ch += *source++; ch <<= 6; 712 713 case 0: 714 ch += *source++; 715 } 716 ch -= offsetsFromUTF8[extraBytesToRead]; 717 718 if( target >= targetEnd ) 719 { 720 source -= ( extraBytesToRead + 1 ); /* Back up the source pointer! 721 */ 722 result = targetExhausted; break; 723 } 724 if( ch <= UNI_MAX_LEGAL_UTF32 ) 725 { 726 /* 727 * UTF-16 surrogate values are illegal in UTF-32, and anything 728 * over Plane 17 (> 0x10FFFF) is illegal. 729 */ 730 if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) 731 { 732 if( flags == strictConversion ) 733 { 734 source -= ( extraBytesToRead + 1 ); /* return to the illegal 735 value itself */ 736 result = sourceIllegal; 737 break; 738 } 739 else 740 { 741 *target++ = UNI_REPLACEMENT_CHAR; 742 } 743 } 744 else 745 { 746 *target++ = ch; 747 } 748 } 749 else /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 750 { 751 result = sourceIllegal; 752 *target++ = UNI_REPLACEMENT_CHAR; 753 } 754 } 755 480 const UTF8* source = *sourceStart; 481 UTF32* target = *targetStart; 482 while (source < sourceEnd) { 483 UTF32 ch = 0; 484 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 485 if (source + extraBytesToRead >= sourceEnd) { 486 result = sourceExhausted; break; 487 } 488 /* Do this check whether lenient or strict */ 489 if (! isLegalUTF8(source, extraBytesToRead+1)) { 490 result = sourceIllegal; 491 break; 492 } 493 /* 494 * The cases all fall through. See "Note A" below. 495 */ 496 switch (extraBytesToRead) { 497 case 5: ch += *source++; ch <<= 6; 498 case 4: ch += *source++; ch <<= 6; 499 case 3: ch += *source++; ch <<= 6; 500 case 2: ch += *source++; ch <<= 6; 501 case 1: ch += *source++; ch <<= 6; 502 case 0: ch += *source++; 503 } 504 ch -= offsetsFromUTF8[extraBytesToRead]; 505 506 if (target >= targetEnd) { 507 source -= (extraBytesToRead+1); /* Back up the source pointer! */ 508 result = targetExhausted; break; 509 } 510 if (ch <= UNI_MAX_LEGAL_UTF32) { 511 /* 512 * UTF-16 surrogate values are illegal in UTF-32, and anything 513 * over Plane 17 (> 0x10FFFF) is illegal. 514 */ 515 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 516 if (flags == strictConversion) { 517 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 518 result = sourceIllegal; 519 break; 520 } else { 521 *target++ = UNI_REPLACEMENT_CHAR; 522 } 523 } else { 524 *target++ = ch; 525 } 526 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 527 result = sourceIllegal; 528 *target++ = UNI_REPLACEMENT_CHAR; 529 } 530 } 756 531 *sourceStart = source; 757 532 *targetStart = target; … … 765 540 temp variable, some decrements & conditionals. The switches 766 541 are equivalent to the following loop: 767 768 769 770 771 772 773 774 542 { 543 int tmpBytesToRead = extraBytesToRead+1; 544 do { 545 ch += *source++; 546 --tmpBytesToRead; 547 if (tmpBytesToRead) ch <<= 6; 548 } while (tmpBytesToRead > 0); 549 } 775 550 In UTF-8 writing code, the switches on "bytesToWrite" are 776 551 similarly unrolled loops. -
trunk/libtransmission/ConvertUTF.h
r7197 r7654 8 8 /* 9 9 * Copyright 2001-2004 Unicode, Inc. 10 * 10 * 11 11 * Disclaimer 12 * 12 * 13 13 * This source code is provided as is by Unicode, Inc. No claims are 14 14 * made as to fitness for any particular purpose. No warranties of any … … 18 18 * sole remedy for any claim will be exchange of defective media 19 19 * within 90 days of receipt. 20 * 20 * 21 21 * Limitations on Rights to Redistribute This Code 22 * 22 * 23 23 * Unicode, Inc. hereby grants the right to freely use the information 24 24 * supplied in this file in the creation of products supporting the … … 41 41 Each routine converts the text between *sourceStart and sourceEnd, 42 42 putting the result into the buffer between *targetStart and 43 targetEnd. Note: the end pointers are *after* the last item: e.g. 44 *(sourceEnd - 1) is the last item.43 targetEnd. Note: the end pointers are *after* the last item: e.g. 44 *(sourceEnd - 1) is the last item. 45 45 46 46 The return result indicates whether the conversion was successful, … … 53 53 54 54 Input parameters: 55 56 57 58 59 60 55 sourceStart - pointer to a pointer to the source buffer. 56 The contents of this are modified on return so that 57 it points at the next thing to be converted. 58 targetStart - similarly, pointer to pointer to the target buffer. 59 sourceEnd, targetEnd - respectively pointers to the ends of the 60 two buffers, for overflow checking only. 61 61 62 62 These conversion functions take a ConversionFlags argument. When this … … 75 75 76 76 Output parameters: 77 78 79 80 81 malformed sequence. 77 The value "sourceIllegal" is returned from some routines if the input 78 sequence is malformed. When "sourceIllegal" is returned, the source 79 value will point to the illegal value that caused the problem. E.g., 80 in UTF-8 when a sequence is malformed, it points to the start of the 81 malformed sequence. 82 82 83 83 Author: Mark E. Davis, 1994. 84 84 Rev History: Rick McGowan, fixes & updates May 2001. 85 85 Fixes & updates, Sept 2001. 86 86 87 87 ------------------------------------------------------------------------ */ 88 88 89 89 /* --------------------------------------------------------------------- … … 93 93 All should be unsigned values to avoid sign extension during 94 94 bit mask & shift operations. 95 95 ------------------------------------------------------------------------ */ 96 96 97 98 typedef unsigned int UTF32; /* at least 32 bits */ 99 typedef unsigned short UTF16; /* at least 16 bits */ 100 typedef unsigned char UTF8; /* typically 8 bits */ 101 typedef unsigned char Boolean; /* 0 or 1 */ 97 typedef unsigned long UTF32; /* at least 32 bits */ 98 typedef unsigned short UTF16; /* at least 16 bits */ 99 typedef unsigned char UTF8; /* typically 8 bits */ 100 typedef unsigned char Boolean; /* 0 or 1 */ 102 101 103 102 /* Some fundamental constants */ … … 108 107 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF 109 108 110 typedef enum 111 { 112 conversionOK, /* conversion successful */ 113 sourceExhausted, /* partial character in source, but hit end */ 114 targetExhausted, /* insuff. room in target for conversion */ 115 sourceIllegal /* source sequence is illegal/malformed */ 109 typedef enum { 110 conversionOK, /* conversion successful */ 111 sourceExhausted, /* partial character in source, but hit end */ 112 targetExhausted, /* insuff. room in target for conversion */ 113 sourceIllegal /* source sequence is illegal/malformed */ 116 114 } ConversionResult; 117 115 118 typedef enum 119 { 120 strictConversion = 0, 121 lenientConversion 116 typedef enum { 117 strictConversion = 0, 118 lenientConversion 122 119 } ConversionFlags; 123 120 … … 127 124 #endif 128 125 129 ConversionResult ConvertUTF8toUTF16( const UTF8** sourceStart, 130 const UTF8* sourceEnd, 131 UTF16** targetStart, 132 UTF16* targetEnd, 133 ConversionFlags flags ); 126 ConversionResult ConvertUTF8toUTF16 ( 127 const UTF8** sourceStart, const UTF8* sourceEnd, 128 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 134 129 135 ConversionResult ConvertUTF16toUTF8( const UTF16** sourceStart, 136 const UTF16* sourceEnd, 137 UTF8** targetStart, 138 UTF8* targetEnd, 139 ConversionFlags flags ); 130 ConversionResult ConvertUTF16toUTF8 ( 131 const UTF16** sourceStart, const UTF16* sourceEnd, 132 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 133 134 ConversionResult ConvertUTF8toUTF32 ( 135 const UTF8** sourceStart, const UTF8* sourceEnd, 136 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 140 137 141 ConversionResult ConvertUTF8toUTF32( const UTF8** sourceStart, 142 const UTF8* sourceEnd, 143 UTF32** targetStart, 144 UTF32* targetEnd, 145 ConversionFlags flags ); 138 ConversionResult ConvertUTF32toUTF8 ( 139 const UTF32** sourceStart, const UTF32* sourceEnd, 140 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 141 142 ConversionResult ConvertUTF16toUTF32 ( 143 const UTF16** sourceStart, const UTF16* sourceEnd, 144 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 146 145 147 ConversionResult ConvertUTF32toUTF8( const UTF32** sourceStart, 148 const UTF32* sourceEnd, 149 UTF8** targetStart, 150 UTF8* targetEnd, 151 ConversionFlags flags ); 146 ConversionResult ConvertUTF32toUTF16 ( 147 const UTF32** sourceStart, const UTF32* sourceEnd, 148 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 152 149 153 ConversionResult ConvertUTF16toUTF32( const UTF16** sourceStart, 154 const UTF16* sourceEnd, 155 UTF32** targetStart, 156 UTF32* targetEnd, 157 ConversionFlags flags ); 158 159 ConversionResult ConvertUTF32toUTF16( const UTF32** sourceStart, 160 const UTF32* sourceEnd, 161 UTF16** targetStart, 162 UTF16* targetEnd, 163 ConversionFlags flags ); 164 165 Boolean isLegalUTF8Sequence( const UTF8 *source, 166 const UTF8 *sourceEnd ); 150 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); 167 151 168 152 #ifdef __cplusplus
Note: See TracChangeset
for help on using the changeset viewer.