001 /**************************************************************** 002 * Licensed to the Apache Software Foundation (ASF) under one * 003 * or more contributor license agreements. See the NOTICE file * 004 * distributed with this work for additional information * 005 * regarding copyright ownership. The ASF licenses this file * 006 * to you under the Apache License, Version 2.0 (the * 007 * "License"); you may not use this file except in compliance * 008 * with the License. You may obtain a copy of the License at * 009 * * 010 * http://www.apache.org/licenses/LICENSE-2.0 * 011 * * 012 * Unless required by applicable law or agreed to in writing, * 013 * software distributed under the License is distributed on an * 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 015 * KIND, either express or implied. See the License for the * 016 * specific language governing permissions and limitations * 017 * under the License. * 018 ****************************************************************/ 019 020 package org.apache.james.mime4j.util; 021 022 import java.io.UnsupportedEncodingException; 023 import java.nio.charset.IllegalCharsetNameException; 024 import java.nio.charset.UnsupportedCharsetException; 025 import java.util.HashMap; 026 import java.util.Map; 027 import java.util.SortedSet; 028 import java.util.TreeSet; 029 030 import org.apache.commons.logging.Log; 031 import org.apache.commons.logging.LogFactory; 032 033 /** 034 * Utility class for working with character sets. It is somewhat similar to 035 * the Java 1.4 <code>java.nio.charset.Charset</code> class but knows many 036 * more aliases and is compatible with Java 1.3. It will use a simple detection 037 * mechanism to detect what character sets the current VM supports. This will 038 * be a sub-set of the character sets listed in the 039 * <a href="http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html"> 040 * Java 1.5 (J2SE5.0) Supported Encodings</a> document. 041 * <p> 042 * The <a href="http://www.iana.org/assignments/character-sets"> 043 * IANA Character Sets</a> document has been used to determine the preferred 044 * MIME character set names and to get a list of known aliases. 045 * <p> 046 * This is a complete list of the character sets known to this class: 047 * <table> 048 * <tr> 049 * <td>Canonical (Java) name</td> 050 * <td>MIME preferred</td> 051 * <td>Aliases</td> 052 * </tr> 053 * <tr> 054 * <td>ASCII</td> 055 * <td>US-ASCII</td> 056 * <td>ANSI_X3.4-1968 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ISO646-US us IBM367 cp367 csASCII ascii7 646 iso_646.irv:1983 </td> 057 * </tr> 058 * <tr> 059 * <td>Big5</td> 060 * <td>Big5</td> 061 * <td>csBig5 CN-Big5 BIG-FIVE BIGFIVE </td> 062 * </tr> 063 * <tr> 064 * <td>Big5_HKSCS</td> 065 * <td>Big5-HKSCS</td> 066 * <td>big5hkscs </td> 067 * </tr> 068 * <tr> 069 * <td>Big5_Solaris</td> 070 * <td>?</td> 071 * <td></td> 072 * </tr> 073 * <tr> 074 * <td>Cp037</td> 075 * <td>IBM037</td> 076 * <td>ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 </td> 077 * </tr> 078 * <tr> 079 * <td>Cp1006</td> 080 * <td>?</td> 081 * <td></td> 082 * </tr> 083 * <tr> 084 * <td>Cp1025</td> 085 * <td>?</td> 086 * <td></td> 087 * </tr> 088 * <tr> 089 * <td>Cp1026</td> 090 * <td>IBM1026</td> 091 * <td>csIBM1026 </td> 092 * </tr> 093 * <tr> 094 * <td>Cp1046</td> 095 * <td>?</td> 096 * <td></td> 097 * </tr> 098 * <tr> 099 * <td>Cp1047</td> 100 * <td>IBM1047</td> 101 * <td>IBM-1047 </td> 102 * </tr> 103 * <tr> 104 * <td>Cp1097</td> 105 * <td>?</td> 106 * <td></td> 107 * </tr> 108 * <tr> 109 * <td>Cp1098</td> 110 * <td>?</td> 111 * <td></td> 112 * </tr> 113 * <tr> 114 * <td>Cp1112</td> 115 * <td>?</td> 116 * <td></td> 117 * </tr> 118 * <tr> 119 * <td>Cp1122</td> 120 * <td>?</td> 121 * <td></td> 122 * </tr> 123 * <tr> 124 * <td>Cp1123</td> 125 * <td>?</td> 126 * <td></td> 127 * </tr> 128 * <tr> 129 * <td>Cp1124</td> 130 * <td>?</td> 131 * <td></td> 132 * </tr> 133 * <tr> 134 * <td>Cp1140</td> 135 * <td>IBM01140</td> 136 * <td>CCSID01140 CP01140 ebcdic-us-37+euro </td> 137 * </tr> 138 * <tr> 139 * <td>Cp1141</td> 140 * <td>IBM01141</td> 141 * <td>CCSID01141 CP01141 ebcdic-de-273+euro </td> 142 * </tr> 143 * <tr> 144 * <td>Cp1142</td> 145 * <td>IBM01142</td> 146 * <td>CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro </td> 147 * </tr> 148 * <tr> 149 * <td>Cp1143</td> 150 * <td>IBM01143</td> 151 * <td>CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro </td> 152 * </tr> 153 * <tr> 154 * <td>Cp1144</td> 155 * <td>IBM01144</td> 156 * <td>CCSID01144 CP01144 ebcdic-it-280+euro </td> 157 * </tr> 158 * <tr> 159 * <td>Cp1145</td> 160 * <td>IBM01145</td> 161 * <td>CCSID01145 CP01145 ebcdic-es-284+euro </td> 162 * </tr> 163 * <tr> 164 * <td>Cp1146</td> 165 * <td>IBM01146</td> 166 * <td>CCSID01146 CP01146 ebcdic-gb-285+euro </td> 167 * </tr> 168 * <tr> 169 * <td>Cp1147</td> 170 * <td>IBM01147</td> 171 * <td>CCSID01147 CP01147 ebcdic-fr-297+euro </td> 172 * </tr> 173 * <tr> 174 * <td>Cp1148</td> 175 * <td>IBM01148</td> 176 * <td>CCSID01148 CP01148 ebcdic-international-500+euro </td> 177 * </tr> 178 * <tr> 179 * <td>Cp1149</td> 180 * <td>IBM01149</td> 181 * <td>CCSID01149 CP01149 ebcdic-is-871+euro </td> 182 * </tr> 183 * <tr> 184 * <td>Cp1250</td> 185 * <td>windows-1250</td> 186 * <td></td> 187 * </tr> 188 * <tr> 189 * <td>Cp1251</td> 190 * <td>windows-1251</td> 191 * <td></td> 192 * </tr> 193 * <tr> 194 * <td>Cp1252</td> 195 * <td>windows-1252</td> 196 * <td></td> 197 * </tr> 198 * <tr> 199 * <td>Cp1253</td> 200 * <td>windows-1253</td> 201 * <td></td> 202 * </tr> 203 * <tr> 204 * <td>Cp1254</td> 205 * <td>windows-1254</td> 206 * <td></td> 207 * </tr> 208 * <tr> 209 * <td>Cp1255</td> 210 * <td>windows-1255</td> 211 * <td></td> 212 * </tr> 213 * <tr> 214 * <td>Cp1256</td> 215 * <td>windows-1256</td> 216 * <td></td> 217 * </tr> 218 * <tr> 219 * <td>Cp1257</td> 220 * <td>windows-1257</td> 221 * <td></td> 222 * </tr> 223 * <tr> 224 * <td>Cp1258</td> 225 * <td>windows-1258</td> 226 * <td></td> 227 * </tr> 228 * <tr> 229 * <td>Cp1381</td> 230 * <td>?</td> 231 * <td></td> 232 * </tr> 233 * <tr> 234 * <td>Cp1383</td> 235 * <td>?</td> 236 * <td></td> 237 * </tr> 238 * <tr> 239 * <td>Cp273</td> 240 * <td>IBM273</td> 241 * <td>csIBM273 </td> 242 * </tr> 243 * <tr> 244 * <td>Cp277</td> 245 * <td>IBM277</td> 246 * <td>EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 </td> 247 * </tr> 248 * <tr> 249 * <td>Cp278</td> 250 * <td>IBM278</td> 251 * <td>CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 </td> 252 * </tr> 253 * <tr> 254 * <td>Cp280</td> 255 * <td>IBM280</td> 256 * <td>ebcdic-cp-it csIBM280 </td> 257 * </tr> 258 * <tr> 259 * <td>Cp284</td> 260 * <td>IBM284</td> 261 * <td>ebcdic-cp-es csIBM284 </td> 262 * </tr> 263 * <tr> 264 * <td>Cp285</td> 265 * <td>IBM285</td> 266 * <td>ebcdic-cp-gb csIBM285 </td> 267 * </tr> 268 * <tr> 269 * <td>Cp297</td> 270 * <td>IBM297</td> 271 * <td>ebcdic-cp-fr csIBM297 </td> 272 * </tr> 273 * <tr> 274 * <td>Cp33722</td> 275 * <td>?</td> 276 * <td></td> 277 * </tr> 278 * <tr> 279 * <td>Cp420</td> 280 * <td>IBM420</td> 281 * <td>ebcdic-cp-ar1 csIBM420 </td> 282 * </tr> 283 * <tr> 284 * <td>Cp424</td> 285 * <td>IBM424</td> 286 * <td>ebcdic-cp-he csIBM424 </td> 287 * </tr> 288 * <tr> 289 * <td>Cp437</td> 290 * <td>IBM437</td> 291 * <td>437 csPC8CodePage437 </td> 292 * </tr> 293 * <tr> 294 * <td>Cp500</td> 295 * <td>IBM500</td> 296 * <td>ebcdic-cp-be ebcdic-cp-ch csIBM500 </td> 297 * </tr> 298 * <tr> 299 * <td>Cp737</td> 300 * <td>?</td> 301 * <td></td> 302 * </tr> 303 * <tr> 304 * <td>Cp775</td> 305 * <td>IBM775</td> 306 * <td>csPC775Baltic </td> 307 * </tr> 308 * <tr> 309 * <td>Cp838</td> 310 * <td>IBM-Thai</td> 311 * <td></td> 312 * </tr> 313 * <tr> 314 * <td>Cp850</td> 315 * <td>IBM850</td> 316 * <td>850 csPC850Multilingual </td> 317 * </tr> 318 * <tr> 319 * <td>Cp852</td> 320 * <td>IBM852</td> 321 * <td>852 csPCp852 </td> 322 * </tr> 323 * <tr> 324 * <td>Cp855</td> 325 * <td>IBM855</td> 326 * <td>855 csIBM855 </td> 327 * </tr> 328 * <tr> 329 * <td>Cp856</td> 330 * <td>?</td> 331 * <td></td> 332 * </tr> 333 * <tr> 334 * <td>Cp857</td> 335 * <td>IBM857</td> 336 * <td>857 csIBM857 </td> 337 * </tr> 338 * <tr> 339 * <td>Cp858</td> 340 * <td>IBM00858</td> 341 * <td>CCSID00858 CP00858 PC-Multilingual-850+euro </td> 342 * </tr> 343 * <tr> 344 * <td>Cp860</td> 345 * <td>IBM860</td> 346 * <td>860 csIBM860 </td> 347 * </tr> 348 * <tr> 349 * <td>Cp861</td> 350 * <td>IBM861</td> 351 * <td>861 cp-is csIBM861 </td> 352 * </tr> 353 * <tr> 354 * <td>Cp862</td> 355 * <td>IBM862</td> 356 * <td>862 csPC862LatinHebrew </td> 357 * </tr> 358 * <tr> 359 * <td>Cp863</td> 360 * <td>IBM863</td> 361 * <td>863 csIBM863 </td> 362 * </tr> 363 * <tr> 364 * <td>Cp864</td> 365 * <td>IBM864</td> 366 * <td>cp864 csIBM864 </td> 367 * </tr> 368 * <tr> 369 * <td>Cp865</td> 370 * <td>IBM865</td> 371 * <td>865 csIBM865 </td> 372 * </tr> 373 * <tr> 374 * <td>Cp866</td> 375 * <td>IBM866</td> 376 * <td>866 csIBM866 </td> 377 * </tr> 378 * <tr> 379 * <td>Cp868</td> 380 * <td>IBM868</td> 381 * <td>cp-ar csIBM868 </td> 382 * </tr> 383 * <tr> 384 * <td>Cp869</td> 385 * <td>IBM869</td> 386 * <td>cp-gr csIBM869 </td> 387 * </tr> 388 * <tr> 389 * <td>Cp870</td> 390 * <td>IBM870</td> 391 * <td>ebcdic-cp-roece ebcdic-cp-yu csIBM870 </td> 392 * </tr> 393 * <tr> 394 * <td>Cp871</td> 395 * <td>IBM871</td> 396 * <td>ebcdic-cp-is csIBM871 </td> 397 * </tr> 398 * <tr> 399 * <td>Cp875</td> 400 * <td>?</td> 401 * <td></td> 402 * </tr> 403 * <tr> 404 * <td>Cp918</td> 405 * <td>IBM918</td> 406 * <td>ebcdic-cp-ar2 csIBM918 </td> 407 * </tr> 408 * <tr> 409 * <td>Cp921</td> 410 * <td>?</td> 411 * <td></td> 412 * </tr> 413 * <tr> 414 * <td>Cp922</td> 415 * <td>?</td> 416 * <td></td> 417 * </tr> 418 * <tr> 419 * <td>Cp930</td> 420 * <td>?</td> 421 * <td></td> 422 * </tr> 423 * <tr> 424 * <td>Cp933</td> 425 * <td>?</td> 426 * <td></td> 427 * </tr> 428 * <tr> 429 * <td>Cp935</td> 430 * <td>?</td> 431 * <td></td> 432 * </tr> 433 * <tr> 434 * <td>Cp937</td> 435 * <td>?</td> 436 * <td></td> 437 * </tr> 438 * <tr> 439 * <td>Cp939</td> 440 * <td>?</td> 441 * <td></td> 442 * </tr> 443 * <tr> 444 * <td>Cp942</td> 445 * <td>?</td> 446 * <td></td> 447 * </tr> 448 * <tr> 449 * <td>Cp942C</td> 450 * <td>?</td> 451 * <td></td> 452 * </tr> 453 * <tr> 454 * <td>Cp943</td> 455 * <td>?</td> 456 * <td></td> 457 * </tr> 458 * <tr> 459 * <td>Cp943C</td> 460 * <td>?</td> 461 * <td></td> 462 * </tr> 463 * <tr> 464 * <td>Cp948</td> 465 * <td>?</td> 466 * <td></td> 467 * </tr> 468 * <tr> 469 * <td>Cp949</td> 470 * <td>?</td> 471 * <td></td> 472 * </tr> 473 * <tr> 474 * <td>Cp949C</td> 475 * <td>?</td> 476 * <td></td> 477 * </tr> 478 * <tr> 479 * <td>Cp950</td> 480 * <td>?</td> 481 * <td></td> 482 * </tr> 483 * <tr> 484 * <td>Cp964</td> 485 * <td>?</td> 486 * <td></td> 487 * </tr> 488 * <tr> 489 * <td>Cp970</td> 490 * <td>?</td> 491 * <td></td> 492 * </tr> 493 * <tr> 494 * <td>EUC_CN</td> 495 * <td>GB2312</td> 496 * <td>x-EUC-CN csGB2312 euccn euc-cn gb2312-80 gb2312-1980 CN-GB CN-GB-ISOIR165 </td> 497 * </tr> 498 * <tr> 499 * <td>EUC_JP</td> 500 * <td>EUC-JP</td> 501 * <td>csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese eucjis x-eucjp eucjp x-euc-jp </td> 502 * </tr> 503 * <tr> 504 * <td>EUC_JP_LINUX</td> 505 * <td>?</td> 506 * <td></td> 507 * </tr> 508 * <tr> 509 * <td>EUC_JP_Solaris</td> 510 * <td>?</td> 511 * <td></td> 512 * </tr> 513 * <tr> 514 * <td>EUC_KR</td> 515 * <td>EUC-KR</td> 516 * <td>csEUCKR ksc5601 5601 ksc5601_1987 ksc_5601 ksc5601-1987 ks_c_5601-1987 euckr </td> 517 * </tr> 518 * <tr> 519 * <td>EUC_TW</td> 520 * <td>EUC-TW</td> 521 * <td>x-EUC-TW cns11643 euctw </td> 522 * </tr> 523 * <tr> 524 * <td>GB18030</td> 525 * <td>GB18030</td> 526 * <td>gb18030-2000 </td> 527 * </tr> 528 * <tr> 529 * <td>GBK</td> 530 * <td>windows-936</td> 531 * <td>CP936 MS936 ms_936 x-mswin-936 </td> 532 * </tr> 533 * <tr> 534 * <td>ISCII91</td> 535 * <td>?</td> 536 * <td>x-ISCII91 iscii </td> 537 * </tr> 538 * <tr> 539 * <td>ISO2022CN</td> 540 * <td>ISO-2022-CN</td> 541 * <td></td> 542 * </tr> 543 * <tr> 544 * <td>ISO2022JP</td> 545 * <td>ISO-2022-JP</td> 546 * <td>csISO2022JP JIS jis_encoding csjisencoding </td> 547 * </tr> 548 * <tr> 549 * <td>ISO2022KR</td> 550 * <td>ISO-2022-KR</td> 551 * <td>csISO2022KR </td> 552 * </tr> 553 * <tr> 554 * <td>ISO2022_CN_CNS</td> 555 * <td>?</td> 556 * <td></td> 557 * </tr> 558 * <tr> 559 * <td>ISO2022_CN_GB</td> 560 * <td>?</td> 561 * <td></td> 562 * </tr> 563 * <tr> 564 * <td>ISO8859_1</td> 565 * <td>ISO-8859-1</td> 566 * <td>ISO_8859-1:1987 iso-ir-100 ISO_8859-1 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 819 IBM-819 ISO8859-1 ISO_8859_1 </td> 567 * </tr> 568 * <tr> 569 * <td>ISO8859_13</td> 570 * <td>ISO-8859-13</td> 571 * <td></td> 572 * </tr> 573 * <tr> 574 * <td>ISO8859_15</td> 575 * <td>ISO-8859-15</td> 576 * <td>ISO_8859-15 Latin-9 8859_15 csISOlatin9 IBM923 cp923 923 L9 IBM-923 ISO8859-15 LATIN9 LATIN0 csISOlatin0 ISO8859_15_FDIS </td> 577 * </tr> 578 * <tr> 579 * <td>ISO8859_2</td> 580 * <td>ISO-8859-2</td> 581 * <td>ISO_8859-2:1987 iso-ir-101 ISO_8859-2 latin2 l2 csISOLatin2 8859_2 iso8859_2 </td> 582 * </tr> 583 * <tr> 584 * <td>ISO8859_3</td> 585 * <td>ISO-8859-3</td> 586 * <td>ISO_8859-3:1988 iso-ir-109 ISO_8859-3 latin3 l3 csISOLatin3 8859_3 </td> 587 * </tr> 588 * <tr> 589 * <td>ISO8859_4</td> 590 * <td>ISO-8859-4</td> 591 * <td>ISO_8859-4:1988 iso-ir-110 ISO_8859-4 latin4 l4 csISOLatin4 8859_4 </td> 592 * </tr> 593 * <tr> 594 * <td>ISO8859_5</td> 595 * <td>ISO-8859-5</td> 596 * <td>ISO_8859-5:1988 iso-ir-144 ISO_8859-5 cyrillic csISOLatinCyrillic 8859_5 </td> 597 * </tr> 598 * <tr> 599 * <td>ISO8859_6</td> 600 * <td>ISO-8859-6</td> 601 * <td>ISO_8859-6:1987 iso-ir-127 ISO_8859-6 ECMA-114 ASMO-708 arabic csISOLatinArabic 8859_6 </td> 602 * </tr> 603 * <tr> 604 * <td>ISO8859_7</td> 605 * <td>ISO-8859-7</td> 606 * <td>ISO_8859-7:1987 iso-ir-126 ISO_8859-7 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 sun_eu_greek </td> 607 * </tr> 608 * <tr> 609 * <td>ISO8859_8</td> 610 * <td>ISO-8859-8</td> 611 * <td>ISO_8859-8:1988 iso-ir-138 ISO_8859-8 hebrew csISOLatinHebrew 8859_8 </td> 612 * </tr> 613 * <tr> 614 * <td>ISO8859_9</td> 615 * <td>ISO-8859-9</td> 616 * <td>ISO_8859-9:1989 iso-ir-148 ISO_8859-9 latin5 l5 csISOLatin5 8859_9 </td> 617 * </tr> 618 * <tr> 619 * <td>JISAutoDetect</td> 620 * <td>?</td> 621 * <td></td> 622 * </tr> 623 * <tr> 624 * <td>JIS_C6626-1983</td> 625 * <td>JIS_C6626-1983</td> 626 * <td>x-JIS0208 JIS0208 csISO87JISX0208 x0208 JIS_X0208-1983 iso-ir-87 </td> 627 * </tr> 628 * <tr> 629 * <td>JIS_X0201</td> 630 * <td>JIS_X0201</td> 631 * <td>X0201 JIS0201 csHalfWidthKatakana </td> 632 * </tr> 633 * <tr> 634 * <td>JIS_X0212-1990</td> 635 * <td>JIS_X0212-1990</td> 636 * <td>iso-ir-159 x0212 JIS0212 csISO159JISX02121990 </td> 637 * </tr> 638 * <tr> 639 * <td>KOI8_R</td> 640 * <td>KOI8-R</td> 641 * <td>csKOI8R koi8 </td> 642 * </tr> 643 * <tr> 644 * <td>MS874</td> 645 * <td>windows-874</td> 646 * <td>cp874 </td> 647 * </tr> 648 * <tr> 649 * <td>MS932</td> 650 * <td>Windows-31J</td> 651 * <td>windows-932 csWindows31J x-ms-cp932 </td> 652 * </tr> 653 * <tr> 654 * <td>MS949</td> 655 * <td>windows-949</td> 656 * <td>windows949 ms_949 x-windows-949 </td> 657 * </tr> 658 * <tr> 659 * <td>MS950</td> 660 * <td>windows-950</td> 661 * <td>x-windows-950 </td> 662 * </tr> 663 * <tr> 664 * <td>MS950_HKSCS</td> 665 * <td></td> 666 * <td></td> 667 * </tr> 668 * <tr> 669 * <td>MacArabic</td> 670 * <td>?</td> 671 * <td></td> 672 * </tr> 673 * <tr> 674 * <td>MacCentralEurope</td> 675 * <td>?</td> 676 * <td></td> 677 * </tr> 678 * <tr> 679 * <td>MacCroatian</td> 680 * <td>?</td> 681 * <td></td> 682 * </tr> 683 * <tr> 684 * <td>MacCyrillic</td> 685 * <td>?</td> 686 * <td></td> 687 * </tr> 688 * <tr> 689 * <td>MacDingbat</td> 690 * <td>?</td> 691 * <td></td> 692 * </tr> 693 * <tr> 694 * <td>MacGreek</td> 695 * <td>MacGreek</td> 696 * <td></td> 697 * </tr> 698 * <tr> 699 * <td>MacHebrew</td> 700 * <td>?</td> 701 * <td></td> 702 * </tr> 703 * <tr> 704 * <td>MacIceland</td> 705 * <td>?</td> 706 * <td></td> 707 * </tr> 708 * <tr> 709 * <td>MacRoman</td> 710 * <td>MacRoman</td> 711 * <td>Macintosh MAC csMacintosh </td> 712 * </tr> 713 * <tr> 714 * <td>MacRomania</td> 715 * <td>?</td> 716 * <td></td> 717 * </tr> 718 * <tr> 719 * <td>MacSymbol</td> 720 * <td>?</td> 721 * <td></td> 722 * </tr> 723 * <tr> 724 * <td>MacThai</td> 725 * <td>?</td> 726 * <td></td> 727 * </tr> 728 * <tr> 729 * <td>MacTurkish</td> 730 * <td>?</td> 731 * <td></td> 732 * </tr> 733 * <tr> 734 * <td>MacUkraine</td> 735 * <td>?</td> 736 * <td></td> 737 * </tr> 738 * <tr> 739 * <td>SJIS</td> 740 * <td>Shift_JIS</td> 741 * <td>MS_Kanji csShiftJIS shift-jis x-sjis pck </td> 742 * </tr> 743 * <tr> 744 * <td>TIS620</td> 745 * <td>TIS-620</td> 746 * <td></td> 747 * </tr> 748 * <tr> 749 * <td>UTF-16</td> 750 * <td>UTF-16</td> 751 * <td>UTF_16 </td> 752 * </tr> 753 * <tr> 754 * <td>UTF8</td> 755 * <td>UTF-8</td> 756 * <td></td> 757 * </tr> 758 * <tr> 759 * <td>UnicodeBig</td> 760 * <td>?</td> 761 * <td></td> 762 * </tr> 763 * <tr> 764 * <td>UnicodeBigUnmarked</td> 765 * <td>UTF-16BE</td> 766 * <td>X-UTF-16BE UTF_16BE ISO-10646-UCS-2 </td> 767 * </tr> 768 * <tr> 769 * <td>UnicodeLittle</td> 770 * <td>?</td> 771 * <td></td> 772 * </tr> 773 * <tr> 774 * <td>UnicodeLittleUnmarked</td> 775 * <td>UTF-16LE</td> 776 * <td>UTF_16LE X-UTF-16LE </td> 777 * </tr> 778 * <tr> 779 * <td>x-Johab</td> 780 * <td>johab</td> 781 * <td>johab cp1361 ms1361 ksc5601-1992 ksc5601_1992 </td> 782 * </tr> 783 * <tr> 784 * <td>x-iso-8859-11</td> 785 * <td>?</td> 786 * <td></td> 787 * </tr> 788 * </table> 789 */ 790 public class CharsetUtil { 791 private static Log log = LogFactory.getLog(CharsetUtil.class); 792 793 private static class Charset implements Comparable<Charset> { 794 private String canonical = null; 795 private String mime = null; 796 private String[] aliases = null; 797 798 private Charset(String canonical, String mime, String[] aliases) { 799 this.canonical = canonical; 800 this.mime = mime; 801 this.aliases = aliases; 802 } 803 804 public int compareTo(Charset c) { 805 return this.canonical.compareTo(c.canonical); 806 } 807 } 808 809 private static Charset[] JAVA_CHARSETS = { 810 new Charset("ISO8859_1", "ISO-8859-1", 811 new String[] {"ISO_8859-1:1987", "iso-ir-100", "ISO_8859-1", 812 "latin1", "l1", "IBM819", "CP819", 813 "csISOLatin1", "8859_1", "819", "IBM-819", 814 "ISO8859-1", "ISO_8859_1"}), 815 new Charset("ISO8859_2", "ISO-8859-2", 816 new String[] {"ISO_8859-2:1987", "iso-ir-101", "ISO_8859-2", 817 "latin2", "l2", "csISOLatin2", "8859_2", 818 "iso8859_2"}), 819 new Charset("ISO8859_3", "ISO-8859-3", new String[] {"ISO_8859-3:1988", "iso-ir-109", "ISO_8859-3", "latin3", "l3", "csISOLatin3", "8859_3"}), 820 new Charset("ISO8859_4", "ISO-8859-4", 821 new String[] {"ISO_8859-4:1988", "iso-ir-110", "ISO_8859-4", 822 "latin4", "l4", "csISOLatin4", "8859_4"}), 823 new Charset("ISO8859_5", "ISO-8859-5", 824 new String[] {"ISO_8859-5:1988", "iso-ir-144", "ISO_8859-5", 825 "cyrillic", "csISOLatinCyrillic", "8859_5"}), 826 new Charset("ISO8859_6", "ISO-8859-6", new String[] {"ISO_8859-6:1987", "iso-ir-127", "ISO_8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic", "8859_6"}), 827 new Charset("ISO8859_7", "ISO-8859-7", 828 new String[] {"ISO_8859-7:1987", "iso-ir-126", "ISO_8859-7", 829 "ELOT_928", "ECMA-118", "greek", "greek8", 830 "csISOLatinGreek", "8859_7", "sun_eu_greek"}), 831 new Charset("ISO8859_8", "ISO-8859-8", new String[] {"ISO_8859-8:1988", "iso-ir-138", "ISO_8859-8", "hebrew", "csISOLatinHebrew", "8859_8"}), 832 new Charset("ISO8859_9", "ISO-8859-9", 833 new String[] {"ISO_8859-9:1989", "iso-ir-148", "ISO_8859-9", 834 "latin5", "l5", "csISOLatin5", "8859_9"}), 835 836 new Charset("ISO8859_13", "ISO-8859-13", new String[] {}), 837 new Charset("ISO8859_15", "ISO-8859-15", 838 new String[] {"ISO_8859-15", "Latin-9", "8859_15", 839 "csISOlatin9", "IBM923", "cp923", "923", "L9", 840 "IBM-923", "ISO8859-15", "LATIN9", "LATIN0", 841 "csISOlatin0", "ISO8859_15_FDIS"}), 842 new Charset("KOI8_R", "KOI8-R", new String[] {"csKOI8R", "koi8"}), 843 new Charset("ASCII", "US-ASCII", 844 new String[] {"ANSI_X3.4-1968", "iso-ir-6", 845 "ANSI_X3.4-1986", "ISO_646.irv:1991", 846 "ISO646-US", "us", "IBM367", "cp367", 847 "csASCII", "ascii7", "646", "iso_646.irv:1983"}), 848 new Charset("UTF8", "UTF-8", new String[] {}), 849 new Charset("UTF-16", "UTF-16", new String[] {"UTF_16"}), 850 new Charset("UnicodeBigUnmarked", "UTF-16BE", new String[] {"X-UTF-16BE", "UTF_16BE", "ISO-10646-UCS-2"}), 851 new Charset("UnicodeLittleUnmarked", "UTF-16LE", new String[] {"UTF_16LE", "X-UTF-16LE"}), 852 new Charset("Big5", "Big5", new String[] {"csBig5", "CN-Big5", "BIG-FIVE", "BIGFIVE"}), 853 new Charset("Big5_HKSCS", "Big5-HKSCS", new String[] {"big5hkscs"}), 854 new Charset("EUC_JP", "EUC-JP", 855 new String[] {"csEUCPkdFmtJapanese", 856 "Extended_UNIX_Code_Packed_Format_for_Japanese", 857 "eucjis", "x-eucjp", "eucjp", "x-euc-jp"}), 858 new Charset("EUC_KR", "EUC-KR", 859 new String[] {"csEUCKR", "ksc5601", "5601", "ksc5601_1987", 860 "ksc_5601", "ksc5601-1987", "ks_c_5601-1987", 861 "euckr"}), 862 new Charset("GB18030", "GB18030", new String[] {"gb18030-2000"}), 863 new Charset("EUC_CN", "GB2312", new String[] {"x-EUC-CN", "csGB2312", "euccn", "euc-cn", "gb2312-80", "gb2312-1980", "CN-GB", "CN-GB-ISOIR165"}), 864 new Charset("GBK", "windows-936", new String[] {"CP936", "MS936", "ms_936", "x-mswin-936"}), 865 866 new Charset("Cp037", "IBM037", new String[] {"ebcdic-cp-us", "ebcdic-cp-ca", "ebcdic-cp-wt", "ebcdic-cp-nl", "csIBM037"}), 867 new Charset("Cp273", "IBM273", new String[] {"csIBM273"}), 868 new Charset("Cp277", "IBM277", new String[] {"EBCDIC-CP-DK", "EBCDIC-CP-NO", "csIBM277"}), 869 new Charset("Cp278", "IBM278", new String[] {"CP278", "ebcdic-cp-fi", "ebcdic-cp-se", "csIBM278"}), 870 new Charset("Cp280", "IBM280", new String[] {"ebcdic-cp-it", "csIBM280"}), 871 new Charset("Cp284", "IBM284", new String[] {"ebcdic-cp-es", "csIBM284"}), 872 new Charset("Cp285", "IBM285", new String[] {"ebcdic-cp-gb", "csIBM285"}), 873 new Charset("Cp297", "IBM297", new String[] {"ebcdic-cp-fr", "csIBM297"}), 874 new Charset("Cp420", "IBM420", new String[] {"ebcdic-cp-ar1", "csIBM420"}), 875 new Charset("Cp424", "IBM424", new String[] {"ebcdic-cp-he", "csIBM424"}), 876 new Charset("Cp437", "IBM437", new String[] {"437", "csPC8CodePage437"}), 877 new Charset("Cp500", "IBM500", new String[] {"ebcdic-cp-be", "ebcdic-cp-ch", "csIBM500"}), 878 new Charset("Cp775", "IBM775", new String[] {"csPC775Baltic"}), 879 new Charset("Cp838", "IBM-Thai", new String[] {}), 880 new Charset("Cp850", "IBM850", new String[] {"850", "csPC850Multilingual"}), 881 new Charset("Cp852", "IBM852", new String[] {"852", "csPCp852"}), 882 new Charset("Cp855", "IBM855", new String[] {"855", "csIBM855"}), 883 new Charset("Cp857", "IBM857", new String[] {"857", "csIBM857"}), 884 new Charset("Cp858", "IBM00858", 885 new String[] {"CCSID00858", "CP00858", 886 "PC-Multilingual-850+euro"}), 887 new Charset("Cp860", "IBM860", new String[] {"860", "csIBM860"}), 888 new Charset("Cp861", "IBM861", new String[] {"861", "cp-is", "csIBM861"}), 889 new Charset("Cp862", "IBM862", new String[] {"862", "csPC862LatinHebrew"}), 890 new Charset("Cp863", "IBM863", new String[] {"863", "csIBM863"}), 891 new Charset("Cp864", "IBM864", new String[] {"cp864", "csIBM864"}), 892 new Charset("Cp865", "IBM865", new String[] {"865", "csIBM865"}), 893 new Charset("Cp866", "IBM866", new String[] {"866", "csIBM866"}), 894 new Charset("Cp868", "IBM868", new String[] {"cp-ar", "csIBM868"}), 895 new Charset("Cp869", "IBM869", new String[] {"cp-gr", "csIBM869"}), 896 new Charset("Cp870", "IBM870", new String[] {"ebcdic-cp-roece", "ebcdic-cp-yu", "csIBM870"}), 897 new Charset("Cp871", "IBM871", new String[] {"ebcdic-cp-is", "csIBM871"}), 898 new Charset("Cp918", "IBM918", new String[] {"ebcdic-cp-ar2", "csIBM918"}), 899 new Charset("Cp1026", "IBM1026", new String[] {"csIBM1026"}), 900 new Charset("Cp1047", "IBM1047", new String[] {"IBM-1047"}), 901 new Charset("Cp1140", "IBM01140", 902 new String[] {"CCSID01140", "CP01140", 903 "ebcdic-us-37+euro"}), 904 new Charset("Cp1141", "IBM01141", 905 new String[] {"CCSID01141", "CP01141", 906 "ebcdic-de-273+euro"}), 907 new Charset("Cp1142", "IBM01142", new String[] {"CCSID01142", "CP01142", "ebcdic-dk-277+euro", "ebcdic-no-277+euro"}), 908 new Charset("Cp1143", "IBM01143", new String[] {"CCSID01143", "CP01143", "ebcdic-fi-278+euro", "ebcdic-se-278+euro"}), 909 new Charset("Cp1144", "IBM01144", new String[] {"CCSID01144", "CP01144", "ebcdic-it-280+euro"}), 910 new Charset("Cp1145", "IBM01145", new String[] {"CCSID01145", "CP01145", "ebcdic-es-284+euro"}), 911 new Charset("Cp1146", "IBM01146", new String[] {"CCSID01146", "CP01146", "ebcdic-gb-285+euro"}), 912 new Charset("Cp1147", "IBM01147", new String[] {"CCSID01147", "CP01147", "ebcdic-fr-297+euro"}), 913 new Charset("Cp1148", "IBM01148", new String[] {"CCSID01148", "CP01148", "ebcdic-international-500+euro"}), 914 new Charset("Cp1149", "IBM01149", new String[] {"CCSID01149", "CP01149", "ebcdic-is-871+euro"}), 915 new Charset("Cp1250", "windows-1250", new String[] {}), 916 new Charset("Cp1251", "windows-1251", new String[] {}), 917 new Charset("Cp1252", "windows-1252", new String[] {}), 918 new Charset("Cp1253", "windows-1253", new String[] {}), 919 new Charset("Cp1254", "windows-1254", new String[] {}), 920 new Charset("Cp1255", "windows-1255", new String[] {}), 921 new Charset("Cp1256", "windows-1256", new String[] {}), 922 new Charset("Cp1257", "windows-1257", new String[] {}), 923 new Charset("Cp1258", "windows-1258", new String[] {}), 924 new Charset("ISO2022CN", "ISO-2022-CN", new String[] {}), 925 new Charset("ISO2022JP", "ISO-2022-JP", new String[] {"csISO2022JP", "JIS", "jis_encoding", "csjisencoding"}), 926 new Charset("ISO2022KR", "ISO-2022-KR", new String[] {"csISO2022KR"}), 927 new Charset("JIS_X0201", "JIS_X0201", new String[] {"X0201", "JIS0201", "csHalfWidthKatakana"}), 928 new Charset("JIS_X0212-1990", "JIS_X0212-1990", new String[] {"iso-ir-159", "x0212", "JIS0212", "csISO159JISX02121990"}), 929 new Charset("JIS_C6626-1983", "JIS_C6626-1983", new String[] {"x-JIS0208", "JIS0208", "csISO87JISX0208", "x0208", "JIS_X0208-1983", "iso-ir-87"}), 930 new Charset("SJIS", "Shift_JIS", new String[] {"MS_Kanji", "csShiftJIS", "shift-jis", "x-sjis", "pck"}), 931 new Charset("TIS620", "TIS-620", new String[] {}), 932 new Charset("MS932", "Windows-31J", new String[] {"windows-932", "csWindows31J", "x-ms-cp932"}), 933 new Charset("EUC_TW", "EUC-TW", new String[] {"x-EUC-TW", "cns11643", "euctw"}), 934 new Charset("x-Johab", "johab", new String[] {"johab", "cp1361", "ms1361", "ksc5601-1992", "ksc5601_1992"}), 935 new Charset("MS950_HKSCS", "", new String[] {}), 936 new Charset("MS874", "windows-874", new String[] {"cp874"}), 937 new Charset("MS949", "windows-949", new String[] {"windows949", "ms_949", "x-windows-949"}), 938 new Charset("MS950", "windows-950", new String[] {"x-windows-950"}), 939 940 new Charset("Cp737", null, new String[] {}), 941 new Charset("Cp856", null, new String[] {}), 942 new Charset("Cp875", null, new String[] {}), 943 new Charset("Cp921", null, new String[] {}), 944 new Charset("Cp922", null, new String[] {}), 945 new Charset("Cp930", null, new String[] {}), 946 new Charset("Cp933", null, new String[] {}), 947 new Charset("Cp935", null, new String[] {}), 948 new Charset("Cp937", null, new String[] {}), 949 new Charset("Cp939", null, new String[] {}), 950 new Charset("Cp942", null, new String[] {}), 951 new Charset("Cp942C", null, new String[] {}), 952 new Charset("Cp943", null, new String[] {}), 953 new Charset("Cp943C", null, new String[] {}), 954 new Charset("Cp948", null, new String[] {}), 955 new Charset("Cp949", null, new String[] {}), 956 new Charset("Cp949C", null, new String[] {}), 957 new Charset("Cp950", null, new String[] {}), 958 new Charset("Cp964", null, new String[] {}), 959 new Charset("Cp970", null, new String[] {}), 960 new Charset("Cp1006", null, new String[] {}), 961 new Charset("Cp1025", null, new String[] {}), 962 new Charset("Cp1046", null, new String[] {}), 963 new Charset("Cp1097", null, new String[] {}), 964 new Charset("Cp1098", null, new String[] {}), 965 new Charset("Cp1112", null, new String[] {}), 966 new Charset("Cp1122", null, new String[] {}), 967 new Charset("Cp1123", null, new String[] {}), 968 new Charset("Cp1124", null, new String[] {}), 969 new Charset("Cp1381", null, new String[] {}), 970 new Charset("Cp1383", null, new String[] {}), 971 new Charset("Cp33722", null, new String[] {}), 972 new Charset("Big5_Solaris", null, new String[] {}), 973 new Charset("EUC_JP_LINUX", null, new String[] {}), 974 new Charset("EUC_JP_Solaris", null, new String[] {}), 975 new Charset("ISCII91", null, new String[] {"x-ISCII91", "iscii"}), 976 new Charset("ISO2022_CN_CNS", null, new String[] {}), 977 new Charset("ISO2022_CN_GB", null, new String[] {}), 978 new Charset("x-iso-8859-11", null, new String[] {}), 979 new Charset("JISAutoDetect", null, new String[] {}), 980 new Charset("MacArabic", null, new String[] {}), 981 new Charset("MacCentralEurope", null, new String[] {}), 982 new Charset("MacCroatian", null, new String[] {}), 983 new Charset("MacCyrillic", null, new String[] {}), 984 new Charset("MacDingbat", null, new String[] {}), 985 new Charset("MacGreek", "MacGreek", new String[] {}), 986 new Charset("MacHebrew", null, new String[] {}), 987 new Charset("MacIceland", null, new String[] {}), 988 new Charset("MacRoman", "MacRoman", new String[] {"Macintosh", "MAC", "csMacintosh"}), 989 new Charset("MacRomania", null, new String[] {}), 990 new Charset("MacSymbol", null, new String[] {}), 991 new Charset("MacThai", null, new String[] {}), 992 new Charset("MacTurkish", null, new String[] {}), 993 new Charset("MacUkraine", null, new String[] {}), 994 new Charset("UnicodeBig", null, new String[] {}), 995 new Charset("UnicodeLittle", null, new String[] {}) 996 }; 997 998 /** 999 * Contains the canonical names of character sets which can be used to 1000 * decode bytes into Java chars. 1001 */ 1002 private static SortedSet<String> decodingSupported = null; 1003 1004 /** 1005 * Contains the canonical names of character sets which can be used to 1006 * encode Java chars into bytes. 1007 */ 1008 private static SortedSet<String> encodingSupported = null; 1009 1010 /** 1011 * Maps character set names to Charset objects. All possible names of 1012 * a charset will be mapped to the Charset. 1013 */ 1014 private static Map<String, Charset> charsetMap = null; 1015 1016 static { 1017 decodingSupported = new TreeSet<String>(); 1018 encodingSupported = new TreeSet<String>(); 1019 byte[] dummy = new byte[] {'d', 'u', 'm', 'm', 'y'}; 1020 for (Charset c : JAVA_CHARSETS) { 1021 try { 1022 new String(dummy, c.canonical); 1023 decodingSupported.add(c.canonical.toLowerCase()); 1024 } catch (UnsupportedOperationException e) { 1025 } catch (UnsupportedEncodingException e) { 1026 } 1027 try { 1028 "dummy".getBytes(c.canonical); 1029 encodingSupported.add(c.canonical.toLowerCase()); 1030 } catch (UnsupportedOperationException e) { 1031 } catch (UnsupportedEncodingException e) { 1032 } 1033 } 1034 1035 charsetMap = new HashMap<String, Charset>(); 1036 for (Charset c : JAVA_CHARSETS) { 1037 charsetMap.put(c.canonical.toLowerCase(), c); 1038 if (c.mime != null) { 1039 charsetMap.put(c.mime.toLowerCase(), c); 1040 } 1041 if (c.aliases != null) { 1042 for (String str : c.aliases) { 1043 charsetMap.put(str.toLowerCase(), c); 1044 } 1045 } 1046 } 1047 1048 if (log.isDebugEnabled()) { 1049 log.debug("Character sets which support decoding: " 1050 + decodingSupported); 1051 log.debug("Character sets which support encoding: " 1052 + encodingSupported); 1053 } 1054 } 1055 1056 /** carriage return - line feed sequence */ 1057 public static final String CRLF = "\r\n"; 1058 1059 /** US-ASCII CR, carriage return (13) */ 1060 public static final int CR = '\r'; 1061 1062 /** US-ASCII LF, line feed (10) */ 1063 public static final int LF = '\n'; 1064 1065 /** US-ASCII SP, space (32) */ 1066 public static final int SP = ' '; 1067 1068 /** US-ASCII HT, horizontal-tab (9) */ 1069 public static final int HT = '\t'; 1070 1071 public static final java.nio.charset.Charset US_ASCII = java.nio.charset.Charset 1072 .forName("US-ASCII"); 1073 1074 public static final java.nio.charset.Charset ISO_8859_1 = java.nio.charset.Charset 1075 .forName("ISO-8859-1"); 1076 1077 public static final java.nio.charset.Charset UTF_8 = java.nio.charset.Charset 1078 .forName("UTF-8"); 1079 1080 public static final java.nio.charset.Charset DEFAULT_CHARSET = US_ASCII; 1081 1082 /** 1083 * Returns <code>true</code> if the specified character falls into the US 1084 * ASCII character set (Unicode range 0000 to 007f). 1085 * 1086 * @param ch 1087 * character to test. 1088 * @return <code>true</code> if the specified character falls into the US 1089 * ASCII character set, <code>false</code> otherwise. 1090 */ 1091 public static boolean isASCII(char ch) { 1092 return (0xFF80 & ch) == 0; 1093 } 1094 1095 /** 1096 * Returns <code>true</code> if the specified string consists entirely of 1097 * US ASCII characters. 1098 * 1099 * @param s 1100 * string to test. 1101 * @return <code>true</code> if the specified string consists entirely of 1102 * US ASCII characters, <code>false</code> otherwise. 1103 */ 1104 public static boolean isASCII(final String s) { 1105 if (s == null) { 1106 throw new IllegalArgumentException("String may not be null"); 1107 } 1108 final int len = s.length(); 1109 for (int i = 0; i < len; i++) { 1110 if (!isASCII(s.charAt(i))) { 1111 return false; 1112 } 1113 } 1114 return true; 1115 } 1116 1117 /** 1118 * Returns <code>true</code> if the specified character is a whitespace 1119 * character (CR, LF, SP or HT). 1120 * 1121 * @param ch 1122 * character to test. 1123 * @return <code>true</code> if the specified character is a whitespace 1124 * character, <code>false</code> otherwise. 1125 */ 1126 public static boolean isWhitespace(char ch) { 1127 return ch == SP || ch == HT || ch == CR || ch == LF; 1128 } 1129 1130 /** 1131 * Returns <code>true</code> if the specified string consists entirely of 1132 * whitespace characters. 1133 * 1134 * @param s 1135 * string to test. 1136 * @return <code>true</code> if the specified string consists entirely of 1137 * whitespace characters, <code>false</code> otherwise. 1138 */ 1139 public static boolean isWhitespace(final String s) { 1140 if (s == null) { 1141 throw new IllegalArgumentException("String may not be null"); 1142 } 1143 final int len = s.length(); 1144 for (int i = 0; i < len; i++) { 1145 if (!isWhitespace(s.charAt(i))) { 1146 return false; 1147 } 1148 } 1149 return true; 1150 } 1151 1152 /** 1153 * Determines if the VM supports encoding (chars to bytes) the 1154 * specified character set. NOTE: the given character set name may 1155 * not be known to the VM even if this method returns <code>true</code>. 1156 * Use {@link #toJavaCharset(String)} to get the canonical Java character 1157 * set name. 1158 * 1159 * @param charsetName the characters set name. 1160 * @return <code>true</code> if encoding is supported, <code>false</code> 1161 * otherwise. 1162 */ 1163 public static boolean isEncodingSupported(String charsetName) { 1164 return encodingSupported.contains(charsetName.toLowerCase()); 1165 } 1166 1167 /** 1168 * Determines if the VM supports decoding (bytes to chars) the 1169 * specified character set. NOTE: the given character set name may 1170 * not be known to the VM even if this method returns <code>true</code>. 1171 * Use {@link #toJavaCharset(String)} to get the canonical Java character 1172 * set name. 1173 * 1174 * @param charsetName the characters set name. 1175 * @return <code>true</code> if decoding is supported, <code>false</code> 1176 * otherwise. 1177 */ 1178 public static boolean isDecodingSupported(String charsetName) { 1179 return decodingSupported.contains(charsetName.toLowerCase()); 1180 } 1181 1182 /** 1183 * Gets the preferred MIME character set name for the specified 1184 * character set or <code>null</code> if not known. 1185 * 1186 * @param charsetName the character set name to look for. 1187 * @return the MIME preferred name or <code>null</code> if not known. 1188 */ 1189 public static String toMimeCharset(String charsetName) { 1190 Charset c = charsetMap.get(charsetName.toLowerCase()); 1191 if (c != null) { 1192 return c.mime; 1193 } 1194 return null; 1195 } 1196 1197 /** 1198 * Gets the canonical Java character set name for the specified 1199 * character set or <code>null</code> if not known. This should be 1200 * called before doing any conversions using the Java API. NOTE: 1201 * you must use {@link #isEncodingSupported(String)} or 1202 * {@link #isDecodingSupported(String)} to make sure the returned 1203 * Java character set is supported by the current VM. 1204 * 1205 * @param charsetName the character set name to look for. 1206 * @return the canonical Java name or <code>null</code> if not known. 1207 */ 1208 public static String toJavaCharset(String charsetName) { 1209 Charset c = charsetMap.get(charsetName.toLowerCase()); 1210 if (c != null) { 1211 return c.canonical; 1212 } 1213 return null; 1214 } 1215 1216 public static java.nio.charset.Charset getCharset(String charsetName) { 1217 String defaultCharset = "ISO-8859-1"; 1218 1219 // Use the default chareset if given charset is null 1220 if(charsetName == null) charsetName = defaultCharset; 1221 1222 try { 1223 return java.nio.charset.Charset.forName(charsetName); 1224 } catch (IllegalCharsetNameException e) { 1225 log.info("Illegal charset " + charsetName + ", fallback to " + defaultCharset + ": " + e); 1226 // Use default charset on exception 1227 return java.nio.charset.Charset.forName(defaultCharset); 1228 } catch (UnsupportedCharsetException ex) { 1229 log.info("Unsupported charset " + charsetName + ", fallback to " + defaultCharset + ": " + ex); 1230 // Use default charset on exception 1231 return java.nio.charset.Charset.forName(defaultCharset); 1232 } 1233 1234 } 1235 /* 1236 * Uncomment the code below and run the main method to regenerate the 1237 * Javadoc table above when the known charsets change. 1238 */ 1239 1240 /* 1241 private static String dumpHtmlTable() { 1242 List<Charset> l = new LinkedList<Charset>(Arrays.asList(JAVA_CHARSETS)); 1243 Collections.sort(l); 1244 StringBuilder sb = new StringBuilder(); 1245 sb.append(" * <table>\n"); 1246 sb.append(" * <tr>\n"); 1247 sb.append(" * <td>Canonical (Java) name</td>\n"); 1248 sb.append(" * <td>MIME preferred</td>\n"); 1249 sb.append(" * <td>Aliases</td>\n"); 1250 sb.append(" * </tr>\n"); 1251 1252 for (Charset c : l) { 1253 sb.append(" * <tr>\n"); 1254 sb.append(" * <td>" + c.canonical + "</td>\n"); 1255 sb.append(" * <td>" + (c.mime == null ? "?" : c.mime)+ "</td>\n"); 1256 sb.append(" * <td>"); 1257 for (int i = 0; c.aliases != null && i < c.aliases.length; i++) { 1258 sb.append(c.aliases[i] + " "); 1259 } 1260 sb.append("</td>\n"); 1261 sb.append(" * </tr>\n"); 1262 } 1263 sb.append(" * </table>\n"); 1264 return sb.toString(); 1265 } 1266 1267 public static void main(String[] args) { 1268 System.out.println(dumpHtmlTable()); 1269 } 1270 */ 1271 }