001    /****************************************************************
002     * Licensed to the Apache Software Foundation (ASF) under one   *
003     * or more contributor license agreements.  See the NOTICE file *
004     * distributed with this work for additional information        *
005     * regarding copyright ownership.  The ASF licenses this file   *
006     * to you under the Apache License, Version 2.0 (the            *
007     * "License"); you may not use this file except in compliance   *
008     * with the License.  You may obtain a copy of the License at   *
009     *                                                              *
010     *   http://www.apache.org/licenses/LICENSE-2.0                 *
011     *                                                              *
012     * Unless required by applicable law or agreed to in writing,   *
013     * software distributed under the License is distributed on an  *
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015     * KIND, either express or implied.  See the License for the    *
016     * specific language governing permissions and limitations      *
017     * under the License.                                           *
018     ****************************************************************/
019    
020    package org.apache.james.mime4j.util;
021    
022    import java.io.UnsupportedEncodingException;
023    import java.nio.charset.IllegalCharsetNameException;
024    import java.nio.charset.UnsupportedCharsetException;
025    import java.util.HashMap;
026    import java.util.Map;
027    import java.util.SortedSet;
028    import java.util.TreeSet;
029    
030    import org.apache.commons.logging.Log;
031    import org.apache.commons.logging.LogFactory;
032    
033    /**
034     * Utility class for working with character sets. It is somewhat similar to
035     * the Java 1.4 <code>java.nio.charset.Charset</code> class but knows many
036     * more aliases and is compatible with Java 1.3. It will use a simple detection
037     * mechanism to detect what character sets the current VM supports. This will
038     * be a sub-set of the character sets listed in the
039     * <a href="http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html">
040     * Java 1.5 (J2SE5.0) Supported Encodings</a> document.
041     * <p>
042     * The <a href="http://www.iana.org/assignments/character-sets">
043     * IANA Character Sets</a> document has been used to determine the preferred
044     * MIME character set names and to get a list of known aliases.
045     * <p>
046     * This is a complete list of the character sets known to this class:
047     * <table>
048     *     <tr>
049     *         <td>Canonical (Java) name</td>
050     *         <td>MIME preferred</td>
051     *         <td>Aliases</td>
052     *     </tr>
053     *     <tr>
054     *         <td>ASCII</td>
055     *         <td>US-ASCII</td>
056     *         <td>ANSI_X3.4-1968 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ISO646-US us IBM367 cp367 csASCII ascii7 646 iso_646.irv:1983 </td>
057     *     </tr>
058     *     <tr>
059     *         <td>Big5</td>
060     *         <td>Big5</td>
061     *         <td>csBig5 CN-Big5 BIG-FIVE BIGFIVE </td>
062     *     </tr>
063     *     <tr>
064     *         <td>Big5_HKSCS</td>
065     *         <td>Big5-HKSCS</td>
066     *         <td>big5hkscs </td>
067     *     </tr>
068     *     <tr>
069     *         <td>Big5_Solaris</td>
070     *         <td>?</td>
071     *         <td></td>
072     *     </tr>
073     *     <tr>
074     *         <td>Cp037</td>
075     *         <td>IBM037</td>
076     *         <td>ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 </td>
077     *     </tr>
078     *     <tr>
079     *         <td>Cp1006</td>
080     *         <td>?</td>
081     *         <td></td>
082     *     </tr>
083     *     <tr>
084     *         <td>Cp1025</td>
085     *         <td>?</td>
086     *         <td></td>
087     *     </tr>
088     *     <tr>
089     *         <td>Cp1026</td>
090     *         <td>IBM1026</td>
091     *         <td>csIBM1026 </td>
092     *     </tr>
093     *     <tr>
094     *         <td>Cp1046</td>
095     *         <td>?</td>
096     *         <td></td>
097     *     </tr>
098     *     <tr>
099     *         <td>Cp1047</td>
100     *         <td>IBM1047</td>
101     *         <td>IBM-1047 </td>
102     *     </tr>
103     *     <tr>
104     *         <td>Cp1097</td>
105     *         <td>?</td>
106     *         <td></td>
107     *     </tr>
108     *     <tr>
109     *         <td>Cp1098</td>
110     *         <td>?</td>
111     *         <td></td>
112     *     </tr>
113     *     <tr>
114     *         <td>Cp1112</td>
115     *         <td>?</td>
116     *         <td></td>
117     *     </tr>
118     *     <tr>
119     *         <td>Cp1122</td>
120     *         <td>?</td>
121     *         <td></td>
122     *     </tr>
123     *     <tr>
124     *         <td>Cp1123</td>
125     *         <td>?</td>
126     *         <td></td>
127     *     </tr>
128     *     <tr>
129     *         <td>Cp1124</td>
130     *         <td>?</td>
131     *         <td></td>
132     *     </tr>
133     *     <tr>
134     *         <td>Cp1140</td>
135     *         <td>IBM01140</td>
136     *         <td>CCSID01140 CP01140 ebcdic-us-37+euro </td>
137     *     </tr>
138     *     <tr>
139     *         <td>Cp1141</td>
140     *         <td>IBM01141</td>
141     *         <td>CCSID01141 CP01141 ebcdic-de-273+euro </td>
142     *     </tr>
143     *     <tr>
144     *         <td>Cp1142</td>
145     *         <td>IBM01142</td>
146     *         <td>CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro </td>
147     *     </tr>
148     *     <tr>
149     *         <td>Cp1143</td>
150     *         <td>IBM01143</td>
151     *         <td>CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro </td>
152     *     </tr>
153     *     <tr>
154     *         <td>Cp1144</td>
155     *         <td>IBM01144</td>
156     *         <td>CCSID01144 CP01144 ebcdic-it-280+euro </td>
157     *     </tr>
158     *     <tr>
159     *         <td>Cp1145</td>
160     *         <td>IBM01145</td>
161     *         <td>CCSID01145 CP01145 ebcdic-es-284+euro </td>
162     *     </tr>
163     *     <tr>
164     *         <td>Cp1146</td>
165     *         <td>IBM01146</td>
166     *         <td>CCSID01146 CP01146 ebcdic-gb-285+euro </td>
167     *     </tr>
168     *     <tr>
169     *         <td>Cp1147</td>
170     *         <td>IBM01147</td>
171     *         <td>CCSID01147 CP01147 ebcdic-fr-297+euro </td>
172     *     </tr>
173     *     <tr>
174     *         <td>Cp1148</td>
175     *         <td>IBM01148</td>
176     *         <td>CCSID01148 CP01148 ebcdic-international-500+euro </td>
177     *     </tr>
178     *     <tr>
179     *         <td>Cp1149</td>
180     *         <td>IBM01149</td>
181     *         <td>CCSID01149 CP01149 ebcdic-is-871+euro </td>
182     *     </tr>
183     *     <tr>
184     *         <td>Cp1250</td>
185     *         <td>windows-1250</td>
186     *         <td></td>
187     *     </tr>
188     *     <tr>
189     *         <td>Cp1251</td>
190     *         <td>windows-1251</td>
191     *         <td></td>
192     *     </tr>
193     *     <tr>
194     *         <td>Cp1252</td>
195     *         <td>windows-1252</td>
196     *         <td></td>
197     *     </tr>
198     *     <tr>
199     *         <td>Cp1253</td>
200     *         <td>windows-1253</td>
201     *         <td></td>
202     *     </tr>
203     *     <tr>
204     *         <td>Cp1254</td>
205     *         <td>windows-1254</td>
206     *         <td></td>
207     *     </tr>
208     *     <tr>
209     *         <td>Cp1255</td>
210     *         <td>windows-1255</td>
211     *         <td></td>
212     *     </tr>
213     *     <tr>
214     *         <td>Cp1256</td>
215     *         <td>windows-1256</td>
216     *         <td></td>
217     *     </tr>
218     *     <tr>
219     *         <td>Cp1257</td>
220     *         <td>windows-1257</td>
221     *         <td></td>
222     *     </tr>
223     *     <tr>
224     *         <td>Cp1258</td>
225     *         <td>windows-1258</td>
226     *         <td></td>
227     *     </tr>
228     *     <tr>
229     *         <td>Cp1381</td>
230     *         <td>?</td>
231     *         <td></td>
232     *     </tr>
233     *     <tr>
234     *         <td>Cp1383</td>
235     *         <td>?</td>
236     *         <td></td>
237     *     </tr>
238     *     <tr>
239     *         <td>Cp273</td>
240     *         <td>IBM273</td>
241     *         <td>csIBM273 </td>
242     *     </tr>
243     *     <tr>
244     *         <td>Cp277</td>
245     *         <td>IBM277</td>
246     *         <td>EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 </td>
247     *     </tr>
248     *     <tr>
249     *         <td>Cp278</td>
250     *         <td>IBM278</td>
251     *         <td>CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 </td>
252     *     </tr>
253     *     <tr>
254     *         <td>Cp280</td>
255     *         <td>IBM280</td>
256     *         <td>ebcdic-cp-it csIBM280 </td>
257     *     </tr>
258     *     <tr>
259     *         <td>Cp284</td>
260     *         <td>IBM284</td>
261     *         <td>ebcdic-cp-es csIBM284 </td>
262     *     </tr>
263     *     <tr>
264     *         <td>Cp285</td>
265     *         <td>IBM285</td>
266     *         <td>ebcdic-cp-gb csIBM285 </td>
267     *     </tr>
268     *     <tr>
269     *         <td>Cp297</td>
270     *         <td>IBM297</td>
271     *         <td>ebcdic-cp-fr csIBM297 </td>
272     *     </tr>
273     *     <tr>
274     *         <td>Cp33722</td>
275     *         <td>?</td>
276     *         <td></td>
277     *     </tr>
278     *     <tr>
279     *         <td>Cp420</td>
280     *         <td>IBM420</td>
281     *         <td>ebcdic-cp-ar1 csIBM420 </td>
282     *     </tr>
283     *     <tr>
284     *         <td>Cp424</td>
285     *         <td>IBM424</td>
286     *         <td>ebcdic-cp-he csIBM424 </td>
287     *     </tr>
288     *     <tr>
289     *         <td>Cp437</td>
290     *         <td>IBM437</td>
291     *         <td>437 csPC8CodePage437 </td>
292     *     </tr>
293     *     <tr>
294     *         <td>Cp500</td>
295     *         <td>IBM500</td>
296     *         <td>ebcdic-cp-be ebcdic-cp-ch csIBM500 </td>
297     *     </tr>
298     *     <tr>
299     *         <td>Cp737</td>
300     *         <td>?</td>
301     *         <td></td>
302     *     </tr>
303     *     <tr>
304     *         <td>Cp775</td>
305     *         <td>IBM775</td>
306     *         <td>csPC775Baltic </td>
307     *     </tr>
308     *     <tr>
309     *         <td>Cp838</td>
310     *         <td>IBM-Thai</td>
311     *         <td></td>
312     *     </tr>
313     *     <tr>
314     *         <td>Cp850</td>
315     *         <td>IBM850</td>
316     *         <td>850 csPC850Multilingual </td>
317     *     </tr>
318     *     <tr>
319     *         <td>Cp852</td>
320     *         <td>IBM852</td>
321     *         <td>852 csPCp852 </td>
322     *     </tr>
323     *     <tr>
324     *         <td>Cp855</td>
325     *         <td>IBM855</td>
326     *         <td>855 csIBM855 </td>
327     *     </tr>
328     *     <tr>
329     *         <td>Cp856</td>
330     *         <td>?</td>
331     *         <td></td>
332     *     </tr>
333     *     <tr>
334     *         <td>Cp857</td>
335     *         <td>IBM857</td>
336     *         <td>857 csIBM857 </td>
337     *     </tr>
338     *     <tr>
339     *         <td>Cp858</td>
340     *         <td>IBM00858</td>
341     *         <td>CCSID00858 CP00858 PC-Multilingual-850+euro </td>
342     *     </tr>
343     *     <tr>
344     *         <td>Cp860</td>
345     *         <td>IBM860</td>
346     *         <td>860 csIBM860 </td>
347     *     </tr>
348     *     <tr>
349     *         <td>Cp861</td>
350     *         <td>IBM861</td>
351     *         <td>861 cp-is csIBM861 </td>
352     *     </tr>
353     *     <tr>
354     *         <td>Cp862</td>
355     *         <td>IBM862</td>
356     *         <td>862 csPC862LatinHebrew </td>
357     *     </tr>
358     *     <tr>
359     *         <td>Cp863</td>
360     *         <td>IBM863</td>
361     *         <td>863 csIBM863 </td>
362     *     </tr>
363     *     <tr>
364     *         <td>Cp864</td>
365     *         <td>IBM864</td>
366     *         <td>cp864 csIBM864 </td>
367     *     </tr>
368     *     <tr>
369     *         <td>Cp865</td>
370     *         <td>IBM865</td>
371     *         <td>865 csIBM865 </td>
372     *     </tr>
373     *     <tr>
374     *         <td>Cp866</td>
375     *         <td>IBM866</td>
376     *         <td>866 csIBM866 </td>
377     *     </tr>
378     *     <tr>
379     *         <td>Cp868</td>
380     *         <td>IBM868</td>
381     *         <td>cp-ar csIBM868 </td>
382     *     </tr>
383     *     <tr>
384     *         <td>Cp869</td>
385     *         <td>IBM869</td>
386     *         <td>cp-gr csIBM869 </td>
387     *     </tr>
388     *     <tr>
389     *         <td>Cp870</td>
390     *         <td>IBM870</td>
391     *         <td>ebcdic-cp-roece ebcdic-cp-yu csIBM870 </td>
392     *     </tr>
393     *     <tr>
394     *         <td>Cp871</td>
395     *         <td>IBM871</td>
396     *         <td>ebcdic-cp-is csIBM871 </td>
397     *     </tr>
398     *     <tr>
399     *         <td>Cp875</td>
400     *         <td>?</td>
401     *         <td></td>
402     *     </tr>
403     *     <tr>
404     *         <td>Cp918</td>
405     *         <td>IBM918</td>
406     *         <td>ebcdic-cp-ar2 csIBM918 </td>
407     *     </tr>
408     *     <tr>
409     *         <td>Cp921</td>
410     *         <td>?</td>
411     *         <td></td>
412     *     </tr>
413     *     <tr>
414     *         <td>Cp922</td>
415     *         <td>?</td>
416     *         <td></td>
417     *     </tr>
418     *     <tr>
419     *         <td>Cp930</td>
420     *         <td>?</td>
421     *         <td></td>
422     *     </tr>
423     *     <tr>
424     *         <td>Cp933</td>
425     *         <td>?</td>
426     *         <td></td>
427     *     </tr>
428     *     <tr>
429     *         <td>Cp935</td>
430     *         <td>?</td>
431     *         <td></td>
432     *     </tr>
433     *     <tr>
434     *         <td>Cp937</td>
435     *         <td>?</td>
436     *         <td></td>
437     *     </tr>
438     *     <tr>
439     *         <td>Cp939</td>
440     *         <td>?</td>
441     *         <td></td>
442     *     </tr>
443     *     <tr>
444     *         <td>Cp942</td>
445     *         <td>?</td>
446     *         <td></td>
447     *     </tr>
448     *     <tr>
449     *         <td>Cp942C</td>
450     *         <td>?</td>
451     *         <td></td>
452     *     </tr>
453     *     <tr>
454     *         <td>Cp943</td>
455     *         <td>?</td>
456     *         <td></td>
457     *     </tr>
458     *     <tr>
459     *         <td>Cp943C</td>
460     *         <td>?</td>
461     *         <td></td>
462     *     </tr>
463     *     <tr>
464     *         <td>Cp948</td>
465     *         <td>?</td>
466     *         <td></td>
467     *     </tr>
468     *     <tr>
469     *         <td>Cp949</td>
470     *         <td>?</td>
471     *         <td></td>
472     *     </tr>
473     *     <tr>
474     *         <td>Cp949C</td>
475     *         <td>?</td>
476     *         <td></td>
477     *     </tr>
478     *     <tr>
479     *         <td>Cp950</td>
480     *         <td>?</td>
481     *         <td></td>
482     *     </tr>
483     *     <tr>
484     *         <td>Cp964</td>
485     *         <td>?</td>
486     *         <td></td>
487     *     </tr>
488     *     <tr>
489     *         <td>Cp970</td>
490     *         <td>?</td>
491     *         <td></td>
492     *     </tr>
493     *     <tr>
494     *         <td>EUC_CN</td>
495     *         <td>GB2312</td>
496     *         <td>x-EUC-CN csGB2312 euccn euc-cn gb2312-80 gb2312-1980 CN-GB CN-GB-ISOIR165 </td>
497     *     </tr>
498     *     <tr>
499     *         <td>EUC_JP</td>
500     *         <td>EUC-JP</td>
501     *         <td>csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese eucjis x-eucjp eucjp x-euc-jp </td>
502     *     </tr>
503     *     <tr>
504     *         <td>EUC_JP_LINUX</td>
505     *         <td>?</td>
506     *         <td></td>
507     *     </tr>
508     *     <tr>
509     *         <td>EUC_JP_Solaris</td>
510     *         <td>?</td>
511     *         <td></td>
512     *     </tr>
513     *     <tr>
514     *         <td>EUC_KR</td>
515     *         <td>EUC-KR</td>
516     *         <td>csEUCKR ksc5601 5601 ksc5601_1987 ksc_5601 ksc5601-1987 ks_c_5601-1987 euckr </td>
517     *     </tr>
518     *     <tr>
519     *         <td>EUC_TW</td>
520     *         <td>EUC-TW</td>
521     *         <td>x-EUC-TW cns11643 euctw </td>
522     *     </tr>
523     *     <tr>
524     *         <td>GB18030</td>
525     *         <td>GB18030</td>
526     *         <td>gb18030-2000 </td>
527     *     </tr>
528     *     <tr>
529     *         <td>GBK</td>
530     *         <td>windows-936</td>
531     *         <td>CP936 MS936 ms_936 x-mswin-936 </td>
532     *     </tr>
533     *     <tr>
534     *         <td>ISCII91</td>
535     *         <td>?</td>
536     *         <td>x-ISCII91 iscii </td>
537     *     </tr>
538     *     <tr>
539     *         <td>ISO2022CN</td>
540     *         <td>ISO-2022-CN</td>
541     *         <td></td>
542     *     </tr>
543     *     <tr>
544     *         <td>ISO2022JP</td>
545     *         <td>ISO-2022-JP</td>
546     *         <td>csISO2022JP JIS jis_encoding csjisencoding </td>
547     *     </tr>
548     *     <tr>
549     *         <td>ISO2022KR</td>
550     *         <td>ISO-2022-KR</td>
551     *         <td>csISO2022KR </td>
552     *     </tr>
553     *     <tr>
554     *         <td>ISO2022_CN_CNS</td>
555     *         <td>?</td>
556     *         <td></td>
557     *     </tr>
558     *     <tr>
559     *         <td>ISO2022_CN_GB</td>
560     *         <td>?</td>
561     *         <td></td>
562     *     </tr>
563     *     <tr>
564     *         <td>ISO8859_1</td>
565     *         <td>ISO-8859-1</td>
566     *         <td>ISO_8859-1:1987 iso-ir-100 ISO_8859-1 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 819 IBM-819 ISO8859-1 ISO_8859_1 </td>
567     *     </tr>
568     *     <tr>
569     *         <td>ISO8859_13</td>
570     *         <td>ISO-8859-13</td>
571     *         <td></td>
572     *     </tr>
573     *     <tr>
574     *         <td>ISO8859_15</td>
575     *         <td>ISO-8859-15</td>
576     *         <td>ISO_8859-15 Latin-9 8859_15 csISOlatin9 IBM923 cp923 923 L9 IBM-923 ISO8859-15 LATIN9 LATIN0 csISOlatin0 ISO8859_15_FDIS </td>
577     *     </tr>
578     *     <tr>
579     *         <td>ISO8859_2</td>
580     *         <td>ISO-8859-2</td>
581     *         <td>ISO_8859-2:1987 iso-ir-101 ISO_8859-2 latin2 l2 csISOLatin2 8859_2 iso8859_2 </td>
582     *     </tr>
583     *     <tr>
584     *         <td>ISO8859_3</td>
585     *         <td>ISO-8859-3</td>
586     *         <td>ISO_8859-3:1988 iso-ir-109 ISO_8859-3 latin3 l3 csISOLatin3 8859_3 </td>
587     *     </tr>
588     *     <tr>
589     *         <td>ISO8859_4</td>
590     *         <td>ISO-8859-4</td>
591     *         <td>ISO_8859-4:1988 iso-ir-110 ISO_8859-4 latin4 l4 csISOLatin4 8859_4 </td>
592     *     </tr>
593     *     <tr>
594     *         <td>ISO8859_5</td>
595     *         <td>ISO-8859-5</td>
596     *         <td>ISO_8859-5:1988 iso-ir-144 ISO_8859-5 cyrillic csISOLatinCyrillic 8859_5 </td>
597     *     </tr>
598     *     <tr>
599     *         <td>ISO8859_6</td>
600     *         <td>ISO-8859-6</td>
601     *         <td>ISO_8859-6:1987 iso-ir-127 ISO_8859-6 ECMA-114 ASMO-708 arabic csISOLatinArabic 8859_6 </td>
602     *     </tr>
603     *     <tr>
604     *         <td>ISO8859_7</td>
605     *         <td>ISO-8859-7</td>
606     *         <td>ISO_8859-7:1987 iso-ir-126 ISO_8859-7 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 sun_eu_greek </td>
607     *     </tr>
608     *     <tr>
609     *         <td>ISO8859_8</td>
610     *         <td>ISO-8859-8</td>
611     *         <td>ISO_8859-8:1988 iso-ir-138 ISO_8859-8 hebrew csISOLatinHebrew 8859_8 </td>
612     *     </tr>
613     *     <tr>
614     *         <td>ISO8859_9</td>
615     *         <td>ISO-8859-9</td>
616     *         <td>ISO_8859-9:1989 iso-ir-148 ISO_8859-9 latin5 l5 csISOLatin5 8859_9 </td>
617     *     </tr>
618     *     <tr>
619     *         <td>JISAutoDetect</td>
620     *         <td>?</td>
621     *         <td></td>
622     *     </tr>
623     *     <tr>
624     *         <td>JIS_C6626-1983</td>
625     *         <td>JIS_C6626-1983</td>
626     *         <td>x-JIS0208 JIS0208 csISO87JISX0208 x0208 JIS_X0208-1983 iso-ir-87 </td>
627     *     </tr>
628     *     <tr>
629     *         <td>JIS_X0201</td>
630     *         <td>JIS_X0201</td>
631     *         <td>X0201 JIS0201 csHalfWidthKatakana </td>
632     *     </tr>
633     *     <tr>
634     *         <td>JIS_X0212-1990</td>
635     *         <td>JIS_X0212-1990</td>
636     *         <td>iso-ir-159 x0212 JIS0212 csISO159JISX02121990 </td>
637     *     </tr>
638     *     <tr>
639     *         <td>KOI8_R</td>
640     *         <td>KOI8-R</td>
641     *         <td>csKOI8R koi8 </td>
642     *     </tr>
643     *     <tr>
644     *         <td>MS874</td>
645     *         <td>windows-874</td>
646     *         <td>cp874 </td>
647     *     </tr>
648     *     <tr>
649     *         <td>MS932</td>
650     *         <td>Windows-31J</td>
651     *         <td>windows-932 csWindows31J x-ms-cp932 </td>
652     *     </tr>
653     *     <tr>
654     *         <td>MS949</td>
655     *         <td>windows-949</td>
656     *         <td>windows949 ms_949 x-windows-949 </td>
657     *     </tr>
658     *     <tr>
659     *         <td>MS950</td>
660     *         <td>windows-950</td>
661     *         <td>x-windows-950 </td>
662     *     </tr>
663     *     <tr>
664     *         <td>MS950_HKSCS</td>
665     *         <td></td>
666     *         <td></td>
667     *     </tr>
668     *     <tr>
669     *         <td>MacArabic</td>
670     *         <td>?</td>
671     *         <td></td>
672     *     </tr>
673     *     <tr>
674     *         <td>MacCentralEurope</td>
675     *         <td>?</td>
676     *         <td></td>
677     *     </tr>
678     *     <tr>
679     *         <td>MacCroatian</td>
680     *         <td>?</td>
681     *         <td></td>
682     *     </tr>
683     *     <tr>
684     *         <td>MacCyrillic</td>
685     *         <td>?</td>
686     *         <td></td>
687     *     </tr>
688     *     <tr>
689     *         <td>MacDingbat</td>
690     *         <td>?</td>
691     *         <td></td>
692     *     </tr>
693     *     <tr>
694     *         <td>MacGreek</td>
695     *         <td>MacGreek</td>
696     *         <td></td>
697     *     </tr>
698     *     <tr>
699     *         <td>MacHebrew</td>
700     *         <td>?</td>
701     *         <td></td>
702     *     </tr>
703     *     <tr>
704     *         <td>MacIceland</td>
705     *         <td>?</td>
706     *         <td></td>
707     *     </tr>
708     *     <tr>
709     *         <td>MacRoman</td>
710     *         <td>MacRoman</td>
711     *         <td>Macintosh MAC csMacintosh </td>
712     *     </tr>
713     *     <tr>
714     *         <td>MacRomania</td>
715     *         <td>?</td>
716     *         <td></td>
717     *     </tr>
718     *     <tr>
719     *         <td>MacSymbol</td>
720     *         <td>?</td>
721     *         <td></td>
722     *     </tr>
723     *     <tr>
724     *         <td>MacThai</td>
725     *         <td>?</td>
726     *         <td></td>
727     *     </tr>
728     *     <tr>
729     *         <td>MacTurkish</td>
730     *         <td>?</td>
731     *         <td></td>
732     *     </tr>
733     *     <tr>
734     *         <td>MacUkraine</td>
735     *         <td>?</td>
736     *         <td></td>
737     *     </tr>
738     *     <tr>
739     *         <td>SJIS</td>
740     *         <td>Shift_JIS</td>
741     *         <td>MS_Kanji csShiftJIS shift-jis x-sjis pck </td>
742     *     </tr>
743     *     <tr>
744     *         <td>TIS620</td>
745     *         <td>TIS-620</td>
746     *         <td></td>
747     *     </tr>
748     *     <tr>
749     *         <td>UTF-16</td>
750     *         <td>UTF-16</td>
751     *         <td>UTF_16 </td>
752     *     </tr>
753     *     <tr>
754     *         <td>UTF8</td>
755     *         <td>UTF-8</td>
756     *         <td></td>
757     *     </tr>
758     *     <tr>
759     *         <td>UnicodeBig</td>
760     *         <td>?</td>
761     *         <td></td>
762     *     </tr>
763     *     <tr>
764     *         <td>UnicodeBigUnmarked</td>
765     *         <td>UTF-16BE</td>
766     *         <td>X-UTF-16BE UTF_16BE ISO-10646-UCS-2 </td>
767     *     </tr>
768     *     <tr>
769     *         <td>UnicodeLittle</td>
770     *         <td>?</td>
771     *         <td></td>
772     *     </tr>
773     *     <tr>
774     *         <td>UnicodeLittleUnmarked</td>
775     *         <td>UTF-16LE</td>
776     *         <td>UTF_16LE X-UTF-16LE </td>
777     *     </tr>
778     *     <tr>
779     *         <td>x-Johab</td>
780     *         <td>johab</td>
781     *         <td>johab cp1361 ms1361 ksc5601-1992 ksc5601_1992 </td>
782     *     </tr>
783     *     <tr>
784     *         <td>x-iso-8859-11</td>
785     *         <td>?</td>
786     *         <td></td>
787     *     </tr>
788     * </table>
789     */
790    public class CharsetUtil {
791        private static Log log = LogFactory.getLog(CharsetUtil.class);
792        
793        private static class Charset implements Comparable<Charset> {
794            private String canonical = null;
795            private String mime = null;
796            private String[] aliases = null;
797            
798            private Charset(String canonical, String mime, String[] aliases) {
799                this.canonical = canonical;
800                this.mime = mime;
801                this.aliases = aliases;
802            }
803    
804            public int compareTo(Charset c) {
805                return this.canonical.compareTo(c.canonical);
806            }
807        }
808        
809        private static Charset[] JAVA_CHARSETS = {
810            new Charset("ISO8859_1", "ISO-8859-1", 
811                        new String[] {"ISO_8859-1:1987", "iso-ir-100", "ISO_8859-1", 
812                                      "latin1", "l1", "IBM819", "CP819", 
813                                      "csISOLatin1", "8859_1", "819", "IBM-819", 
814                                      "ISO8859-1", "ISO_8859_1"}),
815            new Charset("ISO8859_2", "ISO-8859-2", 
816                        new String[] {"ISO_8859-2:1987", "iso-ir-101", "ISO_8859-2",  
817                                      "latin2", "l2", "csISOLatin2", "8859_2", 
818                                      "iso8859_2"}),
819            new Charset("ISO8859_3", "ISO-8859-3", new String[] {"ISO_8859-3:1988", "iso-ir-109", "ISO_8859-3", "latin3", "l3", "csISOLatin3", "8859_3"}),
820            new Charset("ISO8859_4", "ISO-8859-4", 
821                        new String[] {"ISO_8859-4:1988", "iso-ir-110", "ISO_8859-4",
822                                      "latin4", "l4", "csISOLatin4", "8859_4"}),
823            new Charset("ISO8859_5", "ISO-8859-5", 
824                        new String[] {"ISO_8859-5:1988", "iso-ir-144", "ISO_8859-5", 
825                                      "cyrillic", "csISOLatinCyrillic", "8859_5"}),
826            new Charset("ISO8859_6", "ISO-8859-6", new String[] {"ISO_8859-6:1987", "iso-ir-127", "ISO_8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic", "8859_6"}),
827            new Charset("ISO8859_7", "ISO-8859-7", 
828                        new String[] {"ISO_8859-7:1987", "iso-ir-126", "ISO_8859-7", 
829                                      "ELOT_928", "ECMA-118", "greek", "greek8", 
830                                      "csISOLatinGreek", "8859_7", "sun_eu_greek"}),
831            new Charset("ISO8859_8", "ISO-8859-8", new String[] {"ISO_8859-8:1988", "iso-ir-138", "ISO_8859-8", "hebrew", "csISOLatinHebrew", "8859_8"}),
832            new Charset("ISO8859_9", "ISO-8859-9", 
833                        new String[] {"ISO_8859-9:1989", "iso-ir-148", "ISO_8859-9",  
834                                      "latin5", "l5", "csISOLatin5", "8859_9"}),
835    
836            new Charset("ISO8859_13", "ISO-8859-13", new String[] {}),
837            new Charset("ISO8859_15", "ISO-8859-15", 
838                        new String[] {"ISO_8859-15", "Latin-9", "8859_15", 
839                                      "csISOlatin9", "IBM923", "cp923", "923", "L9",
840                                      "IBM-923", "ISO8859-15", "LATIN9", "LATIN0", 
841                                      "csISOlatin0", "ISO8859_15_FDIS"}),
842            new Charset("KOI8_R", "KOI8-R", new String[] {"csKOI8R", "koi8"}),
843            new Charset("ASCII", "US-ASCII", 
844                        new String[] {"ANSI_X3.4-1968", "iso-ir-6", 
845                                      "ANSI_X3.4-1986", "ISO_646.irv:1991", 
846                                      "ISO646-US", "us", "IBM367", "cp367", 
847                                      "csASCII", "ascii7", "646", "iso_646.irv:1983"}),
848            new Charset("UTF8", "UTF-8", new String[] {}),
849            new Charset("UTF-16", "UTF-16", new String[] {"UTF_16"}),
850            new Charset("UnicodeBigUnmarked", "UTF-16BE", new String[] {"X-UTF-16BE", "UTF_16BE", "ISO-10646-UCS-2"}),
851            new Charset("UnicodeLittleUnmarked", "UTF-16LE", new String[] {"UTF_16LE", "X-UTF-16LE"}),
852            new Charset("Big5", "Big5", new String[] {"csBig5", "CN-Big5", "BIG-FIVE", "BIGFIVE"}),
853            new Charset("Big5_HKSCS", "Big5-HKSCS", new String[] {"big5hkscs"}),
854            new Charset("EUC_JP", "EUC-JP", 
855                        new String[] {"csEUCPkdFmtJapanese", 
856                                  "Extended_UNIX_Code_Packed_Format_for_Japanese",
857                                  "eucjis", "x-eucjp", "eucjp", "x-euc-jp"}),
858            new Charset("EUC_KR", "EUC-KR", 
859                        new String[] {"csEUCKR", "ksc5601", "5601", "ksc5601_1987", 
860                                      "ksc_5601", "ksc5601-1987", "ks_c_5601-1987", 
861                                      "euckr"}),
862            new Charset("GB18030", "GB18030", new String[] {"gb18030-2000"}),
863            new Charset("EUC_CN", "GB2312", new String[] {"x-EUC-CN", "csGB2312", "euccn", "euc-cn", "gb2312-80", "gb2312-1980", "CN-GB", "CN-GB-ISOIR165"}),
864            new Charset("GBK", "windows-936", new String[] {"CP936", "MS936", "ms_936", "x-mswin-936"}),
865    
866            new Charset("Cp037", "IBM037", new String[] {"ebcdic-cp-us", "ebcdic-cp-ca", "ebcdic-cp-wt", "ebcdic-cp-nl", "csIBM037"}),
867            new Charset("Cp273", "IBM273", new String[] {"csIBM273"}),
868            new Charset("Cp277", "IBM277", new String[] {"EBCDIC-CP-DK", "EBCDIC-CP-NO", "csIBM277"}),
869            new Charset("Cp278", "IBM278", new String[] {"CP278", "ebcdic-cp-fi", "ebcdic-cp-se", "csIBM278"}),
870            new Charset("Cp280", "IBM280", new String[] {"ebcdic-cp-it", "csIBM280"}),
871            new Charset("Cp284", "IBM284", new String[] {"ebcdic-cp-es", "csIBM284"}),
872            new Charset("Cp285", "IBM285", new String[] {"ebcdic-cp-gb", "csIBM285"}),
873            new Charset("Cp297", "IBM297", new String[] {"ebcdic-cp-fr", "csIBM297"}),
874            new Charset("Cp420", "IBM420", new String[] {"ebcdic-cp-ar1", "csIBM420"}),
875            new Charset("Cp424", "IBM424", new String[] {"ebcdic-cp-he", "csIBM424"}),
876            new Charset("Cp437", "IBM437", new String[] {"437", "csPC8CodePage437"}),
877            new Charset("Cp500", "IBM500", new String[] {"ebcdic-cp-be", "ebcdic-cp-ch", "csIBM500"}),
878            new Charset("Cp775", "IBM775", new String[] {"csPC775Baltic"}),
879            new Charset("Cp838", "IBM-Thai", new String[] {}),
880            new Charset("Cp850", "IBM850", new String[] {"850", "csPC850Multilingual"}),
881            new Charset("Cp852", "IBM852", new String[] {"852", "csPCp852"}),
882            new Charset("Cp855", "IBM855", new String[] {"855", "csIBM855"}),
883            new Charset("Cp857", "IBM857", new String[] {"857", "csIBM857"}),
884            new Charset("Cp858", "IBM00858", 
885                    new String[] {"CCSID00858", "CP00858", 
886                                  "PC-Multilingual-850+euro"}),
887            new Charset("Cp860", "IBM860", new String[] {"860", "csIBM860"}),
888            new Charset("Cp861", "IBM861", new String[] {"861", "cp-is", "csIBM861"}),
889            new Charset("Cp862", "IBM862", new String[] {"862", "csPC862LatinHebrew"}),
890            new Charset("Cp863", "IBM863", new String[] {"863", "csIBM863"}),
891            new Charset("Cp864", "IBM864", new String[] {"cp864", "csIBM864"}),
892            new Charset("Cp865", "IBM865", new String[] {"865", "csIBM865"}),
893            new Charset("Cp866", "IBM866", new String[] {"866", "csIBM866"}),
894            new Charset("Cp868", "IBM868", new String[] {"cp-ar", "csIBM868"}),
895            new Charset("Cp869", "IBM869", new String[] {"cp-gr", "csIBM869"}),
896            new Charset("Cp870", "IBM870", new String[] {"ebcdic-cp-roece", "ebcdic-cp-yu", "csIBM870"}),
897            new Charset("Cp871", "IBM871", new String[] {"ebcdic-cp-is", "csIBM871"}),
898            new Charset("Cp918", "IBM918", new String[] {"ebcdic-cp-ar2", "csIBM918"}),
899            new Charset("Cp1026", "IBM1026", new String[] {"csIBM1026"}),
900            new Charset("Cp1047", "IBM1047", new String[] {"IBM-1047"}),
901            new Charset("Cp1140", "IBM01140", 
902                        new String[] {"CCSID01140", "CP01140", 
903                                      "ebcdic-us-37+euro"}),
904            new Charset("Cp1141", "IBM01141", 
905                        new String[] {"CCSID01141", "CP01141", 
906                                      "ebcdic-de-273+euro"}),
907            new Charset("Cp1142", "IBM01142", new String[] {"CCSID01142", "CP01142", "ebcdic-dk-277+euro", "ebcdic-no-277+euro"}),
908            new Charset("Cp1143", "IBM01143", new String[] {"CCSID01143", "CP01143", "ebcdic-fi-278+euro", "ebcdic-se-278+euro"}),
909            new Charset("Cp1144", "IBM01144", new String[] {"CCSID01144", "CP01144", "ebcdic-it-280+euro"}),
910            new Charset("Cp1145", "IBM01145", new String[] {"CCSID01145", "CP01145", "ebcdic-es-284+euro"}),
911            new Charset("Cp1146", "IBM01146", new String[] {"CCSID01146", "CP01146", "ebcdic-gb-285+euro"}),
912            new Charset("Cp1147", "IBM01147", new String[] {"CCSID01147", "CP01147", "ebcdic-fr-297+euro"}),
913            new Charset("Cp1148", "IBM01148", new String[] {"CCSID01148", "CP01148", "ebcdic-international-500+euro"}),
914            new Charset("Cp1149", "IBM01149", new String[] {"CCSID01149", "CP01149", "ebcdic-is-871+euro"}),
915            new Charset("Cp1250", "windows-1250", new String[] {}),
916            new Charset("Cp1251", "windows-1251", new String[] {}),
917            new Charset("Cp1252", "windows-1252", new String[] {}),
918            new Charset("Cp1253", "windows-1253", new String[] {}),
919            new Charset("Cp1254", "windows-1254", new String[] {}),
920            new Charset("Cp1255", "windows-1255", new String[] {}),
921            new Charset("Cp1256", "windows-1256", new String[] {}),
922            new Charset("Cp1257", "windows-1257", new String[] {}),
923            new Charset("Cp1258", "windows-1258", new String[] {}),
924            new Charset("ISO2022CN", "ISO-2022-CN", new String[] {}),
925            new Charset("ISO2022JP", "ISO-2022-JP", new String[] {"csISO2022JP", "JIS", "jis_encoding", "csjisencoding"}),
926            new Charset("ISO2022KR", "ISO-2022-KR", new String[] {"csISO2022KR"}),
927            new Charset("JIS_X0201", "JIS_X0201", new String[] {"X0201", "JIS0201", "csHalfWidthKatakana"}),
928            new Charset("JIS_X0212-1990", "JIS_X0212-1990", new String[] {"iso-ir-159", "x0212", "JIS0212", "csISO159JISX02121990"}),
929            new Charset("JIS_C6626-1983", "JIS_C6626-1983", new String[] {"x-JIS0208", "JIS0208", "csISO87JISX0208", "x0208", "JIS_X0208-1983", "iso-ir-87"}),
930            new Charset("SJIS", "Shift_JIS", new String[] {"MS_Kanji", "csShiftJIS", "shift-jis", "x-sjis", "pck"}),
931            new Charset("TIS620", "TIS-620", new String[] {}),
932            new Charset("MS932", "Windows-31J", new String[] {"windows-932", "csWindows31J", "x-ms-cp932"}),
933            new Charset("EUC_TW", "EUC-TW", new String[] {"x-EUC-TW", "cns11643", "euctw"}),
934            new Charset("x-Johab", "johab", new String[] {"johab", "cp1361", "ms1361", "ksc5601-1992", "ksc5601_1992"}),
935            new Charset("MS950_HKSCS", "", new String[] {}),
936            new Charset("MS874", "windows-874", new String[] {"cp874"}),
937            new Charset("MS949", "windows-949", new String[] {"windows949", "ms_949", "x-windows-949"}),
938            new Charset("MS950", "windows-950", new String[] {"x-windows-950"}),
939    
940            new Charset("Cp737", null, new String[] {}),
941            new Charset("Cp856", null, new String[] {}),
942            new Charset("Cp875", null, new String[] {}),
943            new Charset("Cp921", null, new String[] {}),
944            new Charset("Cp922", null, new String[] {}),
945            new Charset("Cp930", null, new String[] {}),
946            new Charset("Cp933", null, new String[] {}),
947            new Charset("Cp935", null, new String[] {}),
948            new Charset("Cp937", null, new String[] {}),
949            new Charset("Cp939", null, new String[] {}),
950            new Charset("Cp942", null, new String[] {}),
951            new Charset("Cp942C", null, new String[] {}),
952            new Charset("Cp943", null, new String[] {}),
953            new Charset("Cp943C", null, new String[] {}),
954            new Charset("Cp948", null, new String[] {}),
955            new Charset("Cp949", null, new String[] {}),
956            new Charset("Cp949C", null, new String[] {}),
957            new Charset("Cp950", null, new String[] {}),
958            new Charset("Cp964", null, new String[] {}),
959            new Charset("Cp970", null, new String[] {}),
960            new Charset("Cp1006", null, new String[] {}),
961            new Charset("Cp1025", null, new String[] {}),    
962            new Charset("Cp1046", null, new String[] {}),
963            new Charset("Cp1097", null, new String[] {}),
964            new Charset("Cp1098", null, new String[] {}),
965            new Charset("Cp1112", null, new String[] {}),
966            new Charset("Cp1122", null, new String[] {}),
967            new Charset("Cp1123", null, new String[] {}),
968            new Charset("Cp1124", null, new String[] {}),
969            new Charset("Cp1381", null, new String[] {}),
970            new Charset("Cp1383", null, new String[] {}),
971            new Charset("Cp33722", null, new String[] {}),
972            new Charset("Big5_Solaris", null, new String[] {}),
973            new Charset("EUC_JP_LINUX", null, new String[] {}),
974            new Charset("EUC_JP_Solaris", null, new String[] {}),
975            new Charset("ISCII91", null, new String[] {"x-ISCII91", "iscii"}),
976            new Charset("ISO2022_CN_CNS", null, new String[] {}),
977            new Charset("ISO2022_CN_GB", null, new String[] {}),
978            new Charset("x-iso-8859-11", null, new String[] {}),
979            new Charset("JISAutoDetect", null, new String[] {}),
980            new Charset("MacArabic", null, new String[] {}),
981            new Charset("MacCentralEurope", null, new String[] {}),
982            new Charset("MacCroatian", null, new String[] {}),
983            new Charset("MacCyrillic", null, new String[] {}),
984            new Charset("MacDingbat", null, new String[] {}),
985            new Charset("MacGreek", "MacGreek", new String[] {}),
986            new Charset("MacHebrew", null, new String[] {}),
987            new Charset("MacIceland", null, new String[] {}),
988            new Charset("MacRoman", "MacRoman", new String[] {"Macintosh", "MAC", "csMacintosh"}),
989            new Charset("MacRomania", null, new String[] {}),
990            new Charset("MacSymbol", null, new String[] {}),
991            new Charset("MacThai", null, new String[] {}),
992            new Charset("MacTurkish", null, new String[] {}),
993            new Charset("MacUkraine", null, new String[] {}),
994            new Charset("UnicodeBig", null, new String[] {}),
995            new Charset("UnicodeLittle", null, new String[] {})
996        };
997    
998        /**
999         * Contains the canonical names of character sets which can be used to 
1000         * decode bytes into Java chars.
1001         */
1002        private static SortedSet<String> decodingSupported = null;
1003        
1004        /**
1005         * Contains the canonical names of character sets which can be used to 
1006         * encode Java chars into bytes.
1007         */
1008        private static SortedSet<String> encodingSupported = null;
1009        
1010        /**
1011         * Maps character set names to Charset objects. All possible names of
1012         * a charset will be mapped to the Charset.
1013         */
1014        private static Map<String, Charset> charsetMap = null;
1015        
1016        static {
1017            decodingSupported = new TreeSet<String>();
1018            encodingSupported = new TreeSet<String>();
1019            byte[] dummy = new byte[] {'d', 'u', 'm', 'm', 'y'};
1020            for (Charset c : JAVA_CHARSETS) {
1021                try {
1022                    new String(dummy, c.canonical);
1023                    decodingSupported.add(c.canonical.toLowerCase());
1024                } catch (UnsupportedOperationException e) {
1025                } catch (UnsupportedEncodingException e) {
1026                }
1027                try {
1028                    "dummy".getBytes(c.canonical);
1029                    encodingSupported.add(c.canonical.toLowerCase());
1030                } catch (UnsupportedOperationException e) {
1031                } catch (UnsupportedEncodingException e) {
1032                }
1033            }
1034            
1035            charsetMap = new HashMap<String, Charset>();
1036            for (Charset c : JAVA_CHARSETS) {
1037                charsetMap.put(c.canonical.toLowerCase(), c);
1038                if (c.mime != null) {
1039                    charsetMap.put(c.mime.toLowerCase(), c);
1040                }
1041                if (c.aliases != null) {
1042                    for (String str : c.aliases) {
1043                        charsetMap.put(str.toLowerCase(), c);
1044                    }
1045                }
1046            }
1047            
1048            if (log.isDebugEnabled()) {
1049                log.debug("Character sets which support decoding: " 
1050                            + decodingSupported);
1051                log.debug("Character sets which support encoding: " 
1052                            + encodingSupported);
1053            }
1054        }
1055    
1056        /** carriage return - line feed sequence */
1057        public static final String CRLF = "\r\n";
1058    
1059        /** US-ASCII CR, carriage return (13) */
1060        public static final int CR = '\r';
1061    
1062        /** US-ASCII LF, line feed (10) */
1063        public static final int LF = '\n';
1064    
1065        /** US-ASCII SP, space (32) */
1066        public static final int SP = ' ';
1067    
1068        /** US-ASCII HT, horizontal-tab (9) */
1069        public static final int HT = '\t';
1070    
1071        public static final java.nio.charset.Charset US_ASCII = java.nio.charset.Charset
1072                .forName("US-ASCII");
1073    
1074        public static final java.nio.charset.Charset ISO_8859_1 = java.nio.charset.Charset
1075                .forName("ISO-8859-1");
1076    
1077        public static final java.nio.charset.Charset UTF_8 = java.nio.charset.Charset
1078                .forName("UTF-8");
1079    
1080        public static final java.nio.charset.Charset DEFAULT_CHARSET = US_ASCII;
1081    
1082        /**
1083         * Returns <code>true</code> if the specified character falls into the US
1084         * ASCII character set (Unicode range 0000 to 007f).
1085         * 
1086         * @param ch
1087         *            character to test.
1088         * @return <code>true</code> if the specified character falls into the US
1089         *         ASCII character set, <code>false</code> otherwise.
1090         */
1091        public static boolean isASCII(char ch) {
1092            return (0xFF80 & ch) == 0;
1093        }
1094    
1095        /**
1096         * Returns <code>true</code> if the specified string consists entirely of
1097         * US ASCII characters.
1098         * 
1099         * @param s
1100         *            string to test.
1101         * @return <code>true</code> if the specified string consists entirely of
1102         *         US ASCII characters, <code>false</code> otherwise.
1103         */
1104        public static boolean isASCII(final String s) {
1105            if (s == null) {
1106                throw new IllegalArgumentException("String may not be null");
1107            }
1108            final int len = s.length();
1109            for (int i = 0; i < len; i++) {
1110                if (!isASCII(s.charAt(i))) {
1111                    return false;
1112                }
1113            }
1114            return true;
1115        }
1116    
1117        /**
1118         * Returns <code>true</code> if the specified character is a whitespace
1119         * character (CR, LF, SP or HT).
1120         * 
1121         * @param ch
1122         *            character to test.
1123         * @return <code>true</code> if the specified character is a whitespace
1124         *         character, <code>false</code> otherwise.
1125         */
1126        public static boolean isWhitespace(char ch) {
1127            return ch == SP || ch == HT || ch == CR || ch == LF;
1128        }
1129    
1130        /**
1131         * Returns <code>true</code> if the specified string consists entirely of
1132         * whitespace characters.
1133         * 
1134         * @param s
1135         *            string to test.
1136         * @return <code>true</code> if the specified string consists entirely of
1137         *         whitespace characters, <code>false</code> otherwise.
1138         */
1139        public static boolean isWhitespace(final String s) {
1140            if (s == null) {
1141                throw new IllegalArgumentException("String may not be null");
1142            }
1143            final int len = s.length();
1144            for (int i = 0; i < len; i++) {
1145                if (!isWhitespace(s.charAt(i))) {
1146                    return false;
1147                }
1148            }
1149            return true;
1150        }
1151        
1152        /**
1153         * Determines if the VM supports encoding (chars to bytes) the 
1154         * specified character set. NOTE: the given character set name may 
1155         * not be known to the VM even if this method returns <code>true</code>.
1156         * Use {@link #toJavaCharset(String)} to get the canonical Java character
1157         * set name.
1158         * 
1159         * @param charsetName the characters set name.
1160         * @return <code>true</code> if encoding is supported, <code>false</code>
1161         *         otherwise.
1162         */
1163        public static boolean isEncodingSupported(String charsetName) {
1164            return encodingSupported.contains(charsetName.toLowerCase());
1165        }
1166        
1167        /**
1168         * Determines if the VM supports decoding (bytes to chars) the 
1169         * specified character set. NOTE: the given character set name may 
1170         * not be known to the VM even if this method returns <code>true</code>.
1171         * Use {@link #toJavaCharset(String)} to get the canonical Java character
1172         * set name.
1173         * 
1174         * @param charsetName the characters set name.
1175         * @return <code>true</code> if decoding is supported, <code>false</code>
1176         *         otherwise.
1177         */
1178        public static boolean isDecodingSupported(String charsetName) {
1179            return decodingSupported.contains(charsetName.toLowerCase());
1180        }
1181        
1182        /**
1183         * Gets the preferred MIME character set name for the specified
1184         * character set or <code>null</code> if not known.
1185         * 
1186         * @param charsetName the character set name to look for.
1187         * @return the MIME preferred name or <code>null</code> if not known.
1188         */
1189        public static String toMimeCharset(String charsetName) {
1190            Charset c = charsetMap.get(charsetName.toLowerCase());
1191            if (c != null) {
1192                return c.mime;
1193            }
1194            return null;
1195        }
1196        
1197        /**
1198         * Gets the canonical Java character set name for the specified
1199         * character set or <code>null</code> if not known. This should be
1200         * called before doing any conversions using the Java API. NOTE:
1201         * you must use {@link #isEncodingSupported(String)} or
1202         * {@link #isDecodingSupported(String)} to make sure the returned
1203         * Java character set is supported by the current VM.
1204         * 
1205         * @param charsetName the character set name to look for.
1206         * @return the canonical Java name or <code>null</code> if not known.
1207         */
1208        public static String toJavaCharset(String charsetName) {
1209            Charset c = charsetMap.get(charsetName.toLowerCase());
1210            if (c != null) {
1211                return c.canonical;
1212            }
1213            return null;
1214        }
1215    
1216        public static java.nio.charset.Charset getCharset(String charsetName) {
1217            String defaultCharset = "ISO-8859-1";
1218            
1219            // Use the default chareset if given charset is null
1220            if(charsetName == null) charsetName = defaultCharset;
1221                
1222            try {
1223                return java.nio.charset.Charset.forName(charsetName);
1224            } catch (IllegalCharsetNameException e) {
1225                log.info("Illegal charset " + charsetName + ", fallback to " + defaultCharset + ": " + e);
1226                // Use default charset on exception 
1227                return java.nio.charset.Charset.forName(defaultCharset);
1228            } catch (UnsupportedCharsetException ex) {
1229                log.info("Unsupported charset " + charsetName + ", fallback to " + defaultCharset + ": " + ex);
1230                // Use default charset on exception
1231                return java.nio.charset.Charset.forName(defaultCharset);
1232            }
1233            
1234        }
1235        /*
1236         * Uncomment the code below and run the main method to regenerate the
1237         * Javadoc table above when the known charsets change. 
1238         */
1239        
1240        /*
1241        private static String dumpHtmlTable() {
1242            List<Charset> l = new LinkedList<Charset>(Arrays.asList(JAVA_CHARSETS));
1243            Collections.sort(l);
1244            StringBuilder sb = new StringBuilder();
1245            sb.append(" * <table>\n");
1246            sb.append(" *     <tr>\n");
1247            sb.append(" *         <td>Canonical (Java) name</td>\n");
1248            sb.append(" *         <td>MIME preferred</td>\n");
1249            sb.append(" *         <td>Aliases</td>\n");
1250            sb.append(" *     </tr>\n");
1251    
1252            for (Charset c : l) {
1253                sb.append(" *     <tr>\n");
1254                sb.append(" *         <td>" + c.canonical + "</td>\n");
1255                sb.append(" *         <td>" + (c.mime == null ? "?" : c.mime)+ "</td>\n");
1256                sb.append(" *         <td>");
1257                for (int i = 0; c.aliases != null && i < c.aliases.length; i++) {
1258                    sb.append(c.aliases[i] + " ");
1259                }
1260                sb.append("</td>\n");
1261                sb.append(" *     </tr>\n");
1262            }
1263            sb.append(" * </table>\n");
1264            return sb.toString();
1265        }
1266        
1267        public static void main(String[] args) {
1268            System.out.println(dumpHtmlTable());
1269        }
1270        */
1271    }