天天看點

使用Java将中文字元轉換成Unicode編碼

這兩天操作xml使用到了jdom,在建立xml檔案并輸出到硬碟的時候遇到一個中文編碼的問題:jdom預設輸出的xml編碼是utf-8,但是文檔中如果出現中文字元那麼該中文字元就會變成亂碼,造成xml檔案無法被正确解析。

utf-8應該是可以用來表示中文的吧?我不知道這是不是jdom的一個bug(jdom 1.0,beta了10次的産物哦!)。我google了一下,大家解決這個問題的辦法無非是把jdom的輸出字元集改為gbk或者gb2312,但是這樣就會有一些副作用,如果在沒有特定字元集(gbk或者gb2312)的作業系統上不是依然不能正确解析嗎?一個比較好的解決辦法是先把中文轉換成unicode編碼在直接輸出,程式解析xml後的時候再把unicode編碼轉回中文就沒有問題了。

于是我檢視了jdk的文檔,截至java 5好像都沒有做類似轉換的類可以直接使用,但是我發現一個類 java.util.properties,它的源代碼裡有兩個私有(private)方法 loadconvert (char[] in, int off, int len, char[] convtbuf) 和 saveconvert(string thestring, boolean escapespace) 其實就是做特殊字元和unicode編碼字元間轉換的,我把它們提取出來,單獨包裝到一個類裡就可以使用了。

下面是我包裝的類 charactersettoolkit

/*

* charactersettoolkit.java

*

* created on 2006年10月27日, 下午2:06

* to change this template, choose tools | template manager

* and open the template in the editor.

*/

package mobi.chenwei.lang;

/**

* 進行字元操作的工具類

* @author chen wei

* @email [email protected]

public class charactersettoolkit {

     /** creates a new instance of charactersettoolkit */

     public charactersettoolkit() {

     }

     private static final char[] hexdigit = {

         '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'

     };

     private static char tohex(int nibble) {

         return hexdigit[(nibble & 0xf)];

     /**

      * 将字元串編碼成 unicode 。

      * @param thestring 待轉換成unicode編碼的字元串。

      * @param escapespace 是否忽略空格。

      * @return 傳回轉換後unicode編碼的字元串。

      */

     public static string tounicode(string thestring, boolean escapespace) {

         int len = thestring.length();

         int buflen = len * 2;

         if (buflen < 0) {

             buflen = integer.max_value;

         }

         stringbuffer outbuffer = new stringbuffer(buflen);

         for(int x=0; x<len; x++) {

             char achar = thestring.charat(x);

             // handle common case first, selecting largest block that

             // avoids the specials below

             if ((achar > 61) && (achar < 127)) {

                 if (achar == '\\') {

                     outbuffer.append('\\'); outbuffer.append('\\');

                     continue;

                 }

                 outbuffer.append(achar);

                 continue;

             }

             switch(achar) {

                 case ' ':

                     if (x == 0 || escapespace)

                         outbuffer.append('\\');

                     outbuffer.append(' ');

                     break;

                 case '\t':outbuffer.append('\\'); outbuffer.append('t');

                           break;

                 case '\n':outbuffer.append('\\'); outbuffer.append('n');

                 case '\r':outbuffer.append('\\'); outbuffer.append('r');

                 case '\f':outbuffer.append('\\'); outbuffer.append('f');

                 case '=': // fall through

                 case ':': // fall through

                 case '#': // fall through

                 case '!':

                     outbuffer.append('\\'); outbuffer.append(achar);

                 default:

                     if ((achar < 0x0020) || (achar > 0x007e)) {

                         outbuffer.append('u');

                         outbuffer.append(tohex((achar >> 12) & 0xf));

                         outbuffer.append(tohex((achar >>   8) & 0xf));

                         outbuffer.append(tohex((achar >>   4) & 0xf));

                         outbuffer.append(tohex( achar         & 0xf));

                     } else {

                         outbuffer.append(achar);

                     }

         return outbuffer.tostring();

      * 從 unicode 碼轉換成編碼前的特殊字元串。

      * @param in unicode編碼的字元數組。

      * @param off 轉換的起始偏移量。

      * @param len 轉換的字元長度。

      * @param convtbuf 轉換的緩存字元數組。

      * @return 完成轉換,傳回編碼前的特殊字元串。

     public string fromunicode(char[] in, int off, int len, char[] convtbuf) {

         if (convtbuf.length < len) {

             int newlen = len * 2;

             if (newlen < 0) {

                 newlen = integer.max_value;

             convtbuf = new char[newlen];

         char achar;

         char[] out = convtbuf;

         int outlen = 0;

         int end = off + len;

         while (off < end) {

             achar = in[off++];

             if (achar == '\\') {

                 achar = in[off++];

                 if (achar == 'u') {

                     // read the xxxx

                     int value = 0;

                     for (int i = 0; i < 4; i++) {

                         achar = in[off++];

                         switch (achar) {

                         case '0':

                         case '1':

                         case '2':

                         case '3':

                         case '4':

                         case '5':

                         case '6':

                         case '7':

                         case '8':

                         case '9':

                             value = (value << 4) + achar - '0';

                             break;

                         case 'a':

                         case 'b':

                         case 'c':

                         case 'd':

                         case 'e':

                         case 'f':

                             value = (value << 4) + 10 + achar - 'a';

                         default:

                             throw new illegalargumentexception(

                                     "malformed \\uxxxx encoding.");

                         }

                     out[outlen++] = (char) value;

                 } else {

                     if (achar == 't') {

                         achar = '\t';

                     } else if (achar == 'r') {

                         achar = '\r';

                     } else if (achar == 'n') {

                         achar = '\n';

                     } else if (achar == 'f') {

                         achar = '\f';

                     out[outlen++] = achar;

             } else {

                 out[outlen++] = (char) achar;

         return new string(out, 0, outlen);

}