J2U is a little Java tool for converting text in almost any encoding to Unicode character entiries. See Using Unicode on E2 for why you might want to do this.

In order to use J2U, you will have to cut'n'paste the source code below into a file called "j2u.java" and compile it with a Java compiler like javac. An applet version is in the works.

J2U's default encoding is EUC-JP, the Unix standard for Japanese. Note that J2U handles almost any 8-bit encoding (like EUC and Shift-JIS), but cannot deal with purely 7-bit encodings like JIS since it can't tell where the "special" characters start.


/*
 * j2u.java by gn0sis
 * syntax: java j2u [input-encoding] <input >output
 * converts random local encoding (default EUC-JP) to
 * HTML Unicode character entities
 */
import java.io.*;

public class j2u {
  public static void main(String args[]) {    
    try {
      String encoding = "EUC-JP";
      if(args.length > 1) encoding = args[1];
      BufferedReader in = new BufferedReader
        (new InputStreamReader(System.in, encoding));
      String buf;
      while((buf = in.readLine()) != null) {
        for(int i = 0; i < buf.length(); i++) {
          char c = buf.charAt(i);
          if(Character.UnicodeBlock.of(c) !=
             Character.UnicodeBlock.BASIC_LATIN) {
            byte hi = (byte) (c >>> 8);
            byte lo = (byte) (c & 0xff);
            System.out.print("&#x" +
                     byteToHex(hi) + byteToHex(lo) +
                     ";");
          } else {
            System.out.print(c);
          }
        }
        System.out.println("");
      }
    } catch(IOException ioe) {
      ioe.printStackTrace();
    }
  }

  static public String byteToHex(byte b) {
    // Returns hex String representation of byte b
    char hexDigit[] = {
     '0', '1', '2', '3', '4', '5', '6', '7',
     '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
    };
    char[] array = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
    return new String(array);
   }
}

Log in or register to write something here or to contact authors.