J2U is a little
Java tool for converting
text in almost any encoding
to Unicode character entiries. See
Using Unicode on E2 for why you might want to do this.
In order to use J2U, you will have to cut'n'paste the source code
below into a file called "j2u.java" and compile it with a Java compiler like javac.
An applet version is in the works.
J2U's default encoding is EUC-JP, the Unix standard for Japanese.
Note that J2U handles almost any 8-bit encoding (like EUC
and Shift-JIS), but cannot deal with purely 7-bit encodings like
JIS since it can't tell where the "special" characters start.
/*
* j2u.java by gn0sis
* syntax: java j2u [input-encoding] <input >output
* converts random local encoding (default EUC-JP) to
* HTML Unicode character entities
*/
import java.io.*;
public class j2u {
public static void main(String args[]) {
try {
String encoding = "EUC-JP";
if(args.length > 1) encoding = args[1];
BufferedReader in = new BufferedReader
(new InputStreamReader(System.in, encoding));
String buf;
while((buf = in.readLine()) != null) {
for(int i = 0; i < buf.length(); i++) {
char c = buf.charAt(i);
if(Character.UnicodeBlock.of(c) !=
Character.UnicodeBlock.BASIC_LATIN) {
byte hi = (byte) (c >>> 8);
byte lo = (byte) (c & 0xff);
System.out.print("&#x" +
byteToHex(hi) + byteToHex(lo) +
";");
} else {
System.out.print(c);
}
}
System.out.println("");
}
} catch(IOException ioe) {
ioe.printStackTrace();
}
}
static public String byteToHex(byte b) {
// Returns hex String representation of byte b
char hexDigit[] = {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};
char[] array = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
return new String(array);
}
}