* auto-detect encoding if necessary
This commit is contained in:
parent
5b59ad3ad1
commit
545224396a
@ -5,11 +5,14 @@ import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
|
||||
public class UnicodeReader extends Reader {
|
||||
|
||||
private static final int BOM_SIZE = 4;
|
||||
|
||||
private InputStreamReader reader = null;
|
||||
private final Reader reader;
|
||||
|
||||
public UnicodeReader(InputStream stream) throws IOException {
|
||||
if (!stream.markSupported())
|
||||
@ -19,31 +22,38 @@ public class UnicodeReader extends Reader {
|
||||
byte bom[] = new byte[BOM_SIZE];
|
||||
stream.read(bom, 0, bom.length);
|
||||
|
||||
Charset charset = StandardCharsets.UTF_8;
|
||||
Charset bomEncoding = null;
|
||||
int skip = 0;
|
||||
|
||||
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
|
||||
charset = StandardCharsets.UTF_8;
|
||||
bomEncoding = StandardCharsets.UTF_8;
|
||||
skip = 3;
|
||||
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
|
||||
charset = StandardCharsets.UTF_16BE;
|
||||
bomEncoding = StandardCharsets.UTF_16BE;
|
||||
skip = 2;
|
||||
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
|
||||
charset = StandardCharsets.UTF_16LE;
|
||||
bomEncoding = StandardCharsets.UTF_16LE;
|
||||
skip = 2;
|
||||
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
|
||||
charset = Charset.forName("UTF-32BE");
|
||||
bomEncoding = Charset.forName("UTF-32BE");
|
||||
skip = 4;
|
||||
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
|
||||
charset = Charset.forName("UTF-32LE");
|
||||
bomEncoding = Charset.forName("UTF-32LE");
|
||||
skip = 4;
|
||||
}
|
||||
|
||||
// rewind and skip BOM
|
||||
stream.reset();
|
||||
stream.skip(skip);
|
||||
|
||||
// initialize reader
|
||||
reader = new InputStreamReader(stream, charset);
|
||||
// guess character encoding if necessary
|
||||
if (bomEncoding == null) {
|
||||
// auto-detect encoding
|
||||
reader = new CharsetDetector().getReader(stream, "UTF-8");
|
||||
} else {
|
||||
// initialize reader via BOM
|
||||
reader = new InputStreamReader(stream, bomEncoding);
|
||||
}
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
@ -62,10 +72,6 @@ public class UnicodeReader extends Reader {
|
||||
return reader.read(cbuf);
|
||||
}
|
||||
|
||||
public String getEncoding() {
|
||||
return reader.getEncoding();
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
return reader.read();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user