* fix Unicode BOM issues
This commit is contained in:
parent
1c99bd44a2
commit
5b59ad3ad1
@ -10,7 +10,7 @@ import static net.filebot.util.FileUtilities.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
@ -39,6 +39,8 @@ import net.filebot.similarity.MetricCascade;
|
||||
import net.filebot.similarity.NameSimilarityMetric;
|
||||
import net.filebot.similarity.SequenceMatchSimilarity;
|
||||
import net.filebot.similarity.SimilarityMetric;
|
||||
import net.filebot.util.ByteBufferInputStream;
|
||||
import net.filebot.util.UnicodeReader;
|
||||
import net.filebot.vfs.ArchiveType;
|
||||
import net.filebot.vfs.MemoryFile;
|
||||
import net.filebot.web.Movie;
|
||||
@ -270,13 +272,13 @@ public final class SubtitleUtilities {
|
||||
likelyFormats.addLast(format);
|
||||
}
|
||||
|
||||
// decode bytes
|
||||
String textfile = getText(file.getData());
|
||||
// decode bytes and beware of byte-order marks
|
||||
Reader reader = new UnicodeReader(new ByteBufferInputStream(file.getData()));
|
||||
|
||||
// decode subtitle file with the first reader that seems to work
|
||||
for (SubtitleFormat format : likelyFormats) {
|
||||
// reset reader to position 0
|
||||
SubtitleReader parser = format.newReader(new StringReader(textfile));
|
||||
SubtitleReader parser = format.newReader(reader);
|
||||
|
||||
if (parser.hasNext()) {
|
||||
// correct format found
|
||||
|
101
source/net/filebot/util/UnicodeReader.java
Normal file
101
source/net/filebot/util/UnicodeReader.java
Normal file
@ -0,0 +1,101 @@
|
||||
package net.filebot.util;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class UnicodeReader extends Reader {
|
||||
|
||||
private static final int BOM_SIZE = 4;
|
||||
|
||||
private InputStreamReader reader = null;
|
||||
|
||||
public UnicodeReader(InputStream stream) throws IOException {
|
||||
if (!stream.markSupported())
|
||||
throw new IllegalArgumentException("stream must support mark");
|
||||
|
||||
stream.mark(BOM_SIZE);
|
||||
byte bom[] = new byte[BOM_SIZE];
|
||||
stream.read(bom, 0, bom.length);
|
||||
|
||||
Charset charset = StandardCharsets.UTF_8;
|
||||
int skip = 0;
|
||||
|
||||
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
|
||||
charset = StandardCharsets.UTF_8;
|
||||
skip = 3;
|
||||
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
|
||||
charset = StandardCharsets.UTF_16BE;
|
||||
skip = 2;
|
||||
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
|
||||
charset = StandardCharsets.UTF_16LE;
|
||||
skip = 2;
|
||||
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
|
||||
charset = Charset.forName("UTF-32BE");
|
||||
skip = 4;
|
||||
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
|
||||
charset = Charset.forName("UTF-32LE");
|
||||
skip = 4;
|
||||
}
|
||||
|
||||
stream.reset();
|
||||
stream.skip(skip);
|
||||
|
||||
// initialize reader
|
||||
reader = new InputStreamReader(stream, charset);
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return reader.hashCode();
|
||||
}
|
||||
|
||||
public int read(CharBuffer target) throws IOException {
|
||||
return reader.read(target);
|
||||
}
|
||||
|
||||
public boolean equals(Object obj) {
|
||||
return reader.equals(obj);
|
||||
}
|
||||
|
||||
public int read(char[] cbuf) throws IOException {
|
||||
return reader.read(cbuf);
|
||||
}
|
||||
|
||||
public String getEncoding() {
|
||||
return reader.getEncoding();
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
return reader.read();
|
||||
}
|
||||
|
||||
public int read(char[] cbuf, int offset, int length) throws IOException {
|
||||
return reader.read(cbuf, offset, length);
|
||||
}
|
||||
|
||||
public long skip(long n) throws IOException {
|
||||
return reader.skip(n);
|
||||
}
|
||||
|
||||
public boolean ready() throws IOException {
|
||||
return reader.ready();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
reader.close();
|
||||
}
|
||||
|
||||
public boolean markSupported() {
|
||||
return reader.markSupported();
|
||||
}
|
||||
|
||||
public void mark(int readAheadLimit) throws IOException {
|
||||
reader.mark(readAheadLimit);
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
reader.reset();
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user