Refactor UnicodeReader and BOM detection
This commit is contained in:
parent
53376c3de6
commit
8932eb0b2a
|
@ -9,7 +9,7 @@
|
|||
<classpathentry kind="lib" path="lib/jars/xmlrpc.jar"/>
|
||||
<classpathentry kind="lib" path="lib/ivy/jar/ehcache.jar" sourcepath="lib/ivy/source/ehcache.jar"/>
|
||||
<classpathentry kind="lib" path="lib/ivy/jar/glazedlists_java15.jar" sourcepath="lib/ivy/source/glazedlists_java15.jar"/>
|
||||
<classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar"/>
|
||||
<classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar" sourcepath="lib/ivy/source/icu4j.jar"/>
|
||||
<classpathentry kind="lib" path="lib/ivy/jar/jna.jar" sourcepath="lib/ivy/source/jna.jar"/>
|
||||
<classpathentry kind="lib" path="lib/ivy/jar/junit.jar"/>
|
||||
<classpathentry kind="lib" path="lib/ivy/jar/miglayout-core.jar"/>
|
||||
|
|
|
@ -1219,7 +1219,7 @@ public class MediaDetection {
|
|||
// parse ids from nfo files
|
||||
for (File nfo : nfoFiles) {
|
||||
try {
|
||||
String text = new String(readFile(nfo), "UTF-8");
|
||||
String text = readTextFile(nfo);
|
||||
collection.addAll(grepImdbId(text));
|
||||
} catch (Exception e) {
|
||||
debug.warning("Failed to read nfo: " + e.getMessage());
|
||||
|
@ -1246,7 +1246,7 @@ public class MediaDetection {
|
|||
continue;
|
||||
|
||||
for (File nfo : getChildren(folder, NFO_FILES)) {
|
||||
String text = new String(readFile(nfo), "UTF-8");
|
||||
String text = readTextFile(nfo);
|
||||
|
||||
for (int imdbid : grepImdbId(text)) {
|
||||
SearchResult series = WebServices.TheTVDB.lookupByIMDbID(imdbid, language);
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package net.filebot.subtitle;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.*;
|
||||
import static java.util.Collections.*;
|
||||
import static java.util.stream.Collectors.*;
|
||||
import static net.filebot.Logging.*;
|
||||
|
@ -10,11 +11,10 @@ import static net.filebot.util.FileUtilities.*;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Reader;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
|
@ -33,6 +33,8 @@ import java.util.function.Predicate;
|
|||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import com.optimaize.langdetect.DetectedLanguage;
|
||||
import com.optimaize.langdetect.LanguageDetector;
|
||||
import com.optimaize.langdetect.LanguageDetectorBuilder;
|
||||
|
@ -53,7 +55,7 @@ import net.filebot.similarity.SequenceMatchSimilarity;
|
|||
import net.filebot.similarity.SimilarityComparator;
|
||||
import net.filebot.similarity.SimilarityMetric;
|
||||
import net.filebot.util.ByteBufferInputStream;
|
||||
import net.filebot.util.UnicodeReader;
|
||||
import net.filebot.util.ByteBufferOutputStream;
|
||||
import net.filebot.vfs.ArchiveType;
|
||||
import net.filebot.vfs.MemoryFile;
|
||||
import net.filebot.web.Movie;
|
||||
|
@ -325,7 +327,7 @@ public final class SubtitleUtilities {
|
|||
// decode subtitle file with the first reader that seems to work
|
||||
for (SubtitleFormat format : likelyFormats) {
|
||||
// decode bytes and beware of byte-order marks
|
||||
Reader reader = new UnicodeReader(new ByteBufferInputStream(file.getData()), true, StandardCharsets.UTF_8);
|
||||
Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);
|
||||
|
||||
// reset reader to position 0
|
||||
SubtitleReader parser = format.newReader(reader);
|
||||
|
@ -347,29 +349,31 @@ public final class SubtitleUtilities {
|
|||
throw new IOException("Subtitle format not supported");
|
||||
}
|
||||
|
||||
public static ByteBuffer exportSubtitles(MemoryFile data, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
|
||||
public static ByteBuffer exportSubtitles(MemoryFile file, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
|
||||
if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) {
|
||||
throw new IllegalArgumentException("Format not supported");
|
||||
}
|
||||
|
||||
// convert to target format and target encoding
|
||||
ByteBufferOutputStream buffer = new ByteBufferOutputStream(file.size());
|
||||
OutputStreamWriter writer = new OutputStreamWriter(buffer, outputEncoding);
|
||||
|
||||
if (outputFormat == SubtitleFormat.SubRip) {
|
||||
// output buffer
|
||||
StringBuilder buffer = new StringBuilder(4 * 1024);
|
||||
try (SubRipWriter out = new SubRipWriter(buffer)) {
|
||||
for (SubtitleElement it : decodeSubtitles(data)) {
|
||||
// convert to target format and target encoding
|
||||
try (SubRipWriter out = new SubRipWriter(writer)) {
|
||||
for (SubtitleElement it : decodeSubtitles(file)) {
|
||||
if (outputTimingOffset != 0) {
|
||||
it = new SubtitleElement(Math.max(0, it.getStart() + outputTimingOffset), Math.max(0, it.getEnd() + outputTimingOffset), it.getText());
|
||||
}
|
||||
out.write(it);
|
||||
}
|
||||
}
|
||||
|
||||
return outputEncoding.encode(CharBuffer.wrap(buffer));
|
||||
} else {
|
||||
// convert only text encoding
|
||||
Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);
|
||||
IOUtils.copy(reader, writer);
|
||||
}
|
||||
|
||||
// only change encoding
|
||||
return outputEncoding.encode(getText(data.getData()));
|
||||
return buffer.getByteBuffer();
|
||||
}
|
||||
|
||||
public static SubtitleFormat getSubtitleFormat(File file) {
|
||||
|
|
|
@ -3,6 +3,7 @@ package net.filebot.ui.subtitle.upload;
|
|||
import static java.util.Collections.*;
|
||||
import static net.filebot.Logging.*;
|
||||
import static net.filebot.media.MediaDetection.*;
|
||||
import static net.filebot.util.FileUtilities.*;
|
||||
import static net.filebot.util.ui.SwingUI.*;
|
||||
|
||||
import java.awt.Color;
|
||||
|
@ -32,7 +33,6 @@ import net.filebot.Language;
|
|||
import net.filebot.ResourceManager;
|
||||
import net.filebot.WebServices;
|
||||
import net.filebot.media.MediaDetection;
|
||||
import net.filebot.util.FileUtilities;
|
||||
import net.filebot.util.ui.EmptySelectionModel;
|
||||
import net.filebot.web.Movie;
|
||||
import net.filebot.web.OpenSubtitlesClient;
|
||||
|
@ -151,7 +151,7 @@ public class SubtitleUploadDialog extends JDialog {
|
|||
if (mapping.getLanguage() == null) {
|
||||
mapping.setState(Status.Identifying);
|
||||
try {
|
||||
Locale locale = database.detectLanguage(FileUtilities.readFile(mapping.getSubtitle()));
|
||||
Locale locale = database.detectLanguage(readFile(mapping.getSubtitle()));
|
||||
mapping.setLanguage(Language.getLanguage(locale));
|
||||
} catch (Exception e) {
|
||||
debug.log(Level.WARNING, "Failed to auto-detect language: " + e.getMessage());
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
package net.filebot.util;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public enum BOM {
|
||||
|
||||
UTF_8((byte) 0xEF, (byte) 0xBB, (byte) 0xBF),
|
||||
|
||||
UTF_16BE((byte) 0xFE, (byte) 0xFF),
|
||||
|
||||
UTF_16LE((byte) 0xFF, (byte) 0xFE),
|
||||
|
||||
UTF_32BE((byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF),
|
||||
|
||||
UTF_32LE((byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00),
|
||||
|
||||
GB_18030((byte) 0x84, (byte) 0x31, (byte) 0x95, (byte) 0x33);
|
||||
|
||||
public static final int SIZE = 4;
|
||||
|
||||
private byte[] bom;
|
||||
|
||||
BOM(byte... bom) {
|
||||
this.bom = bom;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return bom.length;
|
||||
}
|
||||
|
||||
public boolean matches(byte[] bytes) {
|
||||
if (bytes.length < bom.length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < bom.length; i++) {
|
||||
if (bom[i] != bytes[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public Charset getCharset() {
|
||||
switch (this) {
|
||||
case UTF_8:
|
||||
return StandardCharsets.UTF_8;
|
||||
case UTF_16BE:
|
||||
return StandardCharsets.UTF_16BE;
|
||||
case UTF_16LE:
|
||||
return StandardCharsets.UTF_16LE;
|
||||
case UTF_32BE:
|
||||
return Charset.forName("UTF-32BE");
|
||||
case UTF_32LE:
|
||||
return Charset.forName("UTF-32LE");
|
||||
case GB_18030:
|
||||
return Charset.forName("GB18030");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static BOM detect(byte[] bytes) {
|
||||
for (BOM bom : values()) {
|
||||
if (bom.matches(bytes)) {
|
||||
return bom;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -4,23 +4,22 @@ import static java.nio.charset.StandardCharsets.*;
|
|||
import static java.util.Arrays.*;
|
||||
import static java.util.Collections.*;
|
||||
import static java.util.Comparator.*;
|
||||
import static java.util.stream.Collectors.*;
|
||||
import static net.filebot.Logging.*;
|
||||
import static net.filebot.util.RegularExpressions.*;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FilenameFilter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.math.BigInteger;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.AtomicMoveNotSupportedException;
|
||||
import java.nio.file.FileVisitOption;
|
||||
|
@ -48,7 +47,6 @@ import java.util.TreeMap;
|
|||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collector;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
|
@ -196,18 +194,21 @@ public final class FileUtilities {
|
|||
return Files.readAllBytes(file.toPath());
|
||||
}
|
||||
|
||||
public static <R, A> R readLines(File file, Collector<? super String, A, R> collector) throws IOException {
|
||||
try (BufferedReader reader = new BufferedReader(new UnicodeReader(new ByteArrayInputStream(readFile(file)), false, UTF_8))) {
|
||||
return reader.lines().collect(collector);
|
||||
public static String readTextFile(File file) throws IOException {
|
||||
byte[] bytes = readFile(file);
|
||||
|
||||
// check BOM
|
||||
BOM bom = BOM.detect(bytes);
|
||||
|
||||
if (bom != null) {
|
||||
return new String(bytes, bom.size(), bytes.length - bom.size(), bom.getCharset());
|
||||
} else {
|
||||
return new String(bytes, UTF_8);
|
||||
}
|
||||
}
|
||||
|
||||
public static List<String> readLines(File file) throws IOException {
|
||||
return readLines(file, toList());
|
||||
}
|
||||
|
||||
public static String readTextFile(File file) throws IOException {
|
||||
return readLines(file, joining(System.lineSeparator()));
|
||||
return asList(NEWLINE.split(readTextFile(file)));
|
||||
}
|
||||
|
||||
public static File writeFile(ByteBuffer data, File destination) throws IOException {
|
||||
|
@ -217,35 +218,37 @@ public final class FileUtilities {
|
|||
return destination;
|
||||
}
|
||||
|
||||
public static Reader createTextReader(InputStream in, boolean guess, Charset declaredEncoding) throws IOException {
|
||||
byte head[] = new byte[BOM.SIZE];
|
||||
in.mark(head.length);
|
||||
in.read(head);
|
||||
in.reset(); // rewind
|
||||
|
||||
// check BOM
|
||||
BOM bom = BOM.detect(head);
|
||||
|
||||
if (bom != null) {
|
||||
in.skip(bom.size()); // skip BOM
|
||||
return new InputStreamReader(in, bom.getCharset());
|
||||
}
|
||||
|
||||
// auto-detect character encoding
|
||||
if (guess) {
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setDeclaredEncoding(declaredEncoding.name());
|
||||
detector.setText(in);
|
||||
CharsetMatch match = detector.detect();
|
||||
if (match != null) {
|
||||
return match.getReader();
|
||||
}
|
||||
}
|
||||
|
||||
// default to declared encoding
|
||||
return new InputStreamReader(in, declaredEncoding);
|
||||
}
|
||||
|
||||
public static Reader createTextReader(File file) throws IOException {
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding
|
||||
detector.setText(new BufferedInputStream(new FileInputStream(file)));
|
||||
|
||||
CharsetMatch charset = detector.detect();
|
||||
if (charset != null)
|
||||
return charset.getReader();
|
||||
|
||||
// assume UTF-8 by default
|
||||
return new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
public static String getText(ByteBuffer data) throws IOException {
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding
|
||||
detector.setText(new ByteBufferInputStream(data));
|
||||
|
||||
CharsetMatch charset = detector.detect();
|
||||
if (charset != null) {
|
||||
try {
|
||||
return charset.getString();
|
||||
} catch (RuntimeException e) {
|
||||
throw new IOException("Failed to read text", e);
|
||||
}
|
||||
}
|
||||
|
||||
// assume UTF-8 by default
|
||||
return UTF_8.decode(data).toString();
|
||||
return createTextReader(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE), true, UTF_8);
|
||||
}
|
||||
|
||||
public static boolean equalsCaseSensitive(File a, File b) {
|
||||
|
|
|
@ -1,125 +0,0 @@
|
|||
package net.filebot.util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
|
||||
public class UnicodeReader extends Reader {
|
||||
|
||||
private static final int BOM_SIZE = 4;
|
||||
|
||||
private final Reader reader;
|
||||
|
||||
public UnicodeReader(InputStream stream, boolean guessCharset, Charset defaultCharset) throws IOException {
|
||||
if (!stream.markSupported()) {
|
||||
throw new IllegalArgumentException("stream must support mark");
|
||||
}
|
||||
|
||||
stream.mark(BOM_SIZE);
|
||||
byte bom[] = new byte[BOM_SIZE];
|
||||
stream.read(bom, 0, bom.length);
|
||||
|
||||
Charset bomEncoding = null;
|
||||
int skip = 0;
|
||||
|
||||
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
|
||||
bomEncoding = StandardCharsets.UTF_8;
|
||||
skip = 3;
|
||||
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
|
||||
bomEncoding = StandardCharsets.UTF_16BE;
|
||||
skip = 2;
|
||||
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
|
||||
bomEncoding = StandardCharsets.UTF_16LE;
|
||||
skip = 2;
|
||||
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
|
||||
bomEncoding = Charset.forName("UTF-32BE");
|
||||
skip = 4;
|
||||
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
|
||||
bomEncoding = Charset.forName("UTF-32LE");
|
||||
skip = 4;
|
||||
}
|
||||
|
||||
// rewind and skip BOM
|
||||
stream.reset();
|
||||
stream.skip(skip);
|
||||
|
||||
// guess character encoding if necessary
|
||||
if (bomEncoding != null) {
|
||||
// initialize reader via BOM
|
||||
reader = new InputStreamReader(stream, bomEncoding);
|
||||
} else if (bomEncoding == null && guessCharset) {
|
||||
// auto-detect encoding
|
||||
reader = new CharsetDetector().getReader(stream, defaultCharset.name());
|
||||
} else {
|
||||
// use default
|
||||
reader = new InputStreamReader(stream, defaultCharset);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return reader.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(CharBuffer target) throws IOException {
|
||||
return reader.read(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return reader.equals(obj);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf) throws IOException {
|
||||
return reader.read(cbuf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
return reader.read();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int offset, int length) throws IOException {
|
||||
return reader.read(cbuf, offset, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
return reader.skip(n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean ready() throws IOException {
|
||||
return reader.ready();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
reader.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean markSupported() {
|
||||
return reader.markSupported();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mark(int readAheadLimit) throws IOException {
|
||||
reader.mark(readAheadLimit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
reader.reset();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,39 +1,36 @@
|
|||
|
||||
package net.filebot.vfs;
|
||||
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
|
||||
public class MemoryFile {
|
||||
|
||||
private final String path;
|
||||
|
||||
private final ByteBuffer data;
|
||||
|
||||
|
||||
public MemoryFile(String path, ByteBuffer data) {
|
||||
// normalize folder separator
|
||||
this.path = path.replace('\\', '/');
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
|
||||
public String getName() {
|
||||
return path.substring(path.lastIndexOf("/") + 1);
|
||||
}
|
||||
|
||||
|
||||
public String getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return data.remaining();
|
||||
}
|
||||
|
||||
public ByteBuffer getData() {
|
||||
return data.duplicate();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return path;
|
||||
|
|
Loading…
Reference in New Issue