Refactor UnicodeReader and BOM detection

This commit is contained in:
Reinhard Pointner 2016-11-21 01:56:43 +08:00
parent 53376c3de6
commit 8932eb0b2a
8 changed files with 137 additions and 185 deletions

View File

@ -9,7 +9,7 @@
<classpathentry kind="lib" path="lib/jars/xmlrpc.jar"/> <classpathentry kind="lib" path="lib/jars/xmlrpc.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/ehcache.jar" sourcepath="lib/ivy/source/ehcache.jar"/> <classpathentry kind="lib" path="lib/ivy/jar/ehcache.jar" sourcepath="lib/ivy/source/ehcache.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/glazedlists_java15.jar" sourcepath="lib/ivy/source/glazedlists_java15.jar"/> <classpathentry kind="lib" path="lib/ivy/jar/glazedlists_java15.jar" sourcepath="lib/ivy/source/glazedlists_java15.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar"/> <classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar" sourcepath="lib/ivy/source/icu4j.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/jna.jar" sourcepath="lib/ivy/source/jna.jar"/> <classpathentry kind="lib" path="lib/ivy/jar/jna.jar" sourcepath="lib/ivy/source/jna.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/junit.jar"/> <classpathentry kind="lib" path="lib/ivy/jar/junit.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/miglayout-core.jar"/> <classpathentry kind="lib" path="lib/ivy/jar/miglayout-core.jar"/>

View File

@ -1219,7 +1219,7 @@ public class MediaDetection {
// parse ids from nfo files // parse ids from nfo files
for (File nfo : nfoFiles) { for (File nfo : nfoFiles) {
try { try {
String text = new String(readFile(nfo), "UTF-8"); String text = readTextFile(nfo);
collection.addAll(grepImdbId(text)); collection.addAll(grepImdbId(text));
} catch (Exception e) { } catch (Exception e) {
debug.warning("Failed to read nfo: " + e.getMessage()); debug.warning("Failed to read nfo: " + e.getMessage());
@ -1246,7 +1246,7 @@ public class MediaDetection {
continue; continue;
for (File nfo : getChildren(folder, NFO_FILES)) { for (File nfo : getChildren(folder, NFO_FILES)) {
String text = new String(readFile(nfo), "UTF-8"); String text = readTextFile(nfo);
for (int imdbid : grepImdbId(text)) { for (int imdbid : grepImdbId(text)) {
SearchResult series = WebServices.TheTVDB.lookupByIMDbID(imdbid, language); SearchResult series = WebServices.TheTVDB.lookupByIMDbID(imdbid, language);

View File

@ -1,5 +1,6 @@
package net.filebot.subtitle; package net.filebot.subtitle;
import static java.nio.charset.StandardCharsets.*;
import static java.util.Collections.*; import static java.util.Collections.*;
import static java.util.stream.Collectors.*; import static java.util.stream.Collectors.*;
import static net.filebot.Logging.*; import static net.filebot.Logging.*;
@ -10,11 +11,10 @@ import static net.filebot.util.FileUtilities.*;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Reader; import java.io.Reader;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
@ -33,6 +33,8 @@ import java.util.function.Predicate;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import com.optimaize.langdetect.DetectedLanguage; import com.optimaize.langdetect.DetectedLanguage;
import com.optimaize.langdetect.LanguageDetector; import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder; import com.optimaize.langdetect.LanguageDetectorBuilder;
@ -53,7 +55,7 @@ import net.filebot.similarity.SequenceMatchSimilarity;
import net.filebot.similarity.SimilarityComparator; import net.filebot.similarity.SimilarityComparator;
import net.filebot.similarity.SimilarityMetric; import net.filebot.similarity.SimilarityMetric;
import net.filebot.util.ByteBufferInputStream; import net.filebot.util.ByteBufferInputStream;
import net.filebot.util.UnicodeReader; import net.filebot.util.ByteBufferOutputStream;
import net.filebot.vfs.ArchiveType; import net.filebot.vfs.ArchiveType;
import net.filebot.vfs.MemoryFile; import net.filebot.vfs.MemoryFile;
import net.filebot.web.Movie; import net.filebot.web.Movie;
@ -325,7 +327,7 @@ public final class SubtitleUtilities {
// decode subtitle file with the first reader that seems to work // decode subtitle file with the first reader that seems to work
for (SubtitleFormat format : likelyFormats) { for (SubtitleFormat format : likelyFormats) {
// decode bytes and beware of byte-order marks // decode bytes and beware of byte-order marks
Reader reader = new UnicodeReader(new ByteBufferInputStream(file.getData()), true, StandardCharsets.UTF_8); Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);
// reset reader to position 0 // reset reader to position 0
SubtitleReader parser = format.newReader(reader); SubtitleReader parser = format.newReader(reader);
@ -347,29 +349,31 @@ public final class SubtitleUtilities {
throw new IOException("Subtitle format not supported"); throw new IOException("Subtitle format not supported");
} }
public static ByteBuffer exportSubtitles(MemoryFile data, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException { public static ByteBuffer exportSubtitles(MemoryFile file, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) { if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) {
throw new IllegalArgumentException("Format not supported"); throw new IllegalArgumentException("Format not supported");
} }
// convert to target format and target encoding ByteBufferOutputStream buffer = new ByteBufferOutputStream(file.size());
OutputStreamWriter writer = new OutputStreamWriter(buffer, outputEncoding);
if (outputFormat == SubtitleFormat.SubRip) { if (outputFormat == SubtitleFormat.SubRip) {
// output buffer // convert to target format and target encoding
StringBuilder buffer = new StringBuilder(4 * 1024); try (SubRipWriter out = new SubRipWriter(writer)) {
try (SubRipWriter out = new SubRipWriter(buffer)) { for (SubtitleElement it : decodeSubtitles(file)) {
for (SubtitleElement it : decodeSubtitles(data)) {
if (outputTimingOffset != 0) { if (outputTimingOffset != 0) {
it = new SubtitleElement(Math.max(0, it.getStart() + outputTimingOffset), Math.max(0, it.getEnd() + outputTimingOffset), it.getText()); it = new SubtitleElement(Math.max(0, it.getStart() + outputTimingOffset), Math.max(0, it.getEnd() + outputTimingOffset), it.getText());
} }
out.write(it); out.write(it);
} }
} }
} else {
return outputEncoding.encode(CharBuffer.wrap(buffer)); // convert only text encoding
Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);
IOUtils.copy(reader, writer);
} }
// only change encoding return buffer.getByteBuffer();
return outputEncoding.encode(getText(data.getData()));
} }
public static SubtitleFormat getSubtitleFormat(File file) { public static SubtitleFormat getSubtitleFormat(File file) {

View File

@ -3,6 +3,7 @@ package net.filebot.ui.subtitle.upload;
import static java.util.Collections.*; import static java.util.Collections.*;
import static net.filebot.Logging.*; import static net.filebot.Logging.*;
import static net.filebot.media.MediaDetection.*; import static net.filebot.media.MediaDetection.*;
import static net.filebot.util.FileUtilities.*;
import static net.filebot.util.ui.SwingUI.*; import static net.filebot.util.ui.SwingUI.*;
import java.awt.Color; import java.awt.Color;
@ -32,7 +33,6 @@ import net.filebot.Language;
import net.filebot.ResourceManager; import net.filebot.ResourceManager;
import net.filebot.WebServices; import net.filebot.WebServices;
import net.filebot.media.MediaDetection; import net.filebot.media.MediaDetection;
import net.filebot.util.FileUtilities;
import net.filebot.util.ui.EmptySelectionModel; import net.filebot.util.ui.EmptySelectionModel;
import net.filebot.web.Movie; import net.filebot.web.Movie;
import net.filebot.web.OpenSubtitlesClient; import net.filebot.web.OpenSubtitlesClient;
@ -151,7 +151,7 @@ public class SubtitleUploadDialog extends JDialog {
if (mapping.getLanguage() == null) { if (mapping.getLanguage() == null) {
mapping.setState(Status.Identifying); mapping.setState(Status.Identifying);
try { try {
Locale locale = database.detectLanguage(FileUtilities.readFile(mapping.getSubtitle())); Locale locale = database.detectLanguage(readFile(mapping.getSubtitle()));
mapping.setLanguage(Language.getLanguage(locale)); mapping.setLanguage(Language.getLanguage(locale));
} catch (Exception e) { } catch (Exception e) {
debug.log(Level.WARNING, "Failed to auto-detect language: " + e.getMessage()); debug.log(Level.WARNING, "Failed to auto-detect language: " + e.getMessage());

View File

@ -0,0 +1,73 @@
package net.filebot.util;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
public enum BOM {
UTF_8((byte) 0xEF, (byte) 0xBB, (byte) 0xBF),
UTF_16BE((byte) 0xFE, (byte) 0xFF),
UTF_16LE((byte) 0xFF, (byte) 0xFE),
UTF_32BE((byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF),
UTF_32LE((byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00),
GB_18030((byte) 0x84, (byte) 0x31, (byte) 0x95, (byte) 0x33);
public static final int SIZE = 4;
private byte[] bom;
BOM(byte... bom) {
this.bom = bom;
}
public int size() {
return bom.length;
}
public boolean matches(byte[] bytes) {
if (bytes.length < bom.length) {
return false;
}
for (int i = 0; i < bom.length; i++) {
if (bom[i] != bytes[i]) {
return false;
}
}
return true;
}
public Charset getCharset() {
switch (this) {
case UTF_8:
return StandardCharsets.UTF_8;
case UTF_16BE:
return StandardCharsets.UTF_16BE;
case UTF_16LE:
return StandardCharsets.UTF_16LE;
case UTF_32BE:
return Charset.forName("UTF-32BE");
case UTF_32LE:
return Charset.forName("UTF-32LE");
case GB_18030:
return Charset.forName("GB18030");
}
return null;
}
public static BOM detect(byte[] bytes) {
for (BOM bom : values()) {
if (bom.matches(bytes)) {
return bom;
}
}
return null;
}
}

View File

@ -4,23 +4,22 @@ import static java.nio.charset.StandardCharsets.*;
import static java.util.Arrays.*; import static java.util.Arrays.*;
import static java.util.Collections.*; import static java.util.Collections.*;
import static java.util.Comparator.*; import static java.util.Comparator.*;
import static java.util.stream.Collectors.*;
import static net.filebot.Logging.*; import static net.filebot.Logging.*;
import static net.filebot.util.RegularExpressions.*; import static net.filebot.util.RegularExpressions.*;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileFilter; import java.io.FileFilter;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FilenameFilter; import java.io.FilenameFilter;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.math.BigInteger; import java.math.BigInteger;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.nio.file.AtomicMoveNotSupportedException; import java.nio.file.AtomicMoveNotSupportedException;
import java.nio.file.FileVisitOption; import java.nio.file.FileVisitOption;
@ -48,7 +47,6 @@ import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collector;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
@ -196,18 +194,21 @@ public final class FileUtilities {
return Files.readAllBytes(file.toPath()); return Files.readAllBytes(file.toPath());
} }
public static <R, A> R readLines(File file, Collector<? super String, A, R> collector) throws IOException { public static String readTextFile(File file) throws IOException {
try (BufferedReader reader = new BufferedReader(new UnicodeReader(new ByteArrayInputStream(readFile(file)), false, UTF_8))) { byte[] bytes = readFile(file);
return reader.lines().collect(collector);
// check BOM
BOM bom = BOM.detect(bytes);
if (bom != null) {
return new String(bytes, bom.size(), bytes.length - bom.size(), bom.getCharset());
} else {
return new String(bytes, UTF_8);
} }
} }
public static List<String> readLines(File file) throws IOException { public static List<String> readLines(File file) throws IOException {
return readLines(file, toList()); return asList(NEWLINE.split(readTextFile(file)));
}
public static String readTextFile(File file) throws IOException {
return readLines(file, joining(System.lineSeparator()));
} }
public static File writeFile(ByteBuffer data, File destination) throws IOException { public static File writeFile(ByteBuffer data, File destination) throws IOException {
@ -217,35 +218,37 @@ public final class FileUtilities {
return destination; return destination;
} }
public static Reader createTextReader(File file) throws IOException { public static Reader createTextReader(InputStream in, boolean guess, Charset declaredEncoding) throws IOException {
CharsetDetector detector = new CharsetDetector(); byte head[] = new byte[BOM.SIZE];
detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding in.mark(head.length);
detector.setText(new BufferedInputStream(new FileInputStream(file))); in.read(head);
in.reset(); // rewind
CharsetMatch charset = detector.detect(); // check BOM
if (charset != null) BOM bom = BOM.detect(head);
return charset.getReader();
// assume UTF-8 by default if (bom != null) {
return new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8); in.skip(bom.size()); // skip BOM
} return new InputStreamReader(in, bom.getCharset());
}
public static String getText(ByteBuffer data) throws IOException { // auto-detect character encoding
CharsetDetector detector = new CharsetDetector(); if (guess) {
detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding CharsetDetector detector = new CharsetDetector();
detector.setText(new ByteBufferInputStream(data)); detector.setDeclaredEncoding(declaredEncoding.name());
detector.setText(in);
CharsetMatch charset = detector.detect(); CharsetMatch match = detector.detect();
if (charset != null) { if (match != null) {
try { return match.getReader();
return charset.getString();
} catch (RuntimeException e) {
throw new IOException("Failed to read text", e);
} }
} }
// assume UTF-8 by default // default to declared encoding
return UTF_8.decode(data).toString(); return new InputStreamReader(in, declaredEncoding);
}
public static Reader createTextReader(File file) throws IOException {
return createTextReader(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE), true, UTF_8);
} }
public static boolean equalsCaseSensitive(File a, File b) { public static boolean equalsCaseSensitive(File a, File b) {

View File

@ -1,125 +0,0 @@
package net.filebot.util;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import com.ibm.icu.text.CharsetDetector;
public class UnicodeReader extends Reader {
private static final int BOM_SIZE = 4;
private final Reader reader;
public UnicodeReader(InputStream stream, boolean guessCharset, Charset defaultCharset) throws IOException {
if (!stream.markSupported()) {
throw new IllegalArgumentException("stream must support mark");
}
stream.mark(BOM_SIZE);
byte bom[] = new byte[BOM_SIZE];
stream.read(bom, 0, bom.length);
Charset bomEncoding = null;
int skip = 0;
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
bomEncoding = StandardCharsets.UTF_8;
skip = 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
bomEncoding = StandardCharsets.UTF_16BE;
skip = 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
bomEncoding = StandardCharsets.UTF_16LE;
skip = 2;
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
bomEncoding = Charset.forName("UTF-32BE");
skip = 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
bomEncoding = Charset.forName("UTF-32LE");
skip = 4;
}
// rewind and skip BOM
stream.reset();
stream.skip(skip);
// guess character encoding if necessary
if (bomEncoding != null) {
// initialize reader via BOM
reader = new InputStreamReader(stream, bomEncoding);
} else if (bomEncoding == null && guessCharset) {
// auto-detect encoding
reader = new CharsetDetector().getReader(stream, defaultCharset.name());
} else {
// use default
reader = new InputStreamReader(stream, defaultCharset);
}
}
@Override
public int hashCode() {
return reader.hashCode();
}
@Override
public int read(CharBuffer target) throws IOException {
return reader.read(target);
}
@Override
public boolean equals(Object obj) {
return reader.equals(obj);
}
@Override
public int read(char[] cbuf) throws IOException {
return reader.read(cbuf);
}
@Override
public int read() throws IOException {
return reader.read();
}
@Override
public int read(char[] cbuf, int offset, int length) throws IOException {
return reader.read(cbuf, offset, length);
}
@Override
public long skip(long n) throws IOException {
return reader.skip(n);
}
@Override
public boolean ready() throws IOException {
return reader.ready();
}
@Override
public void close() throws IOException {
reader.close();
}
@Override
public boolean markSupported() {
return reader.markSupported();
}
@Override
public void mark(int readAheadLimit) throws IOException {
reader.mark(readAheadLimit);
}
@Override
public void reset() throws IOException {
reader.reset();
}
}

View File

@ -1,39 +1,36 @@
package net.filebot.vfs; package net.filebot.vfs;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
public class MemoryFile { public class MemoryFile {
private final String path; private final String path;
private final ByteBuffer data; private final ByteBuffer data;
public MemoryFile(String path, ByteBuffer data) { public MemoryFile(String path, ByteBuffer data) {
// normalize folder separator // normalize folder separator
this.path = path.replace('\\', '/'); this.path = path.replace('\\', '/');
this.data = data; this.data = data;
} }
public String getName() { public String getName() {
return path.substring(path.lastIndexOf("/") + 1); return path.substring(path.lastIndexOf("/") + 1);
} }
public String getPath() { public String getPath() {
return path; return path;
} }
public int size() {
return data.remaining();
}
public ByteBuffer getData() { public ByteBuffer getData() {
return data.duplicate(); return data.duplicate();
} }
@Override @Override
public String toString() { public String toString() {
return path; return path;