Refactor UnicodeReader and BOM detection

2016-11-21 01:56:43 +08:00 · 2016-11-21 01:56:43 +08:00 · 8932eb0b2a
parent 53376c3de6
commit 8932eb0b2a
8 changed files with 137 additions and 185 deletions
--- a/.classpath
+++ b/.classpath
@ -9,7 +9,7 @@
 	<classpathentry kind="lib" path="lib/jars/xmlrpc.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/ehcache.jar" sourcepath="lib/ivy/source/ehcache.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/glazedlists_java15.jar" sourcepath="lib/ivy/source/glazedlists_java15.jar"/>
-	<classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar"/>
+	<classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar" sourcepath="lib/ivy/source/icu4j.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/jna.jar" sourcepath="lib/ivy/source/jna.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/junit.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/miglayout-core.jar"/>
--- a/source/net/filebot/media/MediaDetection.java
+++ b/source/net/filebot/media/MediaDetection.java
@ -1219,7 +1219,7 @@ public class MediaDetection {
 		// parse ids from nfo files
 		for (File nfo : nfoFiles) {
 			try {
-				String text = new String(readFile(nfo), "UTF-8");
+				String text = readTextFile(nfo);
 				collection.addAll(grepImdbId(text));
 			} catch (Exception e) {
 				debug.warning("Failed to read nfo: " + e.getMessage());
@ -1246,7 +1246,7 @@ public class MediaDetection {
 				continue;
 			for (File nfo : getChildren(folder, NFO_FILES)) {
-				String text = new String(readFile(nfo), "UTF-8");
+				String text = readTextFile(nfo);
 				for (int imdbid : grepImdbId(text)) {
 					SearchResult series = WebServices.TheTVDB.lookupByIMDbID(imdbid, language);
--- a/source/net/filebot/subtitle/SubtitleUtilities.java
+++ b/source/net/filebot/subtitle/SubtitleUtilities.java
@ -1,5 +1,6 @@
 package net.filebot.subtitle;
 import static java.nio.charset.StandardCharsets.*;
 import static java.util.Collections.*;
 import static java.util.stream.Collectors.*;
 import static net.filebot.Logging.*;
@ -10,11 +11,10 @@ import static net.filebot.util.FileUtilities.*;
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Reader;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
@ -33,6 +33,8 @@ import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.io.IOUtils;
 import com.optimaize.langdetect.DetectedLanguage;
 import com.optimaize.langdetect.LanguageDetector;
 import com.optimaize.langdetect.LanguageDetectorBuilder;
@ -53,7 +55,7 @@ import net.filebot.similarity.SequenceMatchSimilarity;
 import net.filebot.similarity.SimilarityComparator;
 import net.filebot.similarity.SimilarityMetric;
 import net.filebot.util.ByteBufferInputStream;
-import net.filebot.util.UnicodeReader;
+import net.filebot.util.ByteBufferOutputStream;
 import net.filebot.vfs.ArchiveType;
 import net.filebot.vfs.MemoryFile;
 import net.filebot.web.Movie;
@ -325,7 +327,7 @@ public final class SubtitleUtilities {
 		// decode subtitle file with the first reader that seems to work
 		for (SubtitleFormat format : likelyFormats) {
 			// decode bytes and beware of byte-order marks
-			Reader reader = new UnicodeReader(new ByteBufferInputStream(file.getData()), true, StandardCharsets.UTF_8);
+			Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);
 			// reset reader to position 0
 			SubtitleReader parser = format.newReader(reader);
@ -347,29 +349,31 @@ public final class SubtitleUtilities {
 		throw new IOException("Subtitle format not supported");
 	}
-	public static ByteBuffer exportSubtitles(MemoryFile data, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
+	public static ByteBuffer exportSubtitles(MemoryFile file, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
 		if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) {
 			throw new IllegalArgumentException("Format not supported");
 		}
-		// convert to target format and target encoding
+		ByteBufferOutputStream buffer = new ByteBufferOutputStream(file.size());
 		OutputStreamWriter writer = new OutputStreamWriter(buffer, outputEncoding);
 		if (outputFormat == SubtitleFormat.SubRip) {
-			// output buffer
+			// convert to target format and target encoding
-			StringBuilder buffer = new StringBuilder(4 * 1024);
+			try (SubRipWriter out = new SubRipWriter(writer)) {
-			try (SubRipWriter out = new SubRipWriter(buffer)) {
+				for (SubtitleElement it : decodeSubtitles(file)) {
 				for (SubtitleElement it : decodeSubtitles(data)) {
 					if (outputTimingOffset != 0) {
 						it = new SubtitleElement(Math.max(0, it.getStart() + outputTimingOffset), Math.max(0, it.getEnd() + outputTimingOffset), it.getText());
 					}
 					out.write(it);
 				}
 			}
-
+		} else {
-			return outputEncoding.encode(CharBuffer.wrap(buffer));
+			// convert only text encoding
 			Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);
 			IOUtils.copy(reader, writer);
 		}
-		// only change encoding
+		return buffer.getByteBuffer();
 		return outputEncoding.encode(getText(data.getData()));
 	}
 	public static SubtitleFormat getSubtitleFormat(File file) {
--- a/source/net/filebot/ui/subtitle/upload/SubtitleUploadDialog.java
+++ b/source/net/filebot/ui/subtitle/upload/SubtitleUploadDialog.java
@ -3,6 +3,7 @@ package net.filebot.ui.subtitle.upload;
 import static java.util.Collections.*;
 import static net.filebot.Logging.*;
 import static net.filebot.media.MediaDetection.*;
 import static net.filebot.util.FileUtilities.*;
 import static net.filebot.util.ui.SwingUI.*;
 import java.awt.Color;
@ -32,7 +33,6 @@ import net.filebot.Language;
 import net.filebot.ResourceManager;
 import net.filebot.WebServices;
 import net.filebot.media.MediaDetection;
 import net.filebot.util.FileUtilities;
 import net.filebot.util.ui.EmptySelectionModel;
 import net.filebot.web.Movie;
 import net.filebot.web.OpenSubtitlesClient;
@ -151,7 +151,7 @@ public class SubtitleUploadDialog extends JDialog {
 			if (mapping.getLanguage() == null) {
 				mapping.setState(Status.Identifying);
 				try {
-					Locale locale = database.detectLanguage(FileUtilities.readFile(mapping.getSubtitle()));
+					Locale locale = database.detectLanguage(readFile(mapping.getSubtitle()));
 					mapping.setLanguage(Language.getLanguage(locale));
 				} catch (Exception e) {
 					debug.log(Level.WARNING, "Failed to auto-detect language: " + e.getMessage());
--- a/source/net/filebot/util/BOM.java
+++ b/source/net/filebot/util/BOM.java
@ -0,0 +1,73 @@
 package net.filebot.util;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 public enum BOM {
 	UTF_8((byte) 0xEF, (byte) 0xBB, (byte) 0xBF),
 	UTF_16BE((byte) 0xFE, (byte) 0xFF),
 	UTF_16LE((byte) 0xFF, (byte) 0xFE),
 	UTF_32BE((byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF),
 	UTF_32LE((byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00),
 	GB_18030((byte) 0x84, (byte) 0x31, (byte) 0x95, (byte) 0x33);
 	public static final int SIZE = 4;
 	private byte[] bom;
 	BOM(byte... bom) {
 		this.bom = bom;
 	}
 	public int size() {
 		return bom.length;
 	}
 	public boolean matches(byte[] bytes) {
 		if (bytes.length < bom.length) {
 			return false;
 		}
 		for (int i = 0; i < bom.length; i++) {
 			if (bom[i] != bytes[i]) {
 				return false;
 			}
 		}
 		return true;
 	}
 	public Charset getCharset() {
 		switch (this) {
 		case UTF_8:
 			return StandardCharsets.UTF_8;
 		case UTF_16BE:
 			return StandardCharsets.UTF_16BE;
 		case UTF_16LE:
 			return StandardCharsets.UTF_16LE;
 		case UTF_32BE:
 			return Charset.forName("UTF-32BE");
 		case UTF_32LE:
 			return Charset.forName("UTF-32LE");
 		case GB_18030:
 			return Charset.forName("GB18030");
 		}
 		return null;
 	}
 	public static BOM detect(byte[] bytes) {
 		for (BOM bom : values()) {
 			if (bom.matches(bytes)) {
 				return bom;
 			}
 		}
 		return null;
 	}
 }
--- a/source/net/filebot/util/FileUtilities.java
+++ b/source/net/filebot/util/FileUtilities.java
@ -4,23 +4,22 @@ import static java.nio.charset.StandardCharsets.*;
 import static java.util.Arrays.*;
 import static java.util.Collections.*;
 import static java.util.Comparator.*;
 import static java.util.stream.Collectors.*;
 import static net.filebot.Logging.*;
 import static net.filebot.util.RegularExpressions.*;
 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.AtomicMoveNotSupportedException;
 import java.nio.file.FileVisitOption;
@ -48,7 +47,6 @@ import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collector;
 import java.util.stream.Stream;
 import org.apache.commons.io.FileUtils;
@ -196,18 +194,21 @@ public final class FileUtilities {
 		return Files.readAllBytes(file.toPath());
 	}
-	public static <R, A> R readLines(File file, Collector<? super String, A, R> collector) throws IOException {
+	public static String readTextFile(File file) throws IOException {
-		try (BufferedReader reader = new BufferedReader(new UnicodeReader(new ByteArrayInputStream(readFile(file)), false, UTF_8))) {
+		byte[] bytes = readFile(file);
-			return reader.lines().collect(collector);
+
 		// check BOM
 		BOM bom = BOM.detect(bytes);
 		if (bom != null) {
 			return new String(bytes, bom.size(), bytes.length - bom.size(), bom.getCharset());
 		} else {
 			return new String(bytes, UTF_8);
 		}
 	}
 	public static List<String> readLines(File file) throws IOException {
-		return readLines(file, toList());
+		return asList(NEWLINE.split(readTextFile(file)));
 	}
 	public static String readTextFile(File file) throws IOException {
 		return readLines(file, joining(System.lineSeparator()));
 	}
 	public static File writeFile(ByteBuffer data, File destination) throws IOException {
@ -217,35 +218,37 @@ public final class FileUtilities {
 		return destination;
 	}
-	public static Reader createTextReader(File file) throws IOException {
+	public static Reader createTextReader(InputStream in, boolean guess, Charset declaredEncoding) throws IOException {
-		CharsetDetector detector = new CharsetDetector();
+		byte head[] = new byte[BOM.SIZE];
-		detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding
+		in.mark(head.length);
-		detector.setText(new BufferedInputStream(new FileInputStream(file)));
+		in.read(head);
 		in.reset(); // rewind
-		CharsetMatch charset = detector.detect();
+		// check BOM
-		if (charset != null)
+		BOM bom = BOM.detect(head);
 			return charset.getReader();
-		// assume UTF-8 by default
+		if (bom != null) {
-		return new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8);
+			in.skip(bom.size()); // skip BOM
-	}
+			return new InputStreamReader(in, bom.getCharset());
 		}
-	public static String getText(ByteBuffer data) throws IOException {
+		// auto-detect character encoding
-		CharsetDetector detector = new CharsetDetector();
+		if (guess) {
-		detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding
+			CharsetDetector detector = new CharsetDetector();
-		detector.setText(new ByteBufferInputStream(data));
+			detector.setDeclaredEncoding(declaredEncoding.name());
-
+			detector.setText(in);
-		CharsetMatch charset = detector.detect();
+			CharsetMatch match = detector.detect();
-		if (charset != null) {
+			if (match != null) {
-			try {
+				return match.getReader();
 				return charset.getString();
 			} catch (RuntimeException e) {
 				throw new IOException("Failed to read text", e);
 			}
 		}
-		// assume UTF-8 by default
+		// default to declared encoding
-		return UTF_8.decode(data).toString();
+		return new InputStreamReader(in, declaredEncoding);
 	}
 	public static Reader createTextReader(File file) throws IOException {
 		return createTextReader(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE), true, UTF_8);
 	}
 	public static boolean equalsCaseSensitive(File a, File b) {
--- a/source/net/filebot/util/UnicodeReader.java
+++ b/source/net/filebot/util/UnicodeReader.java
@ -1,125 +0,0 @@
 package net.filebot.util;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.CharBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import com.ibm.icu.text.CharsetDetector;
 public class UnicodeReader extends Reader {
 	private static final int BOM_SIZE = 4;
 	private final Reader reader;
 	public UnicodeReader(InputStream stream, boolean guessCharset, Charset defaultCharset) throws IOException {
 		if (!stream.markSupported()) {
 			throw new IllegalArgumentException("stream must support mark");
 		}
 		stream.mark(BOM_SIZE);
 		byte bom[] = new byte[BOM_SIZE];
 		stream.read(bom, 0, bom.length);
 		Charset bomEncoding = null;
 		int skip = 0;
 		if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
 			bomEncoding = StandardCharsets.UTF_8;
 			skip = 3;
 		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
 			bomEncoding = StandardCharsets.UTF_16BE;
 			skip = 2;
 		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
 			bomEncoding = StandardCharsets.UTF_16LE;
 			skip = 2;
 		} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
 			bomEncoding = Charset.forName("UTF-32BE");
 			skip = 4;
 		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
 			bomEncoding = Charset.forName("UTF-32LE");
 			skip = 4;
 		}
 		// rewind and skip BOM
 		stream.reset();
 		stream.skip(skip);
 		// guess character encoding if necessary
 		if (bomEncoding != null) {
 			// initialize reader via BOM
 			reader = new InputStreamReader(stream, bomEncoding);
 		} else if (bomEncoding == null && guessCharset) {
 			// auto-detect encoding
 			reader = new CharsetDetector().getReader(stream, defaultCharset.name());
 		} else {
 			// use default
 			reader = new InputStreamReader(stream, defaultCharset);
 		}
 	}
 	@Override
 	public int hashCode() {
 		return reader.hashCode();
 	}
 	@Override
 	public int read(CharBuffer target) throws IOException {
 		return reader.read(target);
 	}
 	@Override
 	public boolean equals(Object obj) {
 		return reader.equals(obj);
 	}
 	@Override
 	public int read(char[] cbuf) throws IOException {
 		return reader.read(cbuf);
 	}
 	@Override
 	public int read() throws IOException {
 		return reader.read();
 	}
 	@Override
 	public int read(char[] cbuf, int offset, int length) throws IOException {
 		return reader.read(cbuf, offset, length);
 	}
 	@Override
 	public long skip(long n) throws IOException {
 		return reader.skip(n);
 	}
 	@Override
 	public boolean ready() throws IOException {
 		return reader.ready();
 	}
 	@Override
 	public void close() throws IOException {
 		reader.close();
 	}
 	@Override
 	public boolean markSupported() {
 		return reader.markSupported();
 	}
 	@Override
 	public void mark(int readAheadLimit) throws IOException {
 		reader.mark(readAheadLimit);
 	}
 	@Override
 	public void reset() throws IOException {
 		reader.reset();
 	}
 }
--- a/source/net/filebot/vfs/MemoryFile.java
+++ b/source/net/filebot/vfs/MemoryFile.java
@ -1,39 +1,36 @@
 package net.filebot.vfs;
 import java.nio.ByteBuffer;
 public class MemoryFile {
 	private final String path;
 	private final ByteBuffer data;
 	public MemoryFile(String path, ByteBuffer data) {
 		// normalize folder separator
 		this.path = path.replace('\\', '/');
 		this.data = data;
 	}
 	public String getName() {
 		return path.substring(path.lastIndexOf("/") + 1);
 	}
 	public String getPath() {
 		return path;
 	}
 	public int size() {
 		return data.remaining();
 	}
 	public ByteBuffer getData() {
 		return data.duplicate();
 	}
 	@Override
 	public String toString() {
 		return path;