Refactor UnicodeReader and BOM detection

2016-11-21 01:56:43 +08:00 · 2016-11-21 01:56:43 +08:00 · 8932eb0b2a
parent 53376c3de6
commit 8932eb0b2a
8 changed files with 137 additions and 185 deletions
--- a/.classpath
+++ b/.classpath
@ -9,7 +9,7 @@
 	<classpathentry kind="lib" path="lib/jars/xmlrpc.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/ehcache.jar" sourcepath="lib/ivy/source/ehcache.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/glazedlists_java15.jar" sourcepath="lib/ivy/source/glazedlists_java15.jar"/>
-	<classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar"/>
+	<classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar" sourcepath="lib/ivy/source/icu4j.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/jna.jar" sourcepath="lib/ivy/source/jna.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/junit.jar"/>
 	<classpathentry kind="lib" path="lib/ivy/jar/miglayout-core.jar"/>
--- a/source/net/filebot/media/MediaDetection.java
+++ b/source/net/filebot/media/MediaDetection.java
@ -1219,7 +1219,7 @@ public class MediaDetection {
 		// parse ids from nfo files
 		for (File nfo : nfoFiles) {
 			try {
-				String text = new String(readFile(nfo), "UTF-8");
+				String text = readTextFile(nfo);
 				collection.addAll(grepImdbId(text));
 			} catch (Exception e) {
 				debug.warning("Failed to read nfo: " + e.getMessage());
@ -1246,7 +1246,7 @@ public class MediaDetection {
 				continue;

 			for (File nfo : getChildren(folder, NFO_FILES)) {
-				String text = new String(readFile(nfo), "UTF-8");
+				String text = readTextFile(nfo);

 				for (int imdbid : grepImdbId(text)) {
 					SearchResult series = WebServices.TheTVDB.lookupByIMDbID(imdbid, language);
--- a/source/net/filebot/subtitle/SubtitleUtilities.java
+++ b/source/net/filebot/subtitle/SubtitleUtilities.java
@ -1,5 +1,6 @@
 package net.filebot.subtitle;

+import static java.nio.charset.StandardCharsets.*;
 import static java.util.Collections.*;
 import static java.util.stream.Collectors.*;
 import static net.filebot.Logging.*;
@ -10,11 +11,10 @@ import static net.filebot.util.FileUtilities.*;

 import java.io.File;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
 import java.io.Reader;
 import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
 import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
@ -33,6 +33,8 @@ import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;

+import org.apache.commons.io.IOUtils;
+
 import com.optimaize.langdetect.DetectedLanguage;
 import com.optimaize.langdetect.LanguageDetector;
 import com.optimaize.langdetect.LanguageDetectorBuilder;
@ -53,7 +55,7 @@ import net.filebot.similarity.SequenceMatchSimilarity;
 import net.filebot.similarity.SimilarityComparator;
 import net.filebot.similarity.SimilarityMetric;
 import net.filebot.util.ByteBufferInputStream;
-import net.filebot.util.UnicodeReader;
+import net.filebot.util.ByteBufferOutputStream;
 import net.filebot.vfs.ArchiveType;
 import net.filebot.vfs.MemoryFile;
 import net.filebot.web.Movie;
@ -325,7 +327,7 @@ public final class SubtitleUtilities {
 		// decode subtitle file with the first reader that seems to work
 		for (SubtitleFormat format : likelyFormats) {
 			// decode bytes and beware of byte-order marks
-			Reader reader = new UnicodeReader(new ByteBufferInputStream(file.getData()), true, StandardCharsets.UTF_8);
+			Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);

 			// reset reader to position 0
 			SubtitleReader parser = format.newReader(reader);
@ -347,29 +349,31 @@ public final class SubtitleUtilities {
 		throw new IOException("Subtitle format not supported");
 	}

-	public static ByteBuffer exportSubtitles(MemoryFile data, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
+	public static ByteBuffer exportSubtitles(MemoryFile file, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
 		if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) {
 			throw new IllegalArgumentException("Format not supported");
 		}

-		// convert to target format and target encoding
+		ByteBufferOutputStream buffer = new ByteBufferOutputStream(file.size());
+		OutputStreamWriter writer = new OutputStreamWriter(buffer, outputEncoding);
+
 		if (outputFormat == SubtitleFormat.SubRip) {
-			// output buffer
-			StringBuilder buffer = new StringBuilder(4 * 1024);
-			try (SubRipWriter out = new SubRipWriter(buffer)) {
-				for (SubtitleElement it : decodeSubtitles(data)) {
+			// convert to target format and target encoding
+			try (SubRipWriter out = new SubRipWriter(writer)) {
+				for (SubtitleElement it : decodeSubtitles(file)) {
 					if (outputTimingOffset != 0) {
 						it = new SubtitleElement(Math.max(0, it.getStart() + outputTimingOffset), Math.max(0, it.getEnd() + outputTimingOffset), it.getText());
 					}
 					out.write(it);
 				}
 			}
-
-			return outputEncoding.encode(CharBuffer.wrap(buffer));
+		} else {
+			// convert only text encoding
+			Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);
+			IOUtils.copy(reader, writer);
 		}

-		// only change encoding
-		return outputEncoding.encode(getText(data.getData()));
+		return buffer.getByteBuffer();
 	}

 	public static SubtitleFormat getSubtitleFormat(File file) {
--- a/source/net/filebot/ui/subtitle/upload/SubtitleUploadDialog.java
+++ b/source/net/filebot/ui/subtitle/upload/SubtitleUploadDialog.java
@ -3,6 +3,7 @@ package net.filebot.ui.subtitle.upload;
 import static java.util.Collections.*;
 import static net.filebot.Logging.*;
 import static net.filebot.media.MediaDetection.*;
+import static net.filebot.util.FileUtilities.*;
 import static net.filebot.util.ui.SwingUI.*;

 import java.awt.Color;
@ -32,7 +33,6 @@ import net.filebot.Language;
 import net.filebot.ResourceManager;
 import net.filebot.WebServices;
 import net.filebot.media.MediaDetection;
-import net.filebot.util.FileUtilities;
 import net.filebot.util.ui.EmptySelectionModel;
 import net.filebot.web.Movie;
 import net.filebot.web.OpenSubtitlesClient;
@ -151,7 +151,7 @@ public class SubtitleUploadDialog extends JDialog {
 			if (mapping.getLanguage() == null) {
 				mapping.setState(Status.Identifying);
 				try {
-					Locale locale = database.detectLanguage(FileUtilities.readFile(mapping.getSubtitle()));
+					Locale locale = database.detectLanguage(readFile(mapping.getSubtitle()));
 					mapping.setLanguage(Language.getLanguage(locale));
 				} catch (Exception e) {
 					debug.log(Level.WARNING, "Failed to auto-detect language: " + e.getMessage());
--- a/source/net/filebot/util/BOM.java
+++ b/source/net/filebot/util/BOM.java
@ -0,0 +1,73 @@
+package net.filebot.util;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+public enum BOM {
+
+	UTF_8((byte) 0xEF, (byte) 0xBB, (byte) 0xBF),
+
+	UTF_16BE((byte) 0xFE, (byte) 0xFF),
+
+	UTF_16LE((byte) 0xFF, (byte) 0xFE),
+
+	UTF_32BE((byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF),
+
+	UTF_32LE((byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00),
+
+	GB_18030((byte) 0x84, (byte) 0x31, (byte) 0x95, (byte) 0x33);
+
+	public static final int SIZE = 4;
+
+	private byte[] bom;
+
+	BOM(byte... bom) {
+		this.bom = bom;
+	}
+
+	public int size() {
+		return bom.length;
+	}
+
+	public boolean matches(byte[] bytes) {
+		if (bytes.length < bom.length) {
+			return false;
+		}
+
+		for (int i = 0; i < bom.length; i++) {
+			if (bom[i] != bytes[i]) {
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	public Charset getCharset() {
+		switch (this) {
+		case UTF_8:
+			return StandardCharsets.UTF_8;
+		case UTF_16BE:
+			return StandardCharsets.UTF_16BE;
+		case UTF_16LE:
+			return StandardCharsets.UTF_16LE;
+		case UTF_32BE:
+			return Charset.forName("UTF-32BE");
+		case UTF_32LE:
+			return Charset.forName("UTF-32LE");
+		case GB_18030:
+			return Charset.forName("GB18030");
+		}
+		return null;
+	}
+
+	public static BOM detect(byte[] bytes) {
+		for (BOM bom : values()) {
+			if (bom.matches(bytes)) {
+				return bom;
+			}
+		}
+		return null;
+	}
+
+}
--- a/source/net/filebot/util/FileUtilities.java
+++ b/source/net/filebot/util/FileUtilities.java
@ -4,23 +4,22 @@ import static java.nio.charset.StandardCharsets.*;
 import static java.util.Arrays.*;
 import static java.util.Collections.*;
 import static java.util.Comparator.*;
-import static java.util.stream.Collectors.*;
 import static net.filebot.Logging.*;
 import static net.filebot.util.RegularExpressions.*;

 import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.AtomicMoveNotSupportedException;
 import java.nio.file.FileVisitOption;
@ -48,7 +47,6 @@ import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import java.util.stream.Collector;
 import java.util.stream.Stream;

 import org.apache.commons.io.FileUtils;
@ -196,18 +194,21 @@ public final class FileUtilities {
 		return Files.readAllBytes(file.toPath());
 	}

-	public static <R, A> R readLines(File file, Collector<? super String, A, R> collector) throws IOException {
-		try (BufferedReader reader = new BufferedReader(new UnicodeReader(new ByteArrayInputStream(readFile(file)), false, UTF_8))) {
-			return reader.lines().collect(collector);
+	public static String readTextFile(File file) throws IOException {
+		byte[] bytes = readFile(file);
+
+		// check BOM
+		BOM bom = BOM.detect(bytes);
+
+		if (bom != null) {
+			return new String(bytes, bom.size(), bytes.length - bom.size(), bom.getCharset());
+		} else {
+			return new String(bytes, UTF_8);
 		}
 	}

 	public static List<String> readLines(File file) throws IOException {
-		return readLines(file, toList());
-	}
-
-	public static String readTextFile(File file) throws IOException {
-		return readLines(file, joining(System.lineSeparator()));
+		return asList(NEWLINE.split(readTextFile(file)));
 	}

 	public static File writeFile(ByteBuffer data, File destination) throws IOException {
@ -217,35 +218,37 @@ public final class FileUtilities {
 		return destination;
 	}

-	public static Reader createTextReader(File file) throws IOException {
-		CharsetDetector detector = new CharsetDetector();
-		detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding
-		detector.setText(new BufferedInputStream(new FileInputStream(file)));
+	public static Reader createTextReader(InputStream in, boolean guess, Charset declaredEncoding) throws IOException {
+		byte head[] = new byte[BOM.SIZE];
+		in.mark(head.length);
+		in.read(head);
+		in.reset(); // rewind

-		CharsetMatch charset = detector.detect();
-		if (charset != null)
-			return charset.getReader();
+		// check BOM
+		BOM bom = BOM.detect(head);

-		// assume UTF-8 by default
-		return new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8);
-	}
+		if (bom != null) {
+			in.skip(bom.size()); // skip BOM
+			return new InputStreamReader(in, bom.getCharset());
+		}

-	public static String getText(ByteBuffer data) throws IOException {
-		CharsetDetector detector = new CharsetDetector();
-		detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding
-		detector.setText(new ByteBufferInputStream(data));
-
-		CharsetMatch charset = detector.detect();
-		if (charset != null) {
-			try {
-				return charset.getString();
-			} catch (RuntimeException e) {
-				throw new IOException("Failed to read text", e);
+		// auto-detect character encoding
+		if (guess) {
+			CharsetDetector detector = new CharsetDetector();
+			detector.setDeclaredEncoding(declaredEncoding.name());
+			detector.setText(in);
+			CharsetMatch match = detector.detect();
+			if (match != null) {
+				return match.getReader();
 			}
 		}

-		// assume UTF-8 by default
-		return UTF_8.decode(data).toString();
+		// default to declared encoding
+		return new InputStreamReader(in, declaredEncoding);
+	}
+
+	public static Reader createTextReader(File file) throws IOException {
+		return createTextReader(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE), true, UTF_8);
 	}

 	public static boolean equalsCaseSensitive(File a, File b) {
--- a/source/net/filebot/util/UnicodeReader.java
+++ b/source/net/filebot/util/UnicodeReader.java
@ -1,125 +0,0 @@
-package net.filebot.util;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
-
-import com.ibm.icu.text.CharsetDetector;
-
-public class UnicodeReader extends Reader {
-
-	private static final int BOM_SIZE = 4;
-
-	private final Reader reader;
-
-	public UnicodeReader(InputStream stream, boolean guessCharset, Charset defaultCharset) throws IOException {
-		if (!stream.markSupported()) {
-			throw new IllegalArgumentException("stream must support mark");
-		}
-
-		stream.mark(BOM_SIZE);
-		byte bom[] = new byte[BOM_SIZE];
-		stream.read(bom, 0, bom.length);
-
-		Charset bomEncoding = null;
-		int skip = 0;
-
-		if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
-			bomEncoding = StandardCharsets.UTF_8;
-			skip = 3;
-		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
-			bomEncoding = StandardCharsets.UTF_16BE;
-			skip = 2;
-		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
-			bomEncoding = StandardCharsets.UTF_16LE;
-			skip = 2;
-		} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
-			bomEncoding = Charset.forName("UTF-32BE");
-			skip = 4;
-		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
-			bomEncoding = Charset.forName("UTF-32LE");
-			skip = 4;
-		}
-
-		// rewind and skip BOM
-		stream.reset();
-		stream.skip(skip);
-
-		// guess character encoding if necessary
-		if (bomEncoding != null) {
-			// initialize reader via BOM
-			reader = new InputStreamReader(stream, bomEncoding);
-		} else if (bomEncoding == null && guessCharset) {
-			// auto-detect encoding
-			reader = new CharsetDetector().getReader(stream, defaultCharset.name());
-		} else {
-			// use default
-			reader = new InputStreamReader(stream, defaultCharset);
-		}
-	}
-
-	@Override
-	public int hashCode() {
-		return reader.hashCode();
-	}
-
-	@Override
-	public int read(CharBuffer target) throws IOException {
-		return reader.read(target);
-	}
-
-	@Override
-	public boolean equals(Object obj) {
-		return reader.equals(obj);
-	}
-
-	@Override
-	public int read(char[] cbuf) throws IOException {
-		return reader.read(cbuf);
-	}
-
-	@Override
-	public int read() throws IOException {
-		return reader.read();
-	}
-
-	@Override
-	public int read(char[] cbuf, int offset, int length) throws IOException {
-		return reader.read(cbuf, offset, length);
-	}
-
-	@Override
-	public long skip(long n) throws IOException {
-		return reader.skip(n);
-	}
-
-	@Override
-	public boolean ready() throws IOException {
-		return reader.ready();
-	}
-
-	@Override
-	public void close() throws IOException {
-		reader.close();
-	}
-
-	@Override
-	public boolean markSupported() {
-		return reader.markSupported();
-	}
-
-	@Override
-	public void mark(int readAheadLimit) throws IOException {
-		reader.mark(readAheadLimit);
-	}
-
-	@Override
-	public void reset() throws IOException {
-		reader.reset();
-	}
-
-}
--- a/source/net/filebot/vfs/MemoryFile.java
+++ b/source/net/filebot/vfs/MemoryFile.java
@ -1,39 +1,36 @@

 package net.filebot.vfs;

-
 import java.nio.ByteBuffer;

-
 public class MemoryFile {

 	private final String path;

 	private final ByteBuffer data;

-
 	public MemoryFile(String path, ByteBuffer data) {
 		// normalize folder separator
 		this.path = path.replace('\\', '/');
 		this.data = data;
 	}

-
 	public String getName() {
 		return path.substring(path.lastIndexOf("/") + 1);
 	}

-
 	public String getPath() {
 		return path;
 	}

+	public int size() {
+		return data.remaining();
+	}

 	public ByteBuffer getData() {
 		return data.duplicate();
 	}

-
 	@Override
 	public String toString() {
 		return path;