From 8932eb0b2a1037cd53e806d9d0e385e15b564f0d Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Mon, 21 Nov 2016 01:56:43 +0800 Subject: [PATCH] Refactor UnicodeReader and BOM detection --- .classpath | 2 +- source/net/filebot/media/MediaDetection.java | 4 +- .../filebot/subtitle/SubtitleUtilities.java | 32 +++-- .../subtitle/upload/SubtitleUploadDialog.java | 4 +- source/net/filebot/util/BOM.java | 73 ++++++++++ source/net/filebot/util/FileUtilities.java | 73 +++++----- source/net/filebot/util/UnicodeReader.java | 125 ------------------ source/net/filebot/vfs/MemoryFile.java | 9 +- 8 files changed, 137 insertions(+), 185 deletions(-) create mode 100644 source/net/filebot/util/BOM.java delete mode 100644 source/net/filebot/util/UnicodeReader.java diff --git a/.classpath b/.classpath index f612db73..6a484a24 100644 --- a/.classpath +++ b/.classpath @@ -9,7 +9,7 @@ - + diff --git a/source/net/filebot/media/MediaDetection.java b/source/net/filebot/media/MediaDetection.java index 26cd43a5..3b8fc351 100644 --- a/source/net/filebot/media/MediaDetection.java +++ b/source/net/filebot/media/MediaDetection.java @@ -1219,7 +1219,7 @@ public class MediaDetection { // parse ids from nfo files for (File nfo : nfoFiles) { try { - String text = new String(readFile(nfo), "UTF-8"); + String text = readTextFile(nfo); collection.addAll(grepImdbId(text)); } catch (Exception e) { debug.warning("Failed to read nfo: " + e.getMessage()); @@ -1246,7 +1246,7 @@ public class MediaDetection { continue; for (File nfo : getChildren(folder, NFO_FILES)) { - String text = new String(readFile(nfo), "UTF-8"); + String text = readTextFile(nfo); for (int imdbid : grepImdbId(text)) { SearchResult series = WebServices.TheTVDB.lookupByIMDbID(imdbid, language); diff --git a/source/net/filebot/subtitle/SubtitleUtilities.java b/source/net/filebot/subtitle/SubtitleUtilities.java index f08b480f..6878d99a 100644 --- a/source/net/filebot/subtitle/SubtitleUtilities.java +++ b/source/net/filebot/subtitle/SubtitleUtilities.java @@ -1,5 +1,6 @@ package net.filebot.subtitle; +import static java.nio.charset.StandardCharsets.*; import static java.util.Collections.*; import static java.util.stream.Collectors.*; import static net.filebot.Logging.*; @@ -10,11 +11,10 @@ import static net.filebot.util.FileUtilities.*; import java.io.File; import java.io.IOException; +import java.io.OutputStreamWriter; import java.io.Reader; import java.nio.ByteBuffer; -import java.nio.CharBuffer; import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -33,6 +33,8 @@ import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.commons.io.IOUtils; + import com.optimaize.langdetect.DetectedLanguage; import com.optimaize.langdetect.LanguageDetector; import com.optimaize.langdetect.LanguageDetectorBuilder; @@ -53,7 +55,7 @@ import net.filebot.similarity.SequenceMatchSimilarity; import net.filebot.similarity.SimilarityComparator; import net.filebot.similarity.SimilarityMetric; import net.filebot.util.ByteBufferInputStream; -import net.filebot.util.UnicodeReader; +import net.filebot.util.ByteBufferOutputStream; import net.filebot.vfs.ArchiveType; import net.filebot.vfs.MemoryFile; import net.filebot.web.Movie; @@ -325,7 +327,7 @@ public final class SubtitleUtilities { // decode subtitle file with the first reader that seems to work for (SubtitleFormat format : likelyFormats) { // decode bytes and beware of byte-order marks - Reader reader = new UnicodeReader(new ByteBufferInputStream(file.getData()), true, StandardCharsets.UTF_8); + Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8); // reset reader to position 0 SubtitleReader parser = format.newReader(reader); @@ -347,29 +349,31 @@ public final class SubtitleUtilities { throw new IOException("Subtitle format not supported"); } - public static ByteBuffer exportSubtitles(MemoryFile data, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException { + public static ByteBuffer exportSubtitles(MemoryFile file, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException { if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) { throw new IllegalArgumentException("Format not supported"); } - // convert to target format and target encoding + ByteBufferOutputStream buffer = new ByteBufferOutputStream(file.size()); + OutputStreamWriter writer = new OutputStreamWriter(buffer, outputEncoding); + if (outputFormat == SubtitleFormat.SubRip) { - // output buffer - StringBuilder buffer = new StringBuilder(4 * 1024); - try (SubRipWriter out = new SubRipWriter(buffer)) { - for (SubtitleElement it : decodeSubtitles(data)) { + // convert to target format and target encoding + try (SubRipWriter out = new SubRipWriter(writer)) { + for (SubtitleElement it : decodeSubtitles(file)) { if (outputTimingOffset != 0) { it = new SubtitleElement(Math.max(0, it.getStart() + outputTimingOffset), Math.max(0, it.getEnd() + outputTimingOffset), it.getText()); } out.write(it); } } - - return outputEncoding.encode(CharBuffer.wrap(buffer)); + } else { + // convert only text encoding + Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8); + IOUtils.copy(reader, writer); } - // only change encoding - return outputEncoding.encode(getText(data.getData())); + return buffer.getByteBuffer(); } public static SubtitleFormat getSubtitleFormat(File file) { diff --git a/source/net/filebot/ui/subtitle/upload/SubtitleUploadDialog.java b/source/net/filebot/ui/subtitle/upload/SubtitleUploadDialog.java index 0a27992d..df80b1fc 100644 --- a/source/net/filebot/ui/subtitle/upload/SubtitleUploadDialog.java +++ b/source/net/filebot/ui/subtitle/upload/SubtitleUploadDialog.java @@ -3,6 +3,7 @@ package net.filebot.ui.subtitle.upload; import static java.util.Collections.*; import static net.filebot.Logging.*; import static net.filebot.media.MediaDetection.*; +import static net.filebot.util.FileUtilities.*; import static net.filebot.util.ui.SwingUI.*; import java.awt.Color; @@ -32,7 +33,6 @@ import net.filebot.Language; import net.filebot.ResourceManager; import net.filebot.WebServices; import net.filebot.media.MediaDetection; -import net.filebot.util.FileUtilities; import net.filebot.util.ui.EmptySelectionModel; import net.filebot.web.Movie; import net.filebot.web.OpenSubtitlesClient; @@ -151,7 +151,7 @@ public class SubtitleUploadDialog extends JDialog { if (mapping.getLanguage() == null) { mapping.setState(Status.Identifying); try { - Locale locale = database.detectLanguage(FileUtilities.readFile(mapping.getSubtitle())); + Locale locale = database.detectLanguage(readFile(mapping.getSubtitle())); mapping.setLanguage(Language.getLanguage(locale)); } catch (Exception e) { debug.log(Level.WARNING, "Failed to auto-detect language: " + e.getMessage()); diff --git a/source/net/filebot/util/BOM.java b/source/net/filebot/util/BOM.java new file mode 100644 index 00000000..3d0d2b4a --- /dev/null +++ b/source/net/filebot/util/BOM.java @@ -0,0 +1,73 @@ +package net.filebot.util; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +public enum BOM { + + UTF_8((byte) 0xEF, (byte) 0xBB, (byte) 0xBF), + + UTF_16BE((byte) 0xFE, (byte) 0xFF), + + UTF_16LE((byte) 0xFF, (byte) 0xFE), + + UTF_32BE((byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF), + + UTF_32LE((byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00), + + GB_18030((byte) 0x84, (byte) 0x31, (byte) 0x95, (byte) 0x33); + + public static final int SIZE = 4; + + private byte[] bom; + + BOM(byte... bom) { + this.bom = bom; + } + + public int size() { + return bom.length; + } + + public boolean matches(byte[] bytes) { + if (bytes.length < bom.length) { + return false; + } + + for (int i = 0; i < bom.length; i++) { + if (bom[i] != bytes[i]) { + return false; + } + } + + return true; + } + + public Charset getCharset() { + switch (this) { + case UTF_8: + return StandardCharsets.UTF_8; + case UTF_16BE: + return StandardCharsets.UTF_16BE; + case UTF_16LE: + return StandardCharsets.UTF_16LE; + case UTF_32BE: + return Charset.forName("UTF-32BE"); + case UTF_32LE: + return Charset.forName("UTF-32LE"); + case GB_18030: + return Charset.forName("GB18030"); + } + return null; + } + + public static BOM detect(byte[] bytes) { + for (BOM bom : values()) { + if (bom.matches(bytes)) { + return bom; + } + } + return null; + } + +} diff --git a/source/net/filebot/util/FileUtilities.java b/source/net/filebot/util/FileUtilities.java index 1fd2e40f..d112e788 100644 --- a/source/net/filebot/util/FileUtilities.java +++ b/source/net/filebot/util/FileUtilities.java @@ -4,23 +4,22 @@ import static java.nio.charset.StandardCharsets.*; import static java.util.Arrays.*; import static java.util.Collections.*; import static java.util.Comparator.*; -import static java.util.stream.Collectors.*; import static net.filebot.Logging.*; import static net.filebot.util.RegularExpressions.*; import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.AtomicMoveNotSupportedException; import java.nio.file.FileVisitOption; @@ -48,7 +47,6 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collector; import java.util.stream.Stream; import org.apache.commons.io.FileUtils; @@ -196,18 +194,21 @@ public final class FileUtilities { return Files.readAllBytes(file.toPath()); } - public static R readLines(File file, Collector collector) throws IOException { - try (BufferedReader reader = new BufferedReader(new UnicodeReader(new ByteArrayInputStream(readFile(file)), false, UTF_8))) { - return reader.lines().collect(collector); + public static String readTextFile(File file) throws IOException { + byte[] bytes = readFile(file); + + // check BOM + BOM bom = BOM.detect(bytes); + + if (bom != null) { + return new String(bytes, bom.size(), bytes.length - bom.size(), bom.getCharset()); + } else { + return new String(bytes, UTF_8); } } public static List readLines(File file) throws IOException { - return readLines(file, toList()); - } - - public static String readTextFile(File file) throws IOException { - return readLines(file, joining(System.lineSeparator())); + return asList(NEWLINE.split(readTextFile(file))); } public static File writeFile(ByteBuffer data, File destination) throws IOException { @@ -217,35 +218,37 @@ public final class FileUtilities { return destination; } - public static Reader createTextReader(File file) throws IOException { - CharsetDetector detector = new CharsetDetector(); - detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding - detector.setText(new BufferedInputStream(new FileInputStream(file))); + public static Reader createTextReader(InputStream in, boolean guess, Charset declaredEncoding) throws IOException { + byte head[] = new byte[BOM.SIZE]; + in.mark(head.length); + in.read(head); + in.reset(); // rewind - CharsetMatch charset = detector.detect(); - if (charset != null) - return charset.getReader(); + // check BOM + BOM bom = BOM.detect(head); - // assume UTF-8 by default - return new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8); - } + if (bom != null) { + in.skip(bom.size()); // skip BOM + return new InputStreamReader(in, bom.getCharset()); + } - public static String getText(ByteBuffer data) throws IOException { - CharsetDetector detector = new CharsetDetector(); - detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding - detector.setText(new ByteBufferInputStream(data)); - - CharsetMatch charset = detector.detect(); - if (charset != null) { - try { - return charset.getString(); - } catch (RuntimeException e) { - throw new IOException("Failed to read text", e); + // auto-detect character encoding + if (guess) { + CharsetDetector detector = new CharsetDetector(); + detector.setDeclaredEncoding(declaredEncoding.name()); + detector.setText(in); + CharsetMatch match = detector.detect(); + if (match != null) { + return match.getReader(); } } - // assume UTF-8 by default - return UTF_8.decode(data).toString(); + // default to declared encoding + return new InputStreamReader(in, declaredEncoding); + } + + public static Reader createTextReader(File file) throws IOException { + return createTextReader(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE), true, UTF_8); } public static boolean equalsCaseSensitive(File a, File b) { diff --git a/source/net/filebot/util/UnicodeReader.java b/source/net/filebot/util/UnicodeReader.java deleted file mode 100644 index 213e6c7d..00000000 --- a/source/net/filebot/util/UnicodeReader.java +++ /dev/null @@ -1,125 +0,0 @@ -package net.filebot.util; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; - -import com.ibm.icu.text.CharsetDetector; - -public class UnicodeReader extends Reader { - - private static final int BOM_SIZE = 4; - - private final Reader reader; - - public UnicodeReader(InputStream stream, boolean guessCharset, Charset defaultCharset) throws IOException { - if (!stream.markSupported()) { - throw new IllegalArgumentException("stream must support mark"); - } - - stream.mark(BOM_SIZE); - byte bom[] = new byte[BOM_SIZE]; - stream.read(bom, 0, bom.length); - - Charset bomEncoding = null; - int skip = 0; - - if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { - bomEncoding = StandardCharsets.UTF_8; - skip = 3; - } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { - bomEncoding = StandardCharsets.UTF_16BE; - skip = 2; - } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { - bomEncoding = StandardCharsets.UTF_16LE; - skip = 2; - } else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { - bomEncoding = Charset.forName("UTF-32BE"); - skip = 4; - } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { - bomEncoding = Charset.forName("UTF-32LE"); - skip = 4; - } - - // rewind and skip BOM - stream.reset(); - stream.skip(skip); - - // guess character encoding if necessary - if (bomEncoding != null) { - // initialize reader via BOM - reader = new InputStreamReader(stream, bomEncoding); - } else if (bomEncoding == null && guessCharset) { - // auto-detect encoding - reader = new CharsetDetector().getReader(stream, defaultCharset.name()); - } else { - // use default - reader = new InputStreamReader(stream, defaultCharset); - } - } - - @Override - public int hashCode() { - return reader.hashCode(); - } - - @Override - public int read(CharBuffer target) throws IOException { - return reader.read(target); - } - - @Override - public boolean equals(Object obj) { - return reader.equals(obj); - } - - @Override - public int read(char[] cbuf) throws IOException { - return reader.read(cbuf); - } - - @Override - public int read() throws IOException { - return reader.read(); - } - - @Override - public int read(char[] cbuf, int offset, int length) throws IOException { - return reader.read(cbuf, offset, length); - } - - @Override - public long skip(long n) throws IOException { - return reader.skip(n); - } - - @Override - public boolean ready() throws IOException { - return reader.ready(); - } - - @Override - public void close() throws IOException { - reader.close(); - } - - @Override - public boolean markSupported() { - return reader.markSupported(); - } - - @Override - public void mark(int readAheadLimit) throws IOException { - reader.mark(readAheadLimit); - } - - @Override - public void reset() throws IOException { - reader.reset(); - } - -} diff --git a/source/net/filebot/vfs/MemoryFile.java b/source/net/filebot/vfs/MemoryFile.java index 9fc1def7..5c5b02f9 100644 --- a/source/net/filebot/vfs/MemoryFile.java +++ b/source/net/filebot/vfs/MemoryFile.java @@ -1,39 +1,36 @@ package net.filebot.vfs; - import java.nio.ByteBuffer; - public class MemoryFile { private final String path; private final ByteBuffer data; - public MemoryFile(String path, ByteBuffer data) { // normalize folder separator this.path = path.replace('\\', '/'); this.data = data; } - public String getName() { return path.substring(path.lastIndexOf("/") + 1); } - public String getPath() { return path; } + public int size() { + return data.remaining(); + } public ByteBuffer getData() { return data.duplicate(); } - @Override public String toString() { return path;