* detect charset when parsing subtitles instead of assuming it's UTF-8

* added ICU4J library to build
This commit is contained in:
Reinhard Pointner 2011-09-03 09:30:38 +00:00
parent 5c6e879f6c
commit 7c2c574940
4 changed files with 28 additions and 19 deletions

View File

@ -119,6 +119,10 @@
<include name="org/codehaus/groovy/**" /> <include name="org/codehaus/groovy/**" />
<include name="META-INF/dgminfo" /> <include name="META-INF/dgminfo" />
</zipfileset> </zipfileset>
<zipfileset src="${dir.lib}/icu4j.jar">
<include name="com/ibm/icu/text/**" />
</zipfileset>
<zipfileset src="${dir.lib}/sublight-ws.jar"> <zipfileset src="${dir.lib}/sublight-ws.jar">
<include name="net/sublight/webservice/**" /> <include name="net/sublight/webservice/**" />

View File

@ -25,6 +25,7 @@
<jar href="sublight-ws.jar" /> <jar href="sublight-ws.jar" />
<jar href="xmlrpc.jar" /> <jar href="xmlrpc.jar" />
<jar href="json-simple.jar" /> <jar href="json-simple.jar" />
<jar href="icu4j.jar" />
</resources> </resources>
<component-desc /> <component-desc />

BIN
lib/icu4j.jar Normal file

Binary file not shown.

View File

@ -5,14 +5,15 @@ package net.sourceforge.filebot.ui.panel.subtitle;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.StringReader;
import java.io.InputStreamReader;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import com.ibm.icu.text.CharsetDetector;
import net.sourceforge.filebot.subtitle.SubtitleElement; import net.sourceforge.filebot.subtitle.SubtitleElement;
import net.sourceforge.filebot.subtitle.SubtitleFormat; import net.sourceforge.filebot.subtitle.SubtitleFormat;
import net.sourceforge.filebot.subtitle.SubtitleReader; import net.sourceforge.filebot.subtitle.SubtitleReader;
@ -22,12 +23,19 @@ import net.sourceforge.tuned.ByteBufferInputStream;
final class SubtitleUtilities { final class SubtitleUtilities {
/** /**
* Decode subtitle file even if extension is invalid. * Detect charset and parse subtitle file even if extension is invalid
*/ */
public static List<SubtitleElement> decode(MemoryFile file) throws IOException { public static List<SubtitleElement> decode(MemoryFile file) throws IOException {
LinkedList<SubtitleFormat> priorityList = new LinkedList<SubtitleFormat>(); // detect charset and read text content
CharsetDetector detector = new CharsetDetector();
detector.enableInputFilter(true);
detector.setText(new ByteBufferInputStream(file.getData()));
String textfile = detector.detect().getString();
// gather all formats, put likely formats first // gather all formats, put likely formats first
LinkedList<SubtitleFormat> priorityList = new LinkedList<SubtitleFormat>();
for (SubtitleFormat format : SubtitleFormat.values()) { for (SubtitleFormat format : SubtitleFormat.values()) {
if (format.getFilter().accept(file.getName())) { if (format.getFilter().accept(file.getName())) {
priorityList.addFirst(format); priorityList.addFirst(format);
@ -38,23 +46,19 @@ final class SubtitleUtilities {
// decode subtitle file with the first reader that seems to work // decode subtitle file with the first reader that seems to work
for (SubtitleFormat format : priorityList) { for (SubtitleFormat format : priorityList) {
InputStream data = new ByteBufferInputStream(file.getData()); // reset reader to position 0
SubtitleReader reader = format.newReader(new InputStreamReader(data, "UTF-8")); SubtitleReader parser = format.newReader(new StringReader(textfile));
try { if (parser.hasNext()) {
if (reader.hasNext()) { // correct format found
// correct format found List<SubtitleElement> list = new ArrayList<SubtitleElement>(500);
List<SubtitleElement> list = new ArrayList<SubtitleElement>(500);
// read subtitle file
// read subtitle file while (parser.hasNext()) {
while (reader.hasNext()) { list.add(parser.next());
list.add(reader.next());
}
return list;
} }
} finally {
reader.close(); return list;
} }
} }