* optimize subtitle collection
This commit is contained in:
parent
060229757a
commit
9e4b38ea9a
|
@ -828,7 +828,7 @@ public class CmdlineOperations implements CmdlineInterface {
|
|||
|
||||
private Map<File, SubtitleDescriptor> lookupSubtitleByFileName(SubtitleProvider service, Collection<String> querySet, Language language, Collection<File> videoFiles, boolean strict) throws Exception {
|
||||
// search for subtitles
|
||||
List<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, language.getName());
|
||||
Set<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, language.getName());
|
||||
|
||||
// match subtitle files to video files
|
||||
if (subtitles.size() > 0) {
|
||||
|
|
|
@ -1,19 +1,20 @@
|
|||
|
||||
package net.sourceforge.filebot.similarity;
|
||||
|
||||
|
||||
import static java.lang.Math.*;
|
||||
|
||||
|
||||
public class MetricCascade implements SimilarityMetric {
|
||||
|
||||
|
||||
private final SimilarityMetric[] cascade;
|
||||
|
||||
private final boolean shortCircuit;
|
||||
|
||||
public MetricCascade(SimilarityMetric... cascade) {
|
||||
this.cascade = cascade;
|
||||
this(true, cascade);
|
||||
}
|
||||
|
||||
public MetricCascade(boolean shortCircuit, SimilarityMetric... cascade) {
|
||||
this.cascade = cascade;
|
||||
this.shortCircuit = shortCircuit;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public float getSimilarity(Object o1, Object o2) {
|
||||
|
@ -22,16 +23,20 @@ public class MetricCascade implements SimilarityMetric {
|
|||
float similarity = metric.getSimilarity(o1, o2);
|
||||
if (abs(similarity) >= abs(f)) {
|
||||
// perfect match, ignore remaining metrics
|
||||
if (similarity >= 1) {
|
||||
return similarity;
|
||||
if (shortCircuit) {
|
||||
if (similarity >= 1) {
|
||||
return similarity;
|
||||
} else if (similarity <= -1) {
|
||||
return similarity;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// possible match or perfect negative match
|
||||
f = similarity;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
|
||||
package net.sourceforge.filebot.subtitle;
|
||||
|
||||
|
||||
import static java.lang.Math.*;
|
||||
import static net.sourceforge.filebot.MediaTypes.*;
|
||||
import static net.sourceforge.filebot.media.MediaDetection.*;
|
||||
|
@ -26,6 +24,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import net.sourceforge.filebot.Language;
|
||||
import net.sourceforge.filebot.similarity.Match;
|
||||
import net.sourceforge.filebot.similarity.Matcher;
|
||||
import net.sourceforge.filebot.similarity.MetricAvg;
|
||||
|
@ -33,25 +32,23 @@ import net.sourceforge.filebot.similarity.MetricCascade;
|
|||
import net.sourceforge.filebot.similarity.NameSimilarityMetric;
|
||||
import net.sourceforge.filebot.similarity.SequenceMatchSimilarity;
|
||||
import net.sourceforge.filebot.similarity.SimilarityMetric;
|
||||
import net.sourceforge.filebot.Language;
|
||||
import net.sourceforge.filebot.vfs.ArchiveType;
|
||||
import net.sourceforge.filebot.vfs.MemoryFile;
|
||||
import net.sourceforge.filebot.web.SearchResult;
|
||||
import net.sourceforge.filebot.web.SubtitleDescriptor;
|
||||
import net.sourceforge.filebot.web.SubtitleProvider;
|
||||
|
||||
|
||||
public final class SubtitleUtilities {
|
||||
|
||||
|
||||
public static Map<File, SubtitleDescriptor> matchSubtitles(Collection<File> files, Collection<SubtitleDescriptor> subtitles, boolean strict) throws InterruptedException {
|
||||
Map<File, SubtitleDescriptor> subtitleByVideo = new LinkedHashMap<File, SubtitleDescriptor>();
|
||||
|
||||
|
||||
// optimize for generic media <-> subtitle matching
|
||||
SimilarityMetric[] metrics = new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, NameSubstringSequence, new MetricCascade(NameSubstringSequence, Name), Numeric, new NameSimilarityMetric() };
|
||||
|
||||
// subtitle verification metric specifically excluding SxE mismatches
|
||||
|
||||
// subtitle verification metric specifically excluding SxE mismatches
|
||||
SimilarityMetric absoluteSeasonEpisode = new SimilarityMetric() {
|
||||
|
||||
|
||||
@Override
|
||||
public float getSimilarity(Object o1, Object o2) {
|
||||
float f = SeasonEpisode.getSimilarity(o1, o2);
|
||||
|
@ -62,175 +59,167 @@ public final class SubtitleUtilities {
|
|||
}
|
||||
};
|
||||
SimilarityMetric sanity = new MetricCascade(absoluteSeasonEpisode, AirDate, new MetricAvg(NameSubstringSequence, Name), getMovieMatchMetric());
|
||||
|
||||
|
||||
// first match everything as best as possible, then filter possibly bad matches
|
||||
Matcher<File, SubtitleDescriptor> matcher = new Matcher<File, SubtitleDescriptor>(files, subtitles, false, metrics);
|
||||
|
||||
|
||||
for (Match<File, SubtitleDescriptor> it : matcher.match()) {
|
||||
if (sanity.getSimilarity(it.getValue(), it.getCandidate()) >= (strict ? 0.9f : 0.6f)) {
|
||||
subtitleByVideo.put(it.getValue(), it.getCandidate());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return subtitleByVideo;
|
||||
}
|
||||
|
||||
|
||||
public static List<SubtitleDescriptor> findSubtitles(SubtitleProvider service, Collection<String> querySet, String languageName) throws Exception {
|
||||
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>();
|
||||
|
||||
|
||||
public static Set<SubtitleDescriptor> findSubtitles(SubtitleProvider service, Collection<String> querySet, String languageName) throws Exception {
|
||||
Set<SubtitleDescriptor> subtitles = new LinkedHashSet<SubtitleDescriptor>();
|
||||
|
||||
// search for and automatically select movie / show entry
|
||||
Set<SearchResult> resultSet = new HashSet<SearchResult>();
|
||||
for (String query : querySet) {
|
||||
resultSet.addAll(findProbableSearchResults(query, service.search(query)));
|
||||
}
|
||||
|
||||
|
||||
// fetch subtitles for all search results
|
||||
for (SearchResult it : resultSet) {
|
||||
subtitles.addAll(service.getSubtitleList(it, languageName));
|
||||
}
|
||||
|
||||
|
||||
return subtitles;
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected static Collection<SearchResult> findProbableSearchResults(String query, Iterable<? extends SearchResult> searchResults) {
|
||||
// auto-select most probable search result
|
||||
Set<SearchResult> probableMatches = new LinkedHashSet<SearchResult>();
|
||||
|
||||
|
||||
// use name similarity metric
|
||||
SimilarityMetric metric = new MetricAvg(new SequenceMatchSimilarity(), new NameSimilarityMetric());
|
||||
|
||||
|
||||
// find probable matches using name similarity > threshold
|
||||
for (SearchResult result : searchResults) {
|
||||
if (metric.getSimilarity(query, removeTrailingBrackets(result.getName())) > 0.8f || result.getName().toLowerCase().startsWith(query.toLowerCase())) {
|
||||
probableMatches.add(result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return probableMatches;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Detect charset and parse subtitle file even if extension is invalid
|
||||
*/
|
||||
public static List<SubtitleElement> decodeSubtitles(MemoryFile file) throws IOException {
|
||||
// gather all formats, put likely formats first
|
||||
LinkedList<SubtitleFormat> likelyFormats = new LinkedList<SubtitleFormat>();
|
||||
|
||||
|
||||
for (SubtitleFormat format : SubtitleFormat.values()) {
|
||||
if (format.getFilter().accept(file.getName()))
|
||||
likelyFormats.addFirst(format);
|
||||
else
|
||||
likelyFormats.addLast(format);
|
||||
}
|
||||
|
||||
|
||||
// decode bytes
|
||||
String textfile = getText(file.getData());
|
||||
|
||||
|
||||
// decode subtitle file with the first reader that seems to work
|
||||
for (SubtitleFormat format : likelyFormats) {
|
||||
// reset reader to position 0
|
||||
SubtitleReader parser = format.newReader(new StringReader(textfile));
|
||||
|
||||
|
||||
if (parser.hasNext()) {
|
||||
// correct format found
|
||||
List<SubtitleElement> list = new ArrayList<SubtitleElement>(500);
|
||||
|
||||
|
||||
// read subtitle file
|
||||
while (parser.hasNext()) {
|
||||
list.add(parser.next());
|
||||
}
|
||||
|
||||
|
||||
return list;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// unsupported subtitle format
|
||||
throw new IOException("Cannot read subtitle format");
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static ByteBuffer exportSubtitles(MemoryFile data, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
|
||||
if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) {
|
||||
throw new IllegalArgumentException("Format not supported");
|
||||
}
|
||||
|
||||
|
||||
// convert to target format and target encoding
|
||||
if (outputFormat == SubtitleFormat.SubRip) {
|
||||
// output buffer
|
||||
StringBuilder buffer = new StringBuilder(4 * 1024);
|
||||
SubRipWriter out = new SubRipWriter(buffer);
|
||||
|
||||
|
||||
for (SubtitleElement it : decodeSubtitles(data)) {
|
||||
if (outputTimingOffset != 0)
|
||||
it = new SubtitleElement(max(0, it.getStart() + outputTimingOffset), max(0, it.getEnd() + outputTimingOffset), it.getText());
|
||||
|
||||
|
||||
out.write(it);
|
||||
}
|
||||
|
||||
|
||||
return outputEncoding.encode(CharBuffer.wrap(buffer));
|
||||
}
|
||||
|
||||
|
||||
// only change encoding
|
||||
return outputEncoding.encode(getText(data.getData()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static SubtitleFormat getSubtitleFormat(File file) {
|
||||
for (SubtitleFormat it : SubtitleFormat.values()) {
|
||||
if (it.getFilter().accept(file))
|
||||
return it;
|
||||
}
|
||||
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static SubtitleFormat getSubtitleFormatByName(String name) {
|
||||
for (SubtitleFormat it : SubtitleFormat.values()) {
|
||||
// check by name
|
||||
if (it.name().equalsIgnoreCase(name))
|
||||
return it;
|
||||
|
||||
|
||||
// check by extension
|
||||
if (it.getFilter().acceptExtension(name))
|
||||
return it;
|
||||
}
|
||||
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static String formatSubtitle(String name, String languageName, String type) {
|
||||
StringBuilder sb = new StringBuilder(name);
|
||||
|
||||
|
||||
if (languageName != null) {
|
||||
String lang = Language.getISO3LanguageCodeByName(languageName);
|
||||
|
||||
|
||||
if (lang == null) {
|
||||
// we probably won't get here, but just in case
|
||||
lang = languageName.replaceAll("\\W", "");
|
||||
}
|
||||
|
||||
|
||||
sb.append('.').append(lang);
|
||||
}
|
||||
|
||||
|
||||
if (type != null) {
|
||||
sb.append('.').append(type);
|
||||
}
|
||||
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static MemoryFile fetchSubtitle(SubtitleDescriptor descriptor) throws Exception {
|
||||
ByteBuffer data = descriptor.fetch();
|
||||
|
||||
|
||||
// extract subtitles from archive
|
||||
ArchiveType type = ArchiveType.forName(descriptor.getType());
|
||||
|
||||
|
||||
if (type != ArchiveType.UNKOWN) {
|
||||
// extract subtitle from archive
|
||||
Iterator<MemoryFile> it = type.fromData(data).iterator();
|
||||
|
@ -241,17 +230,16 @@ public final class SubtitleUtilities {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// assume that the fetched data is the subtitle
|
||||
return new MemoryFile(descriptor.getPath(), data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Dummy constructor to prevent instantiation.
|
||||
*/
|
||||
private SubtitleUtilities() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -892,7 +892,7 @@ class SubtitleAutoMatchDialog extends JDialog {
|
|||
|
||||
@Override
|
||||
public String getName() {
|
||||
return String.format("%s (by hash)", service.getName());
|
||||
return String.format("%s [via hash]", service.getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -919,7 +919,7 @@ class SubtitleAutoMatchDialog extends JDialog {
|
|||
|
||||
@Override
|
||||
public String getName() {
|
||||
return String.format("%s (by name)", service.getName());
|
||||
return String.format("%s [via name]", service.getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -946,7 +946,7 @@ class SubtitleAutoMatchDialog extends JDialog {
|
|||
}
|
||||
}
|
||||
|
||||
List<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, languageName);
|
||||
Set<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, languageName);
|
||||
|
||||
// dialog may have been cancelled by now
|
||||
if (Thread.interrupted()) {
|
||||
|
@ -978,6 +978,7 @@ class SubtitleAutoMatchDialog extends JDialog {
|
|||
SimilarityMetric sanity = EpisodeMetrics.verificationMetric();
|
||||
float minMatchSimilarity = 0.5f;
|
||||
|
||||
// this could be very slow, lets hope at this point there is not much left due to positive hash matches
|
||||
for (File file : files) {
|
||||
// add matching subtitles
|
||||
for (SubtitleDescriptor it : subtitles) {
|
||||
|
|
Loading…
Reference in New Issue