* make local movie/series lookup as blazing fast as possible

This commit is contained in:
Reinhard Pointner 2014-01-07 12:26:44 +00:00
parent dc58ae1954
commit 36747c4ea6
4 changed files with 159 additions and 109 deletions

View File

@ -11,11 +11,11 @@ import static net.sourceforge.tuned.FileUtilities.*;
import java.io.File; import java.io.File;
import java.io.FileFilter; import java.io.FileFilter;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.text.CollationKey; import java.text.CollationKey;
import java.text.Collator; import java.text.Collator;
import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
@ -400,43 +400,45 @@ public class MediaDetection {
return matches; return matches;
} }
private static List<Entry<String, SearchResult>> seriesIndex = new ArrayList<Entry<String, SearchResult>>(75000); private static final List<IndexEntry<SearchResult>> seriesIndex = new ArrayList<IndexEntry<SearchResult>>(100000);
public static synchronized List<Entry<String, SearchResult>> getSeriesIndex() throws IOException { public static List<IndexEntry<SearchResult>> getSeriesIndex() throws IOException {
if (seriesIndex.isEmpty()) { synchronized (seriesIndex) {
try { if (seriesIndex.isEmpty()) {
for (SearchResult[] index : new SearchResult[][] { releaseInfo.getTheTVDBIndex(), releaseInfo.getAnidbIndex() }) { try {
for (SearchResult item : index) { for (SearchResult[] index : new SearchResult[][] { releaseInfo.getTheTVDBIndex(), releaseInfo.getAnidbIndex() }) {
for (String name : item.getEffectiveNames()) { for (SearchResult it : index) {
seriesIndex.add(new SimpleEntry<String, SearchResult>(normalizePunctuation(name).toLowerCase(), item)); seriesIndex.addAll(HighPerformanceMatcher.prepare(it));
} }
} }
} } catch (Exception e) {
} catch (Exception e) { // can't load movie index, just try again next time
// can't load movie index, just try again next time Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load series index: " + e.getMessage(), e);
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load series index: " + e.getMessage(), e);
return emptyList();
}
}
return seriesIndex; // rely on online search
return emptyList();
}
}
return seriesIndex;
}
} }
public static List<String> matchSeriesByName(Collection<String> names, int maxStartIndex) throws Exception { public static List<String> matchSeriesByName(Collection<String> files, int maxStartIndex) throws Exception {
HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex); HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex);
List<String> matches = new ArrayList<String>(); List<String> matches = new ArrayList<String>();
for (String name : names) { List<CollationKey[]> names = HighPerformanceMatcher.prepare(files);
String bestMatch = "";
for (Entry<String, SearchResult> it : getSeriesIndex()) { for (CollationKey[] name : names) {
String identifier = it.getKey(); IndexEntry<SearchResult> bestMatch = null;
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier); for (IndexEntry<SearchResult> it : getSeriesIndex()) {
if (commonName != null && commonName.length() >= identifier.length() && commonName.length() > bestMatch.length()) { CollationKey[] commonName = nameMatcher.matchFirstCommonSequence(name, it.lenientKey);
bestMatch = commonName; if (commonName != null && commonName.length >= it.lenientKey.length && (bestMatch == null || commonName.length > bestMatch.lenientKey.length)) {
bestMatch = it;
} }
} }
if (bestMatch.length() > 0) { if (bestMatch != null) {
matches.add(bestMatch); matches.add(bestMatch.lenientName);
} }
} }
@ -469,12 +471,12 @@ public class MediaDetection {
float similarityThreshold = strict ? 0.75f : 0.5f; float similarityThreshold = strict ? 0.75f : 0.5f;
List<SearchResult> seriesList = new ArrayList<SearchResult>(); List<SearchResult> seriesList = new ArrayList<SearchResult>();
for (Entry<String, SearchResult> it : getSeriesIndex()) { for (IndexEntry<SearchResult> it : getSeriesIndex()) {
String name = spacing.matcher(it.getKey()).replaceAll("").toLowerCase(); String name = spacing.matcher(it.lenientName).replaceAll("").toLowerCase();
for (String term : terms) { for (String term : terms) {
if (term.contains(name)) { if (term.contains(name)) {
if (metric.getSimilarity(term, name) >= similarityThreshold) { if (metric.getSimilarity(term, name) >= similarityThreshold) {
seriesList.add(it.getValue()); seriesList.add(it.object);
} }
break; break;
} }
@ -753,24 +755,25 @@ public class MediaDetection {
return matches != null && matches.size() > 0 ? matches.get(0) : null; return matches != null && matches.size() > 0 ? matches.get(0) : null;
} }
private static List<Entry<String, Movie>> movieIndex = new ArrayList<Entry<String, Movie>>(100000); private static final List<IndexEntry<Movie>> movieIndex = new ArrayList<IndexEntry<Movie>>(100000);
public static synchronized List<Entry<String, Movie>> getMovieIndex() throws IOException { public static List<IndexEntry<Movie>> getMovieIndex() throws IOException {
if (movieIndex.isEmpty()) { synchronized (movieIndex) {
try { if (movieIndex.isEmpty()) {
for (Movie movie : releaseInfo.getMovieList()) { try {
for (String name : movie.getEffectiveNamesWithoutYear()) { for (Movie it : releaseInfo.getMovieList()) {
movieIndex.add(new SimpleEntry<String, Movie>(normalizePunctuation(name).toLowerCase(), movie)); movieIndex.addAll(HighPerformanceMatcher.prepare(it));
} }
} } catch (Exception e) {
} catch (Exception e) { // can't load movie index, just try again next time
// can't load movie index, just try again next time Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load movie index: " + e.getMessage(), e);
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load movie index: " + e.getMessage(), e);
return emptyList();
}
}
return movieIndex; // if we can't use internal index we can only rely on online search
return emptyList();
}
}
return movieIndex;
}
} }
public static List<Movie> matchMovieName(final Collection<String> files, boolean strict, int maxStartIndex) throws Exception { public static List<Movie> matchMovieName(final Collection<String> files, boolean strict, int maxStartIndex) throws Exception {
@ -778,19 +781,19 @@ public class MediaDetection {
final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex); final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex);
final Map<Movie, String> matchMap = new HashMap<Movie, String>(); final Map<Movie, String> matchMap = new HashMap<Movie, String>();
for (Entry<String, Movie> movie : getMovieIndex()) { List<CollationKey[]> names = HighPerformanceMatcher.prepare(files);
for (String name : files) {
String movieIdentifier = movie.getKey(); for (IndexEntry<Movie> movie : getMovieIndex()) {
String commonName = nameMatcher.matchFirstCommonSequence(name, movieIdentifier); for (CollationKey[] name : names) {
if (commonName != null && commonName.length() >= movieIdentifier.length()) { CollationKey[] commonName = nameMatcher.matchFirstCommonSequence(name, movie.lenientKey);
String strictMovieIdentifier = movie.getKey() + " " + movie.getValue().getYear(); if (commonName != null && commonName.length >= movie.lenientKey.length) {
String strictCommonName = nameMatcher.matchFirstCommonSequence(name, strictMovieIdentifier); CollationKey[] strictCommonName = nameMatcher.matchFirstCommonSequence(name, movie.strictKey);
if (strictCommonName != null && strictCommonName.length() >= strictMovieIdentifier.length()) { if (strictCommonName != null && strictCommonName.length >= movie.strictKey.length) {
// prefer strict match // prefer strict match
matchMap.put(movie.getValue(), strictCommonName); matchMap.put(movie.object, movie.strictName);
} else if (!strict) { } else if (!strict) {
// make sure the common identifier is not just the year // make sure the common identifier is not just the year
matchMap.put(movie.getValue(), commonName); matchMap.put(movie.object, movie.lenientName);
} }
} }
} }
@ -826,21 +829,20 @@ public class MediaDetection {
float similarityThreshold = strict ? 0.9f : 0.5f; float similarityThreshold = strict ? 0.9f : 0.5f;
LinkedList<Movie> movies = new LinkedList<Movie>(); LinkedList<Movie> movies = new LinkedList<Movie>();
for (Entry<String, Movie> it : getMovieIndex()) { for (IndexEntry<Movie> it : getMovieIndex()) {
String name = spacing.matcher(it.getKey()).replaceAll("").toLowerCase(); String name = spacing.matcher(it.lenientName).replaceAll("").toLowerCase();
for (String term : terms) { for (String term : terms) {
if (term.contains(name)) { if (term.contains(name)) {
String year = String.valueOf(it.getValue().getYear()); String year = String.valueOf(it.object.getYear());
if (term.contains(year) && metric.getSimilarity(term, name + year) > similarityThreshold) { if (term.contains(year) && metric.getSimilarity(term, name + year) > similarityThreshold) {
movies.addFirst(it.getValue()); movies.addFirst(it.object);
} else if (metric.getSimilarity(term, name) > similarityThreshold) { } else if (metric.getSimilarity(term, name) > similarityThreshold) {
movies.addLast(it.getValue()); movies.addLast(it.object);
} }
break; break;
} }
} }
} }
return new ArrayList<Movie>(movies); return new ArrayList<Movie>(movies);
} }
@ -1082,31 +1084,79 @@ public class MediaDetection {
return probableMatches; return probableMatches;
} }
public static class IndexEntry<T> implements Serializable {
private final T object;
private final String lenientName;
private final String strictName;
private final CollationKey[] lenientKey;
private final CollationKey[] strictKey;
public IndexEntry(T object, String lenientName, String strictName, CollationKey[] lenientKey, CollationKey[] strictKey) {
this.object = object;
this.lenientName = lenientName;
this.strictName = strictName;
this.lenientKey = lenientKey;
this.strictKey = strictKey;
}
}
/* /*
* Heavy-duty name matcher used for matching a file to or more movies (out of a list of ~50k) * Heavy-duty name matcher used for matching a file to or more movies (out of a list of ~50k)
*/ */
private static class HighPerformanceMatcher extends CommonSequenceMatcher { private static class HighPerformanceMatcher extends CommonSequenceMatcher {
private static final Collator collator = getLenientCollator(Locale.ENGLISH); private static final Collator collator = getLenientCollator(Locale.ENGLISH);
private static final Pattern space = Pattern.compile("\\s+");
private static final Map<String, CollationKey[]> transformCache = synchronizedMap(new HashMap<String, CollationKey[]>(65536)); public static CollationKey[] prepare(String sequence) {
String[] words = space.split(normalizePunctuation(sequence));
CollationKey[] keys = new CollationKey[words.length];
for (int i = 0; i < words.length; i++) {
keys[i] = collator.getCollationKey(words[i]);
}
return keys;
}
public static List<CollationKey[]> prepare(Collection<String> sequences) {
List<CollationKey[]> result = new ArrayList<CollationKey[]>(sequences.size());
for (String it : sequences) {
result.add(prepare(it));
}
return result;
}
public static List<IndexEntry<Movie>> prepare(Movie m) {
List<String> effectiveNamesWithoutYear = m.getEffectiveNamesWithoutYear();
List<String> effectiveNames = m.getEffectiveNames();
List<IndexEntry<Movie>> index = new ArrayList<IndexEntry<Movie>>(effectiveNames.size());
for (int i = 0; i < effectiveNames.size(); i++) {
String lenientName = normalizePunctuation(effectiveNamesWithoutYear.get(i));
String strictName = normalizePunctuation(effectiveNames.get(i));
index.add(new IndexEntry<Movie>(m, lenientName, strictName, prepare(lenientName), prepare(strictName)));
}
return index;
}
public static List<IndexEntry<SearchResult>> prepare(SearchResult r) {
List<String> effectiveNames = r.getEffectiveNames();
List<IndexEntry<SearchResult>> index = new ArrayList<IndexEntry<SearchResult>>(effectiveNames.size());
for (int i = 0; i < effectiveNames.size(); i++) {
String lenientName = normalizePunctuation(effectiveNames.get(i));
index.add(new IndexEntry<SearchResult>(r, lenientName, null, prepare(lenientName), null));
}
return index;
}
public HighPerformanceMatcher(int maxStartIndex) { public HighPerformanceMatcher(int maxStartIndex) {
super(collator, maxStartIndex, true); super(collator, maxStartIndex, true);
} }
@Override @Override
protected CollationKey[] split(String sequence) { public CollationKey[] split(String sequence) {
CollationKey[] value = transformCache.get(sequence); throw new UnsupportedOperationException("requires ahead-of-time collation");
if (value == null) {
value = super.split(normalize(sequence));
transformCache.put(sequence, value);
}
return value;
}
public String normalize(String sequence) {
return normalizePunctuation(sequence); // only normalize punctuation, make sure we keep the year (important for movie matching)
} }
} }

View File

@ -1,7 +1,5 @@
package net.sourceforge.filebot.similarity; package net.sourceforge.filebot.similarity;
import static java.util.Arrays.*; import static java.util.Arrays.*;
import static java.util.Collections.*; import static java.util.Collections.*;
@ -11,9 +9,8 @@ import java.util.HashMap;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
public class CommonSequenceMatcher { public class CommonSequenceMatcher {
public static Collator getLenientCollator(Locale locale) { public static Collator getLenientCollator(Locale locale) {
// use maximum strength collator by default // use maximum strength collator by default
Collator collator = Collator.getInstance(locale); Collator collator = Collator.getInstance(locale);
@ -21,52 +18,54 @@ public class CommonSequenceMatcher {
collator.setStrength(Collator.PRIMARY); collator.setStrength(Collator.PRIMARY);
return collator; return collator;
} }
protected final Collator collator; protected final Collator collator;
protected final int commonSequenceMaxStartIndex; protected final int commonSequenceMaxStartIndex;
protected final boolean returnFirstMatch; protected final boolean returnFirstMatch;
public CommonSequenceMatcher(Collator collator, int commonSequenceMaxStartIndex, boolean returnFirstMatch) { public CommonSequenceMatcher(Collator collator, int commonSequenceMaxStartIndex, boolean returnFirstMatch) {
this.collator = collator; this.collator = collator;
this.commonSequenceMaxStartIndex = commonSequenceMaxStartIndex; this.commonSequenceMaxStartIndex = commonSequenceMaxStartIndex;
this.returnFirstMatch = returnFirstMatch; this.returnFirstMatch = returnFirstMatch;
} }
public Collator getCollator() { public Collator getCollator() {
return collator; return collator;
} }
public String matchFirstCommonSequence(String... names) { public String matchFirstCommonSequence(String... names) {
CollationKey[] common = null; CollationKey[][] words = new CollationKey[names.length][];
for (int i = 0; i < names.length; i++) {
for (String it : names) { words[i] = split(names[i]);
CollationKey[] words = split(it); }
return synth(matchFirstCommonSequence(words));
}
public <E extends Comparable<E>> E[] matchFirstCommonSequence(E[]... names) {
E[] common = null;
for (E[] words : names) {
if (common == null) { if (common == null) {
// initialize common with current word array // initialize common with current word array
common = words; common = words;
} else { } else {
// find common sequence // find common sequence
common = firstCommonSequence(common, words, commonSequenceMaxStartIndex, returnFirstMatch); common = firstCommonSequence(common, words, commonSequenceMaxStartIndex, returnFirstMatch);
if (common == null) { if (common == null) {
// no common sequence // no common sequence
return null; return null;
} }
} }
} }
return common;
if (common == null)
return null;
return synth(common);
} }
protected String synth(CollationKey[] keys) { protected String synth(CollationKey[] keys) {
if (keys == null) {
return null;
}
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (CollationKey it : keys) { for (CollationKey it : keys) {
if (sb.length() > 0) { if (sb.length() > 0) {
@ -76,15 +75,13 @@ public class CommonSequenceMatcher {
} }
return sb.toString(); return sb.toString();
} }
public CollationKey[] split(String sequence) {
protected CollationKey[] split(String sequence) {
return getCollationKeys(sequence.split("\\s+")); return getCollationKeys(sequence.split("\\s+"));
} }
private final Map<String, CollationKey> collationKeyDictionary = synchronizedMap(new HashMap<String, CollationKey>(256)); private final Map<String, CollationKey> collationKeyDictionary = synchronizedMap(new HashMap<String, CollationKey>(256));
protected CollationKey[] getCollationKeys(String[] words) { protected CollationKey[] getCollationKeys(String[] words) {
CollationKey[] keys = new CollationKey[words.length]; CollationKey[] keys = new CollationKey[words.length];
for (int i = 0; i < keys.length; i++) { for (int i = 0; i < keys.length; i++) {
@ -96,24 +93,23 @@ public class CommonSequenceMatcher {
} }
return keys; return keys;
} }
protected <E extends Comparable<E>> E[] firstCommonSequence(E[] seq1, E[] seq2, int maxStartIndex, boolean returnFirstMatch) { protected <E extends Comparable<E>> E[] firstCommonSequence(E[] seq1, E[] seq2, int maxStartIndex, boolean returnFirstMatch) {
E[] matchSeq = null; E[] matchSeq = null;
for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) { for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) { for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
// common sequence length // common sequence length
int len = 0; int len = 0;
// iterate over common sequence // iterate over common sequence
while ((i + len < seq1.length) && (j + len < seq2.length) && (seq1[i + len].compareTo(seq2[j + len]) == 0)) { while ((i + len < seq1.length) && (j + len < seq2.length) && (seq1[i + len].compareTo(seq2[j + len]) == 0)) {
len++; len++;
} }
// check if a common sequence was found // check if a common sequence was found
if (len > (matchSeq == null ? 0 : matchSeq.length)) { if (len > (matchSeq == null ? 0 : matchSeq.length)) {
matchSeq = copyOfRange(seq1, i, i + len); matchSeq = copyOfRange(seq1, i, i + len);
// look for first match // look for first match
if (returnFirstMatch) { if (returnFirstMatch) {
return matchSeq; return matchSeq;

View File

@ -48,7 +48,7 @@ public class SeriesNameMatcher {
commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(locale), 3, true) { commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(locale), 3, true) {
@Override @Override
protected CollationKey[] split(String sequence) { public CollationKey[] split(String sequence) {
return super.split(normalize(sequence)); return super.split(normalize(sequence));
} }
}; };

View File

@ -43,6 +43,10 @@ public class Movie extends SearchResult {
return tmdbId; return tmdbId;
} }
public String getNameWithYear() {
return toString(name, year);
}
@Override @Override
public List<String> getEffectiveNames() { public List<String> getEffectiveNames() {
if (aliasNames == null || aliasNames.length == 0) { if (aliasNames == null || aliasNames.length == 0) {