* make local movie/series lookup as blazing fast as possible
This commit is contained in:
parent
dc58ae1954
commit
36747c4ea6
|
@ -11,11 +11,11 @@ import static net.sourceforge.tuned.FileUtilities.*;
|
|||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.text.CollationKey;
|
||||
import java.text.Collator;
|
||||
import java.util.AbstractMap.SimpleEntry;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
|
@ -400,43 +400,45 @@ public class MediaDetection {
|
|||
return matches;
|
||||
}
|
||||
|
||||
private static List<Entry<String, SearchResult>> seriesIndex = new ArrayList<Entry<String, SearchResult>>(75000);
|
||||
private static final List<IndexEntry<SearchResult>> seriesIndex = new ArrayList<IndexEntry<SearchResult>>(100000);
|
||||
|
||||
public static synchronized List<Entry<String, SearchResult>> getSeriesIndex() throws IOException {
|
||||
if (seriesIndex.isEmpty()) {
|
||||
try {
|
||||
for (SearchResult[] index : new SearchResult[][] { releaseInfo.getTheTVDBIndex(), releaseInfo.getAnidbIndex() }) {
|
||||
for (SearchResult item : index) {
|
||||
for (String name : item.getEffectiveNames()) {
|
||||
seriesIndex.add(new SimpleEntry<String, SearchResult>(normalizePunctuation(name).toLowerCase(), item));
|
||||
public static List<IndexEntry<SearchResult>> getSeriesIndex() throws IOException {
|
||||
synchronized (seriesIndex) {
|
||||
if (seriesIndex.isEmpty()) {
|
||||
try {
|
||||
for (SearchResult[] index : new SearchResult[][] { releaseInfo.getTheTVDBIndex(), releaseInfo.getAnidbIndex() }) {
|
||||
for (SearchResult it : index) {
|
||||
seriesIndex.addAll(HighPerformanceMatcher.prepare(it));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// can't load movie index, just try again next time
|
||||
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load series index: " + e.getMessage(), e);
|
||||
return emptyList();
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// can't load movie index, just try again next time
|
||||
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load series index: " + e.getMessage(), e);
|
||||
|
||||
return seriesIndex;
|
||||
// rely on online search
|
||||
return emptyList();
|
||||
}
|
||||
}
|
||||
return seriesIndex;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<String> matchSeriesByName(Collection<String> names, int maxStartIndex) throws Exception {
|
||||
public static List<String> matchSeriesByName(Collection<String> files, int maxStartIndex) throws Exception {
|
||||
HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex);
|
||||
List<String> matches = new ArrayList<String>();
|
||||
|
||||
for (String name : names) {
|
||||
String bestMatch = "";
|
||||
for (Entry<String, SearchResult> it : getSeriesIndex()) {
|
||||
String identifier = it.getKey();
|
||||
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
|
||||
if (commonName != null && commonName.length() >= identifier.length() && commonName.length() > bestMatch.length()) {
|
||||
bestMatch = commonName;
|
||||
List<CollationKey[]> names = HighPerformanceMatcher.prepare(files);
|
||||
|
||||
for (CollationKey[] name : names) {
|
||||
IndexEntry<SearchResult> bestMatch = null;
|
||||
for (IndexEntry<SearchResult> it : getSeriesIndex()) {
|
||||
CollationKey[] commonName = nameMatcher.matchFirstCommonSequence(name, it.lenientKey);
|
||||
if (commonName != null && commonName.length >= it.lenientKey.length && (bestMatch == null || commonName.length > bestMatch.lenientKey.length)) {
|
||||
bestMatch = it;
|
||||
}
|
||||
}
|
||||
if (bestMatch.length() > 0) {
|
||||
matches.add(bestMatch);
|
||||
if (bestMatch != null) {
|
||||
matches.add(bestMatch.lenientName);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -469,12 +471,12 @@ public class MediaDetection {
|
|||
float similarityThreshold = strict ? 0.75f : 0.5f;
|
||||
|
||||
List<SearchResult> seriesList = new ArrayList<SearchResult>();
|
||||
for (Entry<String, SearchResult> it : getSeriesIndex()) {
|
||||
String name = spacing.matcher(it.getKey()).replaceAll("").toLowerCase();
|
||||
for (IndexEntry<SearchResult> it : getSeriesIndex()) {
|
||||
String name = spacing.matcher(it.lenientName).replaceAll("").toLowerCase();
|
||||
for (String term : terms) {
|
||||
if (term.contains(name)) {
|
||||
if (metric.getSimilarity(term, name) >= similarityThreshold) {
|
||||
seriesList.add(it.getValue());
|
||||
seriesList.add(it.object);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -753,24 +755,25 @@ public class MediaDetection {
|
|||
return matches != null && matches.size() > 0 ? matches.get(0) : null;
|
||||
}
|
||||
|
||||
private static List<Entry<String, Movie>> movieIndex = new ArrayList<Entry<String, Movie>>(100000);
|
||||
private static final List<IndexEntry<Movie>> movieIndex = new ArrayList<IndexEntry<Movie>>(100000);
|
||||
|
||||
public static synchronized List<Entry<String, Movie>> getMovieIndex() throws IOException {
|
||||
if (movieIndex.isEmpty()) {
|
||||
try {
|
||||
for (Movie movie : releaseInfo.getMovieList()) {
|
||||
for (String name : movie.getEffectiveNamesWithoutYear()) {
|
||||
movieIndex.add(new SimpleEntry<String, Movie>(normalizePunctuation(name).toLowerCase(), movie));
|
||||
public static List<IndexEntry<Movie>> getMovieIndex() throws IOException {
|
||||
synchronized (movieIndex) {
|
||||
if (movieIndex.isEmpty()) {
|
||||
try {
|
||||
for (Movie it : releaseInfo.getMovieList()) {
|
||||
movieIndex.addAll(HighPerformanceMatcher.prepare(it));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// can't load movie index, just try again next time
|
||||
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load movie index: " + e.getMessage(), e);
|
||||
return emptyList();
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// can't load movie index, just try again next time
|
||||
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load movie index: " + e.getMessage(), e);
|
||||
|
||||
return movieIndex;
|
||||
// if we can't use internal index we can only rely on online search
|
||||
return emptyList();
|
||||
}
|
||||
}
|
||||
return movieIndex;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<Movie> matchMovieName(final Collection<String> files, boolean strict, int maxStartIndex) throws Exception {
|
||||
|
@ -778,19 +781,19 @@ public class MediaDetection {
|
|||
final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex);
|
||||
final Map<Movie, String> matchMap = new HashMap<Movie, String>();
|
||||
|
||||
for (Entry<String, Movie> movie : getMovieIndex()) {
|
||||
for (String name : files) {
|
||||
String movieIdentifier = movie.getKey();
|
||||
String commonName = nameMatcher.matchFirstCommonSequence(name, movieIdentifier);
|
||||
if (commonName != null && commonName.length() >= movieIdentifier.length()) {
|
||||
String strictMovieIdentifier = movie.getKey() + " " + movie.getValue().getYear();
|
||||
String strictCommonName = nameMatcher.matchFirstCommonSequence(name, strictMovieIdentifier);
|
||||
if (strictCommonName != null && strictCommonName.length() >= strictMovieIdentifier.length()) {
|
||||
List<CollationKey[]> names = HighPerformanceMatcher.prepare(files);
|
||||
|
||||
for (IndexEntry<Movie> movie : getMovieIndex()) {
|
||||
for (CollationKey[] name : names) {
|
||||
CollationKey[] commonName = nameMatcher.matchFirstCommonSequence(name, movie.lenientKey);
|
||||
if (commonName != null && commonName.length >= movie.lenientKey.length) {
|
||||
CollationKey[] strictCommonName = nameMatcher.matchFirstCommonSequence(name, movie.strictKey);
|
||||
if (strictCommonName != null && strictCommonName.length >= movie.strictKey.length) {
|
||||
// prefer strict match
|
||||
matchMap.put(movie.getValue(), strictCommonName);
|
||||
matchMap.put(movie.object, movie.strictName);
|
||||
} else if (!strict) {
|
||||
// make sure the common identifier is not just the year
|
||||
matchMap.put(movie.getValue(), commonName);
|
||||
matchMap.put(movie.object, movie.lenientName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -826,21 +829,20 @@ public class MediaDetection {
|
|||
float similarityThreshold = strict ? 0.9f : 0.5f;
|
||||
|
||||
LinkedList<Movie> movies = new LinkedList<Movie>();
|
||||
for (Entry<String, Movie> it : getMovieIndex()) {
|
||||
String name = spacing.matcher(it.getKey()).replaceAll("").toLowerCase();
|
||||
for (IndexEntry<Movie> it : getMovieIndex()) {
|
||||
String name = spacing.matcher(it.lenientName).replaceAll("").toLowerCase();
|
||||
for (String term : terms) {
|
||||
if (term.contains(name)) {
|
||||
String year = String.valueOf(it.getValue().getYear());
|
||||
String year = String.valueOf(it.object.getYear());
|
||||
if (term.contains(year) && metric.getSimilarity(term, name + year) > similarityThreshold) {
|
||||
movies.addFirst(it.getValue());
|
||||
movies.addFirst(it.object);
|
||||
} else if (metric.getSimilarity(term, name) > similarityThreshold) {
|
||||
movies.addLast(it.getValue());
|
||||
movies.addLast(it.object);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new ArrayList<Movie>(movies);
|
||||
}
|
||||
|
||||
|
@ -1082,31 +1084,79 @@ public class MediaDetection {
|
|||
return probableMatches;
|
||||
}
|
||||
|
||||
public static class IndexEntry<T> implements Serializable {
|
||||
|
||||
private final T object;
|
||||
private final String lenientName;
|
||||
private final String strictName;
|
||||
private final CollationKey[] lenientKey;
|
||||
private final CollationKey[] strictKey;
|
||||
|
||||
public IndexEntry(T object, String lenientName, String strictName, CollationKey[] lenientKey, CollationKey[] strictKey) {
|
||||
this.object = object;
|
||||
this.lenientName = lenientName;
|
||||
this.strictName = strictName;
|
||||
this.lenientKey = lenientKey;
|
||||
this.strictKey = strictKey;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Heavy-duty name matcher used for matching a file to or more movies (out of a list of ~50k)
|
||||
*/
|
||||
private static class HighPerformanceMatcher extends CommonSequenceMatcher {
|
||||
|
||||
private static final Collator collator = getLenientCollator(Locale.ENGLISH);
|
||||
private static final Pattern space = Pattern.compile("\\s+");
|
||||
|
||||
private static final Map<String, CollationKey[]> transformCache = synchronizedMap(new HashMap<String, CollationKey[]>(65536));
|
||||
public static CollationKey[] prepare(String sequence) {
|
||||
String[] words = space.split(normalizePunctuation(sequence));
|
||||
CollationKey[] keys = new CollationKey[words.length];
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
keys[i] = collator.getCollationKey(words[i]);
|
||||
}
|
||||
return keys;
|
||||
}
|
||||
|
||||
public static List<CollationKey[]> prepare(Collection<String> sequences) {
|
||||
List<CollationKey[]> result = new ArrayList<CollationKey[]>(sequences.size());
|
||||
for (String it : sequences) {
|
||||
result.add(prepare(it));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static List<IndexEntry<Movie>> prepare(Movie m) {
|
||||
List<String> effectiveNamesWithoutYear = m.getEffectiveNamesWithoutYear();
|
||||
List<String> effectiveNames = m.getEffectiveNames();
|
||||
List<IndexEntry<Movie>> index = new ArrayList<IndexEntry<Movie>>(effectiveNames.size());
|
||||
|
||||
for (int i = 0; i < effectiveNames.size(); i++) {
|
||||
String lenientName = normalizePunctuation(effectiveNamesWithoutYear.get(i));
|
||||
String strictName = normalizePunctuation(effectiveNames.get(i));
|
||||
index.add(new IndexEntry<Movie>(m, lenientName, strictName, prepare(lenientName), prepare(strictName)));
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
public static List<IndexEntry<SearchResult>> prepare(SearchResult r) {
|
||||
List<String> effectiveNames = r.getEffectiveNames();
|
||||
List<IndexEntry<SearchResult>> index = new ArrayList<IndexEntry<SearchResult>>(effectiveNames.size());
|
||||
|
||||
for (int i = 0; i < effectiveNames.size(); i++) {
|
||||
String lenientName = normalizePunctuation(effectiveNames.get(i));
|
||||
index.add(new IndexEntry<SearchResult>(r, lenientName, null, prepare(lenientName), null));
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
public HighPerformanceMatcher(int maxStartIndex) {
|
||||
super(collator, maxStartIndex, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected CollationKey[] split(String sequence) {
|
||||
CollationKey[] value = transformCache.get(sequence);
|
||||
if (value == null) {
|
||||
value = super.split(normalize(sequence));
|
||||
transformCache.put(sequence, value);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public String normalize(String sequence) {
|
||||
return normalizePunctuation(sequence); // only normalize punctuation, make sure we keep the year (important for movie matching)
|
||||
public CollationKey[] split(String sequence) {
|
||||
throw new UnsupportedOperationException("requires ahead-of-time collation");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
|
||||
package net.sourceforge.filebot.similarity;
|
||||
|
||||
|
||||
import static java.util.Arrays.*;
|
||||
import static java.util.Collections.*;
|
||||
|
||||
|
@ -11,9 +9,8 @@ import java.util.HashMap;
|
|||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class CommonSequenceMatcher {
|
||||
|
||||
|
||||
public static Collator getLenientCollator(Locale locale) {
|
||||
// use maximum strength collator by default
|
||||
Collator collator = Collator.getInstance(locale);
|
||||
|
@ -21,52 +18,54 @@ public class CommonSequenceMatcher {
|
|||
collator.setStrength(Collator.PRIMARY);
|
||||
return collator;
|
||||
}
|
||||
|
||||
|
||||
protected final Collator collator;
|
||||
protected final int commonSequenceMaxStartIndex;
|
||||
protected final boolean returnFirstMatch;
|
||||
|
||||
|
||||
|
||||
public CommonSequenceMatcher(Collator collator, int commonSequenceMaxStartIndex, boolean returnFirstMatch) {
|
||||
this.collator = collator;
|
||||
this.commonSequenceMaxStartIndex = commonSequenceMaxStartIndex;
|
||||
this.returnFirstMatch = returnFirstMatch;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Collator getCollator() {
|
||||
return collator;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public String matchFirstCommonSequence(String... names) {
|
||||
CollationKey[] common = null;
|
||||
|
||||
for (String it : names) {
|
||||
CollationKey[] words = split(it);
|
||||
|
||||
CollationKey[][] words = new CollationKey[names.length][];
|
||||
for (int i = 0; i < names.length; i++) {
|
||||
words[i] = split(names[i]);
|
||||
}
|
||||
return synth(matchFirstCommonSequence(words));
|
||||
}
|
||||
|
||||
public <E extends Comparable<E>> E[] matchFirstCommonSequence(E[]... names) {
|
||||
E[] common = null;
|
||||
|
||||
for (E[] words : names) {
|
||||
if (common == null) {
|
||||
// initialize common with current word array
|
||||
common = words;
|
||||
} else {
|
||||
// find common sequence
|
||||
common = firstCommonSequence(common, words, commonSequenceMaxStartIndex, returnFirstMatch);
|
||||
|
||||
|
||||
if (common == null) {
|
||||
// no common sequence
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (common == null)
|
||||
return null;
|
||||
|
||||
return synth(common);
|
||||
return common;
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected String synth(CollationKey[] keys) {
|
||||
if (keys == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (CollationKey it : keys) {
|
||||
if (sb.length() > 0) {
|
||||
|
@ -76,15 +75,13 @@ public class CommonSequenceMatcher {
|
|||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
protected CollationKey[] split(String sequence) {
|
||||
|
||||
public CollationKey[] split(String sequence) {
|
||||
return getCollationKeys(sequence.split("\\s+"));
|
||||
}
|
||||
|
||||
|
||||
private final Map<String, CollationKey> collationKeyDictionary = synchronizedMap(new HashMap<String, CollationKey>(256));
|
||||
|
||||
|
||||
|
||||
protected CollationKey[] getCollationKeys(String[] words) {
|
||||
CollationKey[] keys = new CollationKey[words.length];
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
|
@ -96,24 +93,23 @@ public class CommonSequenceMatcher {
|
|||
}
|
||||
return keys;
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected <E extends Comparable<E>> E[] firstCommonSequence(E[] seq1, E[] seq2, int maxStartIndex, boolean returnFirstMatch) {
|
||||
E[] matchSeq = null;
|
||||
for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
|
||||
for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
|
||||
// common sequence length
|
||||
int len = 0;
|
||||
|
||||
|
||||
// iterate over common sequence
|
||||
while ((i + len < seq1.length) && (j + len < seq2.length) && (seq1[i + len].compareTo(seq2[j + len]) == 0)) {
|
||||
len++;
|
||||
}
|
||||
|
||||
|
||||
// check if a common sequence was found
|
||||
if (len > (matchSeq == null ? 0 : matchSeq.length)) {
|
||||
matchSeq = copyOfRange(seq1, i, i + len);
|
||||
|
||||
|
||||
// look for first match
|
||||
if (returnFirstMatch) {
|
||||
return matchSeq;
|
||||
|
|
|
@ -48,7 +48,7 @@ public class SeriesNameMatcher {
|
|||
commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(locale), 3, true) {
|
||||
|
||||
@Override
|
||||
protected CollationKey[] split(String sequence) {
|
||||
public CollationKey[] split(String sequence) {
|
||||
return super.split(normalize(sequence));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -43,6 +43,10 @@ public class Movie extends SearchResult {
|
|||
return tmdbId;
|
||||
}
|
||||
|
||||
public String getNameWithYear() {
|
||||
return toString(name, year);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getEffectiveNames() {
|
||||
if (aliasNames == null || aliasNames.length == 0) {
|
||||
|
|
Loading…
Reference in New Issue