* high-performance locale-aware common-sequence-matching via CollatorKey

* boost series name detection speed
This commit is contained in:
Reinhard Pointner 2012-02-22 17:15:23 +00:00
parent b2681508ef
commit 0f2468fc5f
7 changed files with 210 additions and 111 deletions

View File

@ -102,7 +102,7 @@ public class CmdlineOperations implements CmdlineInterface {
int cws = 0; // common word sequence
double max = mediaFiles.size();
SeriesNameMatcher nameMatcher = new SeriesNameMatcher(getLenientCollator(locale));
SeriesNameMatcher nameMatcher = new SeriesNameMatcher(locale);
Collection<String> cwsList = emptySet();
if (max >= 5) {
cwsList = nameMatcher.matchAll(mediaFiles.toArray(new File[0]));

View File

@ -4,6 +4,7 @@ package net.sourceforge.filebot.media;
import static java.util.Collections.*;
import static net.sourceforge.filebot.MediaTypes.*;
import static net.sourceforge.filebot.similarity.CommonSequenceMatcher.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.FileUtilities.*;
@ -11,6 +12,7 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.CollationKey;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Collection;
@ -34,6 +36,7 @@ import java.util.regex.Pattern;
import net.sourceforge.filebot.MediaTypes;
import net.sourceforge.filebot.WebServices;
import net.sourceforge.filebot.similarity.CommonSequenceMatcher;
import net.sourceforge.filebot.similarity.NameSimilarityMetric;
import net.sourceforge.filebot.similarity.SeriesNameMatcher;
import net.sourceforge.filebot.similarity.SimilarityComparator;
@ -144,22 +147,15 @@ public class MediaDetection {
// cross-reference known series names against file structure
try {
Set<String> folders = new LinkedHashSet<String>();
Set<String> filenames = new LinkedHashSet<String>();
for (File f : files) {
for (int i = 0; i < 3 && f != null; i++, f = f.getParentFile()) {
if (i != 0) {
folders.add(f.getName());
}
filenames.add(f.getName());
}
}
// match know name from filename if there is not enough context for CWS matching
if (files.size() == 1) {
folders.add(files.iterator().next().getName());
}
// match folder names against known series names
for (TheTVDBSearchResult match : matchSeriesByName(folders.toArray(new String[0]))) {
for (TheTVDBSearchResult match : matchSeriesByName(filenames.toArray(new String[0]))) {
names.put(match.getName().toLowerCase(), match.getName());
}
} catch (Exception e) {
@ -167,8 +163,7 @@ public class MediaDetection {
}
// match common word sequence and clean detected word sequence from unwanted elements
SeriesNameMatcher matcher = new SeriesNameMatcher(getLenientCollator(locale));
Collection<String> matches = matcher.matchAll(files.toArray(new File[files.size()]));
Collection<String> matches = new SeriesNameMatcher(locale).matchAll(files.toArray(new File[files.size()]));
try {
matches = stripReleaseInfo(matches, true);
} catch (Exception e) {
@ -182,16 +177,27 @@ public class MediaDetection {
}
public static Collection<TheTVDBSearchResult> matchSeriesByName(String... names) throws Exception {
private static final HashMap<TheTVDBSearchResult, String> seriesNameIndex = new HashMap<TheTVDBSearchResult, String>(32768);
public static List<TheTVDBSearchResult> matchSeriesByName(String... names) throws Exception {
final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
final Map<TheTVDBSearchResult, String> matchMap = new HashMap<TheTVDBSearchResult, String>();
for (final TheTVDBSearchResult entry : releaseInfo.getSeriesList()) {
synchronized (seriesNameIndex) {
if (seriesNameIndex.isEmpty()) {
for (TheTVDBSearchResult entry : releaseInfo.getSeriesList()) {
seriesNameIndex.put(entry, nameMatcher.normalize(entry.getName()));
}
}
}
for (Entry<TheTVDBSearchResult, String> it : seriesNameIndex.entrySet()) {
for (String name : names) {
String identifier = nameMatcher.normalize(entry.getName());
String commonName = nameMatcher.matchByFirstCommonWordSequence(name, identifier);
String identifier = it.getValue();
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
if (commonName != null && commonName.length() >= identifier.length()) {
matchMap.put(entry, commonName);
matchMap.put(it.getKey(), commonName);
}
}
}
@ -215,13 +221,13 @@ public class MediaDetection {
final Map<AnidbSearchResult, String> matchMap = new HashMap<AnidbSearchResult, String>();
for (final AnidbSearchResult entry : WebServices.AniDB.getAnimeTitles()) {
for (String name : names) {
for (String identifier : new String[] { entry.getPrimaryTitle(), entry.getOfficialTitle("en") }) {
if (identifier == null || identifier.isEmpty())
continue;
identifier = nameMatcher.normalize(entry.getName());
String commonName = nameMatcher.matchByFirstCommonWordSequence(name, identifier);
for (String identifier : new String[] { entry.getPrimaryTitle(), entry.getOfficialTitle("en") }) {
if (identifier == null || identifier.isEmpty())
continue;
identifier = nameMatcher.normalize(identifier);
for (String name : names) {
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
if (commonName != null && commonName.length() >= identifier.length()) {
matchMap.put(entry, commonName);
}
@ -302,10 +308,10 @@ public class MediaDetection {
for (final Movie movie : releaseInfo.getMovieList()) {
for (String name : files) {
String movieIdentifier = movie.getName();
String commonName = nameMatcher.matchByFirstCommonWordSequence(name, movieIdentifier);
String commonName = nameMatcher.matchFirstCommonSequence(name, movieIdentifier);
if (commonName != null && commonName.length() >= movieIdentifier.length()) {
String strictMovieIdentifier = movie.getName() + " " + movie.getYear();
String strictCommonName = nameMatcher.matchByFirstCommonWordSequence(name, strictMovieIdentifier);
String strictCommonName = nameMatcher.matchFirstCommonSequence(name, strictMovieIdentifier);
if (strictCommonName != null && strictCommonName.length() >= strictMovieIdentifier.length()) {
// prefer strict match
matchMap.put(movie, strictCommonName);
@ -453,38 +459,34 @@ public class MediaDetection {
}
@SuppressWarnings("unchecked")
public static Comparator<String> getLenientCollator(Locale locale) {
// use maximum strength collator by default
final Collator collator = Collator.getInstance(locale);
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
collator.setStrength(Collator.PRIMARY);
return (Comparator) collator;
}
/*
* Heavy-duty name matcher used for matching a file to or more movies (out of a list of ~50k)
*/
private static class HighPerformanceMatcher extends SeriesNameMatcher {
private static class HighPerformanceMatcher extends CommonSequenceMatcher {
private static final Map<String, String> transformCache = synchronizedMap(new WeakHashMap<String, String>(65536));
private static final Collator collator = getLenientCollator(Locale.ENGLISH);
private static final Map<String, CollationKey[]> transformCache = synchronizedMap(new WeakHashMap<String, CollationKey[]>(65536));
public HighPerformanceMatcher(int commonWordSequenceMaxStartIndex) {
super(String.CASE_INSENSITIVE_ORDER, commonWordSequenceMaxStartIndex); // 3-4x faster than a Collator
super(collator, commonWordSequenceMaxStartIndex);
}
@Override
protected String normalize(String source) {
String value = transformCache.get(source);
protected CollationKey[] split(String sequence) {
CollationKey[] value = transformCache.get(sequence);
if (value == null) {
value = normalizePunctuation(source); // only normalize punctuation, make sure we keep the year (important for movie matching)
transformCache.put(source, value);
value = super.split(normalize(sequence));
transformCache.put(sequence, value);
}
return transformCache.get(source);
return value;
}
public String normalize(String sequence) {
return normalizePunctuation(sequence).toLowerCase(); // only normalize punctuation, make sure we keep the year (important for movie matching)
}
}

View File

@ -0,0 +1,125 @@
package net.sourceforge.filebot.similarity;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import java.text.CollationKey;
import java.text.Collator;
import java.util.Locale;
import java.util.Map;
import java.util.WeakHashMap;
public class CommonSequenceMatcher {
public static Collator getLenientCollator(Locale locale) {
// use maximum strength collator by default
Collator collator = Collator.getInstance(locale);
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
collator.setStrength(Collator.PRIMARY);
return collator;
}
protected final Collator collator;
protected final int commonSequenceMaxStartIndex;
public CommonSequenceMatcher(Collator collator, int commonSequenceMaxStartIndex) {
this.collator = collator;
this.commonSequenceMaxStartIndex = commonSequenceMaxStartIndex;
}
public Collator getCollator() {
return collator;
}
public String matchFirstCommonSequence(String... names) {
CollationKey[] common = null;
for (String it : names) {
CollationKey[] words = split(it);
if (common == null) {
// initialize common with current word array
common = words;
} else {
// find common sequence
common = firstCommonSequence(common, words, commonSequenceMaxStartIndex);
if (common == null) {
// no common sequence
return null;
}
}
}
if (common == null)
return null;
return synth(common);
}
protected String synth(CollationKey[] keys) {
StringBuilder sb = new StringBuilder();
for (CollationKey it : keys) {
if (sb.length() > 0) {
sb.append(' ');
}
sb.append(it.getSourceString());
}
return sb.toString();
}
protected CollationKey[] split(String sequence) {
return getCollationKeys(sequence.split("\\s+"));
}
private final Map<String, CollationKey> collationKeyDictionary = synchronizedMap(new WeakHashMap<String, CollationKey>(256));
protected CollationKey[] getCollationKeys(String[] words) {
CollationKey[] keys = new CollationKey[words.length];
for (int i = 0; i < keys.length; i++) {
keys[i] = collationKeyDictionary.get(words[i]);
if (keys[i] == null) {
keys[i] = collator.getCollationKey(words[i]);
collationKeyDictionary.put(words[i], keys[i]);
}
}
return keys;
}
protected <E extends Comparable<E>> E[] firstCommonSequence(E[] seq1, E[] seq2, int maxStartIndex) {
for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
// common sequence length
int len = 0;
// iterate over common sequence
while ((i + len < seq1.length) && (j + len < seq2.length) && (seq1[i + len].compareTo(seq2[j + len]) == 0)) {
len++;
}
// check if a common sequence was found
if (len > 0) {
if (i == 0 && len == seq1.length)
return seq1;
return copyOfRange(seq1, i, i + len);
}
}
}
// no intersection at all
return null;
}
}

View File

@ -6,7 +6,7 @@ public class Normalization {
public static String normalizePunctuation(String name) {
// remove/normalize special characters
name = name.replaceAll("['`´]+", "");
name = name.replaceAll("[`´ʻ]+", "");
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
return name.trim();

View File

@ -3,15 +3,17 @@ package net.sourceforge.filebot.similarity;
import static java.lang.Math.*;
import static net.sourceforge.filebot.similarity.CommonSequenceMatcher.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import java.text.Collator;
import java.util.Comparator;
import java.util.Locale;
public class SequenceMatchSimilarity implements SimilarityMetric {
private final CommonSequenceMatcher commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(Locale.ROOT), 10);
@Override
public float getSimilarity(Object o1, Object o2) {
String s1 = normalize(o1);
@ -39,20 +41,7 @@ public class SequenceMatchSimilarity implements SimilarityMetric {
protected String match(String s1, String s2) {
// use maximum strength collator by default
Collator collator = Collator.getInstance(Locale.ROOT);
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
collator.setStrength(Collator.TERTIARY);
@SuppressWarnings("unchecked")
SeriesNameMatcher matcher = new SeriesNameMatcher((Comparator) collator, 10) {
@Override
protected String normalize(String name) {
return name; // assume normalization has been done, no need to do that here again
};
};
return matcher.matchByFirstCommonWordSequence(s1, s2);
return commonSequenceMatcher.matchFirstCommonSequence(s1, s2);
}
}

View File

@ -4,10 +4,12 @@ package net.sourceforge.filebot.similarity;
import static java.util.Collections.*;
import static java.util.regex.Pattern.*;
import static net.sourceforge.filebot.similarity.CommonSequenceMatcher.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.StringUtilities.*;
import java.io.File;
import java.text.CollationKey;
import java.util.AbstractCollection;
import java.util.ArrayList;
import java.util.Arrays;
@ -16,6 +18,7 @@ import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;
@ -31,25 +34,25 @@ public class SeriesNameMatcher {
protected SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher(SeasonEpisodeMatcher.DEFAULT_SANITY, true);
protected DateMatcher dateMatcher = new DateMatcher();
protected NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric();
protected int commonWordSequenceMaxStartIndex;
protected Comparator<String> commonWordComparator;
protected CommonSequenceMatcher commonSequenceMatcher;
public SeriesNameMatcher() {
this(String.CASE_INSENSITIVE_ORDER, 3);
this(Locale.ROOT);
}
public SeriesNameMatcher(Comparator<String> comparator) {
this(comparator, 3);
}
public SeriesNameMatcher(Comparator<String> commonWordComparator, int commonWordSequenceMaxStartIndex) {
this.commonWordSequenceMaxStartIndex = commonWordSequenceMaxStartIndex;
this.commonWordComparator = commonWordComparator;
public SeriesNameMatcher(Locale locale) {
commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(locale), 3) {
@Override
protected CollationKey[] split(String sequence) {
return super.split(normalize(sequence));
}
};
}
@ -62,7 +65,7 @@ public class SeriesNameMatcher {
String[] names = entry.getValue();
for (String nameMatch : matchAll(names)) {
String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
String commonMatch = commonSequenceMatcher.matchFirstCommonSequence(nameMatch, parent);
float similarity = commonMatch == null ? 0 : nameSimilarityMetric.getSimilarity(commonMatch, nameMatch);
// prefer common match, but only if it's very similar to the original match
@ -116,7 +119,9 @@ public class SeriesNameMatcher {
* threshold
*/
private Collection<String> flatMatchAll(String[] names, Pattern prefixPattern, int threshold, boolean strict) {
ThresholdCollection<String> thresholdCollection = new ThresholdCollection<String>(threshold, commonWordComparator);
@SuppressWarnings("unchecked")
Comparator<String> wordComparator = (Comparator) commonSequenceMatcher.getCollator();
ThresholdCollection<String> thresholdCollection = new ThresholdCollection<String>(threshold, wordComparator);
for (String name : names) {
// use normalized name
@ -163,7 +168,7 @@ public class SeriesNameMatcher {
return emptySet();
}
String common = matchByFirstCommonWordSequence(names);
String common = commonSequenceMatcher.matchFirstCommonSequence(names);
if (common != null) {
// common word sequence found
@ -218,29 +223,7 @@ public class SeriesNameMatcher {
throw new IllegalArgumentException("Can't match common sequence from less than two names");
}
String[] common = null;
for (String name : names) {
String[] words = normalize(name).split("\\s+");
if (common == null) {
// initialize common with current word array
common = words;
} else {
// find common sequence
common = firstCommonSequence(common, words, commonWordSequenceMaxStartIndex, commonWordComparator);
if (common == null) {
// no common sequence
return null;
}
}
}
if (common == null)
return null;
return join(common, " ");
return commonSequenceMatcher.matchFirstCommonSequence(names);
}

View File

@ -180,13 +180,14 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
// detect series names and create episode list fetch tasks
for (Entry<Set<File>, Set<String>> sameSeriesGroup : mapSeriesNamesByFiles(mediaFiles, locale).entrySet()) {
List<List<File>> batchSets = new ArrayList<List<File>>();
final List<List<File>> batchSets = new ArrayList<List<File>>();
final Collection<String> queries = sameSeriesGroup.getValue();
if (sameSeriesGroup.getValue() != null && sameSeriesGroup.getValue().size() > 0) {
// handle series name batch set all at once
if (queries != null && queries.size() > 0) {
// handle series name batch set all at once -> only 1 batch set
batchSets.add(new ArrayList<File>(sameSeriesGroup.getKey()));
} else {
// these files don't seem to belong to any series -> handle folder per folder
// these files don't seem to belong to any series -> handle folder per folder -> multiple batch sets
batchSets.addAll(mapByFolder(sameSeriesGroup.getKey()).values());
}
@ -195,7 +196,7 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
@Override
public List<Match<File, ?>> call() throws Exception {
return matchEpisodeSet(batchSet, sortOrder, locale, autodetection, parent);
return matchEpisodeSet(batchSet, queries, sortOrder, locale, autodetection, parent);
}
});
}
@ -246,23 +247,22 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
}
public List<Match<File, ?>> matchEpisodeSet(final List<File> files, SortOrder sortOrder, Locale locale, boolean autodetection, Component parent) throws Exception {
public List<Match<File, ?>> matchEpisodeSet(final List<File> files, Collection<String> queries, SortOrder sortOrder, Locale locale, boolean autodetection, Component parent) throws Exception {
Set<Episode> episodes = emptySet();
// detect series name and fetch episode list
if (autodetection) {
Collection<String> names = detectSeriesNames(files, locale);
if (names.size() > 0) {
if (queries != null && queries.size() > 0) {
// only allow one fetch session at a time so later requests can make use of cached results
synchronized (providerLock) {
episodes = fetchEpisodeSet(names, sortOrder, locale, parent);
episodes = fetchEpisodeSet(queries, sortOrder, locale, parent);
}
}
}
// require user input if auto-detection has failed or has been disabled
if (episodes.isEmpty()) {
String suggestion = new SeriesNameMatcher().matchByEpisodeIdentifier(getName(files.get(0)));
String suggestion = new SeriesNameMatcher(locale).matchByEpisodeIdentifier(getName(files.get(0)));
if (suggestion != null) {
// clean media info / release group info / etc
suggestion = stripReleaseInfo(suggestion);