* high-performance locale-aware common-sequence-matching via CollatorKey

* boost series name detection speed
2012-02-22 17:15:23 +00:00 · 2012-02-22 17:15:23 +00:00 · 0f2468fc5f
commit 0f2468fc5f
parent b2681508ef
7 changed files with 210 additions and 111 deletions
--- a/source/net/sourceforge/filebot/cli/CmdlineOperations.java
+++ b/source/net/sourceforge/filebot/cli/CmdlineOperations.java
@ -102,7 +102,7 @@ public class CmdlineOperations implements CmdlineInterface {
 		int cws = 0; // common word sequence
 		double max = mediaFiles.size();
 		
-		SeriesNameMatcher nameMatcher = new SeriesNameMatcher(getLenientCollator(locale));
+		SeriesNameMatcher nameMatcher = new SeriesNameMatcher(locale);
 		Collection<String> cwsList = emptySet();
 		if (max >= 5) {
 			cwsList = nameMatcher.matchAll(mediaFiles.toArray(new File[0]));
--- a/source/net/sourceforge/filebot/media/MediaDetection.java
+++ b/source/net/sourceforge/filebot/media/MediaDetection.java
@ -4,6 +4,7 @@ package net.sourceforge.filebot.media;

 import static java.util.Collections.*;
 import static net.sourceforge.filebot.MediaTypes.*;
+import static net.sourceforge.filebot.similarity.CommonSequenceMatcher.*;
 import static net.sourceforge.filebot.similarity.Normalization.*;
 import static net.sourceforge.tuned.FileUtilities.*;

@ -11,6 +12,7 @@ import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.text.CollationKey;
 import java.text.Collator;
 import java.util.ArrayList;
 import java.util.Collection;
@ -34,6 +36,7 @@ import java.util.regex.Pattern;

 import net.sourceforge.filebot.MediaTypes;
 import net.sourceforge.filebot.WebServices;
+import net.sourceforge.filebot.similarity.CommonSequenceMatcher;
 import net.sourceforge.filebot.similarity.NameSimilarityMetric;
 import net.sourceforge.filebot.similarity.SeriesNameMatcher;
 import net.sourceforge.filebot.similarity.SimilarityComparator;
@ -144,22 +147,15 @@ public class MediaDetection {
 		
 		// cross-reference known series names against file structure
 		try {
-			Set<String> folders = new LinkedHashSet<String>();
+			Set<String> filenames = new LinkedHashSet<String>();
 			for (File f : files) {
 				for (int i = 0; i < 3 && f != null; i++, f = f.getParentFile()) {
-					if (i != 0) {
-						folders.add(f.getName());
-					}
+					filenames.add(f.getName());
 				}
 			}
 			
-			// match know name from filename if there is not enough context for CWS matching
-			if (files.size() == 1) {
-				folders.add(files.iterator().next().getName());
-			}
-			
 			// match folder names against known series names
-			for (TheTVDBSearchResult match : matchSeriesByName(folders.toArray(new String[0]))) {
+			for (TheTVDBSearchResult match : matchSeriesByName(filenames.toArray(new String[0]))) {
 				names.put(match.getName().toLowerCase(), match.getName());
 			}
 		} catch (Exception e) {
@ -167,8 +163,7 @@ public class MediaDetection {
 		}
 		
 		// match common word sequence and clean detected word sequence from unwanted elements
-		SeriesNameMatcher matcher = new SeriesNameMatcher(getLenientCollator(locale));
-		Collection<String> matches = matcher.matchAll(files.toArray(new File[files.size()]));
+		Collection<String> matches = new SeriesNameMatcher(locale).matchAll(files.toArray(new File[files.size()]));
 		try {
 			matches = stripReleaseInfo(matches, true);
 		} catch (Exception e) {
@ -182,16 +177,27 @@ public class MediaDetection {
 	}
 	
 	
-	public static Collection<TheTVDBSearchResult> matchSeriesByName(String... names) throws Exception {
+	private static final HashMap<TheTVDBSearchResult, String> seriesNameIndex = new HashMap<TheTVDBSearchResult, String>(32768);
+	
+	
+	public static List<TheTVDBSearchResult> matchSeriesByName(String... names) throws Exception {
 		final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
 		final Map<TheTVDBSearchResult, String> matchMap = new HashMap<TheTVDBSearchResult, String>();
 		
-		for (final TheTVDBSearchResult entry : releaseInfo.getSeriesList()) {
+		synchronized (seriesNameIndex) {
+			if (seriesNameIndex.isEmpty()) {
+				for (TheTVDBSearchResult entry : releaseInfo.getSeriesList()) {
+					seriesNameIndex.put(entry, nameMatcher.normalize(entry.getName()));
+				}
+			}
+		}
+		
+		for (Entry<TheTVDBSearchResult, String> it : seriesNameIndex.entrySet()) {
 			for (String name : names) {
-				String identifier = nameMatcher.normalize(entry.getName());
-				String commonName = nameMatcher.matchByFirstCommonWordSequence(name, identifier);
+				String identifier = it.getValue();
+				String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
 				if (commonName != null && commonName.length() >= identifier.length()) {
-					matchMap.put(entry, commonName);
+					matchMap.put(it.getKey(), commonName);
 				}
 			}
 		}
@ -215,13 +221,13 @@ public class MediaDetection {
 		final Map<AnidbSearchResult, String> matchMap = new HashMap<AnidbSearchResult, String>();
 		
 		for (final AnidbSearchResult entry : WebServices.AniDB.getAnimeTitles()) {
-			for (String name : names) {
-				for (String identifier : new String[] { entry.getPrimaryTitle(), entry.getOfficialTitle("en") }) {
-					if (identifier == null || identifier.isEmpty())
-						continue;
-					
-					identifier = nameMatcher.normalize(entry.getName());
-					String commonName = nameMatcher.matchByFirstCommonWordSequence(name, identifier);
+			for (String identifier : new String[] { entry.getPrimaryTitle(), entry.getOfficialTitle("en") }) {
+				if (identifier == null || identifier.isEmpty())
+					continue;
+				
+				identifier = nameMatcher.normalize(identifier);
+				for (String name : names) {
+					String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
 					if (commonName != null && commonName.length() >= identifier.length()) {
 						matchMap.put(entry, commonName);
 					}
@ -302,10 +308,10 @@ public class MediaDetection {
 		for (final Movie movie : releaseInfo.getMovieList()) {
 			for (String name : files) {
 				String movieIdentifier = movie.getName();
-				String commonName = nameMatcher.matchByFirstCommonWordSequence(name, movieIdentifier);
+				String commonName = nameMatcher.matchFirstCommonSequence(name, movieIdentifier);
 				if (commonName != null && commonName.length() >= movieIdentifier.length()) {
 					String strictMovieIdentifier = movie.getName() + " " + movie.getYear();
-					String strictCommonName = nameMatcher.matchByFirstCommonWordSequence(name, strictMovieIdentifier);
+					String strictCommonName = nameMatcher.matchFirstCommonSequence(name, strictMovieIdentifier);
 					if (strictCommonName != null && strictCommonName.length() >= strictMovieIdentifier.length()) {
 						// prefer strict match
 						matchMap.put(movie, strictCommonName);
@ -453,38 +459,34 @@ public class MediaDetection {
 	}
 	
 	
-	@SuppressWarnings("unchecked")
-	public static Comparator<String> getLenientCollator(Locale locale) {
-		// use maximum strength collator by default
-		final Collator collator = Collator.getInstance(locale);
-		collator.setDecomposition(Collator.FULL_DECOMPOSITION);
-		collator.setStrength(Collator.PRIMARY);
-		
-		return (Comparator) collator;
-	}
-	
-	
 	/*
 	 * Heavy-duty name matcher used for matching a file to or more movies (out of a list of ~50k)
 	 */
-	private static class HighPerformanceMatcher extends SeriesNameMatcher {
+	private static class HighPerformanceMatcher extends CommonSequenceMatcher {
 		
-		private static final Map<String, String> transformCache = synchronizedMap(new WeakHashMap<String, String>(65536));
+		private static final Collator collator = getLenientCollator(Locale.ENGLISH);
+		
+		private static final Map<String, CollationKey[]> transformCache = synchronizedMap(new WeakHashMap<String, CollationKey[]>(65536));
 		
 		
 		public HighPerformanceMatcher(int commonWordSequenceMaxStartIndex) {
-			super(String.CASE_INSENSITIVE_ORDER, commonWordSequenceMaxStartIndex); // 3-4x faster than a Collator 
+			super(collator, commonWordSequenceMaxStartIndex);
 		}
 		
 		
 		@Override
-		protected String normalize(String source) {
-			String value = transformCache.get(source);
+		protected CollationKey[] split(String sequence) {
+			CollationKey[] value = transformCache.get(sequence);
 			if (value == null) {
-				value = normalizePunctuation(source); // only normalize punctuation, make sure we keep the year (important for movie matching)
-				transformCache.put(source, value);
+				value = super.split(normalize(sequence));
+				transformCache.put(sequence, value);
 			}
-			return transformCache.get(source);
+			return value;
+		}
+		
+		
+		public String normalize(String sequence) {
+			return normalizePunctuation(sequence).toLowerCase(); // only normalize punctuation, make sure we keep the year (important for movie matching)
 		}
 	}
 	
--- a/source/net/sourceforge/filebot/similarity/CommonSequenceMatcher.java
+++ b/source/net/sourceforge/filebot/similarity/CommonSequenceMatcher.java
@ -0,0 +1,125 @@
+
+package net.sourceforge.filebot.similarity;
+
+
+import static java.util.Arrays.*;
+import static java.util.Collections.*;
+
+import java.text.CollationKey;
+import java.text.Collator;
+import java.util.Locale;
+import java.util.Map;
+import java.util.WeakHashMap;
+
+
+public class CommonSequenceMatcher {
+	
+	public static Collator getLenientCollator(Locale locale) {
+		// use maximum strength collator by default
+		Collator collator = Collator.getInstance(locale);
+		collator.setDecomposition(Collator.FULL_DECOMPOSITION);
+		collator.setStrength(Collator.PRIMARY);
+		return collator;
+	}
+	
+	
+	protected final Collator collator;
+	protected final int commonSequenceMaxStartIndex;
+	
+	
+	public CommonSequenceMatcher(Collator collator, int commonSequenceMaxStartIndex) {
+		this.collator = collator;
+		this.commonSequenceMaxStartIndex = commonSequenceMaxStartIndex;
+	}
+	
+	
+	public Collator getCollator() {
+		return collator;
+	}
+	
+	
+	public String matchFirstCommonSequence(String... names) {
+		CollationKey[] common = null;
+		
+		for (String it : names) {
+			CollationKey[] words = split(it);
+			
+			if (common == null) {
+				// initialize common with current word array
+				common = words;
+			} else {
+				// find common sequence
+				common = firstCommonSequence(common, words, commonSequenceMaxStartIndex);
+				
+				if (common == null) {
+					// no common sequence
+					return null;
+				}
+			}
+		}
+		
+		if (common == null)
+			return null;
+		
+		return synth(common);
+	}
+	
+	
+	protected String synth(CollationKey[] keys) {
+		StringBuilder sb = new StringBuilder();
+		for (CollationKey it : keys) {
+			if (sb.length() > 0) {
+				sb.append(' ');
+			}
+			sb.append(it.getSourceString());
+		}
+		return sb.toString();
+	}
+	
+	
+	protected CollationKey[] split(String sequence) {
+		return getCollationKeys(sequence.split("\\s+"));
+	}
+	
+	
+	private final Map<String, CollationKey> collationKeyDictionary = synchronizedMap(new WeakHashMap<String, CollationKey>(256));
+	
+	
+	protected CollationKey[] getCollationKeys(String[] words) {
+		CollationKey[] keys = new CollationKey[words.length];
+		for (int i = 0; i < keys.length; i++) {
+			keys[i] = collationKeyDictionary.get(words[i]);
+			if (keys[i] == null) {
+				keys[i] = collator.getCollationKey(words[i]);
+				collationKeyDictionary.put(words[i], keys[i]);
+			}
+		}
+		return keys;
+	}
+	
+	
+	protected <E extends Comparable<E>> E[] firstCommonSequence(E[] seq1, E[] seq2, int maxStartIndex) {
+		for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
+			for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
+				// common sequence length
+				int len = 0;
+				
+				// iterate over common sequence
+				while ((i + len < seq1.length) && (j + len < seq2.length) && (seq1[i + len].compareTo(seq2[j + len]) == 0)) {
+					len++;
+				}
+				
+				// check if a common sequence was found
+				if (len > 0) {
+					if (i == 0 && len == seq1.length)
+						return seq1;
+					
+					return copyOfRange(seq1, i, i + len);
+				}
+			}
+		}
+		
+		// no intersection at all
+		return null;
+	}
+}
--- a/source/net/sourceforge/filebot/similarity/Normalization.java
+++ b/source/net/sourceforge/filebot/similarity/Normalization.java
@ -6,7 +6,7 @@ public class Normalization {
 	
 	public static String normalizePunctuation(String name) {
 		// remove/normalize special characters
-		name = name.replaceAll("['`´]+", "");
+		name = name.replaceAll("[`´‘’ʻ]+", "");
 		name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
 		
 		return name.trim();
--- a/source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java
+++ b/source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java
@ -3,15 +3,17 @@ package net.sourceforge.filebot.similarity;


 import static java.lang.Math.*;
+import static net.sourceforge.filebot.similarity.CommonSequenceMatcher.*;
 import static net.sourceforge.filebot.similarity.Normalization.*;

-import java.text.Collator;
-import java.util.Comparator;
 import java.util.Locale;


 public class SequenceMatchSimilarity implements SimilarityMetric {
 	
+	private final CommonSequenceMatcher commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(Locale.ROOT), 10);
+	
+	
 	@Override
 	public float getSimilarity(Object o1, Object o2) {
 		String s1 = normalize(o1);
@ -39,20 +41,7 @@ public class SequenceMatchSimilarity implements SimilarityMetric {
 	
 	
 	protected String match(String s1, String s2) {
-		// use maximum strength collator by default
-		Collator collator = Collator.getInstance(Locale.ROOT);
-		collator.setDecomposition(Collator.FULL_DECOMPOSITION);
-		collator.setStrength(Collator.TERTIARY);
-		
-		@SuppressWarnings("unchecked")
-		SeriesNameMatcher matcher = new SeriesNameMatcher((Comparator) collator, 10) {
-			
-			@Override
-			protected String normalize(String name) {
-				return name; // assume normalization has been done, no need to do that here again
-			};
-		};
-		
-		return matcher.matchByFirstCommonWordSequence(s1, s2);
+		return commonSequenceMatcher.matchFirstCommonSequence(s1, s2);
 	}
+	
 }
--- a/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java
+++ b/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java
@ -4,10 +4,12 @@ package net.sourceforge.filebot.similarity;

 import static java.util.Collections.*;
 import static java.util.regex.Pattern.*;
+import static net.sourceforge.filebot.similarity.CommonSequenceMatcher.*;
 import static net.sourceforge.filebot.similarity.Normalization.*;
 import static net.sourceforge.tuned.StringUtilities.*;

 import java.io.File;
+import java.text.CollationKey;
 import java.util.AbstractCollection;
 import java.util.ArrayList;
 import java.util.Arrays;
@ -16,6 +18,7 @@ import java.util.Comparator;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Scanner;
@ -31,25 +34,25 @@ public class SeriesNameMatcher {
 	
 	protected SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher(SeasonEpisodeMatcher.DEFAULT_SANITY, true);
 	protected DateMatcher dateMatcher = new DateMatcher();
+	
 	protected NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric();
 	
-	protected int commonWordSequenceMaxStartIndex;
-	protected Comparator<String> commonWordComparator;
+	protected CommonSequenceMatcher commonSequenceMatcher;
 	
 	
 	public SeriesNameMatcher() {
-		this(String.CASE_INSENSITIVE_ORDER, 3);
+		this(Locale.ROOT);
 	}
 	
 	
-	public SeriesNameMatcher(Comparator<String> comparator) {
-		this(comparator, 3);
-	}
-	
-	
-	public SeriesNameMatcher(Comparator<String> commonWordComparator, int commonWordSequenceMaxStartIndex) {
-		this.commonWordSequenceMaxStartIndex = commonWordSequenceMaxStartIndex;
-		this.commonWordComparator = commonWordComparator;
+	public SeriesNameMatcher(Locale locale) {
+		commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(locale), 3) {
+			
+			@Override
+			protected CollationKey[] split(String sequence) {
+				return super.split(normalize(sequence));
+			}
+		};
 	}
 	
 	
@ -62,7 +65,7 @@ public class SeriesNameMatcher {
 			String[] names = entry.getValue();
 			
 			for (String nameMatch : matchAll(names)) {
-				String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
+				String commonMatch = commonSequenceMatcher.matchFirstCommonSequence(nameMatch, parent);
 				float similarity = commonMatch == null ? 0 : nameSimilarityMetric.getSimilarity(commonMatch, nameMatch);
 				
 				// prefer common match, but only if it's very similar to the original match
@ -116,7 +119,9 @@ public class SeriesNameMatcher {
 	 *         threshold
 	 */
 	private Collection<String> flatMatchAll(String[] names, Pattern prefixPattern, int threshold, boolean strict) {
-		ThresholdCollection<String> thresholdCollection = new ThresholdCollection<String>(threshold, commonWordComparator);
+		@SuppressWarnings("unchecked")
+		Comparator<String> wordComparator = (Comparator) commonSequenceMatcher.getCollator();
+		ThresholdCollection<String> thresholdCollection = new ThresholdCollection<String>(threshold, wordComparator);
 		
 		for (String name : names) {
 			// use normalized name
@ -163,7 +168,7 @@ public class SeriesNameMatcher {
 			return emptySet();
 		}
 		
-		String common = matchByFirstCommonWordSequence(names);
+		String common = commonSequenceMatcher.matchFirstCommonSequence(names);
 		
 		if (common != null) {
 			// common word sequence found
@ -218,29 +223,7 @@ public class SeriesNameMatcher {
 			throw new IllegalArgumentException("Can't match common sequence from less than two names");
 		}
 		
-		String[] common = null;
-		
-		for (String name : names) {
-			String[] words = normalize(name).split("\\s+");
-			
-			if (common == null) {
-				// initialize common with current word array
-				common = words;
-			} else {
-				// find common sequence
-				common = firstCommonSequence(common, words, commonWordSequenceMaxStartIndex, commonWordComparator);
-				
-				if (common == null) {
-					// no common sequence
-					return null;
-				}
-			}
-		}
-		
-		if (common == null)
-			return null;
-		
-		return join(common, " ");
+		return commonSequenceMatcher.matchFirstCommonSequence(names);
 	}
 	
 	
--- a/source/net/sourceforge/filebot/ui/rename/EpisodeListMatcher.java
+++ b/source/net/sourceforge/filebot/ui/rename/EpisodeListMatcher.java
@ -180,13 +180,14 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
 		
 		// detect series names and create episode list fetch tasks
 		for (Entry<Set<File>, Set<String>> sameSeriesGroup : mapSeriesNamesByFiles(mediaFiles, locale).entrySet()) {
-			List<List<File>> batchSets = new ArrayList<List<File>>();
+			final List<List<File>> batchSets = new ArrayList<List<File>>();
+			final Collection<String> queries = sameSeriesGroup.getValue();
 			
-			if (sameSeriesGroup.getValue() != null && sameSeriesGroup.getValue().size() > 0) {
-				// handle series name batch set all at once
+			if (queries != null && queries.size() > 0) {
+				// handle series name batch set all at once -> only 1 batch set
 				batchSets.add(new ArrayList<File>(sameSeriesGroup.getKey()));
 			} else {
-				// these files don't seem to belong to any series -> handle folder per folder
+				// these files don't seem to belong to any series -> handle folder per folder -> multiple batch sets
 				batchSets.addAll(mapByFolder(sameSeriesGroup.getKey()).values());
 			}
 			
@ -195,7 +196,7 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
 					
 					@Override
 					public List<Match<File, ?>> call() throws Exception {
-						return matchEpisodeSet(batchSet, sortOrder, locale, autodetection, parent);
+						return matchEpisodeSet(batchSet, queries, sortOrder, locale, autodetection, parent);
 					}
 				});
 			}
@ -246,23 +247,22 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
 	}
 	
 	
-	public List<Match<File, ?>> matchEpisodeSet(final List<File> files, SortOrder sortOrder, Locale locale, boolean autodetection, Component parent) throws Exception {
+	public List<Match<File, ?>> matchEpisodeSet(final List<File> files, Collection<String> queries, SortOrder sortOrder, Locale locale, boolean autodetection, Component parent) throws Exception {
 		Set<Episode> episodes = emptySet();
 		
 		// detect series name and fetch episode list
 		if (autodetection) {
-			Collection<String> names = detectSeriesNames(files, locale);
-			if (names.size() > 0) {
+			if (queries != null && queries.size() > 0) {
 				// only allow one fetch session at a time so later requests can make use of cached results
 				synchronized (providerLock) {
-					episodes = fetchEpisodeSet(names, sortOrder, locale, parent);
+					episodes = fetchEpisodeSet(queries, sortOrder, locale, parent);
 				}
 			}
 		}
 		
 		// require user input if auto-detection has failed or has been disabled 
 		if (episodes.isEmpty()) {
-			String suggestion = new SeriesNameMatcher().matchByEpisodeIdentifier(getName(files.get(0)));
+			String suggestion = new SeriesNameMatcher(locale).matchByEpisodeIdentifier(getName(files.get(0)));
 			if (suggestion != null) {
 				// clean media info / release group info / etc 
 				suggestion = stripReleaseInfo(suggestion);