diff --git a/source/net/sourceforge/filebot/media/MediaDetection.java b/source/net/sourceforge/filebot/media/MediaDetection.java index 8947c428..d288e7f9 100644 --- a/source/net/sourceforge/filebot/media/MediaDetection.java +++ b/source/net/sourceforge/filebot/media/MediaDetection.java @@ -11,11 +11,11 @@ import static net.sourceforge.tuned.FileUtilities.*; import java.io.File; import java.io.FileFilter; import java.io.IOException; +import java.io.Serializable; import java.net.MalformedURLException; import java.net.URL; import java.text.CollationKey; import java.text.Collator; -import java.util.AbstractMap.SimpleEntry; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; @@ -400,43 +400,45 @@ public class MediaDetection { return matches; } - private static List> seriesIndex = new ArrayList>(75000); + private static final List> seriesIndex = new ArrayList>(100000); - public static synchronized List> getSeriesIndex() throws IOException { - if (seriesIndex.isEmpty()) { - try { - for (SearchResult[] index : new SearchResult[][] { releaseInfo.getTheTVDBIndex(), releaseInfo.getAnidbIndex() }) { - for (SearchResult item : index) { - for (String name : item.getEffectiveNames()) { - seriesIndex.add(new SimpleEntry(normalizePunctuation(name).toLowerCase(), item)); + public static List> getSeriesIndex() throws IOException { + synchronized (seriesIndex) { + if (seriesIndex.isEmpty()) { + try { + for (SearchResult[] index : new SearchResult[][] { releaseInfo.getTheTVDBIndex(), releaseInfo.getAnidbIndex() }) { + for (SearchResult it : index) { + seriesIndex.addAll(HighPerformanceMatcher.prepare(it)); } } - } - } catch (Exception e) { - // can't load movie index, just try again next time - Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load series index: " + e.getMessage(), e); - return emptyList(); - } - } + } catch (Exception e) { + // can't load movie index, just try again next time + Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load series index: " + e.getMessage(), e); - return seriesIndex; + // rely on online search + return emptyList(); + } + } + return seriesIndex; + } } - public static List matchSeriesByName(Collection names, int maxStartIndex) throws Exception { + public static List matchSeriesByName(Collection files, int maxStartIndex) throws Exception { HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex); List matches = new ArrayList(); - for (String name : names) { - String bestMatch = ""; - for (Entry it : getSeriesIndex()) { - String identifier = it.getKey(); - String commonName = nameMatcher.matchFirstCommonSequence(name, identifier); - if (commonName != null && commonName.length() >= identifier.length() && commonName.length() > bestMatch.length()) { - bestMatch = commonName; + List names = HighPerformanceMatcher.prepare(files); + + for (CollationKey[] name : names) { + IndexEntry bestMatch = null; + for (IndexEntry it : getSeriesIndex()) { + CollationKey[] commonName = nameMatcher.matchFirstCommonSequence(name, it.lenientKey); + if (commonName != null && commonName.length >= it.lenientKey.length && (bestMatch == null || commonName.length > bestMatch.lenientKey.length)) { + bestMatch = it; } } - if (bestMatch.length() > 0) { - matches.add(bestMatch); + if (bestMatch != null) { + matches.add(bestMatch.lenientName); } } @@ -469,12 +471,12 @@ public class MediaDetection { float similarityThreshold = strict ? 0.75f : 0.5f; List seriesList = new ArrayList(); - for (Entry it : getSeriesIndex()) { - String name = spacing.matcher(it.getKey()).replaceAll("").toLowerCase(); + for (IndexEntry it : getSeriesIndex()) { + String name = spacing.matcher(it.lenientName).replaceAll("").toLowerCase(); for (String term : terms) { if (term.contains(name)) { if (metric.getSimilarity(term, name) >= similarityThreshold) { - seriesList.add(it.getValue()); + seriesList.add(it.object); } break; } @@ -753,24 +755,25 @@ public class MediaDetection { return matches != null && matches.size() > 0 ? matches.get(0) : null; } - private static List> movieIndex = new ArrayList>(100000); + private static final List> movieIndex = new ArrayList>(100000); - public static synchronized List> getMovieIndex() throws IOException { - if (movieIndex.isEmpty()) { - try { - for (Movie movie : releaseInfo.getMovieList()) { - for (String name : movie.getEffectiveNamesWithoutYear()) { - movieIndex.add(new SimpleEntry(normalizePunctuation(name).toLowerCase(), movie)); + public static List> getMovieIndex() throws IOException { + synchronized (movieIndex) { + if (movieIndex.isEmpty()) { + try { + for (Movie it : releaseInfo.getMovieList()) { + movieIndex.addAll(HighPerformanceMatcher.prepare(it)); } - } - } catch (Exception e) { - // can't load movie index, just try again next time - Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load movie index: " + e.getMessage(), e); - return emptyList(); - } - } + } catch (Exception e) { + // can't load movie index, just try again next time + Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load movie index: " + e.getMessage(), e); - return movieIndex; + // if we can't use internal index we can only rely on online search + return emptyList(); + } + } + return movieIndex; + } } public static List matchMovieName(final Collection files, boolean strict, int maxStartIndex) throws Exception { @@ -778,19 +781,19 @@ public class MediaDetection { final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex); final Map matchMap = new HashMap(); - for (Entry movie : getMovieIndex()) { - for (String name : files) { - String movieIdentifier = movie.getKey(); - String commonName = nameMatcher.matchFirstCommonSequence(name, movieIdentifier); - if (commonName != null && commonName.length() >= movieIdentifier.length()) { - String strictMovieIdentifier = movie.getKey() + " " + movie.getValue().getYear(); - String strictCommonName = nameMatcher.matchFirstCommonSequence(name, strictMovieIdentifier); - if (strictCommonName != null && strictCommonName.length() >= strictMovieIdentifier.length()) { + List names = HighPerformanceMatcher.prepare(files); + + for (IndexEntry movie : getMovieIndex()) { + for (CollationKey[] name : names) { + CollationKey[] commonName = nameMatcher.matchFirstCommonSequence(name, movie.lenientKey); + if (commonName != null && commonName.length >= movie.lenientKey.length) { + CollationKey[] strictCommonName = nameMatcher.matchFirstCommonSequence(name, movie.strictKey); + if (strictCommonName != null && strictCommonName.length >= movie.strictKey.length) { // prefer strict match - matchMap.put(movie.getValue(), strictCommonName); + matchMap.put(movie.object, movie.strictName); } else if (!strict) { // make sure the common identifier is not just the year - matchMap.put(movie.getValue(), commonName); + matchMap.put(movie.object, movie.lenientName); } } } @@ -826,21 +829,20 @@ public class MediaDetection { float similarityThreshold = strict ? 0.9f : 0.5f; LinkedList movies = new LinkedList(); - for (Entry it : getMovieIndex()) { - String name = spacing.matcher(it.getKey()).replaceAll("").toLowerCase(); + for (IndexEntry it : getMovieIndex()) { + String name = spacing.matcher(it.lenientName).replaceAll("").toLowerCase(); for (String term : terms) { if (term.contains(name)) { - String year = String.valueOf(it.getValue().getYear()); + String year = String.valueOf(it.object.getYear()); if (term.contains(year) && metric.getSimilarity(term, name + year) > similarityThreshold) { - movies.addFirst(it.getValue()); + movies.addFirst(it.object); } else if (metric.getSimilarity(term, name) > similarityThreshold) { - movies.addLast(it.getValue()); + movies.addLast(it.object); } break; } } } - return new ArrayList(movies); } @@ -1082,31 +1084,79 @@ public class MediaDetection { return probableMatches; } + public static class IndexEntry implements Serializable { + + private final T object; + private final String lenientName; + private final String strictName; + private final CollationKey[] lenientKey; + private final CollationKey[] strictKey; + + public IndexEntry(T object, String lenientName, String strictName, CollationKey[] lenientKey, CollationKey[] strictKey) { + this.object = object; + this.lenientName = lenientName; + this.strictName = strictName; + this.lenientKey = lenientKey; + this.strictKey = strictKey; + } + } + /* * Heavy-duty name matcher used for matching a file to or more movies (out of a list of ~50k) */ private static class HighPerformanceMatcher extends CommonSequenceMatcher { private static final Collator collator = getLenientCollator(Locale.ENGLISH); + private static final Pattern space = Pattern.compile("\\s+"); - private static final Map transformCache = synchronizedMap(new HashMap(65536)); + public static CollationKey[] prepare(String sequence) { + String[] words = space.split(normalizePunctuation(sequence)); + CollationKey[] keys = new CollationKey[words.length]; + for (int i = 0; i < words.length; i++) { + keys[i] = collator.getCollationKey(words[i]); + } + return keys; + } + + public static List prepare(Collection sequences) { + List result = new ArrayList(sequences.size()); + for (String it : sequences) { + result.add(prepare(it)); + } + return result; + } + + public static List> prepare(Movie m) { + List effectiveNamesWithoutYear = m.getEffectiveNamesWithoutYear(); + List effectiveNames = m.getEffectiveNames(); + List> index = new ArrayList>(effectiveNames.size()); + + for (int i = 0; i < effectiveNames.size(); i++) { + String lenientName = normalizePunctuation(effectiveNamesWithoutYear.get(i)); + String strictName = normalizePunctuation(effectiveNames.get(i)); + index.add(new IndexEntry(m, lenientName, strictName, prepare(lenientName), prepare(strictName))); + } + return index; + } + + public static List> prepare(SearchResult r) { + List effectiveNames = r.getEffectiveNames(); + List> index = new ArrayList>(effectiveNames.size()); + + for (int i = 0; i < effectiveNames.size(); i++) { + String lenientName = normalizePunctuation(effectiveNames.get(i)); + index.add(new IndexEntry(r, lenientName, null, prepare(lenientName), null)); + } + return index; + } public HighPerformanceMatcher(int maxStartIndex) { super(collator, maxStartIndex, true); } @Override - protected CollationKey[] split(String sequence) { - CollationKey[] value = transformCache.get(sequence); - if (value == null) { - value = super.split(normalize(sequence)); - transformCache.put(sequence, value); - } - return value; - } - - public String normalize(String sequence) { - return normalizePunctuation(sequence); // only normalize punctuation, make sure we keep the year (important for movie matching) + public CollationKey[] split(String sequence) { + throw new UnsupportedOperationException("requires ahead-of-time collation"); } } diff --git a/source/net/sourceforge/filebot/similarity/CommonSequenceMatcher.java b/source/net/sourceforge/filebot/similarity/CommonSequenceMatcher.java index 3b865d0e..848b5efe 100644 --- a/source/net/sourceforge/filebot/similarity/CommonSequenceMatcher.java +++ b/source/net/sourceforge/filebot/similarity/CommonSequenceMatcher.java @@ -1,7 +1,5 @@ - package net.sourceforge.filebot.similarity; - import static java.util.Arrays.*; import static java.util.Collections.*; @@ -11,9 +9,8 @@ import java.util.HashMap; import java.util.Locale; import java.util.Map; - public class CommonSequenceMatcher { - + public static Collator getLenientCollator(Locale locale) { // use maximum strength collator by default Collator collator = Collator.getInstance(locale); @@ -21,52 +18,54 @@ public class CommonSequenceMatcher { collator.setStrength(Collator.PRIMARY); return collator; } - + protected final Collator collator; protected final int commonSequenceMaxStartIndex; protected final boolean returnFirstMatch; - - + public CommonSequenceMatcher(Collator collator, int commonSequenceMaxStartIndex, boolean returnFirstMatch) { this.collator = collator; this.commonSequenceMaxStartIndex = commonSequenceMaxStartIndex; this.returnFirstMatch = returnFirstMatch; } - - + public Collator getCollator() { return collator; } - - + public String matchFirstCommonSequence(String... names) { - CollationKey[] common = null; - - for (String it : names) { - CollationKey[] words = split(it); - + CollationKey[][] words = new CollationKey[names.length][]; + for (int i = 0; i < names.length; i++) { + words[i] = split(names[i]); + } + return synth(matchFirstCommonSequence(words)); + } + + public > E[] matchFirstCommonSequence(E[]... names) { + E[] common = null; + + for (E[] words : names) { if (common == null) { // initialize common with current word array common = words; } else { // find common sequence common = firstCommonSequence(common, words, commonSequenceMaxStartIndex, returnFirstMatch); - + if (common == null) { // no common sequence return null; } } } - - if (common == null) - return null; - - return synth(common); + return common; } - - + protected String synth(CollationKey[] keys) { + if (keys == null) { + return null; + } + StringBuilder sb = new StringBuilder(); for (CollationKey it : keys) { if (sb.length() > 0) { @@ -76,15 +75,13 @@ public class CommonSequenceMatcher { } return sb.toString(); } - - - protected CollationKey[] split(String sequence) { + + public CollationKey[] split(String sequence) { return getCollationKeys(sequence.split("\\s+")); } - + private final Map collationKeyDictionary = synchronizedMap(new HashMap(256)); - - + protected CollationKey[] getCollationKeys(String[] words) { CollationKey[] keys = new CollationKey[words.length]; for (int i = 0; i < keys.length; i++) { @@ -96,24 +93,23 @@ public class CommonSequenceMatcher { } return keys; } - - + protected > E[] firstCommonSequence(E[] seq1, E[] seq2, int maxStartIndex, boolean returnFirstMatch) { E[] matchSeq = null; for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) { for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) { // common sequence length int len = 0; - + // iterate over common sequence while ((i + len < seq1.length) && (j + len < seq2.length) && (seq1[i + len].compareTo(seq2[j + len]) == 0)) { len++; } - + // check if a common sequence was found if (len > (matchSeq == null ? 0 : matchSeq.length)) { matchSeq = copyOfRange(seq1, i, i + len); - + // look for first match if (returnFirstMatch) { return matchSeq; diff --git a/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java b/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java index d0b14a3a..7414b700 100644 --- a/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java +++ b/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java @@ -48,7 +48,7 @@ public class SeriesNameMatcher { commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(locale), 3, true) { @Override - protected CollationKey[] split(String sequence) { + public CollationKey[] split(String sequence) { return super.split(normalize(sequence)); } }; diff --git a/source/net/sourceforge/filebot/web/Movie.java b/source/net/sourceforge/filebot/web/Movie.java index 0b80df3a..7da3804b 100644 --- a/source/net/sourceforge/filebot/web/Movie.java +++ b/source/net/sourceforge/filebot/web/Movie.java @@ -43,6 +43,10 @@ public class Movie extends SearchResult { return tmdbId; } + public String getNameWithYear() { + return toString(name, year); + } + @Override public List getEffectiveNames() { if (aliasNames == null || aliasNames.length == 0) {