From 18df1820a75b2591bc3a0a015e45c89030272768 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Mon, 1 Apr 2013 09:39:31 +0000 Subject: [PATCH] * improved series lookup for series-name-without-spacing type naming, the worst of all naming styles... e.g. isd-thewalkingdead-s03e12.avi --- .../filebot/media/MediaDetection.java | 91 +++++++++++++++++-- .../filebot/media/ReleaseInfo.java | 22 ----- .../filebot/media/ReleaseInfo.properties | 3 - .../sourceforge/filebot/web/AnidbClient.java | 7 +- website/data/release-groups.txt | 1 + 5 files changed, 89 insertions(+), 35 deletions(-) diff --git a/source/net/sourceforge/filebot/media/MediaDetection.java b/source/net/sourceforge/filebot/media/MediaDetection.java index 899ab3f1..315572ac 100644 --- a/source/net/sourceforge/filebot/media/MediaDetection.java +++ b/source/net/sourceforge/filebot/media/MediaDetection.java @@ -2,6 +2,7 @@ package net.sourceforge.filebot.media; +import static java.util.Arrays.*; import static java.util.Collections.*; import static java.util.regex.Pattern.*; import static net.sourceforge.filebot.MediaTypes.*; @@ -51,6 +52,7 @@ import net.sourceforge.filebot.similarity.SequenceMatchSimilarity; import net.sourceforge.filebot.similarity.SeriesNameMatcher; import net.sourceforge.filebot.similarity.SimilarityComparator; import net.sourceforge.filebot.similarity.SimilarityMetric; +import net.sourceforge.filebot.web.AnidbClient.AnidbSearchResult; import net.sourceforge.filebot.web.Date; import net.sourceforge.filebot.web.Episode; import net.sourceforge.filebot.web.Movie; @@ -283,7 +285,7 @@ public class MediaDetection { Set filenames = new LinkedHashSet(); for (File f : files) { for (int i = 0; i < 3 && f != null; i++, f = f.getParentFile()) { - (i == 0 ? filenames : folders).add(normalizeBrackets(f.getName())); + (i == 0 ? filenames : folders).add(normalizeBrackets(getName(f))); } } @@ -302,6 +304,22 @@ public class MediaDetection { matches.addAll(matchSeriesByName(filenames, 3)); } + // assume name without spacing will mess up any lookup + if (matches.isEmpty()) { + // try to narrow down file to series name as best as possible + SeriesNameMatcher snm = new SeriesNameMatcher(); + String[] sns = filenames.toArray(new String[0]); + for (int i = 0; i < sns.length; i++) { + String sn = snm.matchByEpisodeIdentifier(sns[i]); + if (sn != null) { + sns[i] = sn; + } + } + for (SearchResult it : matchSeriesFromStringWithoutSpacing(stripReleaseInfo(asList(sns), false), true)) { + matches.add(it.getName()); + } + } + // pass along only valid terms names.addAll(stripBlacklistedTerms(matches)); } catch (Exception e) { @@ -341,15 +359,40 @@ public class MediaDetection { return matches; } + private static List> seriesIndex = new ArrayList>(75000); + + + public static synchronized List> getSeriesIndex() throws IOException { + if (seriesIndex.isEmpty()) { + try { + for (TheTVDBSearchResult it : releaseInfo.getTheTVDBIndex()) { + seriesIndex.add(new SimpleEntry(normalizePunctuation(it.getName()).toLowerCase(), it)); + } + for (AnidbSearchResult it : releaseInfo.getAnidbIndex()) { + seriesIndex.add(new SimpleEntry(normalizePunctuation(it.getPrimaryTitle()).toLowerCase(), it)); + if (it.getEnglishTitle() != null) { + seriesIndex.add(new SimpleEntry(normalizePunctuation(it.getEnglishTitle()).toLowerCase(), it)); + } + } + } catch (Exception e) { + // can't load movie index, just try again next time + Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load series index: " + e.getMessage(), e); + return emptyList(); + } + } + + return seriesIndex; + } + public static List matchSeriesByName(Collection names, int maxStartIndex) throws Exception { HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex); List matches = new ArrayList(); - String[] seriesIndex = releaseInfo.getSeriesList(); for (String name : names) { String bestMatch = ""; - for (String identifier : seriesIndex) { + for (Entry it : getSeriesIndex()) { + String identifier = it.getKey(); String commonName = nameMatcher.matchFirstCommonSequence(name, identifier); if (commonName != null && commonName.length() >= identifier.length() && commonName.length() > bestMatch.length()) { bestMatch = commonName; @@ -373,6 +416,38 @@ public class MediaDetection { } + public static List matchSeriesFromStringWithoutSpacing(Collection names, boolean strict) throws IOException { + // clear name of punctuation, spacing, and leading 'The' or 'A' that are common causes for word-lookup to fail + Pattern spacing = Pattern.compile("(^(?i)(The|A)\\b)|[\\p{Punct}\\p{Space}]+"); + + List terms = new ArrayList(names.size()); + for (String it : names) { + String term = spacing.matcher(it).replaceAll("").toLowerCase(); + if (term.length() >= 3) { + terms.add(term); // only consider words, not just random letters + } + } + + // similarity threshold based on strict/non-strict + SimilarityMetric metric = new NameSimilarityMetric(); + float similarityThreshold = strict ? 0.75f : 0.5f; + + List seriesList = new ArrayList(); + for (Entry it : getSeriesIndex()) { + String name = spacing.matcher(it.getKey()).replaceAll("").toLowerCase(); + for (String term : terms) { + if (term.contains(name)) { + if (metric.getSimilarity(term, name) >= similarityThreshold) { + seriesList.add(it.getValue()); + } + break; + } + } + } + return seriesList; + } + + public static Collection detectMovie(File movieFile, MovieIdentificationService hashLookupService, MovieIdentificationService queryLookupService, Locale locale, boolean strict) throws Exception { Set options = new LinkedHashSet(); @@ -563,15 +638,13 @@ public class MediaDetection { return matches != null && matches.size() > 0 ? matches.get(0) : null; } - private static List> movieIndex; + private static List> movieIndex = new ArrayList>(100000); - private static synchronized List> getMovieIndex() throws IOException { - if (movieIndex == null) { + public static synchronized List> getMovieIndex() throws IOException { + if (movieIndex.isEmpty()) { try { - Movie[] movies = releaseInfo.getMovieList(); - movieIndex = new ArrayList>(movies.length); - for (Movie movie : movies) { + for (Movie movie : releaseInfo.getMovieList()) { movieIndex.add(new SimpleEntry(normalizePunctuation(movie.getName()).toLowerCase(), movie)); } } catch (Exception e) { diff --git a/source/net/sourceforge/filebot/media/ReleaseInfo.java b/source/net/sourceforge/filebot/media/ReleaseInfo.java index a8419e31..bbd9fe71 100644 --- a/source/net/sourceforge/filebot/media/ReleaseInfo.java +++ b/source/net/sourceforge/filebot/media/ReleaseInfo.java @@ -7,13 +7,11 @@ import static java.util.Collections.*; import static java.util.ResourceBundle.*; import static java.util.regex.Pattern.*; import static net.sourceforge.filebot.similarity.Normalization.*; -import static net.sourceforge.tuned.FileUtilities.*; import static net.sourceforge.tuned.StringUtilities.*; import java.io.File; import java.io.FileFilter; import java.io.IOException; -import java.io.InputStreamReader; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.text.Collator; @@ -238,11 +236,6 @@ public class ReleaseInfo { } - public String[] getSeriesList() throws IOException { - return seriesListResource.get(); - } - - public TheTVDBSearchResult[] getTheTVDBIndex() throws IOException { return tvdbIndexResource.get(); } @@ -279,7 +272,6 @@ public class ReleaseInfo { protected final CachedResource queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist")); protected final CachedResource excludeBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.exclude-blacklist")); protected final CachedResource movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list")); - protected final CachedResource seriesListResource = new SeriesListResource(getBundle(getClass().getName()).getString("url.series-list")); protected final CachedResource seriesDirectMappingsResource = new PatternResource(getBundle(getClass().getName()).getString("url.series-mappings")); protected final CachedResource tvdbIndexResource = new TheTVDBIndexResource(getBundle(getClass().getName()).getString("url.thetvdb-index")); protected final CachedResource anidbIndexResource = new AnidbIndexResource(getBundle(getClass().getName()).getString("url.anidb-index")); @@ -323,20 +315,6 @@ public class ReleaseInfo { } - protected static class SeriesListResource extends CachedResource { - - public SeriesListResource(String resource) { - super(resource, String[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week - } - - - @Override - public String[] process(ByteBuffer data) throws IOException { - return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8")).split("\\n"); - } - } - - protected static class TheTVDBIndexResource extends CachedResource { public TheTVDBIndexResource(String resource) { diff --git a/source/net/sourceforge/filebot/media/ReleaseInfo.properties b/source/net/sourceforge/filebot/media/ReleaseInfo.properties index 19653326..84789bdd 100644 --- a/source/net/sourceforge/filebot/media/ReleaseInfo.properties +++ b/source/net/sourceforge/filebot/media/ReleaseInfo.properties @@ -19,9 +19,6 @@ url.series-mappings: http://filebot.net/data/series-mappings.txt # list of all movies (id, name, year) url.movie-list: http://filebot.net/data/movies.txt.gz -# list of tv show and anime names -url.series-list: http://filebot.net/data/series.list.gz - # TheTVDB index url.thetvdb-index: http://filebot.net/data/thetvdb.txt.gz diff --git a/source/net/sourceforge/filebot/web/AnidbClient.java b/source/net/sourceforge/filebot/web/AnidbClient.java index 7c389430..d19eaf71 100644 --- a/source/net/sourceforge/filebot/web/AnidbClient.java +++ b/source/net/sourceforge/filebot/web/AnidbClient.java @@ -90,7 +90,7 @@ public class AnidbClient extends AbstractEpisodeListProvider { @Override protected Set getFields(AnidbSearchResult anime) { - return set(anime.getPrimaryTitle(), anime.getOfficialTitle("en")); + return set(anime.getPrimaryTitle(), anime.getEnglishTitle()); } }; @@ -263,6 +263,11 @@ public class AnidbClient extends AbstractEpisodeListProvider { } + public String getEnglishTitle() { + return officialTitle != null ? officialTitle.get("en") : null; + } + + public String getOfficialTitle(String key) { return officialTitle != null ? officialTitle.get(key) : null; } diff --git a/website/data/release-groups.txt b/website/data/release-groups.txt index 8a55ead3..d138d072 100644 --- a/website/data/release-groups.txt +++ b/website/data/release-groups.txt @@ -793,6 +793,7 @@ iNVANDRAREN iON iRB iRoNiCs +iSD iSG iSRAELiTE iTA