* improved series lookup for series-name-without-spacing type naming, the worst of all naming styles... e.g. isd-thewalkingdead-s03e12.avi

This commit is contained in:
Reinhard Pointner 2013-04-01 09:39:31 +00:00
parent 2793321715
commit 18df1820a7
5 changed files with 89 additions and 35 deletions

View File

@ -2,6 +2,7 @@
package net.sourceforge.filebot.media; package net.sourceforge.filebot.media;
import static java.util.Arrays.*;
import static java.util.Collections.*; import static java.util.Collections.*;
import static java.util.regex.Pattern.*; import static java.util.regex.Pattern.*;
import static net.sourceforge.filebot.MediaTypes.*; import static net.sourceforge.filebot.MediaTypes.*;
@ -51,6 +52,7 @@ import net.sourceforge.filebot.similarity.SequenceMatchSimilarity;
import net.sourceforge.filebot.similarity.SeriesNameMatcher; import net.sourceforge.filebot.similarity.SeriesNameMatcher;
import net.sourceforge.filebot.similarity.SimilarityComparator; import net.sourceforge.filebot.similarity.SimilarityComparator;
import net.sourceforge.filebot.similarity.SimilarityMetric; import net.sourceforge.filebot.similarity.SimilarityMetric;
import net.sourceforge.filebot.web.AnidbClient.AnidbSearchResult;
import net.sourceforge.filebot.web.Date; import net.sourceforge.filebot.web.Date;
import net.sourceforge.filebot.web.Episode; import net.sourceforge.filebot.web.Episode;
import net.sourceforge.filebot.web.Movie; import net.sourceforge.filebot.web.Movie;
@ -283,7 +285,7 @@ public class MediaDetection {
Set<String> filenames = new LinkedHashSet<String>(); Set<String> filenames = new LinkedHashSet<String>();
for (File f : files) { for (File f : files) {
for (int i = 0; i < 3 && f != null; i++, f = f.getParentFile()) { for (int i = 0; i < 3 && f != null; i++, f = f.getParentFile()) {
(i == 0 ? filenames : folders).add(normalizeBrackets(f.getName())); (i == 0 ? filenames : folders).add(normalizeBrackets(getName(f)));
} }
} }
@ -302,6 +304,22 @@ public class MediaDetection {
matches.addAll(matchSeriesByName(filenames, 3)); matches.addAll(matchSeriesByName(filenames, 3));
} }
// assume name without spacing will mess up any lookup
if (matches.isEmpty()) {
// try to narrow down file to series name as best as possible
SeriesNameMatcher snm = new SeriesNameMatcher();
String[] sns = filenames.toArray(new String[0]);
for (int i = 0; i < sns.length; i++) {
String sn = snm.matchByEpisodeIdentifier(sns[i]);
if (sn != null) {
sns[i] = sn;
}
}
for (SearchResult it : matchSeriesFromStringWithoutSpacing(stripReleaseInfo(asList(sns), false), true)) {
matches.add(it.getName());
}
}
// pass along only valid terms // pass along only valid terms
names.addAll(stripBlacklistedTerms(matches)); names.addAll(stripBlacklistedTerms(matches));
} catch (Exception e) { } catch (Exception e) {
@ -341,15 +359,40 @@ public class MediaDetection {
return matches; return matches;
} }
private static List<Entry<String, SearchResult>> seriesIndex = new ArrayList<Entry<String, SearchResult>>(75000);
public static synchronized List<Entry<String, SearchResult>> getSeriesIndex() throws IOException {
if (seriesIndex.isEmpty()) {
try {
for (TheTVDBSearchResult it : releaseInfo.getTheTVDBIndex()) {
seriesIndex.add(new SimpleEntry<String, SearchResult>(normalizePunctuation(it.getName()).toLowerCase(), it));
}
for (AnidbSearchResult it : releaseInfo.getAnidbIndex()) {
seriesIndex.add(new SimpleEntry<String, SearchResult>(normalizePunctuation(it.getPrimaryTitle()).toLowerCase(), it));
if (it.getEnglishTitle() != null) {
seriesIndex.add(new SimpleEntry<String, SearchResult>(normalizePunctuation(it.getEnglishTitle()).toLowerCase(), it));
}
}
} catch (Exception e) {
// can't load movie index, just try again next time
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.SEVERE, "Failed to load series index: " + e.getMessage(), e);
return emptyList();
}
}
return seriesIndex;
}
public static List<String> matchSeriesByName(Collection<String> names, int maxStartIndex) throws Exception { public static List<String> matchSeriesByName(Collection<String> names, int maxStartIndex) throws Exception {
HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex); HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(maxStartIndex);
List<String> matches = new ArrayList<String>(); List<String> matches = new ArrayList<String>();
String[] seriesIndex = releaseInfo.getSeriesList();
for (String name : names) { for (String name : names) {
String bestMatch = ""; String bestMatch = "";
for (String identifier : seriesIndex) { for (Entry<String, SearchResult> it : getSeriesIndex()) {
String identifier = it.getKey();
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier); String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
if (commonName != null && commonName.length() >= identifier.length() && commonName.length() > bestMatch.length()) { if (commonName != null && commonName.length() >= identifier.length() && commonName.length() > bestMatch.length()) {
bestMatch = commonName; bestMatch = commonName;
@ -373,6 +416,38 @@ public class MediaDetection {
} }
public static List<SearchResult> matchSeriesFromStringWithoutSpacing(Collection<String> names, boolean strict) throws IOException {
// clear name of punctuation, spacing, and leading 'The' or 'A' that are common causes for word-lookup to fail
Pattern spacing = Pattern.compile("(^(?i)(The|A)\\b)|[\\p{Punct}\\p{Space}]+");
List<String> terms = new ArrayList<String>(names.size());
for (String it : names) {
String term = spacing.matcher(it).replaceAll("").toLowerCase();
if (term.length() >= 3) {
terms.add(term); // only consider words, not just random letters
}
}
// similarity threshold based on strict/non-strict
SimilarityMetric metric = new NameSimilarityMetric();
float similarityThreshold = strict ? 0.75f : 0.5f;
List<SearchResult> seriesList = new ArrayList<SearchResult>();
for (Entry<String, SearchResult> it : getSeriesIndex()) {
String name = spacing.matcher(it.getKey()).replaceAll("").toLowerCase();
for (String term : terms) {
if (term.contains(name)) {
if (metric.getSimilarity(term, name) >= similarityThreshold) {
seriesList.add(it.getValue());
}
break;
}
}
}
return seriesList;
}
public static Collection<Movie> detectMovie(File movieFile, MovieIdentificationService hashLookupService, MovieIdentificationService queryLookupService, Locale locale, boolean strict) throws Exception { public static Collection<Movie> detectMovie(File movieFile, MovieIdentificationService hashLookupService, MovieIdentificationService queryLookupService, Locale locale, boolean strict) throws Exception {
Set<Movie> options = new LinkedHashSet<Movie>(); Set<Movie> options = new LinkedHashSet<Movie>();
@ -563,15 +638,13 @@ public class MediaDetection {
return matches != null && matches.size() > 0 ? matches.get(0) : null; return matches != null && matches.size() > 0 ? matches.get(0) : null;
} }
private static List<Entry<String, Movie>> movieIndex; private static List<Entry<String, Movie>> movieIndex = new ArrayList<Entry<String, Movie>>(100000);
private static synchronized List<Entry<String, Movie>> getMovieIndex() throws IOException { public static synchronized List<Entry<String, Movie>> getMovieIndex() throws IOException {
if (movieIndex == null) { if (movieIndex.isEmpty()) {
try { try {
Movie[] movies = releaseInfo.getMovieList(); for (Movie movie : releaseInfo.getMovieList()) {
movieIndex = new ArrayList<Entry<String, Movie>>(movies.length);
for (Movie movie : movies) {
movieIndex.add(new SimpleEntry<String, Movie>(normalizePunctuation(movie.getName()).toLowerCase(), movie)); movieIndex.add(new SimpleEntry<String, Movie>(normalizePunctuation(movie.getName()).toLowerCase(), movie));
} }
} catch (Exception e) { } catch (Exception e) {

View File

@ -7,13 +7,11 @@ import static java.util.Collections.*;
import static java.util.ResourceBundle.*; import static java.util.ResourceBundle.*;
import static java.util.regex.Pattern.*; import static java.util.regex.Pattern.*;
import static net.sourceforge.filebot.similarity.Normalization.*; import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.FileUtilities.*;
import static net.sourceforge.tuned.StringUtilities.*; import static net.sourceforge.tuned.StringUtilities.*;
import java.io.File; import java.io.File;
import java.io.FileFilter; import java.io.FileFilter;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.text.Collator; import java.text.Collator;
@ -238,11 +236,6 @@ public class ReleaseInfo {
} }
public String[] getSeriesList() throws IOException {
return seriesListResource.get();
}
public TheTVDBSearchResult[] getTheTVDBIndex() throws IOException { public TheTVDBSearchResult[] getTheTVDBIndex() throws IOException {
return tvdbIndexResource.get(); return tvdbIndexResource.get();
} }
@ -279,7 +272,6 @@ public class ReleaseInfo {
protected final CachedResource<String[]> queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist")); protected final CachedResource<String[]> queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist"));
protected final CachedResource<String[]> excludeBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.exclude-blacklist")); protected final CachedResource<String[]> excludeBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.exclude-blacklist"));
protected final CachedResource<Movie[]> movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list")); protected final CachedResource<Movie[]> movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list"));
protected final CachedResource<String[]> seriesListResource = new SeriesListResource(getBundle(getClass().getName()).getString("url.series-list"));
protected final CachedResource<String[]> seriesDirectMappingsResource = new PatternResource(getBundle(getClass().getName()).getString("url.series-mappings")); protected final CachedResource<String[]> seriesDirectMappingsResource = new PatternResource(getBundle(getClass().getName()).getString("url.series-mappings"));
protected final CachedResource<TheTVDBSearchResult[]> tvdbIndexResource = new TheTVDBIndexResource(getBundle(getClass().getName()).getString("url.thetvdb-index")); protected final CachedResource<TheTVDBSearchResult[]> tvdbIndexResource = new TheTVDBIndexResource(getBundle(getClass().getName()).getString("url.thetvdb-index"));
protected final CachedResource<AnidbSearchResult[]> anidbIndexResource = new AnidbIndexResource(getBundle(getClass().getName()).getString("url.anidb-index")); protected final CachedResource<AnidbSearchResult[]> anidbIndexResource = new AnidbIndexResource(getBundle(getClass().getName()).getString("url.anidb-index"));
@ -323,20 +315,6 @@ public class ReleaseInfo {
} }
protected static class SeriesListResource extends CachedResource<String[]> {
public SeriesListResource(String resource) {
super(resource, String[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
}
@Override
public String[] process(ByteBuffer data) throws IOException {
return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8")).split("\\n");
}
}
protected static class TheTVDBIndexResource extends CachedResource<TheTVDBSearchResult[]> { protected static class TheTVDBIndexResource extends CachedResource<TheTVDBSearchResult[]> {
public TheTVDBIndexResource(String resource) { public TheTVDBIndexResource(String resource) {

View File

@ -19,9 +19,6 @@ url.series-mappings: http://filebot.net/data/series-mappings.txt
# list of all movies (id, name, year) # list of all movies (id, name, year)
url.movie-list: http://filebot.net/data/movies.txt.gz url.movie-list: http://filebot.net/data/movies.txt.gz
# list of tv show and anime names
url.series-list: http://filebot.net/data/series.list.gz
# TheTVDB index # TheTVDB index
url.thetvdb-index: http://filebot.net/data/thetvdb.txt.gz url.thetvdb-index: http://filebot.net/data/thetvdb.txt.gz

View File

@ -90,7 +90,7 @@ public class AnidbClient extends AbstractEpisodeListProvider {
@Override @Override
protected Set<String> getFields(AnidbSearchResult anime) { protected Set<String> getFields(AnidbSearchResult anime) {
return set(anime.getPrimaryTitle(), anime.getOfficialTitle("en")); return set(anime.getPrimaryTitle(), anime.getEnglishTitle());
} }
}; };
@ -263,6 +263,11 @@ public class AnidbClient extends AbstractEpisodeListProvider {
} }
public String getEnglishTitle() {
return officialTitle != null ? officialTitle.get("en") : null;
}
public String getOfficialTitle(String key) { public String getOfficialTitle(String key) {
return officialTitle != null ? officialTitle.get(key) : null; return officialTitle != null ? officialTitle.get(key) : null;
} }

View File

@ -793,6 +793,7 @@ iNVANDRAREN
iON iON
iRB iRB
iRoNiCs iRoNiCs
iSD
iSG iSG
iSRAELiTE iSRAELiTE
iTA iTA