* performance improvements / switch to series.list.gz

* use before-rule when cleaning up tokens from movie filenames
* added series.list.gz script
This commit is contained in:
Reinhard Pointner 2012-02-23 18:48:35 +00:00
parent 4d3c2c6f55
commit 806ffdc91d
7 changed files with 105 additions and 104 deletions

22
BuildData.groovy Normal file
View File

@ -0,0 +1,22 @@
def page = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
def names = page.fetch().getHtml('utf-8')
.depthFirst().TABLE.find{it['@id'] == "listtable"}
.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English'}
.findResults{ it.TD[0].A.text() }
def anime = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles()
names += anime.findResults{ it.getPrimaryTitle() }
names += anime.findResults{ it.getOfficialTitle('en') }
names = names.findAll{ it =~ /^[A-Z]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) }
names = names.sort().unique()
args[0].withOutputStream{ out ->
new java.util.zip.GZIPOutputStream(out).withWriter('utf-8'){ writer ->
names.each{ writer.append(it).append('\n') }
}
}
println "Series Count: " + names.size()

View File

@ -63,8 +63,8 @@ import static net.sourceforge.filebot.web.WebRequest.*
URL.metaClass.get = { readAll(getReader(delegate.openConnection())) } URL.metaClass.get = { readAll(getReader(delegate.openConnection())) }
URL.metaClass.fetch = { fetch(delegate) } URL.metaClass.fetch = { fetch(delegate) }
URL.metaClass.getHtml = { new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(delegate))) } URL.metaClass.getHtml = { new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(readAll(getReader(delegate.openConnection()))) }
ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(new StringReader(Charset.forName(csn).decode(delegate.duplicate()).toString())))) } ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(Charset.forName(csn).decode(delegate.duplicate()).toString()) }
URL.metaClass.post = { Map parameters -> post(delegate.openConnection(), parameters) } URL.metaClass.post = { Map parameters -> post(delegate.openConnection(), parameters) }
URL.metaClass.post = { byte[] data, contentType = 'application/octet-stream' -> post(delegate.openConnection(), data, contentType) } URL.metaClass.post = { byte[] data, contentType = 'application/octet-stream' -> post(delegate.openConnection(), data, contentType) }

View File

@ -25,6 +25,7 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.NoSuchElementException;
import java.util.Set; import java.util.Set;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.TreeSet; import java.util.TreeSet;
@ -41,7 +42,6 @@ import net.sourceforge.filebot.similarity.NameSimilarityMetric;
import net.sourceforge.filebot.similarity.SeriesNameMatcher; import net.sourceforge.filebot.similarity.SeriesNameMatcher;
import net.sourceforge.filebot.similarity.SimilarityComparator; import net.sourceforge.filebot.similarity.SimilarityComparator;
import net.sourceforge.filebot.similarity.SimilarityMetric; import net.sourceforge.filebot.similarity.SimilarityMetric;
import net.sourceforge.filebot.web.AnidbClient.AnidbSearchResult;
import net.sourceforge.filebot.web.Movie; import net.sourceforge.filebot.web.Movie;
import net.sourceforge.filebot.web.MovieIdentificationService; import net.sourceforge.filebot.web.MovieIdentificationService;
import net.sourceforge.filebot.web.SearchResult; import net.sourceforge.filebot.web.SearchResult;
@ -155,8 +155,8 @@ public class MediaDetection {
} }
// match folder names against known series names // match folder names against known series names
for (TheTVDBSearchResult match : matchSeriesByName(filenames.toArray(new String[0]))) { for (String match : matchSeriesByName(filenames.toArray(new String[0]))) {
names.put(match.getName().toLowerCase(), match.getName()); names.put(match.toLowerCase(), match);
} }
} catch (Exception e) { } catch (Exception e) {
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to match folder structure: " + e.getMessage(), e); Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to match folder structure: " + e.getMessage(), e);
@ -177,75 +177,29 @@ public class MediaDetection {
} }
private static final HashMap<TheTVDBSearchResult, String> seriesNameIndex = new HashMap<TheTVDBSearchResult, String>(32768); public static List<String> matchSeriesByName(String... names) throws Exception {
HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
List<String> matches = new ArrayList<String>();
for (String identifier : releaseInfo.getSeriesList()) {
public static List<TheTVDBSearchResult> matchSeriesByName(String... names) throws Exception {
final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
final Map<TheTVDBSearchResult, String> matchMap = new HashMap<TheTVDBSearchResult, String>();
synchronized (seriesNameIndex) {
if (seriesNameIndex.isEmpty()) {
for (TheTVDBSearchResult entry : releaseInfo.getSeriesList()) {
seriesNameIndex.put(entry, nameMatcher.normalize(entry.getName()));
}
}
}
for (Entry<TheTVDBSearchResult, String> it : seriesNameIndex.entrySet()) {
for (String name : names) { for (String name : names) {
String identifier = it.getValue();
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier); String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
if (commonName != null && commonName.length() >= identifier.length()) { if (commonName != null && commonName.length() >= identifier.length()) {
matchMap.put(it.getKey(), commonName); matches.add(commonName);
} }
} }
} }
// sort by length of name match (descending) // sort by length of name match (descending)
List<TheTVDBSearchResult> results = new ArrayList<TheTVDBSearchResult>(matchMap.keySet()); sort(matches, new Comparator<String>() {
sort(results, new Comparator<TheTVDBSearchResult>() {
@Override @Override
public int compare(TheTVDBSearchResult a, TheTVDBSearchResult b) { public int compare(String a, String b) {
return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length())); return Integer.valueOf(b.length()).compareTo(Integer.valueOf(a.length()));
} }
}); });
return results; return matches;
}
public static Collection<AnidbSearchResult> matchAnimeByName(String... names) throws Exception {
final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
final Map<AnidbSearchResult, String> matchMap = new HashMap<AnidbSearchResult, String>();
for (final AnidbSearchResult entry : WebServices.AniDB.getAnimeTitles()) {
for (String identifier : new String[] { entry.getPrimaryTitle(), entry.getOfficialTitle("en") }) {
if (identifier == null || identifier.isEmpty())
continue;
identifier = nameMatcher.normalize(identifier);
for (String name : names) {
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
if (commonName != null && commonName.length() >= identifier.length()) {
matchMap.put(entry, commonName);
}
}
}
}
// sort by length of name match (descending)
List<AnidbSearchResult> results = new ArrayList<AnidbSearchResult>(matchMap.keySet());
sort(results, new Comparator<AnidbSearchResult>() {
@Override
public int compare(AnidbSearchResult a, AnidbSearchResult b) {
return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length()));
}
});
return results;
} }
@ -366,7 +320,11 @@ public class MediaDetection {
public static String stripReleaseInfo(String name) throws IOException { public static String stripReleaseInfo(String name) throws IOException {
return releaseInfo.cleanRelease(name, true); try {
return releaseInfo.cleanRelease(singleton(name), true).iterator().next();
} catch (NoSuchElementException e) {
return ""; // default value in case all tokens are stripped away
}
} }

View File

@ -6,11 +6,13 @@ import static java.util.Arrays.*;
import static java.util.ResourceBundle.*; import static java.util.ResourceBundle.*;
import static java.util.regex.Pattern.*; import static java.util.regex.Pattern.*;
import static net.sourceforge.filebot.similarity.Normalization.*; import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.FileUtilities.*;
import static net.sourceforge.tuned.StringUtilities.*; import static net.sourceforge.tuned.StringUtilities.*;
import java.io.File; import java.io.File;
import java.io.FileFilter; import java.io.FileFilter;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.text.Collator; import java.text.Collator;
@ -32,7 +34,6 @@ import java.util.zip.GZIPInputStream;
import net.sourceforge.filebot.web.CachedResource; import net.sourceforge.filebot.web.CachedResource;
import net.sourceforge.filebot.web.Movie; import net.sourceforge.filebot.web.Movie;
import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
import net.sourceforge.tuned.ByteBufferInputStream; import net.sourceforge.tuned.ByteBufferInputStream;
@ -89,28 +90,32 @@ public class ReleaseInfo {
} }
public List<String> cleanRelease(Iterable<String> items, boolean strict) throws IOException { public List<String> cleanRelease(Collection<String> items, boolean strict) throws IOException {
Set<String> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet(); Set<String> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet();
return clean(items, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages));
}
Pattern releaseGroup = getReleaseGroupPattern(strict);
Pattern languageSuffix = getLanguageSuffixPattern(languages);
Pattern languageTag = getLanguageTagPattern(languages);
Pattern videoSource = getVideoSourcePattern();
Pattern videoFormat = getVideoFormatPattern();
Pattern resolution = getResolutionPattern();
Pattern queryBlacklist = getBlacklistPattern();
public String cleanRelease(String item, boolean strict) throws IOException { Pattern[] blacklist = new Pattern[] { releaseGroup, languageSuffix, languageTag, videoSource, videoFormat, resolution, queryBlacklist };
Set<String> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet(); Pattern[] stopwords = new Pattern[] { getReleaseGroupPattern(true), languageSuffix, languageTag, videoSource, videoFormat, resolution };
return clean(item, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages));
}
List<String> output = new ArrayList<String>(items.size());
public List<String> clean(Iterable<String> items, Pattern... blacklisted) {
List<String> cleanedItems = new ArrayList<String>();
for (String it : items) { for (String it : items) {
String cleanedItem = clean(it, blacklisted); it = substringBefore(it, stopwords);
if (cleanedItem.length() > 0) { it = clean(it, blacklist);
cleanedItems.add(cleanedItem);
// ignore empty values
if (it.length() > 0) {
output.add(it);
} }
} }
return cleanedItems; return output;
} }
@ -123,7 +128,20 @@ public class ReleaseInfo {
} }
public Pattern getLanguageOptionPattern(Collection<String> languages) { public String substringBefore(String item, Pattern... stopwords) {
for (Pattern it : stopwords) {
Matcher matcher = it.matcher(item);
if (matcher.find()) {
return item.substring(0, matcher.start()); // use substring before the matched stopword
}
}
// no stopword found, keep original string
return item;
}
public Pattern getLanguageTagPattern(Collection<String> languages) {
// [en] // [en]
return compile("(?<=[-\\[{(])(" + join(quoteAll(languages), "|") + ")(?=\\p{Punct})", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ); return compile("(?<=[-\\[{(])(" + join(quoteAll(languages), "|") + ")(?=\\p{Punct})", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ);
} }
@ -172,7 +190,7 @@ public class ReleaseInfo {
} }
public synchronized TheTVDBSearchResult[] getSeriesList() throws IOException { public synchronized String[] getSeriesList() throws IOException {
return seriesListResource.get(); return seriesListResource.get();
} }
@ -186,7 +204,7 @@ public class ReleaseInfo {
protected final CachedResource<String[]> releaseGroupResource = new PatternResource(getBundle(getClass().getName()).getString("url.release-groups")); protected final CachedResource<String[]> releaseGroupResource = new PatternResource(getBundle(getClass().getName()).getString("url.release-groups"));
protected final CachedResource<String[]> queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist")); protected final CachedResource<String[]> queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist"));
protected final CachedResource<Movie[]> movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list")); protected final CachedResource<Movie[]> movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list"));
protected final CachedResource<TheTVDBSearchResult[]> seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list")); protected final CachedResource<String[]> seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list"));
protected static class PatternResource extends CachedResource<String[]> { protected static class PatternResource extends CachedResource<String[]> {
@ -206,7 +224,7 @@ public class ReleaseInfo {
protected static class MovieResource extends CachedResource<Movie[]> { protected static class MovieResource extends CachedResource<Movie[]> {
public MovieResource(String resource) { public MovieResource(String resource) {
super(resource, Movie[].class, 24 * 60 * 60 * 1000); // 24h update interval super(resource, Movie[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
} }
@ -227,25 +245,16 @@ public class ReleaseInfo {
} }
protected static class SeriesResource extends CachedResource<TheTVDBSearchResult[]> { protected static class SeriesResource extends CachedResource<String[]> {
public SeriesResource(String resource) { public SeriesResource(String resource) {
super(resource, TheTVDBSearchResult[].class, 24 * 60 * 60 * 1000); // 24h update interval super(resource, String[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
} }
@Override @Override
public TheTVDBSearchResult[] process(ByteBuffer data) throws IOException { public String[] process(ByteBuffer data) throws IOException {
Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n"); return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "utf-8")).split("\\n");
List<TheTVDBSearchResult> tvshows = new ArrayList<TheTVDBSearchResult>();
while (scanner.hasNext()) {
int sid = scanner.nextInt();
String name = scanner.next();
tvshows.add(new TheTVDBSearchResult(name, sid));
}
return tvshows.toArray(new TheTVDBSearchResult[0]);
} }
} }

View File

@ -12,7 +12,7 @@ url.query-blacklist: http://filebot.sourceforge.net/data/query-blacklist.txt
# list of all movies (id, name, year) # list of all movies (id, name, year)
url.movie-list: http://filebot.sourceforge.net/data/movies.txt.gz url.movie-list: http://filebot.sourceforge.net/data/movies.txt.gz
url.series-list: http://filebot.sourceforge.net/data/tvshows.txt.gz url.series-list: http://filebot.sourceforge.net/data/series.list.gz
# disk folder matcher # disk folder matcher
pattern.diskfolder.entry: ^BDMV$|^HVDVD_TS$|^VIDEO_TS$|^AUDIO_TS$|^VCD$ pattern.diskfolder.entry: ^BDMV$|^HVDVD_TS$|^VIDEO_TS$|^AUDIO_TS$|^VCD$

View File

@ -2,22 +2,34 @@
package net.sourceforge.filebot.similarity; package net.sourceforge.filebot.similarity;
import static java.util.regex.Pattern.*;
import java.util.regex.Pattern;
public class Normalization { public class Normalization {
private static final Pattern apostrophe = compile("['`´ʻ]+");
private static final Pattern punctuation = compile("[\\p{Punct}\\p{Space}]+");
private static final Pattern[] brackets = new Pattern[] { compile("\\([^\\(]*\\)"), compile("\\[[^\\[]*\\]"), compile("\\{[^\\{]*\\}") };
private static final Pattern checksum = compile("[\\(\\[]\\p{XDigit}{8}[\\]\\)]");
public static String normalizePunctuation(String name) { public static String normalizePunctuation(String name) {
// remove/normalize special characters // remove/normalize special characters
name = name.replaceAll("[`´ʻ]+", ""); name = apostrophe.matcher(name).replaceAll("");
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " "); name = punctuation.matcher(name).replaceAll(" ");
return name.trim(); return name.trim();
} }
public static String normalizeBrackets(String name) { public static String normalizeBrackets(String name) {
// remove group names and checksums, any [...] or (...) // remove group names and checksums, any [...] or (...)
name = name.replaceAll("\\([^\\(]*\\)", " "); for (Pattern it : brackets) {
name = name.replaceAll("\\[[^\\[]*\\]", " "); name = it.matcher(name).replaceAll(" ");
name = name.replaceAll("\\{[^\\{]*\\}", " "); }
return name; return name;
} }
@ -25,7 +37,7 @@ public class Normalization {
public static String removeEmbeddedChecksum(String string) { public static String removeEmbeddedChecksum(String string) {
// match embedded checksum and surrounding brackets // match embedded checksum and surrounding brackets
return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", ""); return checksum.matcher(string).replaceAll("");
} }
} }

BIN
website/data/series.list.gz Normal file

Binary file not shown.