* performance improvements / switch to series.list.gz

* use before-rule when cleaning up tokens from movie filenames
* added series.list.gz script
This commit is contained in:
Reinhard Pointner 2012-02-23 18:48:35 +00:00
parent 4d3c2c6f55
commit 806ffdc91d
7 changed files with 105 additions and 104 deletions

22
BuildData.groovy Normal file
View File

@ -0,0 +1,22 @@
def page = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
def names = page.fetch().getHtml('utf-8')
.depthFirst().TABLE.find{it['@id'] == "listtable"}
.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English'}
.findResults{ it.TD[0].A.text() }
def anime = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles()
names += anime.findResults{ it.getPrimaryTitle() }
names += anime.findResults{ it.getOfficialTitle('en') }
names = names.findAll{ it =~ /^[A-Z]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) }
names = names.sort().unique()
args[0].withOutputStream{ out ->
new java.util.zip.GZIPOutputStream(out).withWriter('utf-8'){ writer ->
names.each{ writer.append(it).append('\n') }
}
}
println "Series Count: " + names.size()

View File

@ -63,8 +63,8 @@ import static net.sourceforge.filebot.web.WebRequest.*
URL.metaClass.get = { readAll(getReader(delegate.openConnection())) }
URL.metaClass.fetch = { fetch(delegate) }
URL.metaClass.getHtml = { new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(delegate))) }
ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(new StringReader(Charset.forName(csn).decode(delegate.duplicate()).toString())))) }
URL.metaClass.getHtml = { new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(readAll(getReader(delegate.openConnection()))) }
ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(Charset.forName(csn).decode(delegate.duplicate()).toString()) }
URL.metaClass.post = { Map parameters -> post(delegate.openConnection(), parameters) }
URL.metaClass.post = { byte[] data, contentType = 'application/octet-stream' -> post(delegate.openConnection(), data, contentType) }

View File

@ -25,6 +25,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeSet;
@ -41,7 +42,6 @@ import net.sourceforge.filebot.similarity.NameSimilarityMetric;
import net.sourceforge.filebot.similarity.SeriesNameMatcher;
import net.sourceforge.filebot.similarity.SimilarityComparator;
import net.sourceforge.filebot.similarity.SimilarityMetric;
import net.sourceforge.filebot.web.AnidbClient.AnidbSearchResult;
import net.sourceforge.filebot.web.Movie;
import net.sourceforge.filebot.web.MovieIdentificationService;
import net.sourceforge.filebot.web.SearchResult;
@ -155,8 +155,8 @@ public class MediaDetection {
}
// match folder names against known series names
for (TheTVDBSearchResult match : matchSeriesByName(filenames.toArray(new String[0]))) {
names.put(match.getName().toLowerCase(), match.getName());
for (String match : matchSeriesByName(filenames.toArray(new String[0]))) {
names.put(match.toLowerCase(), match);
}
} catch (Exception e) {
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to match folder structure: " + e.getMessage(), e);
@ -177,75 +177,29 @@ public class MediaDetection {
}
private static final HashMap<TheTVDBSearchResult, String> seriesNameIndex = new HashMap<TheTVDBSearchResult, String>(32768);
public static List<TheTVDBSearchResult> matchSeriesByName(String... names) throws Exception {
final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
final Map<TheTVDBSearchResult, String> matchMap = new HashMap<TheTVDBSearchResult, String>();
public static List<String> matchSeriesByName(String... names) throws Exception {
HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
List<String> matches = new ArrayList<String>();
synchronized (seriesNameIndex) {
if (seriesNameIndex.isEmpty()) {
for (TheTVDBSearchResult entry : releaseInfo.getSeriesList()) {
seriesNameIndex.put(entry, nameMatcher.normalize(entry.getName()));
}
}
}
for (Entry<TheTVDBSearchResult, String> it : seriesNameIndex.entrySet()) {
for (String identifier : releaseInfo.getSeriesList()) {
for (String name : names) {
String identifier = it.getValue();
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
if (commonName != null && commonName.length() >= identifier.length()) {
matchMap.put(it.getKey(), commonName);
matches.add(commonName);
}
}
}
// sort by length of name match (descending)
List<TheTVDBSearchResult> results = new ArrayList<TheTVDBSearchResult>(matchMap.keySet());
sort(results, new Comparator<TheTVDBSearchResult>() {
sort(matches, new Comparator<String>() {
@Override
public int compare(TheTVDBSearchResult a, TheTVDBSearchResult b) {
return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length()));
public int compare(String a, String b) {
return Integer.valueOf(b.length()).compareTo(Integer.valueOf(a.length()));
}
});
return results;
}
public static Collection<AnidbSearchResult> matchAnimeByName(String... names) throws Exception {
final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
final Map<AnidbSearchResult, String> matchMap = new HashMap<AnidbSearchResult, String>();
for (final AnidbSearchResult entry : WebServices.AniDB.getAnimeTitles()) {
for (String identifier : new String[] { entry.getPrimaryTitle(), entry.getOfficialTitle("en") }) {
if (identifier == null || identifier.isEmpty())
continue;
identifier = nameMatcher.normalize(identifier);
for (String name : names) {
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
if (commonName != null && commonName.length() >= identifier.length()) {
matchMap.put(entry, commonName);
}
}
}
}
// sort by length of name match (descending)
List<AnidbSearchResult> results = new ArrayList<AnidbSearchResult>(matchMap.keySet());
sort(results, new Comparator<AnidbSearchResult>() {
@Override
public int compare(AnidbSearchResult a, AnidbSearchResult b) {
return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length()));
}
});
return results;
return matches;
}
@ -366,7 +320,11 @@ public class MediaDetection {
public static String stripReleaseInfo(String name) throws IOException {
return releaseInfo.cleanRelease(name, true);
try {
return releaseInfo.cleanRelease(singleton(name), true).iterator().next();
} catch (NoSuchElementException e) {
return ""; // default value in case all tokens are stripped away
}
}

View File

@ -6,11 +6,13 @@ import static java.util.Arrays.*;
import static java.util.ResourceBundle.*;
import static java.util.regex.Pattern.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.FileUtilities.*;
import static net.sourceforge.tuned.StringUtilities.*;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.text.Collator;
@ -32,7 +34,6 @@ import java.util.zip.GZIPInputStream;
import net.sourceforge.filebot.web.CachedResource;
import net.sourceforge.filebot.web.Movie;
import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
import net.sourceforge.tuned.ByteBufferInputStream;
@ -89,28 +90,32 @@ public class ReleaseInfo {
}
public List<String> cleanRelease(Iterable<String> items, boolean strict) throws IOException {
public List<String> cleanRelease(Collection<String> items, boolean strict) throws IOException {
Set<String> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet();
return clean(items, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages));
}
public String cleanRelease(String item, boolean strict) throws IOException {
Set<String> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet();
return clean(item, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages));
}
public List<String> clean(Iterable<String> items, Pattern... blacklisted) {
List<String> cleanedItems = new ArrayList<String>();
Pattern releaseGroup = getReleaseGroupPattern(strict);
Pattern languageSuffix = getLanguageSuffixPattern(languages);
Pattern languageTag = getLanguageTagPattern(languages);
Pattern videoSource = getVideoSourcePattern();
Pattern videoFormat = getVideoFormatPattern();
Pattern resolution = getResolutionPattern();
Pattern queryBlacklist = getBlacklistPattern();
Pattern[] blacklist = new Pattern[] { releaseGroup, languageSuffix, languageTag, videoSource, videoFormat, resolution, queryBlacklist };
Pattern[] stopwords = new Pattern[] { getReleaseGroupPattern(true), languageSuffix, languageTag, videoSource, videoFormat, resolution };
List<String> output = new ArrayList<String>(items.size());
for (String it : items) {
String cleanedItem = clean(it, blacklisted);
if (cleanedItem.length() > 0) {
cleanedItems.add(cleanedItem);
it = substringBefore(it, stopwords);
it = clean(it, blacklist);
// ignore empty values
if (it.length() > 0) {
output.add(it);
}
}
return cleanedItems;
return output;
}
@ -123,7 +128,20 @@ public class ReleaseInfo {
}
public Pattern getLanguageOptionPattern(Collection<String> languages) {
public String substringBefore(String item, Pattern... stopwords) {
for (Pattern it : stopwords) {
Matcher matcher = it.matcher(item);
if (matcher.find()) {
return item.substring(0, matcher.start()); // use substring before the matched stopword
}
}
// no stopword found, keep original string
return item;
}
public Pattern getLanguageTagPattern(Collection<String> languages) {
// [en]
return compile("(?<=[-\\[{(])(" + join(quoteAll(languages), "|") + ")(?=\\p{Punct})", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ);
}
@ -172,7 +190,7 @@ public class ReleaseInfo {
}
public synchronized TheTVDBSearchResult[] getSeriesList() throws IOException {
public synchronized String[] getSeriesList() throws IOException {
return seriesListResource.get();
}
@ -186,7 +204,7 @@ public class ReleaseInfo {
protected final CachedResource<String[]> releaseGroupResource = new PatternResource(getBundle(getClass().getName()).getString("url.release-groups"));
protected final CachedResource<String[]> queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist"));
protected final CachedResource<Movie[]> movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list"));
protected final CachedResource<TheTVDBSearchResult[]> seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list"));
protected final CachedResource<String[]> seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list"));
protected static class PatternResource extends CachedResource<String[]> {
@ -206,7 +224,7 @@ public class ReleaseInfo {
protected static class MovieResource extends CachedResource<Movie[]> {
public MovieResource(String resource) {
super(resource, Movie[].class, 24 * 60 * 60 * 1000); // 24h update interval
super(resource, Movie[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
}
@ -227,25 +245,16 @@ public class ReleaseInfo {
}
protected static class SeriesResource extends CachedResource<TheTVDBSearchResult[]> {
protected static class SeriesResource extends CachedResource<String[]> {
public SeriesResource(String resource) {
super(resource, TheTVDBSearchResult[].class, 24 * 60 * 60 * 1000); // 24h update interval
super(resource, String[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
}
@Override
public TheTVDBSearchResult[] process(ByteBuffer data) throws IOException {
Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
List<TheTVDBSearchResult> tvshows = new ArrayList<TheTVDBSearchResult>();
while (scanner.hasNext()) {
int sid = scanner.nextInt();
String name = scanner.next();
tvshows.add(new TheTVDBSearchResult(name, sid));
}
return tvshows.toArray(new TheTVDBSearchResult[0]);
public String[] process(ByteBuffer data) throws IOException {
return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "utf-8")).split("\\n");
}
}

View File

@ -12,7 +12,7 @@ url.query-blacklist: http://filebot.sourceforge.net/data/query-blacklist.txt
# list of all movies (id, name, year)
url.movie-list: http://filebot.sourceforge.net/data/movies.txt.gz
url.series-list: http://filebot.sourceforge.net/data/tvshows.txt.gz
url.series-list: http://filebot.sourceforge.net/data/series.list.gz
# disk folder matcher
pattern.diskfolder.entry: ^BDMV$|^HVDVD_TS$|^VIDEO_TS$|^AUDIO_TS$|^VCD$

View File

@ -2,22 +2,34 @@
package net.sourceforge.filebot.similarity;
import static java.util.regex.Pattern.*;
import java.util.regex.Pattern;
public class Normalization {
private static final Pattern apostrophe = compile("['`´ʻ]+");
private static final Pattern punctuation = compile("[\\p{Punct}\\p{Space}]+");
private static final Pattern[] brackets = new Pattern[] { compile("\\([^\\(]*\\)"), compile("\\[[^\\[]*\\]"), compile("\\{[^\\{]*\\}") };
private static final Pattern checksum = compile("[\\(\\[]\\p{XDigit}{8}[\\]\\)]");
public static String normalizePunctuation(String name) {
// remove/normalize special characters
name = name.replaceAll("[`´ʻ]+", "");
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
name = apostrophe.matcher(name).replaceAll("");
name = punctuation.matcher(name).replaceAll(" ");
return name.trim();
}
public static String normalizeBrackets(String name) {
// remove group names and checksums, any [...] or (...)
name = name.replaceAll("\\([^\\(]*\\)", " ");
name = name.replaceAll("\\[[^\\[]*\\]", " ");
name = name.replaceAll("\\{[^\\{]*\\}", " ");
for (Pattern it : brackets) {
name = it.matcher(name).replaceAll(" ");
}
return name;
}
@ -25,7 +37,7 @@ public class Normalization {
public static String removeEmbeddedChecksum(String string) {
// match embedded checksum and surrounding brackets
return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", "");
return checksum.matcher(string).replaceAll("");
}
}

BIN
website/data/series.list.gz Normal file

Binary file not shown.