* performance improvements / switch to series.list.gz

* use before-rule when cleaning up tokens from movie filenames * added series.list.gz script
2012-02-23 18:48:35 +00:00 · 2012-02-23 18:48:35 +00:00 · 806ffdc91d
parent 4d3c2c6f55
commit 806ffdc91d
7 changed files with 105 additions and 104 deletions
--- a/BuildData.groovy
+++ b/BuildData.groovy
@ -0,0 +1,22 @@
+def page = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
+
+def names = page.fetch().getHtml('utf-8')
+.depthFirst().TABLE.find{it['@id'] == "listtable"}
+.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English'}
+.findResults{ it.TD[0].A.text() }
+
+def anime = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles()
+names += anime.findResults{ it.getPrimaryTitle() }
+names += anime.findResults{ it.getOfficialTitle('en') }
+
+names = names.findAll{ it =~ /^[A-Z]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) }
+names = names.sort().unique()
+
+
+args[0].withOutputStream{ out ->
+	new java.util.zip.GZIPOutputStream(out).withWriter('utf-8'){ writer ->
+		names.each{ writer.append(it).append('\n') }
+	}
+}
+
+println "Series Count: " + names.size()
--- a/source/net/sourceforge/filebot/cli/ScriptShell.lib.groovy
+++ b/source/net/sourceforge/filebot/cli/ScriptShell.lib.groovy
@ -63,8 +63,8 @@ import static net.sourceforge.filebot.web.WebRequest.*

 URL.metaClass.get = { readAll(getReader(delegate.openConnection())) }
 URL.metaClass.fetch = { fetch(delegate) }
-URL.metaClass.getHtml = { new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(delegate))) }
-ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(new StringReader(Charset.forName(csn).decode(delegate.duplicate()).toString())))) }
+URL.metaClass.getHtml = { new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(readAll(getReader(delegate.openConnection()))) }
+ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(Charset.forName(csn).decode(delegate.duplicate()).toString()) }

 URL.metaClass.post = { Map parameters -> post(delegate.openConnection(), parameters) }
 URL.metaClass.post = { byte[] data, contentType = 'application/octet-stream' -> post(delegate.openConnection(), data, contentType) }
--- a/source/net/sourceforge/filebot/media/MediaDetection.java
+++ b/source/net/sourceforge/filebot/media/MediaDetection.java
@ -25,6 +25,7 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.NoSuchElementException;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeSet;
@ -41,7 +42,6 @@ import net.sourceforge.filebot.similarity.NameSimilarityMetric;
 import net.sourceforge.filebot.similarity.SeriesNameMatcher;
 import net.sourceforge.filebot.similarity.SimilarityComparator;
 import net.sourceforge.filebot.similarity.SimilarityMetric;
-import net.sourceforge.filebot.web.AnidbClient.AnidbSearchResult;
 import net.sourceforge.filebot.web.Movie;
 import net.sourceforge.filebot.web.MovieIdentificationService;
 import net.sourceforge.filebot.web.SearchResult;
@ -155,8 +155,8 @@ public class MediaDetection {
 			}
 			
 			// match folder names against known series names
-			for (TheTVDBSearchResult match : matchSeriesByName(filenames.toArray(new String[0]))) {
-				names.put(match.getName().toLowerCase(), match.getName());
+			for (String match : matchSeriesByName(filenames.toArray(new String[0]))) {
+				names.put(match.toLowerCase(), match);
 			}
 		} catch (Exception e) {
 			Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to match folder structure: " + e.getMessage(), e);
@ -177,75 +177,29 @@ public class MediaDetection {
 	}
 	
 	
-	private static final HashMap<TheTVDBSearchResult, String> seriesNameIndex = new HashMap<TheTVDBSearchResult, String>(32768);
+	public static List<String> matchSeriesByName(String... names) throws Exception {
+		HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
+		List<String> matches = new ArrayList<String>();
 		
-	
-	public static List<TheTVDBSearchResult> matchSeriesByName(String... names) throws Exception {
-		final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
-		final Map<TheTVDBSearchResult, String> matchMap = new HashMap<TheTVDBSearchResult, String>();
-		
-		synchronized (seriesNameIndex) {
-			if (seriesNameIndex.isEmpty()) {
-				for (TheTVDBSearchResult entry : releaseInfo.getSeriesList()) {
-					seriesNameIndex.put(entry, nameMatcher.normalize(entry.getName()));
-				}
-			}
-		}
-		
-		for (Entry<TheTVDBSearchResult, String> it : seriesNameIndex.entrySet()) {
+		for (String identifier : releaseInfo.getSeriesList()) {
 			for (String name : names) {
-				String identifier = it.getValue();
 				String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
 				if (commonName != null && commonName.length() >= identifier.length()) {
-					matchMap.put(it.getKey(), commonName);
+					matches.add(commonName);
 				}
 			}
 		}
 		
 		// sort by length of name match (descending)
-		List<TheTVDBSearchResult> results = new ArrayList<TheTVDBSearchResult>(matchMap.keySet());
-		sort(results, new Comparator<TheTVDBSearchResult>() {
+		sort(matches, new Comparator<String>() {
 			
 			@Override
-			public int compare(TheTVDBSearchResult a, TheTVDBSearchResult b) {
-				return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length()));
+			public int compare(String a, String b) {
+				return Integer.valueOf(b.length()).compareTo(Integer.valueOf(a.length()));
 			}
 		});
 		
-		return results;
-	}
-	
-	
-	public static Collection<AnidbSearchResult> matchAnimeByName(String... names) throws Exception {
-		final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
-		final Map<AnidbSearchResult, String> matchMap = new HashMap<AnidbSearchResult, String>();
-		
-		for (final AnidbSearchResult entry : WebServices.AniDB.getAnimeTitles()) {
-			for (String identifier : new String[] { entry.getPrimaryTitle(), entry.getOfficialTitle("en") }) {
-				if (identifier == null || identifier.isEmpty())
-					continue;
-				
-				identifier = nameMatcher.normalize(identifier);
-				for (String name : names) {
-					String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
-					if (commonName != null && commonName.length() >= identifier.length()) {
-						matchMap.put(entry, commonName);
-					}
-				}
-			}
-		}
-		
-		// sort by length of name match (descending)
-		List<AnidbSearchResult> results = new ArrayList<AnidbSearchResult>(matchMap.keySet());
-		sort(results, new Comparator<AnidbSearchResult>() {
-			
-			@Override
-			public int compare(AnidbSearchResult a, AnidbSearchResult b) {
-				return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length()));
-			}
-		});
-		
-		return results;
+		return matches;
 	}
 	
 	
@ -366,7 +320,11 @@ public class MediaDetection {
 	
 	
 	public static String stripReleaseInfo(String name) throws IOException {
-		return releaseInfo.cleanRelease(name, true);
+		try {
+			return releaseInfo.cleanRelease(singleton(name), true).iterator().next();
+		} catch (NoSuchElementException e) {
+			return ""; // default value in case all tokens are stripped away
+		}
 	}
 	
 	
--- a/source/net/sourceforge/filebot/media/ReleaseInfo.java
+++ b/source/net/sourceforge/filebot/media/ReleaseInfo.java
@ -6,11 +6,13 @@ import static java.util.Arrays.*;
 import static java.util.ResourceBundle.*;
 import static java.util.regex.Pattern.*;
 import static net.sourceforge.filebot.similarity.Normalization.*;
+import static net.sourceforge.tuned.FileUtilities.*;
 import static net.sourceforge.tuned.StringUtilities.*;

 import java.io.File;
 import java.io.FileFilter;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.text.Collator;
@ -32,7 +34,6 @@ import java.util.zip.GZIPInputStream;

 import net.sourceforge.filebot.web.CachedResource;
 import net.sourceforge.filebot.web.Movie;
-import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
 import net.sourceforge.tuned.ByteBufferInputStream;


@ -89,28 +90,32 @@ public class ReleaseInfo {
 	}
 	
 	
-	public List<String> cleanRelease(Iterable<String> items, boolean strict) throws IOException {
+	public List<String> cleanRelease(Collection<String> items, boolean strict) throws IOException {
 		Set<String> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet();
-		return clean(items, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages));
-	}
 		
+		Pattern releaseGroup = getReleaseGroupPattern(strict);
+		Pattern languageSuffix = getLanguageSuffixPattern(languages);
+		Pattern languageTag = getLanguageTagPattern(languages);
+		Pattern videoSource = getVideoSourcePattern();
+		Pattern videoFormat = getVideoFormatPattern();
+		Pattern resolution = getResolutionPattern();
+		Pattern queryBlacklist = getBlacklistPattern();
 		
-	public String cleanRelease(String item, boolean strict) throws IOException {
-		Set<String> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet();
-		return clean(item, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages));
-	}
+		Pattern[] blacklist = new Pattern[] { releaseGroup, languageSuffix, languageTag, videoSource, videoFormat, resolution, queryBlacklist };
+		Pattern[] stopwords = new Pattern[] { getReleaseGroupPattern(true), languageSuffix, languageTag, videoSource, videoFormat, resolution };
 		
-	
-	public List<String> clean(Iterable<String> items, Pattern... blacklisted) {
-		List<String> cleanedItems = new ArrayList<String>();
+		List<String> output = new ArrayList<String>(items.size());
 		for (String it : items) {
-			String cleanedItem = clean(it, blacklisted);
-			if (cleanedItem.length() > 0) {
-				cleanedItems.add(cleanedItem);
+			it = substringBefore(it, stopwords);
+			it = clean(it, blacklist);
+			
+			// ignore empty values
+			if (it.length() > 0) {
+				output.add(it);
 			}
 		}
 		
-		return cleanedItems;
+		return output;
 	}
 	
 	
@ -123,7 +128,20 @@ public class ReleaseInfo {
 	}
 	
 	
-	public Pattern getLanguageOptionPattern(Collection<String> languages) {
+	public String substringBefore(String item, Pattern... stopwords) {
+		for (Pattern it : stopwords) {
+			Matcher matcher = it.matcher(item);
+			if (matcher.find()) {
+				return item.substring(0, matcher.start()); // use substring before the matched stopword
+			}
+		}
+		
+		// no stopword found, keep original string
+		return item;
+	}
+	
+	
+	public Pattern getLanguageTagPattern(Collection<String> languages) {
 		// [en]
 		return compile("(?<=[-\\[{(])(" + join(quoteAll(languages), "|") + ")(?=\\p{Punct})", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ);
 	}
@ -172,7 +190,7 @@ public class ReleaseInfo {
 	}
 	
 	
-	public synchronized TheTVDBSearchResult[] getSeriesList() throws IOException {
+	public synchronized String[] getSeriesList() throws IOException {
 		return seriesListResource.get();
 	}
 	
@ -186,7 +204,7 @@ public class ReleaseInfo {
 	protected final CachedResource<String[]> releaseGroupResource = new PatternResource(getBundle(getClass().getName()).getString("url.release-groups"));
 	protected final CachedResource<String[]> queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist"));
 	protected final CachedResource<Movie[]> movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list"));
-	protected final CachedResource<TheTVDBSearchResult[]> seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list"));
+	protected final CachedResource<String[]> seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list"));
 	
 	
 	protected static class PatternResource extends CachedResource<String[]> {
@ -206,7 +224,7 @@ public class ReleaseInfo {
 	protected static class MovieResource extends CachedResource<Movie[]> {
 		
 		public MovieResource(String resource) {
-			super(resource, Movie[].class, 24 * 60 * 60 * 1000); // 24h update interval
+			super(resource, Movie[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
 		}
 		
 		
@ -227,25 +245,16 @@ public class ReleaseInfo {
 	}
 	
 	
-	protected static class SeriesResource extends CachedResource<TheTVDBSearchResult[]> {
+	protected static class SeriesResource extends CachedResource<String[]> {
 		
 		public SeriesResource(String resource) {
-			super(resource, TheTVDBSearchResult[].class, 24 * 60 * 60 * 1000); // 24h update interval
+			super(resource, String[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
 		}
 		
 		
 		@Override
-		public TheTVDBSearchResult[] process(ByteBuffer data) throws IOException {
-			Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
-			
-			List<TheTVDBSearchResult> tvshows = new ArrayList<TheTVDBSearchResult>();
-			while (scanner.hasNext()) {
-				int sid = scanner.nextInt();
-				String name = scanner.next();
-				tvshows.add(new TheTVDBSearchResult(name, sid));
-			}
-			
-			return tvshows.toArray(new TheTVDBSearchResult[0]);
+		public String[] process(ByteBuffer data) throws IOException {
+			return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "utf-8")).split("\\n");
 		}
 	}
 	
--- a/source/net/sourceforge/filebot/media/ReleaseInfo.properties
+++ b/source/net/sourceforge/filebot/media/ReleaseInfo.properties
@ -12,7 +12,7 @@ url.query-blacklist: http://filebot.sourceforge.net/data/query-blacklist.txt

 # list of all movies (id, name, year)
 url.movie-list: http://filebot.sourceforge.net/data/movies.txt.gz
-url.series-list: http://filebot.sourceforge.net/data/tvshows.txt.gz
+url.series-list: http://filebot.sourceforge.net/data/series.list.gz

 # disk folder matcher
 pattern.diskfolder.entry: ^BDMV$|^HVDVD_TS$|^VIDEO_TS$|^AUDIO_TS$|^VCD$
--- a/source/net/sourceforge/filebot/similarity/Normalization.java
+++ b/source/net/sourceforge/filebot/similarity/Normalization.java
@ -2,22 +2,34 @@
 package net.sourceforge.filebot.similarity;


+import static java.util.regex.Pattern.*;
+
+import java.util.regex.Pattern;
+
+
 public class Normalization {
 	
+	private static final Pattern apostrophe = compile("['`´‘’ʻ]+");
+	private static final Pattern punctuation = compile("[\\p{Punct}\\p{Space}]+");
+	
+	private static final Pattern[] brackets = new Pattern[] { compile("\\([^\\(]*\\)"), compile("\\[[^\\[]*\\]"), compile("\\{[^\\{]*\\}") };
+	
+	private static final Pattern checksum = compile("[\\(\\[]\\p{XDigit}{8}[\\]\\)]");
+	
+	
 	public static String normalizePunctuation(String name) {
 		// remove/normalize special characters
-		name = name.replaceAll("[`´‘’ʻ]+", "");
-		name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
-		
+		name = apostrophe.matcher(name).replaceAll("");
+		name = punctuation.matcher(name).replaceAll(" ");
 		return name.trim();
 	}
 	
 	
 	public static String normalizeBrackets(String name) {
 		// remove group names and checksums, any [...] or (...)
-		name = name.replaceAll("\\([^\\(]*\\)", " ");
-		name = name.replaceAll("\\[[^\\[]*\\]", " ");
-		name = name.replaceAll("\\{[^\\{]*\\}", " ");
+		for (Pattern it : brackets) {
+			name = it.matcher(name).replaceAll(" ");
+		}
 		
 		return name;
 	}
@ -25,7 +37,7 @@ public class Normalization {
 	
 	public static String removeEmbeddedChecksum(String string) {
 		// match embedded checksum and surrounding brackets 
-		return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", "");
+		return checksum.matcher(string).replaceAll("");
 	}
 	
 }
--- a/website/data/series.list.gz
+++ b/website/data/series.list.gz