diff --git a/BuildData.groovy b/BuildData.groovy new file mode 100644 index 00000000..60f8c104 --- /dev/null +++ b/BuildData.groovy @@ -0,0 +1,22 @@ +def page = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search') + +def names = page.fetch().getHtml('utf-8') +.depthFirst().TABLE.find{it['@id'] == "listtable"} +.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English'} +.findResults{ it.TD[0].A.text() } + +def anime = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles() +names += anime.findResults{ it.getPrimaryTitle() } +names += anime.findResults{ it.getOfficialTitle('en') } + +names = names.findAll{ it =~ /^[A-Z]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } +names = names.sort().unique() + + +args[0].withOutputStream{ out -> + new java.util.zip.GZIPOutputStream(out).withWriter('utf-8'){ writer -> + names.each{ writer.append(it).append('\n') } + } +} + +println "Series Count: " + names.size() diff --git a/source/net/sourceforge/filebot/cli/ScriptShell.lib.groovy b/source/net/sourceforge/filebot/cli/ScriptShell.lib.groovy index deb649a5..be84f2af 100644 --- a/source/net/sourceforge/filebot/cli/ScriptShell.lib.groovy +++ b/source/net/sourceforge/filebot/cli/ScriptShell.lib.groovy @@ -63,8 +63,8 @@ import static net.sourceforge.filebot.web.WebRequest.* URL.metaClass.get = { readAll(getReader(delegate.openConnection())) } URL.metaClass.fetch = { fetch(delegate) } -URL.metaClass.getHtml = { new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(delegate))) } -ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(new StringReader(Charset.forName(csn).decode(delegate.duplicate()).toString())))) } +URL.metaClass.getHtml = { new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(readAll(getReader(delegate.openConnection()))) } +ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(Charset.forName(csn).decode(delegate.duplicate()).toString()) } URL.metaClass.post = { Map parameters -> post(delegate.openConnection(), parameters) } URL.metaClass.post = { byte[] data, contentType = 'application/octet-stream' -> post(delegate.openConnection(), data, contentType) } diff --git a/source/net/sourceforge/filebot/media/MediaDetection.java b/source/net/sourceforge/filebot/media/MediaDetection.java index 89485381..3dfd8b06 100644 --- a/source/net/sourceforge/filebot/media/MediaDetection.java +++ b/source/net/sourceforge/filebot/media/MediaDetection.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; +import java.util.NoSuchElementException; import java.util.Set; import java.util.SortedMap; import java.util.TreeSet; @@ -41,7 +42,6 @@ import net.sourceforge.filebot.similarity.NameSimilarityMetric; import net.sourceforge.filebot.similarity.SeriesNameMatcher; import net.sourceforge.filebot.similarity.SimilarityComparator; import net.sourceforge.filebot.similarity.SimilarityMetric; -import net.sourceforge.filebot.web.AnidbClient.AnidbSearchResult; import net.sourceforge.filebot.web.Movie; import net.sourceforge.filebot.web.MovieIdentificationService; import net.sourceforge.filebot.web.SearchResult; @@ -155,8 +155,8 @@ public class MediaDetection { } // match folder names against known series names - for (TheTVDBSearchResult match : matchSeriesByName(filenames.toArray(new String[0]))) { - names.put(match.getName().toLowerCase(), match.getName()); + for (String match : matchSeriesByName(filenames.toArray(new String[0]))) { + names.put(match.toLowerCase(), match); } } catch (Exception e) { Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to match folder structure: " + e.getMessage(), e); @@ -177,75 +177,29 @@ public class MediaDetection { } - private static final HashMap seriesNameIndex = new HashMap(32768); - - - public static List matchSeriesByName(String... names) throws Exception { - final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0); - final Map matchMap = new HashMap(); + public static List matchSeriesByName(String... names) throws Exception { + HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0); + List matches = new ArrayList(); - synchronized (seriesNameIndex) { - if (seriesNameIndex.isEmpty()) { - for (TheTVDBSearchResult entry : releaseInfo.getSeriesList()) { - seriesNameIndex.put(entry, nameMatcher.normalize(entry.getName())); - } - } - } - - for (Entry it : seriesNameIndex.entrySet()) { + for (String identifier : releaseInfo.getSeriesList()) { for (String name : names) { - String identifier = it.getValue(); String commonName = nameMatcher.matchFirstCommonSequence(name, identifier); if (commonName != null && commonName.length() >= identifier.length()) { - matchMap.put(it.getKey(), commonName); + matches.add(commonName); } } } // sort by length of name match (descending) - List results = new ArrayList(matchMap.keySet()); - sort(results, new Comparator() { + sort(matches, new Comparator() { @Override - public int compare(TheTVDBSearchResult a, TheTVDBSearchResult b) { - return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length())); + public int compare(String a, String b) { + return Integer.valueOf(b.length()).compareTo(Integer.valueOf(a.length())); } }); - return results; - } - - - public static Collection matchAnimeByName(String... names) throws Exception { - final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0); - final Map matchMap = new HashMap(); - - for (final AnidbSearchResult entry : WebServices.AniDB.getAnimeTitles()) { - for (String identifier : new String[] { entry.getPrimaryTitle(), entry.getOfficialTitle("en") }) { - if (identifier == null || identifier.isEmpty()) - continue; - - identifier = nameMatcher.normalize(identifier); - for (String name : names) { - String commonName = nameMatcher.matchFirstCommonSequence(name, identifier); - if (commonName != null && commonName.length() >= identifier.length()) { - matchMap.put(entry, commonName); - } - } - } - } - - // sort by length of name match (descending) - List results = new ArrayList(matchMap.keySet()); - sort(results, new Comparator() { - - @Override - public int compare(AnidbSearchResult a, AnidbSearchResult b) { - return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length())); - } - }); - - return results; + return matches; } @@ -366,7 +320,11 @@ public class MediaDetection { public static String stripReleaseInfo(String name) throws IOException { - return releaseInfo.cleanRelease(name, true); + try { + return releaseInfo.cleanRelease(singleton(name), true).iterator().next(); + } catch (NoSuchElementException e) { + return ""; // default value in case all tokens are stripped away + } } diff --git a/source/net/sourceforge/filebot/media/ReleaseInfo.java b/source/net/sourceforge/filebot/media/ReleaseInfo.java index aa63e528..17fde5ab 100644 --- a/source/net/sourceforge/filebot/media/ReleaseInfo.java +++ b/source/net/sourceforge/filebot/media/ReleaseInfo.java @@ -6,11 +6,13 @@ import static java.util.Arrays.*; import static java.util.ResourceBundle.*; import static java.util.regex.Pattern.*; import static net.sourceforge.filebot.similarity.Normalization.*; +import static net.sourceforge.tuned.FileUtilities.*; import static net.sourceforge.tuned.StringUtilities.*; import java.io.File; import java.io.FileFilter; import java.io.IOException; +import java.io.InputStreamReader; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.text.Collator; @@ -32,7 +34,6 @@ import java.util.zip.GZIPInputStream; import net.sourceforge.filebot.web.CachedResource; import net.sourceforge.filebot.web.Movie; -import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult; import net.sourceforge.tuned.ByteBufferInputStream; @@ -89,28 +90,32 @@ public class ReleaseInfo { } - public List cleanRelease(Iterable items, boolean strict) throws IOException { + public List cleanRelease(Collection items, boolean strict) throws IOException { Set languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet(); - return clean(items, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages)); - } - - - public String cleanRelease(String item, boolean strict) throws IOException { - Set languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet(); - return clean(item, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages)); - } - - - public List clean(Iterable items, Pattern... blacklisted) { - List cleanedItems = new ArrayList(); + + Pattern releaseGroup = getReleaseGroupPattern(strict); + Pattern languageSuffix = getLanguageSuffixPattern(languages); + Pattern languageTag = getLanguageTagPattern(languages); + Pattern videoSource = getVideoSourcePattern(); + Pattern videoFormat = getVideoFormatPattern(); + Pattern resolution = getResolutionPattern(); + Pattern queryBlacklist = getBlacklistPattern(); + + Pattern[] blacklist = new Pattern[] { releaseGroup, languageSuffix, languageTag, videoSource, videoFormat, resolution, queryBlacklist }; + Pattern[] stopwords = new Pattern[] { getReleaseGroupPattern(true), languageSuffix, languageTag, videoSource, videoFormat, resolution }; + + List output = new ArrayList(items.size()); for (String it : items) { - String cleanedItem = clean(it, blacklisted); - if (cleanedItem.length() > 0) { - cleanedItems.add(cleanedItem); + it = substringBefore(it, stopwords); + it = clean(it, blacklist); + + // ignore empty values + if (it.length() > 0) { + output.add(it); } } - return cleanedItems; + return output; } @@ -123,7 +128,20 @@ public class ReleaseInfo { } - public Pattern getLanguageOptionPattern(Collection languages) { + public String substringBefore(String item, Pattern... stopwords) { + for (Pattern it : stopwords) { + Matcher matcher = it.matcher(item); + if (matcher.find()) { + return item.substring(0, matcher.start()); // use substring before the matched stopword + } + } + + // no stopword found, keep original string + return item; + } + + + public Pattern getLanguageTagPattern(Collection languages) { // [en] return compile("(?<=[-\\[{(])(" + join(quoteAll(languages), "|") + ")(?=\\p{Punct})", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ); } @@ -172,7 +190,7 @@ public class ReleaseInfo { } - public synchronized TheTVDBSearchResult[] getSeriesList() throws IOException { + public synchronized String[] getSeriesList() throws IOException { return seriesListResource.get(); } @@ -186,7 +204,7 @@ public class ReleaseInfo { protected final CachedResource releaseGroupResource = new PatternResource(getBundle(getClass().getName()).getString("url.release-groups")); protected final CachedResource queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist")); protected final CachedResource movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list")); - protected final CachedResource seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list")); + protected final CachedResource seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list")); protected static class PatternResource extends CachedResource { @@ -206,7 +224,7 @@ public class ReleaseInfo { protected static class MovieResource extends CachedResource { public MovieResource(String resource) { - super(resource, Movie[].class, 24 * 60 * 60 * 1000); // 24h update interval + super(resource, Movie[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week } @@ -227,25 +245,16 @@ public class ReleaseInfo { } - protected static class SeriesResource extends CachedResource { + protected static class SeriesResource extends CachedResource { public SeriesResource(String resource) { - super(resource, TheTVDBSearchResult[].class, 24 * 60 * 60 * 1000); // 24h update interval + super(resource, String[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week } @Override - public TheTVDBSearchResult[] process(ByteBuffer data) throws IOException { - Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n"); - - List tvshows = new ArrayList(); - while (scanner.hasNext()) { - int sid = scanner.nextInt(); - String name = scanner.next(); - tvshows.add(new TheTVDBSearchResult(name, sid)); - } - - return tvshows.toArray(new TheTVDBSearchResult[0]); + public String[] process(ByteBuffer data) throws IOException { + return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "utf-8")).split("\\n"); } } diff --git a/source/net/sourceforge/filebot/media/ReleaseInfo.properties b/source/net/sourceforge/filebot/media/ReleaseInfo.properties index 273739b6..1aa83835 100644 --- a/source/net/sourceforge/filebot/media/ReleaseInfo.properties +++ b/source/net/sourceforge/filebot/media/ReleaseInfo.properties @@ -12,7 +12,7 @@ url.query-blacklist: http://filebot.sourceforge.net/data/query-blacklist.txt # list of all movies (id, name, year) url.movie-list: http://filebot.sourceforge.net/data/movies.txt.gz -url.series-list: http://filebot.sourceforge.net/data/tvshows.txt.gz +url.series-list: http://filebot.sourceforge.net/data/series.list.gz # disk folder matcher pattern.diskfolder.entry: ^BDMV$|^HVDVD_TS$|^VIDEO_TS$|^AUDIO_TS$|^VCD$ diff --git a/source/net/sourceforge/filebot/similarity/Normalization.java b/source/net/sourceforge/filebot/similarity/Normalization.java index 8b456858..e8806865 100644 --- a/source/net/sourceforge/filebot/similarity/Normalization.java +++ b/source/net/sourceforge/filebot/similarity/Normalization.java @@ -2,22 +2,34 @@ package net.sourceforge.filebot.similarity; +import static java.util.regex.Pattern.*; + +import java.util.regex.Pattern; + + public class Normalization { + private static final Pattern apostrophe = compile("['`´‘’ʻ]+"); + private static final Pattern punctuation = compile("[\\p{Punct}\\p{Space}]+"); + + private static final Pattern[] brackets = new Pattern[] { compile("\\([^\\(]*\\)"), compile("\\[[^\\[]*\\]"), compile("\\{[^\\{]*\\}") }; + + private static final Pattern checksum = compile("[\\(\\[]\\p{XDigit}{8}[\\]\\)]"); + + public static String normalizePunctuation(String name) { // remove/normalize special characters - name = name.replaceAll("[`´‘’ʻ]+", ""); - name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " "); - + name = apostrophe.matcher(name).replaceAll(""); + name = punctuation.matcher(name).replaceAll(" "); return name.trim(); } public static String normalizeBrackets(String name) { // remove group names and checksums, any [...] or (...) - name = name.replaceAll("\\([^\\(]*\\)", " "); - name = name.replaceAll("\\[[^\\[]*\\]", " "); - name = name.replaceAll("\\{[^\\{]*\\}", " "); + for (Pattern it : brackets) { + name = it.matcher(name).replaceAll(" "); + } return name; } @@ -25,7 +37,7 @@ public class Normalization { public static String removeEmbeddedChecksum(String string) { // match embedded checksum and surrounding brackets - return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", ""); + return checksum.matcher(string).replaceAll(""); } } diff --git a/website/data/series.list.gz b/website/data/series.list.gz new file mode 100644 index 00000000..5df7b2f5 Binary files /dev/null and b/website/data/series.list.gz differ