* use XZ Utils for packing online database files

2013-08-10 07:56:11 +00:00 · 2013-08-10 07:56:11 +00:00 · 09d2dc24af
parent c0498185d1
commit 09d2dc24af
7 changed files with 118 additions and 153 deletions
--- a/BuildData.groovy
+++ b/BuildData.groovy
@ -1,3 +1,4 @@
+import  org.tukaani.xz.*

 // ------------------------------------------------------------------------- //

@ -37,14 +38,13 @@ println "Reviews: " + reviews.size()
 // ------------------------------------------------------------------------- //


-def series_out  = new File("website/data/series.list.gz")
-def movies_out  = new File("website/data/movies.txt.gz")
-def thetvdb_out = new File("website/data/thetvdb.txt.gz")
-def anidb_out   = new File("website/data/anidb.txt.gz")
+def movies_out  = new File("website/data/movies.txt")
+def thetvdb_out = new File("website/data/thetvdb.txt")
+def anidb_out   = new File("website/data/anidb.txt")

-def gz(file, lines) {
-	file.withOutputStream{ out ->
-		new java.util.zip.GZIPOutputStream(out).withWriter('UTF-8'){ writer ->
+def pack(file, lines) {
+	new File(file.parentFile, file.name + '.xz').withOutputStream{ out ->
+		new XZOutputStream(out, new LZMA2Options(LZMA2Options.PRESET_DEFAULT)).withWriter('UTF-8'){ writer ->
 			lines.each{ writer.append(it).append('\n') }
 		}
 	}
@ -80,7 +80,7 @@ def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
 movies.each{ movieSorter.put([it[1], it[2], it[0]].join('\t'), it) }
 movies = movieSorter.values().collect{ it.join('\t') }

-gz(movies_out, movies)
+pack(movies_out, movies)
 println "Movie Count: " + movies.size()

 // sanity check
@ -146,7 +146,7 @@ thetvdb_index = thetvdb_index.sort(new Comparator() {

 // join and sort
 def thetvdb_txt = thetvdb_index.findResults{ [it[0].pad(6), it[1].trim()].join('\t') }
-gz(thetvdb_out, thetvdb_txt)
+pack(thetvdb_out, thetvdb_txt)
 println "TheTVDB Index: " + thetvdb_txt.size()

 // sanity check
@ -160,8 +160,8 @@ def anidb = new net.sourceforge.filebot.web.AnidbClient(null, 0).getAnimeTitles(
 def anidb_index = anidb.findResults{ [it.getAnimeId(), it.getPrimaryTitle(), it.getEnglishTitle()] }

 // join and sort
-def anidb_txt = anidb_index.findResults{ [it[0].pad(5), it[1] ?: '', it[2] ?: ''].join('\t').replaceAll(/['`´‘’ʻ]+/, /'/) }.sort().unique()
-gz(anidb_out, anidb_txt)
+def anidb_txt = anidb_index.findResults{ [it[0].pad(5), it[1] ?: '', it[2] == null || it[2].equals(it[1]) ? '' : it[2]]*.replaceAll(/\s+/, ' ')*.trim().join('\t').replaceAll(/['`´‘’ʻ]+/, /'/) }.sort().unique()
+pack(anidb_out, anidb_txt)
 println "AniDB Index: " + anidb_txt.size()

 // sanity check
--- a/build.xml
+++ b/build.xml
@ -109,6 +109,10 @@
 				<include name="org/kohsuke/args4j/**" />
 			</zipfileset>
 			
+			<zipfileset src="${dir.lib}/xz.jar">
+				<include name="org/tukaani/xz/**" />
+			</zipfileset>
+
 			<zipfileset src="${dir.lib}/ehcache.jar">
 				<include name="net/sf/ehcache/**" />
 				<include name="ehcache-failsafe.xml" />
@ -632,6 +636,7 @@
 	<target name="upload-data-frs" depends="login">
 		<scp todir="${sf.user}:${sf.password}@${deploy.data.frs}" trust="yes" verbose="true" sftp="true">
 			<fileset dir="${dir.website}/data">
+				<include name="*.xz" />
 				<include name="*.gz" />
 			</fileset>
 		</scp>
--- a/installer/webstart/filebot.jnlp
+++ b/installer/webstart/filebot.jnlp
@ -27,13 +27,13 @@
 		<property name="application.warmup"        value="false"    />
 		<property name="unixfs"                    value="false"    />
 		<property name="useNativeShell"            value="false"    />
-		<property name="useExtendedFileAttributes" value="false"    />
+		<property name="useExtendedFileAttributes" value="true"    />
 		<property name="java.net.useSystemProxies" value="true"     />
 		<property name="sun.net.client.defaultConnectTimeout" value="10000" />
 		<property name="sun.net.client.defaultReadTimeout"    value="60000" />
 		
 		
-		<java version="1.6+" />
+		<java version="1.7+" />
 		<property name="jnlp.packEnabled"       value="true" />
 		
 		<jar href="filebot.jar"       download="eager" main="true" />
@ -50,11 +50,12 @@
 		<jar href="slf4j-jdk.jar"     download="eager" />
 		<jar href="jgat-custom.jar"   download="eager" />
 		<jar href="xmlrpc.jar"        download="eager" />
-		<jar href="sublight-ws.jar"   download="eager" />
+		<jar href="xz.jar"            download="eager" />
 		<jar href="json-simple.jar"   download="lazy" />
 		<jar href="json-io.jar"       download="lazy" />
 		<jar href="junrar-custom.jar" download="lazy" />
 		<jar href="jacksum.jar"       download="lazy" />
+		<jar href="jsoup.jar"         download="lazy" />
 		<jar href="nekohtml.jar"      download="lazy" part="scraper" />
 		<jar href="xercesImpl.jar"    download="lazy" part="scraper" />
 		<jar href="mediainfo.jar"        download="lazy" part="native"  />
--- a/lib/xz.jar
+++ b/lib/xz.jar
--- a/source/net/sourceforge/filebot/media/ReleaseInfo.java
+++ b/source/net/sourceforge/filebot/media/ReleaseInfo.java
@ -1,13 +1,13 @@
-
 package net.sourceforge.filebot.media;

-
-import static java.util.Arrays.*;
-import static java.util.Collections.*;
-import static java.util.ResourceBundle.*;
-import static java.util.regex.Pattern.*;
-import static net.sourceforge.filebot.similarity.Normalization.*;
-import static net.sourceforge.tuned.StringUtilities.*;
+import static java.util.Arrays.asList;
+import static java.util.Collections.unmodifiableMap;
+import static java.util.ResourceBundle.getBundle;
+import static java.util.regex.Pattern.CASE_INSENSITIVE;
+import static java.util.regex.Pattern.UNICODE_CASE;
+import static java.util.regex.Pattern.compile;
+import static net.sourceforge.filebot.similarity.Normalization.normalizePunctuation;
+import static net.sourceforge.tuned.StringUtilities.join;

 import java.io.File;
 import java.io.FileFilter;
@ -31,7 +31,6 @@ import java.util.Set;
 import java.util.TreeMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import java.util.zip.GZIPInputStream;

 import net.sourceforge.filebot.web.AnidbSearchResult;
 import net.sourceforge.filebot.web.CachedResource;
@ -39,6 +38,7 @@ import net.sourceforge.filebot.web.Movie;
 import net.sourceforge.filebot.web.TheTVDBSearchResult;
 import net.sourceforge.tuned.ByteBufferInputStream;

+import org.tukaani.xz.XZInputStream;

 public class ReleaseInfo {

@ -47,7 +47,6 @@ public class ReleaseInfo {
 		return matchLast(getVideoSourcePattern(), getBundle(getClass().getName()).getString("pattern.video.source").split("[|]"), strings);
 	}

-	
 	public String getReleaseGroup(String... strings) throws IOException {
 		// check file and folder for release group names
 		String[] groups = releaseGroupResource.get();
@ -63,7 +62,6 @@ public class ReleaseInfo {
 		return match;
 	}

-	
 	public Locale getLanguageSuffix(String name) {
 		// match locale identifier and lookup Locale object
 		Map<String, Locale> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault());
@ -75,7 +73,6 @@ public class ReleaseInfo {
 		return languages.get(lang);
 	}

-	
 	protected String matchLast(Pattern pattern, String[] standardValues, CharSequence... sequence) {
 		String lastMatch = null;

@ -106,7 +103,6 @@ public class ReleaseInfo {
 	private final Map<Boolean, Pattern[]> stopwords = new HashMap<Boolean, Pattern[]>(2);
 	private final Map<Boolean, Pattern[]> blacklist = new HashMap<Boolean, Pattern[]>(2);

-	
 	public List<String> cleanRelease(Collection<String> items, boolean strict) throws IOException {
 		Pattern[] stopwords;
 		Pattern[] blacklist;
@ -150,7 +146,6 @@ public class ReleaseInfo {
 		return output;
 	}

-	
 	public String clean(String item, Pattern... blacklisted) {
 		for (Pattern it : blacklisted) {
 			item = it.matcher(item).replaceAll("");
@ -158,7 +153,6 @@ public class ReleaseInfo {
 		return item;
 	}

-	
 	public String substringBefore(String item, Pattern... stopwords) {
 		for (Pattern it : stopwords) {
 			Matcher matcher = it.matcher(item);
@ -172,79 +166,66 @@ public class ReleaseInfo {
 		return item;
 	}

-	
 	public Pattern getLanguageTagPattern(Collection<String> languages) {
 		// [en]
 		return compile("(?<=[-\\[{(])(" + join(quoteAll(languages), "|") + ")(?=\\p{Punct})", CASE_INSENSITIVE | UNICODE_CASE);
 	}

-	
 	public Pattern getLanguageSuffixPattern(Collection<String> languages, boolean strict) {
 		// .en.srt
 		return compile("(?<=[.])(" + join(quoteAll(languages), "|") + ")(?=[._ ]*$)", (strict ? 0 : CASE_INSENSITIVE) | UNICODE_CASE);
 	}

-	
 	public Pattern getResolutionPattern() {
 		// match screen resolutions 640x480, 1280x720, etc
 		return compile("(?<!\\p{Alnum})(\\d{4}|[6-9]\\d{2})x(\\d{4}|[4-9]\\d{2})(?!\\p{Alnum})");
 	}

-	
 	public Pattern getVideoFormatPattern() {
 		// pattern matching any video source name
 		String pattern = getBundle(getClass().getName()).getString("pattern.video.format");
 		return compile("(?<!\\p{Alnum})(" + pattern + ")(?!\\p{Alnum})", CASE_INSENSITIVE);
 	}

-	
 	public Pattern getVideoSourcePattern() {
 		// pattern matching any video source name
 		String pattern = getBundle(getClass().getName()).getString("pattern.video.source");
 		return compile("(?<!\\p{Alnum})(" + pattern + ")(?!\\p{Alnum})", CASE_INSENSITIVE);
 	}

-	
 	public Pattern getClutterBracketPattern(boolean strict) {
 		// match patterns like [Action, Drama] or {ENG-XViD-MP3-DVDRiP} etc
 		String contentFilter = strict ? "[\\p{Space}\\p{Punct}&&[^\\[\\]]]" : "\\p{Alpha}";
 		return compile("(?:\\[([^\\[\\]]+?" + contentFilter + "[^\\[\\]]+?)\\])|(?:\\{([^\\{\\}]+?" + contentFilter + "[^\\{\\}]+?)\\})|(?:\\(([^\\(\\)]+?" + contentFilter + "[^\\(\\)]+?)\\))");
 	}

-	
 	public Pattern getReleaseGroupPattern(boolean strict) throws IOException {
 		// pattern matching any release group name enclosed in separators
 		return compile("(?<!\\p{Alnum})(" + join(releaseGroupResource.get(), "|") + ")(?!\\p{Alnum})", strict ? 0 : CASE_INSENSITIVE | UNICODE_CASE);
 	}

-	
 	public Pattern getBlacklistPattern() throws IOException {
 		// pattern matching any release group name enclosed in separators
 		return compile("(?<!\\p{Alnum})(" + join(queryBlacklistResource.get(), "|") + ")(?!\\p{Alnum})", CASE_INSENSITIVE | UNICODE_CASE);
 	}

-	
 	public Pattern getExcludePattern() throws IOException {
 		// pattern matching any release group name enclosed in separators
 		return compile(join(excludeBlacklistResource.get(), "|"), CASE_INSENSITIVE | UNICODE_CASE);
 	}

-	
 	public Movie[] getMovieList() throws IOException {
 		return movieListResource.get();
 	}

-	
 	public TheTVDBSearchResult[] getTheTVDBIndex() throws IOException {
 		return tvdbIndexResource.get();
 	}

-	
 	public AnidbSearchResult[] getAnidbIndex() throws IOException {
 		return anidbIndexResource.get();
 	}

-	
 	public Map<Pattern, String> getSeriesDirectMappings() throws IOException {
 		Map<Pattern, String> mappings = new LinkedHashMap<Pattern, String>();
 		for (String line : seriesDirectMappingsResource.get()) {
@ -256,12 +237,10 @@ public class ReleaseInfo {
 		return mappings;
 	}

-	
 	public FileFilter getDiskFolderFilter() {
 		return new FolderEntryFilter(compile(getBundle(getClass().getName()).getString("pattern.diskfolder.entry")));
 	}

-	
 	public FileFilter getClutterFileFilter() throws IOException {
 		return new ClutterFileFilter(getExcludePattern(), 262144000); // only files smaller than 250 MB may be considered clutter
 	}
@ -275,31 +254,27 @@ public class ReleaseInfo {
 	protected final CachedResource<TheTVDBSearchResult[]> tvdbIndexResource = new TheTVDBIndexResource(getBundle(getClass().getName()).getString("url.thetvdb-index"));
 	protected final CachedResource<AnidbSearchResult[]> anidbIndexResource = new AnidbIndexResource(getBundle(getClass().getName()).getString("url.anidb-index"));

-	
 	protected static class PatternResource extends CachedResource<String[]> {

 		public PatternResource(String resource) {
 			super(resource, String[].class, 24 * 60 * 60 * 1000); // 24h update interval
 		}

-		
 		@Override
 		public String[] process(ByteBuffer data) {
 			return compile("\\n").split(Charset.forName("UTF-8").decode(data));
 		}
 	}

-	
 	protected static class MovieResource extends CachedResource<Movie[]> {

 		public MovieResource(String resource) {
 			super(resource, Movie[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
 		}

-		
 		@Override
 		public Movie[] process(ByteBuffer data) throws IOException {
-			Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
+			Scanner scanner = new Scanner(new XZInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");

 			List<Movie> movies = new ArrayList<Movie>();
 			while (scanner.hasNext()) {
@ -313,17 +288,15 @@ public class ReleaseInfo {
 		}
 	}

-	
 	protected static class TheTVDBIndexResource extends CachedResource<TheTVDBSearchResult[]> {

 		public TheTVDBIndexResource(String resource) {
 			super(resource, TheTVDBSearchResult[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
 		}

-		
 		@Override
 		public TheTVDBSearchResult[] process(ByteBuffer data) throws IOException {
-			Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
+			Scanner scanner = new Scanner(new XZInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");

 			List<TheTVDBSearchResult> tvshows = new ArrayList<TheTVDBSearchResult>();
 			while (scanner.hasNext() && scanner.hasNextInt()) {
@ -336,17 +309,15 @@ public class ReleaseInfo {
 		}
 	}

-	
 	protected static class AnidbIndexResource extends CachedResource<AnidbSearchResult[]> {

 		public AnidbIndexResource(String resource) {
 			super(resource, AnidbSearchResult[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
 		}

-		
 		@Override
 		public AnidbSearchResult[] process(ByteBuffer data) throws IOException {
-			Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
+			Scanner scanner = new Scanner(new XZInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");

 			List<AnidbSearchResult> anime = new ArrayList<AnidbSearchResult>();
 			while (scanner.hasNext() && scanner.hasNextInt()) {
@ -354,28 +325,21 @@ public class ReleaseInfo {
 				String primaryTitle = scanner.next().trim();
 				String englishTitle = scanner.next().trim();

-				if (englishTitle.isEmpty() || englishTitle.equals(primaryTitle)) {
-					anime.add(new AnidbSearchResult(aid, primaryTitle, null));
-				} else {
-					anime.add(new AnidbSearchResult(aid, primaryTitle, englishTitle));
-				}
+				anime.add(new AnidbSearchResult(aid, primaryTitle, englishTitle.isEmpty() ? null : englishTitle));
 			}

 			return anime.toArray(new AnidbSearchResult[0]);
 		}
 	}

-	
 	protected static class FolderEntryFilter implements FileFilter {

 		private final Pattern entryPattern;

-		
 		public FolderEntryFilter(Pattern entryPattern) {
 			this.entryPattern = entryPattern;
 		}

-		
 		@Override
 		public boolean accept(File dir) {
 			if (dir.isDirectory()) {
@ -389,42 +353,35 @@ public class ReleaseInfo {
 		}
 	}

-	
 	public static class FileFolderNameFilter implements FileFilter {

 		private final Pattern namePattern;

-		
 		public FileFolderNameFilter(Pattern namePattern) {
 			this.namePattern = namePattern;
 		}

-		
 		@Override
 		public boolean accept(File file) {
 			return (namePattern.matcher(file.getName()).find() || (file.isFile() && namePattern.matcher(file.getParentFile().getName()).find()));
 		}
 	}

-	
 	public static class ClutterFileFilter extends FileFolderNameFilter {

 		private long maxFileSize;

-		
 		public ClutterFileFilter(Pattern namePattern, long maxFileSize) {
 			super(namePattern);
 			this.maxFileSize = maxFileSize;
 		}

-		
 		@Override
 		public boolean accept(File file) {
 			return super.accept(file) && file.isFile() && file.length() < maxFileSize;
 		}
 	}

-	
 	private Collection<String> quoteAll(Collection<String> strings) {
 		List<String> patterns = new ArrayList<String>(strings.size());
 		for (String it : strings) {
@ -433,7 +390,6 @@ public class ReleaseInfo {
 		return patterns;
 	}

-	
 	private Map<String, Locale> getLanguageMap(Locale... supportedDisplayLocale) {
 		// use maximum strength collator by default
 		Collator collator = Collator.getInstance(Locale.ROOT);
--- a/source/net/sourceforge/filebot/media/ReleaseInfo.properties
+++ b/source/net/sourceforge/filebot/media/ReleaseInfo.properties
@ -17,13 +17,13 @@ url.exclude-blacklist: http://filebot.net/data/exclude-blacklist.txt
 url.series-mappings: http://filebot.net/data/series-mappings.txt

 # list of all movies (id, name, year)
-url.movie-list: http://filebot.net/data/movies.txt.gz
+url.movie-list: http://filebot.net/data/movies.txt.xz

 # TheTVDB index
-url.thetvdb-index: http://filebot.net/data/thetvdb.txt.gz
+url.thetvdb-index: http://filebot.net/data/thetvdb.txt.xz

 # AniDB index
-url.anidb-index: http://filebot.net/data/anidb.txt.gz
+url.anidb-index: http://filebot.net/data/anidb.txt.xz

 # disk folder matcher
 pattern.diskfolder.entry: BDMV|HVDVD_TS|VIDEO_TS|AUDIO_TS|VCD|movie.nfo
--- a/website/data/.htaccess
+++ b/website/data/.htaccess
@ -1,6 +1,9 @@
 options +indexes

+redirect 301 /data/movies.txt.xz http://sourceforge.net/projects/filebot/files/data/movies.txt.xz/download
+redirect 301 /data/thetvdb.txt.xz http://sourceforge.net/projects/filebot/files/data/thetvdb.txt.xz/download
+redirect 301 /data/anidb.txt.xz http://sourceforge.net/projects/filebot/files/data/anidb.txt.xz/download
+
 redirect 301 /data/movies.txt.gz http://sourceforge.net/projects/filebot/files/data/movies.txt.gz/download
-redirect 301 /data/series.list.gz http://sourceforge.net/projects/filebot/files/data/series.list.gz/download
 redirect 301 /data/thetvdb.txt.gz http://sourceforge.net/projects/filebot/files/data/thetvdb.txt.gz/download
 redirect 301 /data/anidb.txt.gz http://sourceforge.net/projects/filebot/files/data/anidb.txt.gz/download