+ rebuild movie index with imdb AND tmdb IDs

2013-11-20 10:07:25 +00:00 · 2013-11-20 10:07:25 +00:00 · 75c897bae5
parent 28df8ff69a
commit 75c897bae5
4 changed files with 120 additions and 52 deletions
--- a/BuildData.groovy
+++ b/BuildData.groovy
@ -1,6 +1,7 @@
 import  org.tukaani.xz.*

-// ------------------------------------------------------------------------- //
+
+/* ------------------------------------------------------------------------- */


 def sortRegexList(path) {
@ -9,7 +10,6 @@ def sortRegexList(path) {
 		// check if regex compiles
 		set += java.util.regex.Pattern.compile(it.trim()).pattern()
 	}
-	
 	def out = set.join('\n').saveAs(path)
 	println "$out\n$out.text\n"
 }
@ -22,11 +22,14 @@ sortRegexList("website/data/exclude-blacklist.txt")
 sortRegexList("website/data/series-mappings.txt")


-// ------------------------------------------------------------------------- //
+/* ------------------------------------------------------------------------- */


 def reviews = []
-new File('reviews.csv').eachLine('UTF-8'){ def s = it.split(';', 3); reviews << [user: s[0], date: s[1], text: s[2].replaceAll(/^["]|["]$/, '').replaceAll(/["]{2}/, '"')] }
+new File('reviews.csv').eachLine('UTF-8'){
+	def s = it.split(';', 3)
+	reviews << [user: s[0], date: s[1], text: s[2].replaceAll(/^\"|\"$/, '').replaceAll(/["]{2}/, '"') ]
+}
 reviews = reviews.sort{ it.date }

 def json = new groovy.json.JsonBuilder()
@ -35,10 +38,10 @@ json.toPrettyString().saveAs('website/reviews.json')
 println "Reviews: " + reviews.size()


-// ------------------------------------------------------------------------- //
+/* ------------------------------------------------------------------------- */


-def movies_out  = new File("website/data/movies.txt")
+def movies_out  = new File("website/data/moviedb.txt")
 def thetvdb_out = new File("website/data/thetvdb.txt")
 def anidb_out   = new File("website/data/anidb.txt")

@ -51,9 +54,19 @@ def pack(file, lines) {
 }


-// ------------------------------------------------------------------------- //
+/* ------------------------------------------------------------------------- */
+
+
+// BUILD moviedb index
+def treeSort(list, keyFunction) {
+	def sorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
+	list.each{
+		sorter.put(keyFunction(it), it)
+	}
+	return sorter.values()
+}
+

-// BUILD movies.txt.gz
 def omdb = new TreeSet({ a, b -> a[0].compareTo(b[0]) } as Comparator)
 new File('omdb.txt').eachLine('Windows-1252'){
 	def line = it.split(/\t/)
@ -67,27 +80,57 @@ new File('omdb.txt').eachLine('Windows-1252'){
 		def votes = tryQuietly{ line[12].replaceAll(/\D/, '').toInteger() } ?: 0
 		
 		if ((year >= 1970 && (runtime =~ /h/ || votes >= 200) && rating >= 1 && votes >= 50) || (year >= 1950 && votes >= 5000)) {
-			omdb << [imdbid, name, year]
+			omdb << [imdbid.pad(7), name, year]
 		}
 	}
 }
-omdb = omdb.findAll{ it[0] <= 9999999 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ && it[1].length() >= 4}.collect{ [it[0].pad(7), it[1], it[2]] }
+def isValidMovieName = { s -> s =~ /^[A-Z0-9]/ && s =~ /[\p{Alpha}]{3}/ }
+omdb = omdb.findAll{ (it[0] as int) <= 9999999 && isValidMovieName(it[1]) }

-// save movie data
-def movies = omdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') }
-def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
-movies.each{ movieSorter.put([it[1], it[2], it[0]].join('\t'), [it[0], it[2], it[1]]) } // ORDER => ID, YEAR, NAME
-movies = movieSorter.values().collect{ it.join('\t') }

-pack(movies_out, movies)
+def tmdb_txt = new File('tmdb.txt')
+def tmdb_index = csv(tmdb_txt, '\t', 1, [0..-1])
+
+def tmdb = omdb.findResults{ m ->
+	if (tmdb_index.containsKey(m[0])) {
+		return tmdb_index[m[0]]
+	}
+	
+	def sync = System.currentTimeMillis()
+	def row = [sync, m[0].pad(7), 0, m[2], m[1]]
+	try {
+		def info = net.sourceforge.filebot.WebServices.TMDb.getMovieInfo("tt${m[0]}", Locale.ENGLISH, false)
+		def names = [info.name, info.originalName, m[1]]
+		row = [sync, m[0].pad(7), info.id.pad(7), info.released?.year ?: m[2]] + names.findResults{ it ?: '' }
+	} catch(FileNotFoundException e) {
+	}
+	
+	println row
+	tmdb_txt << row.join('\t') << '\n'
+	return row
+}
+
+movies = tmdb.findResults{
+	def ity = it[1..3] // imdb id, tmdb id, year
+	def names = it[4..-1].findAll{ isValidMovieName(it) }.unique{ it.toLowerCase().normalizePunctuation() }
+	if (ity[1].toInteger() > 0 && names.size() > 0)
+		return ity + names
+	else
+		return null
+}
+movies = treeSort(movies, { it[3, 2].join(' ') })
+
+pack(movies_out, movies.findResults{ it.join('\t') })
 println "Movie Count: " + movies.size()

 // sanity check
-if (movies.size() < 50000) { throw new Exception('Movie index sanity failed') }
+if (movies.size() < 40000) { throw new Exception('Movie index sanity failed') }

-// ------------------------------------------------------------------------- //

-// BUILD thetvdb-index.gz
+/* ------------------------------------------------------------------------- */
+
+
+// BUILD tvdb index
 def tvdb = new HashMap()
 def tvdb_txt = new File('tvdb.txt')
 new File('tvdb.txt').eachLine{
@ -139,9 +182,7 @@ tvdb.values().each{


 thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].length() < 2) }
-thetvdb_index = thetvdb_index.sort(new Comparator() {
-	int compare(a, b) { a[0] <=> b[0] }
-})
+thetvdb_index = thetvdb_index.sort(new Comparator() { int compare(a, b) { a[0] <=> b[0] } })

 // join and sort
 def thetvdb_txt = thetvdb_index.groupBy{ it[0] }.findResults{ k, v -> ([k.pad(6)] + v*.getAt(1).unique()).join('\t') }
@ -152,8 +193,10 @@ println "TheTVDB Index: " + thetvdb_txt.size()
 if (thetvdb_txt.size() < 30000) { throw new Exception('TheTVDB index sanity failed') }


+/* ------------------------------------------------------------------------- */

-// BUILD anidb-index.gz
+
+// BUILD anidb index
 def anidb = new net.sourceforge.filebot.web.AnidbClient(null, 0).getAnimeTitles()

 def anidb_index = anidb.findResults{
--- a/source/net/sourceforge/filebot/format/ExpressionFormat.lib.groovy
+++ b/source/net/sourceforge/filebot/format/ExpressionFormat.lib.groovy
@ -3,17 +3,11 @@ import static net.sourceforge.tuned.FileUtilities.*
 import java.util.regex.Pattern


-// simplified switch/case pattern matching
-def c(c) { try { c.call() } catch (Throwable e) { null } }
-def csv(path, delim = ';', keyIndex = 0, valueIndex = 1) { def f = path as File; def values = [:]; f.splitEachLine(delim) { line -> values.put(line[keyIndex], valueIndex < line.size() ? line[valueIndex] : null) }; return values }
-Object.metaClass.match = { Map cases -> def val = delegate; cases.findResult { switch(val) { case it.key: return it.value} } }
-
-
 /**
-* Allow getAt() for File paths
-*
-* e.g. file[0] -> "F:"
-*/
+ * Allow getAt() for File paths
+ *
+ * e.g. file[0] -> "F:"
+ */
 File.metaClass.getAt = { Range range -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(range).join(File.separator) }
 File.metaClass.getAt = { int index -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(index) }
 File.metaClass.getRoot = { listPath(delegate)[0] }
@ -22,7 +16,7 @@ File.metaClass.getDiskSpace = { listPath(delegate).reverse().find{ it.exists() }


 /**
- * Convenience methods for String.toLowerCase()and String.toUpperCase()
+ * Convenience methods for String.toLowerCase() and String.toUpperCase()
 */
 String.metaClass.lower = { toLowerCase() }
 String.metaClass.upper = { toUpperCase() }
@ -93,10 +87,10 @@ String.metaClass.upperInitial = { replaceAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[a-


 /**
-* Get acronym, i.e. first letter of each word.
-*
-* e.g. "Deep Space 9" -> "DS9"
-*/
+ * Get acronym, i.e. first letter of each word.
+ *
+ * e.g. "Deep Space 9" -> "DS9"
+ */
 String.metaClass.acronym = { delegate.sortName('$2').findAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[\p{Alnum}]/).join().toUpperCase() }
 String.metaClass.sortName = { replacement = '$2, $1' -> delegate.replaceFirst(/^(?i)(The|A|An)\s(.+)/, replacement).trim() }

@ -165,19 +159,49 @@ String.metaClass.transliterate = { transformIdentifier -> com.ibm.icu.text.Trans


 /**
-* Convert Unicode to ASCII as best as possible. Works with most alphabets/scripts used in the world.
-*
-* e.g. "Österreich" -> "Osterreich"
-*      "カタカナ" -> "katakana"
-*/
+ * Convert Unicode to ASCII as best as possible. Works with most alphabets/scripts used in the world.
+ *
+ * e.g. "Österreich" -> "Osterreich"
+ *      "カタカナ" -> "katakana"
+ */
 String.metaClass.ascii = { fallback = ' ' -> delegate.transliterate("Any-Latin;Latin-ASCII;[:Diacritic:]remove").replaceAll("[^\\p{ASCII}]+", fallback) }



+/**
+ * General helpers and utilities
+ */
+def c(c) {
+	try {
+		return c.call()
+	} catch (Throwable e) {
+		return null
+	}
+}
+
+def csv(path, delim = ';', keyIndex = 0, valueIndex = 1) {
+	def f = path as File
+	def values = [:]
+	if (f.isFile()) {
+		f.splitEachLine(delim) { line ->
+			values.put(line[keyIndex], c{ line[valueIndex] })
+		}
+	}
+	return values
+}
+
+Object.metaClass.match = { Map cases ->
+	def val = delegate;
+	cases.findResult {
+		switch(val) { case it.key: return it.value}
+	}
+}
+
+

 /**
-* Web and File IO helpers
-*/
+ * Web and File IO helpers
+ */
 import net.sourceforge.filebot.web.WebRequest
 import net.sourceforge.tuned.FileUtilities
 import net.sourceforge.tuned.XPathUtilities
@ -190,8 +214,8 @@ URL.metaClass.scrapeAll = { xpath -> XPathUtilities.selectNodes(xpath, WebReques


 /**
-* XML / XPath utility functions
-*/
+ * XML / XPath utility functions
+ */
 import javax.xml.xpath.XPathFactory
 import javax.xml.xpath.XPathConstants

--- a/source/net/sourceforge/filebot/media/ReleaseInfo.java
+++ b/source/net/sourceforge/filebot/media/ReleaseInfo.java
@ -328,10 +328,11 @@ public class ReleaseInfo {

 			for (String[] row : rows) {
 				int imdbid = parseInt(row[0]);
-				int year = parseInt(row[1]);
-				String name = row[2];
-				String[] aliasNames = copyOfRange(row, 3, row.length);
-				movies.add(new Movie(name, aliasNames, year, imdbid, -1));
+				int tmdbid = parseInt(row[1]);
+				int year = parseInt(row[2]);
+				String name = row[3];
+				String[] aliasNames = copyOfRange(row, 4, row.length);
+				movies.add(new Movie(name, aliasNames, year, imdbid > 0 ? imdbid : -1, tmdbid > 0 ? tmdbid : -1));
 			}

 			return movies.toArray(new Movie[0]);
--- a/source/net/sourceforge/filebot/media/ReleaseInfo.properties
+++ b/source/net/sourceforge/filebot/media/ReleaseInfo.properties
@ -20,7 +20,7 @@ number.clutter.maxfilesize: 262144000
 url.series-mappings: http://filebot.net/data/series-mappings.txt

 # list of all movies (id, name, year)
-url.movie-list: http://filebot.net/data/movies.txt.xz
+url.movie-list: http://filebot.net/data/moviedb.txt.xz

 # TheTVDB index
 url.thetvdb-index: http://filebot.net/data/thetvdb.txt.xz