+ rebuild movie index with imdb AND tmdb IDs

This commit is contained in:
Reinhard Pointner 2013-11-20 10:07:25 +00:00
parent 28df8ff69a
commit 75c897bae5
4 changed files with 120 additions and 52 deletions

View File

@ -1,6 +1,7 @@
import org.tukaani.xz.* import org.tukaani.xz.*
// ------------------------------------------------------------------------- //
/* ------------------------------------------------------------------------- */
def sortRegexList(path) { def sortRegexList(path) {
@ -9,7 +10,6 @@ def sortRegexList(path) {
// check if regex compiles // check if regex compiles
set += java.util.regex.Pattern.compile(it.trim()).pattern() set += java.util.regex.Pattern.compile(it.trim()).pattern()
} }
def out = set.join('\n').saveAs(path) def out = set.join('\n').saveAs(path)
println "$out\n$out.text\n" println "$out\n$out.text\n"
} }
@ -22,11 +22,14 @@ sortRegexList("website/data/exclude-blacklist.txt")
sortRegexList("website/data/series-mappings.txt") sortRegexList("website/data/series-mappings.txt")
// ------------------------------------------------------------------------- // /* ------------------------------------------------------------------------- */
def reviews = [] def reviews = []
new File('reviews.csv').eachLine('UTF-8'){ def s = it.split(';', 3); reviews << [user: s[0], date: s[1], text: s[2].replaceAll(/^["]|["]$/, '').replaceAll(/["]{2}/, '"')] } new File('reviews.csv').eachLine('UTF-8'){
def s = it.split(';', 3)
reviews << [user: s[0], date: s[1], text: s[2].replaceAll(/^\"|\"$/, '').replaceAll(/["]{2}/, '"') ]
}
reviews = reviews.sort{ it.date } reviews = reviews.sort{ it.date }
def json = new groovy.json.JsonBuilder() def json = new groovy.json.JsonBuilder()
@ -35,10 +38,10 @@ json.toPrettyString().saveAs('website/reviews.json')
println "Reviews: " + reviews.size() println "Reviews: " + reviews.size()
// ------------------------------------------------------------------------- // /* ------------------------------------------------------------------------- */
def movies_out = new File("website/data/movies.txt") def movies_out = new File("website/data/moviedb.txt")
def thetvdb_out = new File("website/data/thetvdb.txt") def thetvdb_out = new File("website/data/thetvdb.txt")
def anidb_out = new File("website/data/anidb.txt") def anidb_out = new File("website/data/anidb.txt")
@ -51,9 +54,19 @@ def pack(file, lines) {
} }
// ------------------------------------------------------------------------- // /* ------------------------------------------------------------------------- */
// BUILD moviedb index
def treeSort(list, keyFunction) {
def sorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
list.each{
sorter.put(keyFunction(it), it)
}
return sorter.values()
}
// BUILD movies.txt.gz
def omdb = new TreeSet({ a, b -> a[0].compareTo(b[0]) } as Comparator) def omdb = new TreeSet({ a, b -> a[0].compareTo(b[0]) } as Comparator)
new File('omdb.txt').eachLine('Windows-1252'){ new File('omdb.txt').eachLine('Windows-1252'){
def line = it.split(/\t/) def line = it.split(/\t/)
@ -67,27 +80,57 @@ new File('omdb.txt').eachLine('Windows-1252'){
def votes = tryQuietly{ line[12].replaceAll(/\D/, '').toInteger() } ?: 0 def votes = tryQuietly{ line[12].replaceAll(/\D/, '').toInteger() } ?: 0
if ((year >= 1970 && (runtime =~ /h/ || votes >= 200) && rating >= 1 && votes >= 50) || (year >= 1950 && votes >= 5000)) { if ((year >= 1970 && (runtime =~ /h/ || votes >= 200) && rating >= 1 && votes >= 50) || (year >= 1950 && votes >= 5000)) {
omdb << [imdbid, name, year] omdb << [imdbid.pad(7), name, year]
} }
} }
} }
omdb = omdb.findAll{ it[0] <= 9999999 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ && it[1].length() >= 4}.collect{ [it[0].pad(7), it[1], it[2]] } def isValidMovieName = { s -> s =~ /^[A-Z0-9]/ && s =~ /[\p{Alpha}]{3}/ }
omdb = omdb.findAll{ (it[0] as int) <= 9999999 && isValidMovieName(it[1]) }
// save movie data
def movies = omdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') }
def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
movies.each{ movieSorter.put([it[1], it[2], it[0]].join('\t'), [it[0], it[2], it[1]]) } // ORDER => ID, YEAR, NAME
movies = movieSorter.values().collect{ it.join('\t') }
pack(movies_out, movies) def tmdb_txt = new File('tmdb.txt')
def tmdb_index = csv(tmdb_txt, '\t', 1, [0..-1])
def tmdb = omdb.findResults{ m ->
if (tmdb_index.containsKey(m[0])) {
return tmdb_index[m[0]]
}
def sync = System.currentTimeMillis()
def row = [sync, m[0].pad(7), 0, m[2], m[1]]
try {
def info = net.sourceforge.filebot.WebServices.TMDb.getMovieInfo("tt${m[0]}", Locale.ENGLISH, false)
def names = [info.name, info.originalName, m[1]]
row = [sync, m[0].pad(7), info.id.pad(7), info.released?.year ?: m[2]] + names.findResults{ it ?: '' }
} catch(FileNotFoundException e) {
}
println row
tmdb_txt << row.join('\t') << '\n'
return row
}
movies = tmdb.findResults{
def ity = it[1..3] // imdb id, tmdb id, year
def names = it[4..-1].findAll{ isValidMovieName(it) }.unique{ it.toLowerCase().normalizePunctuation() }
if (ity[1].toInteger() > 0 && names.size() > 0)
return ity + names
else
return null
}
movies = treeSort(movies, { it[3, 2].join(' ') })
pack(movies_out, movies.findResults{ it.join('\t') })
println "Movie Count: " + movies.size() println "Movie Count: " + movies.size()
// sanity check // sanity check
if (movies.size() < 50000) { throw new Exception('Movie index sanity failed') } if (movies.size() < 40000) { throw new Exception('Movie index sanity failed') }
// ------------------------------------------------------------------------- //
// BUILD thetvdb-index.gz /* ------------------------------------------------------------------------- */
// BUILD tvdb index
def tvdb = new HashMap() def tvdb = new HashMap()
def tvdb_txt = new File('tvdb.txt') def tvdb_txt = new File('tvdb.txt')
new File('tvdb.txt').eachLine{ new File('tvdb.txt').eachLine{
@ -139,9 +182,7 @@ tvdb.values().each{
thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].length() < 2) } thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].length() < 2) }
thetvdb_index = thetvdb_index.sort(new Comparator() { thetvdb_index = thetvdb_index.sort(new Comparator() { int compare(a, b) { a[0] <=> b[0] } })
int compare(a, b) { a[0] <=> b[0] }
})
// join and sort // join and sort
def thetvdb_txt = thetvdb_index.groupBy{ it[0] }.findResults{ k, v -> ([k.pad(6)] + v*.getAt(1).unique()).join('\t') } def thetvdb_txt = thetvdb_index.groupBy{ it[0] }.findResults{ k, v -> ([k.pad(6)] + v*.getAt(1).unique()).join('\t') }
@ -152,8 +193,10 @@ println "TheTVDB Index: " + thetvdb_txt.size()
if (thetvdb_txt.size() < 30000) { throw new Exception('TheTVDB index sanity failed') } if (thetvdb_txt.size() < 30000) { throw new Exception('TheTVDB index sanity failed') }
/* ------------------------------------------------------------------------- */
// BUILD anidb-index.gz
// BUILD anidb index
def anidb = new net.sourceforge.filebot.web.AnidbClient(null, 0).getAnimeTitles() def anidb = new net.sourceforge.filebot.web.AnidbClient(null, 0).getAnimeTitles()
def anidb_index = anidb.findResults{ def anidb_index = anidb.findResults{

View File

@ -3,17 +3,11 @@ import static net.sourceforge.tuned.FileUtilities.*
import java.util.regex.Pattern import java.util.regex.Pattern
// simplified switch/case pattern matching
def c(c) { try { c.call() } catch (Throwable e) { null } }
def csv(path, delim = ';', keyIndex = 0, valueIndex = 1) { def f = path as File; def values = [:]; f.splitEachLine(delim) { line -> values.put(line[keyIndex], valueIndex < line.size() ? line[valueIndex] : null) }; return values }
Object.metaClass.match = { Map cases -> def val = delegate; cases.findResult { switch(val) { case it.key: return it.value} } }
/** /**
* Allow getAt() for File paths * Allow getAt() for File paths
* *
* e.g. file[0] -> "F:" * e.g. file[0] -> "F:"
*/ */
File.metaClass.getAt = { Range range -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(range).join(File.separator) } File.metaClass.getAt = { Range range -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(range).join(File.separator) }
File.metaClass.getAt = { int index -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(index) } File.metaClass.getAt = { int index -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(index) }
File.metaClass.getRoot = { listPath(delegate)[0] } File.metaClass.getRoot = { listPath(delegate)[0] }
@ -22,7 +16,7 @@ File.metaClass.getDiskSpace = { listPath(delegate).reverse().find{ it.exists() }
/** /**
* Convenience methods for String.toLowerCase()and String.toUpperCase() * Convenience methods for String.toLowerCase() and String.toUpperCase()
*/ */
String.metaClass.lower = { toLowerCase() } String.metaClass.lower = { toLowerCase() }
String.metaClass.upper = { toUpperCase() } String.metaClass.upper = { toUpperCase() }
@ -93,10 +87,10 @@ String.metaClass.upperInitial = { replaceAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[a-
/** /**
* Get acronym, i.e. first letter of each word. * Get acronym, i.e. first letter of each word.
* *
* e.g. "Deep Space 9" -> "DS9" * e.g. "Deep Space 9" -> "DS9"
*/ */
String.metaClass.acronym = { delegate.sortName('$2').findAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[\p{Alnum}]/).join().toUpperCase() } String.metaClass.acronym = { delegate.sortName('$2').findAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[\p{Alnum}]/).join().toUpperCase() }
String.metaClass.sortName = { replacement = '$2, $1' -> delegate.replaceFirst(/^(?i)(The|A|An)\s(.+)/, replacement).trim() } String.metaClass.sortName = { replacement = '$2, $1' -> delegate.replaceFirst(/^(?i)(The|A|An)\s(.+)/, replacement).trim() }
@ -165,19 +159,49 @@ String.metaClass.transliterate = { transformIdentifier -> com.ibm.icu.text.Trans
/** /**
* Convert Unicode to ASCII as best as possible. Works with most alphabets/scripts used in the world. * Convert Unicode to ASCII as best as possible. Works with most alphabets/scripts used in the world.
* *
* e.g. "Österreich" -> "Osterreich" * e.g. "Österreich" -> "Osterreich"
* "カタカナ" -> "katakana" * "カタカナ" -> "katakana"
*/ */
String.metaClass.ascii = { fallback = ' ' -> delegate.transliterate("Any-Latin;Latin-ASCII;[:Diacritic:]remove").replaceAll("[^\\p{ASCII}]+", fallback) } String.metaClass.ascii = { fallback = ' ' -> delegate.transliterate("Any-Latin;Latin-ASCII;[:Diacritic:]remove").replaceAll("[^\\p{ASCII}]+", fallback) }
/**
* General helpers and utilities
*/
def c(c) {
try {
return c.call()
} catch (Throwable e) {
return null
}
}
def csv(path, delim = ';', keyIndex = 0, valueIndex = 1) {
def f = path as File
def values = [:]
if (f.isFile()) {
f.splitEachLine(delim) { line ->
values.put(line[keyIndex], c{ line[valueIndex] })
}
}
return values
}
Object.metaClass.match = { Map cases ->
def val = delegate;
cases.findResult {
switch(val) { case it.key: return it.value}
}
}
/** /**
* Web and File IO helpers * Web and File IO helpers
*/ */
import net.sourceforge.filebot.web.WebRequest import net.sourceforge.filebot.web.WebRequest
import net.sourceforge.tuned.FileUtilities import net.sourceforge.tuned.FileUtilities
import net.sourceforge.tuned.XPathUtilities import net.sourceforge.tuned.XPathUtilities
@ -190,8 +214,8 @@ URL.metaClass.scrapeAll = { xpath -> XPathUtilities.selectNodes(xpath, WebReques
/** /**
* XML / XPath utility functions * XML / XPath utility functions
*/ */
import javax.xml.xpath.XPathFactory import javax.xml.xpath.XPathFactory
import javax.xml.xpath.XPathConstants import javax.xml.xpath.XPathConstants

View File

@ -328,10 +328,11 @@ public class ReleaseInfo {
for (String[] row : rows) { for (String[] row : rows) {
int imdbid = parseInt(row[0]); int imdbid = parseInt(row[0]);
int year = parseInt(row[1]); int tmdbid = parseInt(row[1]);
String name = row[2]; int year = parseInt(row[2]);
String[] aliasNames = copyOfRange(row, 3, row.length); String name = row[3];
movies.add(new Movie(name, aliasNames, year, imdbid, -1)); String[] aliasNames = copyOfRange(row, 4, row.length);
movies.add(new Movie(name, aliasNames, year, imdbid > 0 ? imdbid : -1, tmdbid > 0 ? tmdbid : -1));
} }
return movies.toArray(new Movie[0]); return movies.toArray(new Movie[0]);

View File

@ -20,7 +20,7 @@ number.clutter.maxfilesize: 262144000
url.series-mappings: http://filebot.net/data/series-mappings.txt url.series-mappings: http://filebot.net/data/series-mappings.txt
# list of all movies (id, name, year) # list of all movies (id, name, year)
url.movie-list: http://filebot.net/data/movies.txt.xz url.movie-list: http://filebot.net/data/moviedb.txt.xz
# TheTVDB index # TheTVDB index
url.thetvdb-index: http://filebot.net/data/thetvdb.txt.xz url.thetvdb-index: http://filebot.net/data/thetvdb.txt.xz