+ rebuild movie index with imdb AND tmdb IDs
This commit is contained in:
parent
28df8ff69a
commit
75c897bae5
|
@ -1,6 +1,7 @@
|
|||
import org.tukaani.xz.*
|
||||
|
||||
// ------------------------------------------------------------------------- //
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
def sortRegexList(path) {
|
||||
|
@ -9,7 +10,6 @@ def sortRegexList(path) {
|
|||
// check if regex compiles
|
||||
set += java.util.regex.Pattern.compile(it.trim()).pattern()
|
||||
}
|
||||
|
||||
def out = set.join('\n').saveAs(path)
|
||||
println "$out\n$out.text\n"
|
||||
}
|
||||
|
@ -22,11 +22,14 @@ sortRegexList("website/data/exclude-blacklist.txt")
|
|||
sortRegexList("website/data/series-mappings.txt")
|
||||
|
||||
|
||||
// ------------------------------------------------------------------------- //
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
def reviews = []
|
||||
new File('reviews.csv').eachLine('UTF-8'){ def s = it.split(';', 3); reviews << [user: s[0], date: s[1], text: s[2].replaceAll(/^["]|["]$/, '').replaceAll(/["]{2}/, '"')] }
|
||||
new File('reviews.csv').eachLine('UTF-8'){
|
||||
def s = it.split(';', 3)
|
||||
reviews << [user: s[0], date: s[1], text: s[2].replaceAll(/^\"|\"$/, '').replaceAll(/["]{2}/, '"') ]
|
||||
}
|
||||
reviews = reviews.sort{ it.date }
|
||||
|
||||
def json = new groovy.json.JsonBuilder()
|
||||
|
@ -35,10 +38,10 @@ json.toPrettyString().saveAs('website/reviews.json')
|
|||
println "Reviews: " + reviews.size()
|
||||
|
||||
|
||||
// ------------------------------------------------------------------------- //
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
def movies_out = new File("website/data/movies.txt")
|
||||
def movies_out = new File("website/data/moviedb.txt")
|
||||
def thetvdb_out = new File("website/data/thetvdb.txt")
|
||||
def anidb_out = new File("website/data/anidb.txt")
|
||||
|
||||
|
@ -51,9 +54,19 @@ def pack(file, lines) {
|
|||
}
|
||||
|
||||
|
||||
// ------------------------------------------------------------------------- //
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
// BUILD moviedb index
|
||||
def treeSort(list, keyFunction) {
|
||||
def sorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
|
||||
list.each{
|
||||
sorter.put(keyFunction(it), it)
|
||||
}
|
||||
return sorter.values()
|
||||
}
|
||||
|
||||
|
||||
// BUILD movies.txt.gz
|
||||
def omdb = new TreeSet({ a, b -> a[0].compareTo(b[0]) } as Comparator)
|
||||
new File('omdb.txt').eachLine('Windows-1252'){
|
||||
def line = it.split(/\t/)
|
||||
|
@ -67,27 +80,57 @@ new File('omdb.txt').eachLine('Windows-1252'){
|
|||
def votes = tryQuietly{ line[12].replaceAll(/\D/, '').toInteger() } ?: 0
|
||||
|
||||
if ((year >= 1970 && (runtime =~ /h/ || votes >= 200) && rating >= 1 && votes >= 50) || (year >= 1950 && votes >= 5000)) {
|
||||
omdb << [imdbid, name, year]
|
||||
omdb << [imdbid.pad(7), name, year]
|
||||
}
|
||||
}
|
||||
}
|
||||
omdb = omdb.findAll{ it[0] <= 9999999 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ && it[1].length() >= 4}.collect{ [it[0].pad(7), it[1], it[2]] }
|
||||
def isValidMovieName = { s -> s =~ /^[A-Z0-9]/ && s =~ /[\p{Alpha}]{3}/ }
|
||||
omdb = omdb.findAll{ (it[0] as int) <= 9999999 && isValidMovieName(it[1]) }
|
||||
|
||||
// save movie data
|
||||
def movies = omdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') }
|
||||
def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
|
||||
movies.each{ movieSorter.put([it[1], it[2], it[0]].join('\t'), [it[0], it[2], it[1]]) } // ORDER => ID, YEAR, NAME
|
||||
movies = movieSorter.values().collect{ it.join('\t') }
|
||||
|
||||
pack(movies_out, movies)
|
||||
def tmdb_txt = new File('tmdb.txt')
|
||||
def tmdb_index = csv(tmdb_txt, '\t', 1, [0..-1])
|
||||
|
||||
def tmdb = omdb.findResults{ m ->
|
||||
if (tmdb_index.containsKey(m[0])) {
|
||||
return tmdb_index[m[0]]
|
||||
}
|
||||
|
||||
def sync = System.currentTimeMillis()
|
||||
def row = [sync, m[0].pad(7), 0, m[2], m[1]]
|
||||
try {
|
||||
def info = net.sourceforge.filebot.WebServices.TMDb.getMovieInfo("tt${m[0]}", Locale.ENGLISH, false)
|
||||
def names = [info.name, info.originalName, m[1]]
|
||||
row = [sync, m[0].pad(7), info.id.pad(7), info.released?.year ?: m[2]] + names.findResults{ it ?: '' }
|
||||
} catch(FileNotFoundException e) {
|
||||
}
|
||||
|
||||
println row
|
||||
tmdb_txt << row.join('\t') << '\n'
|
||||
return row
|
||||
}
|
||||
|
||||
movies = tmdb.findResults{
|
||||
def ity = it[1..3] // imdb id, tmdb id, year
|
||||
def names = it[4..-1].findAll{ isValidMovieName(it) }.unique{ it.toLowerCase().normalizePunctuation() }
|
||||
if (ity[1].toInteger() > 0 && names.size() > 0)
|
||||
return ity + names
|
||||
else
|
||||
return null
|
||||
}
|
||||
movies = treeSort(movies, { it[3, 2].join(' ') })
|
||||
|
||||
pack(movies_out, movies.findResults{ it.join('\t') })
|
||||
println "Movie Count: " + movies.size()
|
||||
|
||||
// sanity check
|
||||
if (movies.size() < 50000) { throw new Exception('Movie index sanity failed') }
|
||||
if (movies.size() < 40000) { throw new Exception('Movie index sanity failed') }
|
||||
|
||||
// ------------------------------------------------------------------------- //
|
||||
|
||||
// BUILD thetvdb-index.gz
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
// BUILD tvdb index
|
||||
def tvdb = new HashMap()
|
||||
def tvdb_txt = new File('tvdb.txt')
|
||||
new File('tvdb.txt').eachLine{
|
||||
|
@ -139,9 +182,7 @@ tvdb.values().each{
|
|||
|
||||
|
||||
thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].length() < 2) }
|
||||
thetvdb_index = thetvdb_index.sort(new Comparator() {
|
||||
int compare(a, b) { a[0] <=> b[0] }
|
||||
})
|
||||
thetvdb_index = thetvdb_index.sort(new Comparator() { int compare(a, b) { a[0] <=> b[0] } })
|
||||
|
||||
// join and sort
|
||||
def thetvdb_txt = thetvdb_index.groupBy{ it[0] }.findResults{ k, v -> ([k.pad(6)] + v*.getAt(1).unique()).join('\t') }
|
||||
|
@ -152,8 +193,10 @@ println "TheTVDB Index: " + thetvdb_txt.size()
|
|||
if (thetvdb_txt.size() < 30000) { throw new Exception('TheTVDB index sanity failed') }
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
// BUILD anidb-index.gz
|
||||
|
||||
// BUILD anidb index
|
||||
def anidb = new net.sourceforge.filebot.web.AnidbClient(null, 0).getAnimeTitles()
|
||||
|
||||
def anidb_index = anidb.findResults{
|
||||
|
|
|
@ -3,17 +3,11 @@ import static net.sourceforge.tuned.FileUtilities.*
|
|||
import java.util.regex.Pattern
|
||||
|
||||
|
||||
// simplified switch/case pattern matching
|
||||
def c(c) { try { c.call() } catch (Throwable e) { null } }
|
||||
def csv(path, delim = ';', keyIndex = 0, valueIndex = 1) { def f = path as File; def values = [:]; f.splitEachLine(delim) { line -> values.put(line[keyIndex], valueIndex < line.size() ? line[valueIndex] : null) }; return values }
|
||||
Object.metaClass.match = { Map cases -> def val = delegate; cases.findResult { switch(val) { case it.key: return it.value} } }
|
||||
|
||||
|
||||
/**
|
||||
* Allow getAt() for File paths
|
||||
*
|
||||
* e.g. file[0] -> "F:"
|
||||
*/
|
||||
* Allow getAt() for File paths
|
||||
*
|
||||
* e.g. file[0] -> "F:"
|
||||
*/
|
||||
File.metaClass.getAt = { Range range -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(range).join(File.separator) }
|
||||
File.metaClass.getAt = { int index -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(index) }
|
||||
File.metaClass.getRoot = { listPath(delegate)[0] }
|
||||
|
@ -22,7 +16,7 @@ File.metaClass.getDiskSpace = { listPath(delegate).reverse().find{ it.exists() }
|
|||
|
||||
|
||||
/**
|
||||
* Convenience methods for String.toLowerCase()and String.toUpperCase()
|
||||
* Convenience methods for String.toLowerCase() and String.toUpperCase()
|
||||
*/
|
||||
String.metaClass.lower = { toLowerCase() }
|
||||
String.metaClass.upper = { toUpperCase() }
|
||||
|
@ -93,10 +87,10 @@ String.metaClass.upperInitial = { replaceAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[a-
|
|||
|
||||
|
||||
/**
|
||||
* Get acronym, i.e. first letter of each word.
|
||||
*
|
||||
* e.g. "Deep Space 9" -> "DS9"
|
||||
*/
|
||||
* Get acronym, i.e. first letter of each word.
|
||||
*
|
||||
* e.g. "Deep Space 9" -> "DS9"
|
||||
*/
|
||||
String.metaClass.acronym = { delegate.sortName('$2').findAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[\p{Alnum}]/).join().toUpperCase() }
|
||||
String.metaClass.sortName = { replacement = '$2, $1' -> delegate.replaceFirst(/^(?i)(The|A|An)\s(.+)/, replacement).trim() }
|
||||
|
||||
|
@ -165,19 +159,49 @@ String.metaClass.transliterate = { transformIdentifier -> com.ibm.icu.text.Trans
|
|||
|
||||
|
||||
/**
|
||||
* Convert Unicode to ASCII as best as possible. Works with most alphabets/scripts used in the world.
|
||||
*
|
||||
* e.g. "Österreich" -> "Osterreich"
|
||||
* "カタカナ" -> "katakana"
|
||||
*/
|
||||
* Convert Unicode to ASCII as best as possible. Works with most alphabets/scripts used in the world.
|
||||
*
|
||||
* e.g. "Österreich" -> "Osterreich"
|
||||
* "カタカナ" -> "katakana"
|
||||
*/
|
||||
String.metaClass.ascii = { fallback = ' ' -> delegate.transliterate("Any-Latin;Latin-ASCII;[:Diacritic:]remove").replaceAll("[^\\p{ASCII}]+", fallback) }
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* General helpers and utilities
|
||||
*/
|
||||
def c(c) {
|
||||
try {
|
||||
return c.call()
|
||||
} catch (Throwable e) {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
def csv(path, delim = ';', keyIndex = 0, valueIndex = 1) {
|
||||
def f = path as File
|
||||
def values = [:]
|
||||
if (f.isFile()) {
|
||||
f.splitEachLine(delim) { line ->
|
||||
values.put(line[keyIndex], c{ line[valueIndex] })
|
||||
}
|
||||
}
|
||||
return values
|
||||
}
|
||||
|
||||
Object.metaClass.match = { Map cases ->
|
||||
def val = delegate;
|
||||
cases.findResult {
|
||||
switch(val) { case it.key: return it.value}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Web and File IO helpers
|
||||
*/
|
||||
* Web and File IO helpers
|
||||
*/
|
||||
import net.sourceforge.filebot.web.WebRequest
|
||||
import net.sourceforge.tuned.FileUtilities
|
||||
import net.sourceforge.tuned.XPathUtilities
|
||||
|
@ -190,8 +214,8 @@ URL.metaClass.scrapeAll = { xpath -> XPathUtilities.selectNodes(xpath, WebReques
|
|||
|
||||
|
||||
/**
|
||||
* XML / XPath utility functions
|
||||
*/
|
||||
* XML / XPath utility functions
|
||||
*/
|
||||
import javax.xml.xpath.XPathFactory
|
||||
import javax.xml.xpath.XPathConstants
|
||||
|
||||
|
|
|
@ -328,10 +328,11 @@ public class ReleaseInfo {
|
|||
|
||||
for (String[] row : rows) {
|
||||
int imdbid = parseInt(row[0]);
|
||||
int year = parseInt(row[1]);
|
||||
String name = row[2];
|
||||
String[] aliasNames = copyOfRange(row, 3, row.length);
|
||||
movies.add(new Movie(name, aliasNames, year, imdbid, -1));
|
||||
int tmdbid = parseInt(row[1]);
|
||||
int year = parseInt(row[2]);
|
||||
String name = row[3];
|
||||
String[] aliasNames = copyOfRange(row, 4, row.length);
|
||||
movies.add(new Movie(name, aliasNames, year, imdbid > 0 ? imdbid : -1, tmdbid > 0 ? tmdbid : -1));
|
||||
}
|
||||
|
||||
return movies.toArray(new Movie[0]);
|
||||
|
|
|
@ -20,7 +20,7 @@ number.clutter.maxfilesize: 262144000
|
|||
url.series-mappings: http://filebot.net/data/series-mappings.txt
|
||||
|
||||
# list of all movies (id, name, year)
|
||||
url.movie-list: http://filebot.net/data/movies.txt.xz
|
||||
url.movie-list: http://filebot.net/data/moviedb.txt.xz
|
||||
|
||||
# TheTVDB index
|
||||
url.thetvdb-index: http://filebot.net/data/thetvdb.txt.xz
|
||||
|
|
Loading…
Reference in New Issue