filebot/build-data/BuildData.groovy

396 lines
13 KiB
Groovy
Raw Normal View History

2015-05-11 10:35:45 +00:00
#!/usr/bin/env filebot -script
2016-02-24 19:33:04 +00:00
import java.util.regex.*
2014-01-08 08:36:32 +00:00
import org.tukaani.xz.*
2016-02-24 19:27:31 +00:00
/* ------------------------------------------------------------------------- */
2016-02-24 19:27:31 +00:00
2015-05-11 13:57:04 +00:00
def dir_root = ".."
def dir_website = "${dir_root}/website"
2015-05-11 10:35:45 +00:00
def dir_data = "${dir_website}/data"
2016-02-24 19:27:31 +00:00
// sort and check shared regex collections
['add-series-alias.txt',
'exclude-blacklist.txt',
'query-blacklist.txt',
'release-groups.txt',
'series-mappings.txt'
].each{
def input = new URL("https://raw.githubusercontent.com/filebot/data/master/${it}")
def output = new File("${dir_data}/${it}")
def lines = new TreeSet(String.CASE_INSENSITIVE_ORDER)
2016-02-24 19:38:03 +00:00
input.getText('UTF-8').split(/\R/)*.trim().findAll{ it.length() > 0 }.each{
lines += Pattern.compile(it).pattern()
}
2016-03-10 03:53:49 +00:00
println input
lines.each{ println it }
pack(output, lines)
2016-02-24 19:27:31 +00:00
}
/* ------------------------------------------------------------------------- */
def reviews = []
2015-05-27 16:47:17 +00:00
new File("${dir_root}/reviews.tsv").eachLine('UTF-8'){
def s = it.split(/\t/, 3)*.trim()*.replaceAll('["]{2}', '"')
2015-05-27 16:47:17 +00:00
reviews << [user: s[0], date: s[1], text: s[2]]
}
reviews = reviews.sort{ it.date }
def json = new groovy.json.JsonBuilder()
json.call(reviews as List)
2015-05-11 10:35:45 +00:00
json.toPrettyString().saveAs("${dir_website}/reviews.json")
println "Reviews: " + reviews.size()
/* ------------------------------------------------------------------------- */
2015-05-11 10:35:45 +00:00
def moviedb_out = new File("${dir_data}/moviedb.txt")
def thetvdb_out = new File("${dir_data}/thetvdb.txt")
def anidb_out = new File("${dir_data}/anidb.txt")
def osdb_out = new File("${dir_data}/osdb.txt")
def pack(file, lines) {
2016-03-10 03:53:49 +00:00
file.withOutputStream{ out ->
out.withWriter('UTF-8'){ writer ->
lines.each{ writer.append(it).append('\n') }
}
}
new File(file.parentFile, file.name + '.xz').withOutputStream{ out ->
new XZOutputStream(out, new LZMA2Options(LZMA2Options.PRESET_DEFAULT)).withWriter('UTF-8'){ writer ->
lines.each{ writer.append(it).append('\n') }
}
}
2014-01-07 15:21:38 +00:00
def rows = lines.size()
def columns = lines.collect{ it.split(/\t/).length }.max()
2015-05-11 13:57:04 +00:00
println "${file.canonicalFile} ($rows rows, $columns columns)"
}
/* ------------------------------------------------------------------------- */
def isValidMovieName(s) {
2014-03-06 15:50:14 +00:00
return (s.normalizePunctuation().length() >= 4) || (s=~ /^[A-Z0-9]/ && s =~ /[\p{Alnum}]{3}/)
}
def getNamePermutations(names) {
def normalize = { s -> s.toLowerCase().normalizePunctuation() }.memoize()
def fn1 = { s -> def n = s.replaceAll(/(?i)(^(The|A)\s)|([,]\s(The|A)$)/, ''); s =~ /^(?i:The|A)/ && n ==~ /\w+/ ? s : n } // e.g. The Walking Dead => Walking Dead, The Voice => The Voice
def fn2 = { s -> s.replaceAll(/\s&\s/, ' and ') }
def fn3 = { s -> s.replaceAll(/\([^\)]*\)$/, '') }
2014-09-11 20:04:24 +00:00
def out = names*.trim().unique().collectMany{ original ->
def simplified = original
[fn1, fn2, fn3].each{ fn -> simplified = fn(simplified).trim() }
return [original, simplified]
}.unique{ normalize(it) }.findAll{ it.length() > 0 }
2014-03-18 20:08:06 +00:00
out = out.findAll{ it.length() >= 2 && !(it ==~ /[1][0-9][1-9]/) && !(it =~ /^[a-z]/) && it =~ /^[@.\p{L}\p{Digit}]/ } // MUST START WITH UNICODE LETTER
2014-01-11 09:04:49 +00:00
out = out.findAll{ !MediaDetection.releaseInfo.structureRootPattern.matcher(it).matches() } // IGNORE NAMES THAT OVERLAP WITH MEDIA FOLDER NAMES
// out = out.findAll{ a -> names.take(1).contains(a) || out.findAll{ b -> normalize(a).startsWith(normalize(b) + ' ') }.size() == 0 } // TRY TO EXCLUDE REDUNDANT SUBSTRING DUPLICATES
return out
}
def treeSort(list, keyFunction) {
def sorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
list.each{
sorter.put(keyFunction(it), it)
}
return sorter.values()
}
def csv(f, delim, keyIndex, valueIndex) {
def values = [:]
if (f.isFile()) {
f.splitEachLine(delim, 'UTF-8') { line ->
values.put(line[keyIndex], tryQuietly{ line[valueIndex] })
}
}
return values
}
/* ------------------------------------------------------------------------- */
// BUILD moviedb index
def omdb = []
2015-05-11 13:57:04 +00:00
new File('omdbMovies.txt').eachLine('Windows-1252'){
def line = it.split(/\t/)
if (line.length > 11 && line[0] ==~ /\d+/ && line[3] ==~ /\d{4}/) {
def imdbid = line[1].substring(2).toInteger()
def name = line[2].replaceAll(/\s+/, ' ').trim()
def year = line[3].toInteger()
def runtime = line[5]
def genres = line[6]
2014-02-18 06:55:45 +00:00
def rating = tryQuietly{ line[12].toFloat() } ?: 0
def votes = tryQuietly{ line[13].replaceAll(/\D/, '').toInteger() } ?: 0
2015-11-12 12:20:25 +00:00
if (!(genres =~ /Short/ || votes <= 100 || rating <= 2) && ((year >= 1970 && (runtime =~ /(\d.h)|(\d{2,3}.min)/ || votes >= 1000)) || (year >= 1950 && votes >= 20000))) {
omdb << [imdbid.pad(7), name, year]
}
}
}
omdb = omdb.findAll{ (it[0] as int) <= 9999999 && isValidMovieName(it[1]) }
def tmdb_txt = new File('tmdb.txt')
def tmdb_index = csv(tmdb_txt, '\t', 1, [0..-1])
def tmdb = []
omdb.each{ m ->
def sync = System.currentTimeMillis()
if (tmdb_index.containsKey(m[0]) && (sync - tmdb_index[m[0]][0].toLong()) < ((m[2].toInteger() < 2000 ? 360 : 120) * 24 * 60 * 60 * 1000L) ) {
tmdb << tmdb_index[m[0]]
return
}
try {
2014-08-22 06:59:30 +00:00
def info = WebServices.TheMovieDB.getMovieInfo("tt${m[0]}", Locale.ENGLISH, true)
2014-10-31 10:36:18 +00:00
if (info.votes <= 1 || info.rating <= 2)
2015-11-12 11:43:13 +00:00
throw new IllegalArgumentException('Insufficient movie data: ' + info)
2014-10-31 10:36:18 +00:00
def names = [info.name, info.originalName] + info.alternativeTitles
[info?.released?.year, m[2]].findResults{ it?.toInteger() }.unique().each{ y ->
def row = [sync, m[0].pad(7), info.id.pad(7), y.pad(4)] + names
println row
tmdb << row
}
2015-11-12 11:43:13 +00:00
} catch(IllegalArgumentException | FileNotFoundException e) {
2015-11-12 11:30:49 +00:00
printException(e, false)
def row = [sync, m[0].pad(7), 0, m[2], m[1]]
println row
tmdb << row
}
}
tmdb*.join('\t').join('\n').saveAs(tmdb_txt)
movies = tmdb.findResults{
def ity = it[1..3] // imdb id, tmdb id, year
def names = getNamePermutations(it[4..-1]).findAll{ isValidMovieName(it) }
2013-11-21 16:31:09 +00:00
if (ity[0].toInteger() > 0 && ity[1].toInteger() > 0 && names.size() > 0)
return ity + names
else
return null
}
movies = treeSort(movies, { it[3, 2].join(' ') })
// sanity check
2014-10-31 15:59:16 +00:00
if (movies.size() < 20000) { die('Movie index sanity failed:' + movies.size()) }
pack(moviedb_out, movies*.join('\t'))
/* ------------------------------------------------------------------------- */
// BUILD tvdb index
def tvdb_txt = new File('tvdb.txt')
def tvdb = [:]
if (tvdb_txt.exists()) {
2014-08-14 17:29:34 +00:00
tvdb_txt.eachLine('UTF-8'){
def line = it.split('\t').toList()
2014-08-16 16:28:40 +00:00
def names = line.subList(5, line.size())
tvdb.put(line[1] as Integer, [line[0] as Long, line[1] as Integer, line[2], line[3] as Float, line[4] as Float] + names)
}
}
2014-08-16 03:07:51 +00:00
def tvdb_updates = [:] as TreeMap
2014-08-14 17:29:34 +00:00
new File('updates_all.xml').eachLine('UTF-8'){
def m = (it =~ '<Series><id>(\\d+)</id><time>(\\d+)</time></Series>')
2014-08-16 02:40:39 +00:00
while(m.find()) {
def id = m.group(1) as Integer
def time = m.group(2) as Integer
tvdb_updates[id] = [id: id, time: time]
2014-08-14 17:29:34 +00:00
}
}
2014-12-05 16:21:13 +00:00
// blacklist crap entries
2014-12-05 17:28:56 +00:00
tvdb_updates.remove(219901)
tvdb_updates.remove(256135)
2014-12-05 16:21:13 +00:00
2014-08-14 17:29:34 +00:00
2014-08-16 02:40:39 +00:00
tvdb_updates.values().each{ update ->
2014-03-10 05:34:53 +00:00
if (tvdb[update.id] == null || update.time > tvdb[update.id][0]) {
try {
retry(2, 60000) {
2014-08-15 09:58:42 +00:00
def seriesNames = []
def xml = new XmlSlurper().parse("http://thetvdb.com/api/BA864DEE427E384A/series/${update.id}/en.xml")
def imdbid = xml.Series.IMDB_ID.text()
2014-08-15 09:58:42 +00:00
seriesNames += xml.Series.SeriesName.text()
def rating = tryQuietly{ xml.Series.Rating.text().toFloat() }
2014-05-15 08:18:50 +00:00
def votes = tryQuietly{ xml.Series.RatingCount.text().toFloat() }
2014-08-15 09:58:42 +00:00
// only retrieve additional data for reasonably popular shows
if (votes >= 5 && rating >= 4) {
tryLogCatch{
if (imdbid =~ /tt(\d+)/) {
2014-09-15 20:41:51 +00:00
seriesNames += OMDb.getMovieDescriptor(new Movie(null, 0, imdbid.match(/tt(\d+)/) as int, -1), Locale.ENGLISH).getName()
2014-08-15 09:58:42 +00:00
}
}
2014-08-15 09:58:42 +00:00
// scrape extra alias titles from webpage (not supported yet by API)
def jsoup = org.jsoup.Jsoup.connect("http://thetvdb.com/?tab=series&id=${update.id}").get()
def akaseries = jsoup.select('#akaseries table tr table tr')
.findAll{ it.select('td').any{ it.text() ==~ /en/ } }
.findResults{ it.select('td').first().text() }
.findAll{ it?.length() > 0 }
2014-11-28 15:59:47 +00:00
def intlseries = jsoup.select('#seriesform input')
.findAll{ it.attr('name') =~ /SeriesName/ }
.sort{ it.attr('name').match(/\d+/) as int }
.collect{ it.attr('value') }
.findAll{ it?.length() > 0 }
println "Scraped data $akaseries and $intlseries for series $seriesNames"
seriesNames += akaseries
seriesNames += intlseries
}
2014-08-15 09:58:42 +00:00
def data = [update.time, update.id, imdbid, rating ?: 0, votes ?: 0] + seriesNames.findAll{ it != null && it.length() > 0 }
tvdb.put(update.id, data)
println "Update $update => $data"
}
}
catch(Throwable e) {
2014-08-16 02:40:39 +00:00
printException(e, false)
2014-08-15 09:58:42 +00:00
def data = [update.time, update.id, '', 0, 0]
tvdb.put(update.id, data)
println "Update $update => $data"
}
}
}
// remove entries that have become invalid
tvdb.keySet().toList().each{ id ->
2014-08-16 02:40:39 +00:00
if (tvdb_updates[id] == null) {
println "Invalid ID found: ${tvdb[id]}"
tvdb.remove(id)
}
}
2014-05-12 19:18:31 +00:00
tvdb.values().findResults{ it.collect{ it.toString().replace('\t', '').trim() }.join('\t') }.join('\n').saveAs(tvdb_txt)
def thetvdb_index = []
tvdb.values().each{ r ->
def tvdb_id = r[1]
2014-08-15 09:58:42 +00:00
def rating = r[3]
def votes = r[4]
2014-08-16 16:28:40 +00:00
def names = r.subList(5, r.size())
2016-02-25 19:30:41 +00:00
if ((votes >= 5 && rating >= 4) || (votes >= 2 && rating >= 6) || (votes >= 1 && rating >= 10)) {
2014-08-15 09:58:42 +00:00
getNamePermutations(names).each{ n ->
thetvdb_index << [tvdb_id, n]
}
}
}
// additional custom mappings
2015-05-11 10:35:45 +00:00
new File("${dir_data}/add-series-alias.txt").splitEachLine(/\t+/, 'UTF-8') { row ->
def se = thetvdb_index.find{ row[0] == it[1] && !it.contains(row[1]) }
if (se == null) die("Unabled to find series '${row[0]}': '${row[1]}'")
2015-03-24 08:01:27 +00:00
thetvdb_index << [se[0], row[1]]
}
2015-11-03 03:43:54 +00:00
thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate|Series.Not.Permitted)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].length() < 2) }
2014-05-12 19:18:31 +00:00
thetvdb_index = thetvdb_index.sort{ a, b -> a[0] <=> b[0] }
// join and sort
def thetvdb_txt = thetvdb_index.groupBy{ it[0] }.findResults{ k, v -> ([k.pad(6)] + v*.getAt(1).unique{ it.toLowerCase() }).join('\t') }
// sanity check
if (thetvdb_txt.size() < 4000) { die('TheTVDB index sanity failed: ' + thetvdb_txt.size()) }
pack(thetvdb_out, thetvdb_txt)
/* ------------------------------------------------------------------------- */
// BUILD osdb index
def osdb = []
new File('osdb.txt').eachLine('UTF-8'){
def fields = it.split(/\t/)*.trim()
// 0 IDMovie, 1 IDMovieImdb, 2 MovieName, 3 MovieYear, 4 MovieKind, 5 MoviePriority
if (fields.size() == 6 && fields[1] ==~ /\d+/ && fields[3] ==~ /\d{4}/) {
if (fields[4] ==~ /movie|tv.series/ && isValidMovieName(fields[2]) && (fields[3] as int) >= 1970 && (fields[5] as int) >= 500) {
// 0 imdbid, 1 name, 2 year, 3 kind, 4 priority
osdb << [fields[1] as int, fields[2], fields[3] as int, fields[4] == /movie/ ? 'm' : fields[4] == /tv series/ ? 's' : '?', fields[5] as int]
}
}
}
// sort reverse by score
osdb.sort{ a, b -> b[4] <=> a[4] }
// reset score/priority because it's currently not used
osdb*.set(4, 0)
// map by imdbid
def tvdb_index = tvdb.values().findAll{ it[2] =~ /tt(\d+)/ }.collectEntries{ [it[2].substring(2).pad(7), it] }
// collect final output data
osdb = osdb.findResults{
def names = [it[1]]
if (it[3] == 'm') {
def tmdb_entry = tmdb_index[it[0].pad(7)]
if (tmdb_entry != null && tmdb_entry.size() > 4) {
names += tmdb_entry[4..-1]
}
} else if (it[3] == 's') {
def tvdb_entry = tvdb_index[it[0].pad(7)]
if (tvdb_entry != null && tvdb_entry.size() > 5) {
names += tvdb_entry[5..-1]
}
}
// 0 kind, 1 score, 2 imdbid, 3 year, 4-n names
return [it[3], it[4], it[0], it[2]] + names.unique()
}
// sanity check
2015-06-13 05:03:55 +00:00
if (osdb.size() < 15000) { die('OSDB index sanity failed:' + osdb.size()) }
pack(osdb_out, osdb*.join('\t'))
2015-05-25 08:41:33 +00:00
/* ------------------------------------------------------------------------- */
// BUILD anidb index
2016-02-05 16:31:53 +00:00
def anidb = new AnidbClient('filebot', 6).getAnimeTitles()
def animeExcludes = new HashSet()
// exclude anime movies from anime index
new File('anime-list.xml').eachLine('UTF-8') {
2016-02-06 12:23:53 +00:00
if (it =~ /tvdbid="movie"/ || it =~ /imdbid="ttd\+"/) {
2016-02-05 16:31:53 +00:00
animeExcludes << it.match(/anidbid="(\d+)"/).toInteger()
}
}
2015-05-25 08:41:33 +00:00
def anidb_index = anidb.findResults{
2016-02-05 16:31:53 +00:00
if (animeExcludes.contains(it.animeId))
return null
2015-05-25 08:41:33 +00:00
def names = it.effectiveNames*.replaceAll(/\s+/, ' ')*.trim()*.replaceAll(/['`´ʻ]+/, /'/)
names = getNamePermutations(names)
names = names.findAll{ stripReleaseInfo(it)?.length() > 0 }
2016-02-05 16:31:53 +00:00
return names.empty ? null : [it.animeId.pad(5)] + names.take(4)
2015-05-25 08:41:33 +00:00
}
// join and sort
def anidb_txt = anidb_index.findResults{ row -> row.join('\t') }.sort().unique()
// sanity check
2016-02-05 16:31:53 +00:00
if (anidb_txt.size() < 8000 || animeExcludes.size() < 500) { die('AniDB index sanity failed:' + anidb_txt.size()) }
2015-05-25 08:41:33 +00:00
pack(anidb_out, anidb_txt)