* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names

* lots of extra RG names and blacklisted terms (esp useful for dokus)
* updated cleaner script to handle video clutter like samples etc
This commit is contained in:
Reinhard Pointner 2012-07-13 11:41:50 +00:00
parent 7cf02bb235
commit d29fe49390
5 changed files with 243 additions and 44 deletions

View File

@ -1,7 +1,28 @@
// filebot -script BuildData.groovy -trust-script
// filebot -script BuildData.groovy
def s_out = new File("website/data/series.list.gz")
def m_out = new File("website/data/movies.txt.gz")
def sortRegexList(path) {
def set = new TreeSet(String.CASE_INSENSITIVE_ORDER)
new File(path).eachLine{
// check if regex compiles
set += java.util.regex.Pattern.compile(it).pattern()
}
def out = set.join('\n').saveAs(path)
println "$out\n$out.text\n"
}
// sort and check shared regex collections
sortRegexList("website/data/release-groups.txt")
sortRegexList("website/data/query-blacklist.txt")
// ------------------------------------------------------------------------- //
def series_out = new File("website/data/series.list.gz")
def movies_out = new File("website/data/movies.txt.gz")
def gz(file, lines) {
file.withOutputStream{ out ->
@ -15,20 +36,54 @@ def gz(file, lines) {
// ------------------------------------------------------------------------- //
// BUILD movies.txt.gz
def tsv = new URL("http://www.opensubtitles.org/addons/export_movie.php")
def movies = []
// LOAD osdb-imdb.txt (already verified data)
def imdb_tsv = new File("website/data/osdb-imdb.txt")
def imdb = [].asSynchronized() // thread-safe list
tsv.text.eachLine{
imdb_tsv.getText('UTF-8').eachLine{
imdb << it.split(/\t/)
}
imdb_ids = new HashSet(imdb.collect{ it[0] })
// BUILD movies.txt.gz
def osdb_tsv = new URL("http://www.opensubtitles.org/addons/export_movie.php")
def osdb = []
osdb_tsv.getText('UTF-8').eachLine{
def line = it.split(/\t/)*.replaceAll(/\s+/, ' ')*.trim()
if (line.size() == 4 && line[0] =~ /\d+/) {
movies.add([line[1].toInteger(), line[2], line[3].toInteger()])
osdb << [line[1].toInteger(), line[2], line[3].toInteger()]
}
}
osdb = osdb.findAll{ it[0] <= 9999999 && it[2] >= 1930 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ }.collect{ [it[0].pad(7), it[1], it[2]] }
movies = movies.findAll{ it[0] <= 9999999 && it[2] >= 1930 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ }.sort{ it[1] }
gz(m_out, movies.collect{ [it[0].pad(7), it[1], it[2]].join('\t') })
parallel(osdb.collect{ row ->
return {
// update new data
if (!imdb_ids.contains(row[0])) {
def mov = net.sourceforge.filebot.WebServices.IMDb.getMovieDescriptor(row[0] as int, null)
if (mov != null && mov.name.length() > 0 && mov.year > 0) {
println "Adding $mov"
imdb << [row[0], mov.name, mov.year]
} else {
println "Blacklisting $row"
imdb << [row[0], null]
}
}
}
}, 20)
// save updated imdb data
imdb.collect{ it.join('\t') }.join('\n').saveAs(imdb_tsv)
// save movie data
def movies = imdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') }
def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
movies.each{ movieSorter.put(it[1], it) }
movies = movieSorter.values().collect{ it.join('\t') }
gz(movies_out, movies)
println "Movie Count: " + movies.size()
@ -36,27 +91,32 @@ println "Movie Count: " + movies.size()
// BUILD series.list.gz
def page = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
def names = page.fetch().getHtml('utf-8')
def thetvdb_index = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
def thetvdb_names = thetvdb_index.fetch().getHtml('UTF-8')
.depthFirst().TABLE.find{it['@id'] == "listtable"}
.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English'}
.findResults{ it.TD[0].A.text() }
if (names.size() == 0) {
throw new Exception("Failed to scrape series names")
def imdb_series_names = imdb.findAll{ it.size() >= 3 && it[1].startsWith('"') }.collect{ it[1] }
def anidb_names = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles().findResults{ [it.getPrimaryTitle(), it.getOfficialTitle('en')] }.flatten()
/*
def dokuwiki_index = new URL('http://docuwiki.net/postbot/getList.php?subject=Name')
def doku_names = []
dokuwiki_index.getText('UTF-8').eachLine{
doku_names << it.trim().replaceTrailingBrackets()
}
*/
def anime = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles()
names += anime.findResults{ it.getPrimaryTitle() }
names += anime.findResults{ it.getOfficialTitle('en') }
def names = [thetvdb_names, imdb_series_names, anidb_names]
names.each{ if (it.size() == 0) throw new Exception("Failed to scrape series names") } // sanity check
names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } // collect and normalize names
names = names.findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) }
def unique = new TreeSet(String.CASE_INSENSITIVE_ORDER)
unique.addAll(names)
names = unique as List
def seriesSorter = new TreeSet(String.CASE_INSENSITIVE_ORDER)
seriesSorter.addAll(names)
names = seriesSorter as List
gz(s_out, names)
gz(series_out, names)
println "Series Count: " + names.size()

View File

@ -99,7 +99,7 @@ public class IMDbClient implements MovieIdentificationService {
if (header.toUpperCase().contains("(VG)")) // ignore video games
return null;
String name = selectString("//H1/A/text()", dom);
String name = selectString("//H1/A/text()", dom).replaceAll("\\s+", " ").trim();
String year = new Scanner(selectString("//H1/A/following::A/text()", dom)).useDelimiter("\\D+").next();
String url = selectString("//H1/A/@href", dom);
return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, getImdbId(url));

View File

@ -2,6 +2,7 @@
(?-i:ENGLISH)
(?-i:FRENCH)
(?-i:GERMAN)
(?-i:LAB)
(?-i:LIMITED|LiMiTED)
(?-i:SPANISH)
(?-i:SWEDISH|SWEDiSH)
@ -32,16 +33,23 @@
^VCD$
^VIDEO_TS$
A.Release.Lounge
ABC
Anime[s]?
Arte
BBC
btarena.org
By.Cool.Release
CBC
CD[0]?[1-3]
Channel.4
Channel.5
CN
CVCD
DC
Demonoid
Director's.Cut
Directors.Cut
Discovery.Channel
docu
Dual.Audio
dubbed
@ -58,8 +66,10 @@ Fra
FRE
GER
Hard.Subbed
HBO
HDRip
Hindi
History.Channel
HQ
info
iNT
@ -69,6 +79,7 @@ ISO
iTA
iTALIA
jigaxx
k.tk.crew
KIDZCORNER
KOR
KORSUB
@ -78,9 +89,15 @@ mkvonly
Movies
MultiSub
MVGroup.org
National.Geographic
NFO
NG
NHK
NL
NL.Subs
NLT
o2.pl
PBS
Pre.?DVD
PROPER
PSP
@ -92,8 +109,8 @@ ReRip
RESYNC
RETAIL
RiffTrax
Sample
sample[s]?$
sample[s]?
SBS
Screenshot
ShareGo
ShareReactor
@ -121,10 +138,7 @@ UNCUT
unrated
unrated.edition
UsaBit.com
Video[s]?
www.speed.cd
www.torentz.3xforum.ro
www.Torrenting.com
www[.]
video[s]?
www[.][\w-.]+[.](com|net|tk|ro|cd)
xRipp
Zune

View File

@ -12,6 +12,7 @@
3LT0N
420RipZ
4HM
666
7SiNS
850105
a-S
@ -21,6 +22,8 @@ AaS
aBD
AbSurdity
aceford
ACF
AckTiv3
ADHD
AE
AEGiS
@ -47,15 +50,18 @@ ARiGOLD
ARROW
ArtSubs
ASAP
Atlas47
ATTENTATET
AVCHD
AVS720
AW
aWake
AXE
aXXo
AZuRRaY
babylonad
BAJSKORV
BaLD
BamHD
Barba
BaSS
@ -64,20 +70,25 @@ bc10
BDClub
BDiSC
beAst
BEEF.STEW
BeStDivX
BestHD
BiA
BiDA
Billman424
Blixten
BLOW
Blu-bits
BluDragon
BlueBird
blueF
Bluereaper
BlueTV
BLUEYES
blueZilla
BluWave
BMB
BoBo
BORGATA
bReAK
BrG
@ -86,13 +97,16 @@ BRMP
BRUTUS
BRZONE
BTSD
BTSFilms
BTT
BugZ
BULLDOZER
BUNNY
BurnFre
BWB
C4TV
CAMELOT
catflap
CBGB
CDD
CDDHD
@ -119,25 +133,31 @@ cntc
COALiTiON
Cocksure
COMPULSION
Connaz-AKA-MrPirate
cottage
COWiSO
CPtScene
CPY
CREEDANCE
CRF
CRIMSON
CRiSC
CROSSBOW
CRYS
CSHD
CTD
CtrlHD
CTU
CULTHD
CuMBuCKeTS
CYBERMEN
CyberTyger
D-Z0N3
D3Si
danger2u
danirl
Danny
Darkside.RG
DARM
DASH
DAW
@ -171,16 +191,22 @@ DiTa
DiVERSiTY
DivXNL
DivXNL-Team
DjRobo38
dmd
DMT
DnB
DNL
DNR
DOMiNO
DON
Donatello
DoNE
DOT
DOUBT
Dowcker
DOWN
DRHD
DrSn
DUPLI
DUQA
DutchReleaseTeam
@ -198,6 +224,7 @@ Ekolb
Electri4ka
ELECTRiC
Electrichka
ELiA
elizabethtga
EM0C0RE
EmC
@ -225,11 +252,13 @@ EXViD
eztv
FaNSuB
FASM
FEAR
FELONY
FFNDVD
FHD
FHM
FiCO
FiddleGoose
FiHTV
FilmHD
FiNaLe
@ -268,6 +297,7 @@ Gazdi
GB
GECKOS
GEHENNA
Genesis-RG
GFW
GFY
GiNJi
@ -276,6 +306,8 @@ Goblin10
Gogeta
GoLDSToNE
GOTHiC
greenbud1969
GREiD
GriOTS
Grond
gudhak
@ -286,6 +318,7 @@ HaB
HAGGiS
HAiDEAF
HALCYON
Hammer71
HANGOVER
hannibal
HCA
@ -317,10 +350,15 @@ HDX
HDxT
Helix
HHH
HHI
HIDD3N
HiDt
HiFi
HiGHTIMES
HiNT
Hivrolta
HLS
HNR
HoodBag
HORiZON
HOWL
@ -340,6 +378,8 @@ IGUANA
iKA
iLG
iLL
iLLUSiON
imacRuel1
iMAGiNE
iMBT
IMF
@ -352,7 +392,9 @@ iNGOT
InSaNiTy
iNSECTS
iNSPiRED
IntelliQ
iNTERNAL
iNTiMiD
INtL
iNVANDRAREN
iON
@ -361,46 +403,64 @@ ITZ
Japhson
JAVLiU
JCH
jedi
JENC
JFKXVID
JJH
JoLLyRoGeR
Jozzep
K-F
k2
KaKa
kamera
KEG
keltz
KiLT
KiNGS
kirklestat
KLAXXON
KlockreN
KNIGHTY1973
KOENiG
Koffe
Kole
KonzillaRG
KooKoo
KRaLiMaRKo
Kuth
KYR
Kyuubi
LamB
Larceny
LCHD
leetay
LEVERAGE
LEViTY
LiMiTED
LiPAN
LMAO
LMG
LoD
LOL
LOLCATS
LoneWolf
LOST
LP
lrc
LRH
LTRG
LTT
LUSO
M794
MACHD
macro
madeec
MAGiCAL
MAGiCViBE
MAiN
MainEvent
MARiNES
marioBombo
MAXSPEED
MC
MCR
@ -408,12 +468,16 @@ med
MEDiAMANiACS
MEDiEVAL
MELiTE
Mental.RG
MeTH
METiS
MHQ
Mikoto
MiND
MiNT
MiRAGETV
Mish
MissRipZ
MMI
MoF
MOMENTUM
@ -434,8 +498,11 @@ N-F
NaRB
Narutoverse
NBS
NDRT
NeDiVx
NEPTUNE
NERDHD
NeRoZ
NEW.SOURCE
NewArtRiot
NFHD
@ -444,6 +511,7 @@ NGXHD
NhaNc3
NiBURU
NiF
NikonXP
Nile
NiX
NL.Subs
@ -460,14 +528,19 @@ NPW
NSUBS
NT
NTb
NTF
NuMy
NUXX
NWO
NyTT
OAS
Occor
OEM
OEM1080
Omifast
OmU
ONYX
OPT!V!D
ORC
ORENJi
ORPHEUS
@ -476,9 +549,11 @@ OSiTV
OUTDATED
OZC
P0W4
P4DGE
Pa@Ph
PADDO
papi
PAROVOZ
PARTiCLE
PaYxXx
PeeWee
@ -502,9 +577,11 @@ PoTuS
PP
PPQ
PRECiOUS
prevail
Prime
PriMeHD
PRiNCE
prithwi
PRoDJi
PROGRESS
PROPHETS
@ -523,16 +600,19 @@ Purana
PURE
PUZZLE
PxHD
PZE
Q0S
QCF
QDP
QiX
QSP
Quali.SlaYer
QXE
R&C
rabomil
RANDi
RAP
Rare.Share
Razor1911
Reaperza
REAVERS
@ -541,6 +621,8 @@ REFiNED
RELOADED
Republic
REPULSiON
RETRO
Rets
REVEiLLE
REWARD
RightSiZE
@ -549,8 +631,10 @@ RiPTATORz
RiTALiX
RiVER
RMT
RoCK&BlueLadyRG
RoCKRioT
ROVERS
RS
RSG
RTA
RUBY
@ -568,18 +652,22 @@ SAMFD
SANTI
SAPHiRE
Sapphire
SATIVA
SChiZO
Scratch404
Scratched
SCREAM
ScWb
SecretMyth
SECTOR7
SEMTEX
SEPTiC
SEVcD
SEVENTWENTY
SexSh0p
SFM
SGKK
Shadow
Shadowman
SHAMNBOYZ
SHDXXX
@ -587,16 +675,22 @@ shortbrehd
SHS
SHUNPO
SiC
sickboy88
SiGHTHD
SiHD
SiLU
SINISTER
SiNNERS
SiRiUs.sHaRe
SiTV
SKALiWAGZ
SkipTowne
SKYLIGHT
SLM
SLO
SLOMO
SMoKeR
Smurfenlars
Sneak
SNUGGLER
SoCkS
@ -607,13 +701,16 @@ SPARKS
SPOOKY
sprinter
SSF
STAGEMAN
Stealthmaster
stieg
stoffinho17
Stranded
streetwars
STV
Subject16
SuBoXoNe
SUBZERO
SUNSPOT
SURFER
SVD
@ -628,16 +725,22 @@ TASTE
TASTETV
TB
TDF
TDR
TeamRV
TELEFLiX
TENEIGHTY
TeNNReeD
TERRA
terribleHD
terribleSD
TFE
THENiGHTMAREiNHD
TheWretched
Thizz
THOR
THORA
THUGLiNE
THUNDER
TiDE
TiMELORDS
TiMPE
@ -653,8 +756,10 @@ TOPAZ
TorrenTGui
tpz
trentalent
TRiMEDIA
TRiPS
TrollHD
trosa
TruCK
tRuE
TRUEFRENCH
@ -663,6 +768,7 @@ TsH
tsn
TURKiSO
TUSAHD
Tushar
TVA
TW
TWiZTED
@ -676,12 +782,15 @@ USELESS
UVall
VaAr3
VALiOMEDiA
VALKYRiA
VAMPS
Vanillapunk
VanRay
VCDVaULT
VeGaN
Vegapunk
VeggTeppe
Vex
ViCiOsO
ViKAT
ViNYL
@ -712,9 +821,11 @@ WiRE
WLM
WoLF
Wolky
WoRKZ
WPi
WRCR
WuSiWuG
XanaX
Xander
XiA
XOR
@ -735,4 +846,9 @@ Yibis
YoHo
YOUFORGOTTOREPACKTHIS
ZBS
ZEKTORM
ZEN
Zeus.Dias
ZMG
Zox
Zuzuu

View File

@ -1,18 +1,27 @@
// filebot -script "http://filebot.sf.net/scripts/cleaner.groovy" -trust-script /path/to/media/
// filebot -script "http://filebot.sf.net/scripts/cleaner.groovy" [--action test] /path/to/media/
/*
* Delete orphaned "clutter" files like nfo, jpg, etc
* Delete orphaned "clutter" files like nfo, jpg, etc and sample files
*/
def isClutter(file) {
return file.hasExtension("nfo", "txt", "jpg", "jpeg")
def isClutter(f) {
f.path =~ /\b(?i:sample|trailer|extras|deleted.scenes|music.video|scrapbook)\b/ || f.hasExtension("jpg", "jpeg", "png", "gif", "nfo", "xml", "htm", "html", "log", "srt", "sub", "idx", "md5", "sfv", "txt", "rtf", "url", "db", "dna")
}
def clean(f) {
println "Delete $f"
// do a dry run via --action test
if (_args.action == 'test') {
return false
}
return f.isDirectory() ? f.deleteDir() : f.delete()
}
// delete clutter files in orphaned media folders
args.getFiles{ isClutter(it) && !it.dir.hasFile{ it.isVideo() }}.each {
println "Delete file $it: " + it.delete()
}
args.getFiles{ isClutter(it) && !it.dir.hasFile{ (it.isVideo() || it.isAudio()) && !isClutter(it) }}.each { clean(it) }
// delete empty folders but exclude roots
args.getFolders{ it.getFiles().isEmpty() && !args.contains(it) }.each {
println "Delete dir $it: " + it.deleteDir()
}
// delete empty folders but exclude given args
args.getFolders{ it.listFiles().length == 0 && !args.contains(it) }.each { clean(it) }