From d29fe4939090f1e3905c6eed6ae69c3e96e3a423 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Fri, 13 Jul 2012 11:41:50 +0000 Subject: [PATCH] * build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc --- BuildData.groovy | 108 ++++++++++++---- .../sourceforge/filebot/web/IMDbClient.java | 2 +- website/data/query-blacklist.txt | 30 +++-- website/data/release-groups.txt | 116 ++++++++++++++++++ website/scripts/cleaner.groovy | 31 +++-- 5 files changed, 243 insertions(+), 44 deletions(-) diff --git a/BuildData.groovy b/BuildData.groovy index 6ff27c19..9e901eda 100644 --- a/BuildData.groovy +++ b/BuildData.groovy @@ -1,7 +1,28 @@ -// filebot -script BuildData.groovy -trust-script +// filebot -script BuildData.groovy -def s_out = new File("website/data/series.list.gz") -def m_out = new File("website/data/movies.txt.gz") + +def sortRegexList(path) { + def set = new TreeSet(String.CASE_INSENSITIVE_ORDER) + new File(path).eachLine{ + // check if regex compiles + set += java.util.regex.Pattern.compile(it).pattern() + } + + def out = set.join('\n').saveAs(path) + println "$out\n$out.text\n" +} + + +// sort and check shared regex collections +sortRegexList("website/data/release-groups.txt") +sortRegexList("website/data/query-blacklist.txt") + + +// ------------------------------------------------------------------------- // + + +def series_out = new File("website/data/series.list.gz") +def movies_out = new File("website/data/movies.txt.gz") def gz(file, lines) { file.withOutputStream{ out -> @@ -15,20 +36,54 @@ def gz(file, lines) { // ------------------------------------------------------------------------- // -// BUILD movies.txt.gz -def tsv = new URL("http://www.opensubtitles.org/addons/export_movie.php") -def movies = [] +// LOAD osdb-imdb.txt (already verified data) +def imdb_tsv = new File("website/data/osdb-imdb.txt") +def imdb = [].asSynchronized() // thread-safe list -tsv.text.eachLine{ +imdb_tsv.getText('UTF-8').eachLine{ + imdb << it.split(/\t/) +} +imdb_ids = new HashSet(imdb.collect{ it[0] }) + +// BUILD movies.txt.gz +def osdb_tsv = new URL("http://www.opensubtitles.org/addons/export_movie.php") +def osdb = [] +osdb_tsv.getText('UTF-8').eachLine{ def line = it.split(/\t/)*.replaceAll(/\s+/, ' ')*.trim() if (line.size() == 4 && line[0] =~ /\d+/) { - movies.add([line[1].toInteger(), line[2], line[3].toInteger()]) + osdb << [line[1].toInteger(), line[2], line[3].toInteger()] } } +osdb = osdb.findAll{ it[0] <= 9999999 && it[2] >= 1930 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ }.collect{ [it[0].pad(7), it[1], it[2]] } -movies = movies.findAll{ it[0] <= 9999999 && it[2] >= 1930 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ }.sort{ it[1] } -gz(m_out, movies.collect{ [it[0].pad(7), it[1], it[2]].join('\t') }) +parallel(osdb.collect{ row -> + return { + // update new data + if (!imdb_ids.contains(row[0])) { + def mov = net.sourceforge.filebot.WebServices.IMDb.getMovieDescriptor(row[0] as int, null) + if (mov != null && mov.name.length() > 0 && mov.year > 0) { + println "Adding $mov" + imdb << [row[0], mov.name, mov.year] + } else { + println "Blacklisting $row" + imdb << [row[0], null] + } + } + } +}, 20) + +// save updated imdb data +imdb.collect{ it.join('\t') }.join('\n').saveAs(imdb_tsv) + +// save movie data +def movies = imdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') } + +def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER) +movies.each{ movieSorter.put(it[1], it) } +movies = movieSorter.values().collect{ it.join('\t') } + +gz(movies_out, movies) println "Movie Count: " + movies.size() @@ -36,27 +91,32 @@ println "Movie Count: " + movies.size() // BUILD series.list.gz -def page = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search') - -def names = page.fetch().getHtml('utf-8') +def thetvdb_index = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search') +def thetvdb_names = thetvdb_index.fetch().getHtml('UTF-8') .depthFirst().TABLE.find{it['@id'] == "listtable"} .depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English'} .findResults{ it.TD[0].A.text() } -if (names.size() == 0) { - throw new Exception("Failed to scrape series names") + +def imdb_series_names = imdb.findAll{ it.size() >= 3 && it[1].startsWith('"') }.collect{ it[1] } +def anidb_names = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles().findResults{ [it.getPrimaryTitle(), it.getOfficialTitle('en')] }.flatten() + +/* +def dokuwiki_index = new URL('http://docuwiki.net/postbot/getList.php?subject=Name') +def doku_names = [] +dokuwiki_index.getText('UTF-8').eachLine{ + doku_names << it.trim().replaceTrailingBrackets() } +*/ -def anime = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles() -names += anime.findResults{ it.getPrimaryTitle() } -names += anime.findResults{ it.getOfficialTitle('en') } +def names = [thetvdb_names, imdb_series_names, anidb_names] +names.each{ if (it.size() == 0) throw new Exception("Failed to scrape series names") } // sanity check +names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } // collect and normalize names -names = names.findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } - -def unique = new TreeSet(String.CASE_INSENSITIVE_ORDER) -unique.addAll(names) -names = unique as List +def seriesSorter = new TreeSet(String.CASE_INSENSITIVE_ORDER) +seriesSorter.addAll(names) +names = seriesSorter as List -gz(s_out, names) +gz(series_out, names) println "Series Count: " + names.size() diff --git a/source/net/sourceforge/filebot/web/IMDbClient.java b/source/net/sourceforge/filebot/web/IMDbClient.java index 2925c05a..cb14ff04 100644 --- a/source/net/sourceforge/filebot/web/IMDbClient.java +++ b/source/net/sourceforge/filebot/web/IMDbClient.java @@ -99,7 +99,7 @@ public class IMDbClient implements MovieIdentificationService { if (header.toUpperCase().contains("(VG)")) // ignore video games return null; - String name = selectString("//H1/A/text()", dom); + String name = selectString("//H1/A/text()", dom).replaceAll("\\s+", " ").trim(); String year = new Scanner(selectString("//H1/A/following::A/text()", dom)).useDelimiter("\\D+").next(); String url = selectString("//H1/A/@href", dom); return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, getImdbId(url)); diff --git a/website/data/query-blacklist.txt b/website/data/query-blacklist.txt index cd83f953..d50ed7d9 100644 --- a/website/data/query-blacklist.txt +++ b/website/data/query-blacklist.txt @@ -2,6 +2,7 @@ (?-i:ENGLISH) (?-i:FRENCH) (?-i:GERMAN) +(?-i:LAB) (?-i:LIMITED|LiMiTED) (?-i:SPANISH) (?-i:SWEDISH|SWEDiSH) @@ -32,16 +33,23 @@ ^VCD$ ^VIDEO_TS$ A.Release.Lounge +ABC Anime[s]? +Arte BBC +btarena.org By.Cool.Release +CBC CD[0]?[1-3] +Channel.4 +Channel.5 CN CVCD DC Demonoid Director's.Cut Directors.Cut +Discovery.Channel docu Dual.Audio dubbed @@ -58,8 +66,10 @@ Fra FRE GER Hard.Subbed +HBO HDRip Hindi +History.Channel HQ info iNT @@ -69,6 +79,7 @@ ISO iTA iTALIA jigaxx +k.tk.crew KIDZCORNER KOR KORSUB @@ -78,9 +89,15 @@ mkvonly Movies MultiSub MVGroup.org +National.Geographic +NFO +NG +NHK NL NL.Subs NLT +o2.pl +PBS Pre.?DVD PROPER PSP @@ -92,8 +109,8 @@ ReRip RESYNC RETAIL RiffTrax -Sample -sample[s]?$ +sample[s]? +SBS Screenshot ShareGo ShareReactor @@ -121,10 +138,7 @@ UNCUT unrated unrated.edition UsaBit.com -Video[s]? -www.speed.cd -www.torentz.3xforum.ro -www.Torrenting.com -www[.] +video[s]? +www[.][\w-.]+[.](com|net|tk|ro|cd) xRipp -Zune +Zune \ No newline at end of file diff --git a/website/data/release-groups.txt b/website/data/release-groups.txt index a19f4efc..153bb531 100644 --- a/website/data/release-groups.txt +++ b/website/data/release-groups.txt @@ -12,6 +12,7 @@ 3LT0N 420RipZ 4HM +666 7SiNS 850105 a-S @@ -21,6 +22,8 @@ AaS aBD AbSurdity aceford +ACF +AckTiv3 ADHD AE AEGiS @@ -47,15 +50,18 @@ ARiGOLD ARROW ArtSubs ASAP +Atlas47 ATTENTATET AVCHD AVS720 AW aWake +AXE aXXo AZuRRaY babylonad BAJSKORV +BaLD BamHD Barba BaSS @@ -64,20 +70,25 @@ bc10 BDClub BDiSC beAst +BEEF.STEW BeStDivX BestHD BiA BiDA +Billman424 +Blixten BLOW Blu-bits BluDragon BlueBird blueF +Bluereaper BlueTV BLUEYES blueZilla BluWave BMB +BoBo BORGATA bReAK BrG @@ -86,13 +97,16 @@ BRMP BRUTUS BRZONE BTSD +BTSFilms BTT BugZ BULLDOZER BUNNY +BurnFre BWB C4TV CAMELOT +catflap CBGB CDD CDDHD @@ -119,25 +133,31 @@ cntc COALiTiON Cocksure COMPULSION +Connaz-AKA-MrPirate cottage COWiSO CPtScene CPY +CREEDANCE CRF CRIMSON CRiSC CROSSBOW CRYS CSHD +CTD CtrlHD CTU CULTHD CuMBuCKeTS CYBERMEN +CyberTyger D-Z0N3 D3Si danger2u danirl +Danny +Darkside.RG DARM DASH DAW @@ -171,16 +191,22 @@ DiTa DiVERSiTY DivXNL DivXNL-Team +DjRobo38 +dmd DMT DnB DNL DNR +DOMiNO DON +Donatello DoNE DOT DOUBT +Dowcker DOWN DRHD +DrSn DUPLI DUQA DutchReleaseTeam @@ -198,6 +224,7 @@ Ekolb Electri4ka ELECTRiC Electrichka +ELiA elizabethtga EM0C0RE EmC @@ -225,11 +252,13 @@ EXViD eztv FaNSuB FASM +FEAR FELONY FFNDVD FHD FHM FiCO +FiddleGoose FiHTV FilmHD FiNaLe @@ -268,6 +297,7 @@ Gazdi GB GECKOS GEHENNA +Genesis-RG GFW GFY GiNJi @@ -276,6 +306,8 @@ Goblin10 Gogeta GoLDSToNE GOTHiC +greenbud1969 +GREiD GriOTS Grond gudhak @@ -286,6 +318,7 @@ HaB HAGGiS HAiDEAF HALCYON +Hammer71 HANGOVER hannibal HCA @@ -317,10 +350,15 @@ HDX HDxT Helix HHH +HHI +HIDD3N HiDt HiFi HiGHTIMES HiNT +Hivrolta +HLS +HNR HoodBag HORiZON HOWL @@ -340,6 +378,8 @@ IGUANA iKA iLG iLL +iLLUSiON +imacRuel1 iMAGiNE iMBT IMF @@ -352,7 +392,9 @@ iNGOT InSaNiTy iNSECTS iNSPiRED +IntelliQ iNTERNAL +iNTiMiD INtL iNVANDRAREN iON @@ -361,46 +403,64 @@ ITZ Japhson JAVLiU JCH +jedi JENC +JFKXVID JJH JoLLyRoGeR +Jozzep K-F k2 KaKa kamera +KEG keltz +KiLT KiNGS +kirklestat KLAXXON KlockreN +KNIGHTY1973 KOENiG +Koffe +Kole KonzillaRG +KooKoo KRaLiMaRKo +Kuth KYR Kyuubi LamB Larceny LCHD +leetay LEVERAGE LEViTY +LiMiTED LiPAN LMAO +LMG LoD LOL LOLCATS LoneWolf LOST LP +lrc +LRH LTRG LTT LUSO M794 MACHD macro +madeec MAGiCAL MAGiCViBE MAiN MainEvent MARiNES +marioBombo MAXSPEED MC MCR @@ -408,12 +468,16 @@ med MEDiAMANiACS MEDiEVAL MELiTE +Mental.RG MeTH METiS MHQ +Mikoto MiND MiNT MiRAGETV +Mish +MissRipZ MMI MoF MOMENTUM @@ -434,8 +498,11 @@ N-F NaRB Narutoverse NBS +NDRT NeDiVx +NEPTUNE NERDHD +NeRoZ NEW.SOURCE NewArtRiot NFHD @@ -444,6 +511,7 @@ NGXHD NhaNc3 NiBURU NiF +NikonXP Nile NiX NL.Subs @@ -460,14 +528,19 @@ NPW NSUBS NT NTb +NTF +NuMy +NUXX NWO NyTT OAS +Occor OEM OEM1080 Omifast OmU ONYX +OPT!V!D ORC ORENJi ORPHEUS @@ -476,9 +549,11 @@ OSiTV OUTDATED OZC P0W4 +P4DGE Pa@Ph PADDO papi +PAROVOZ PARTiCLE PaYxXx PeeWee @@ -502,9 +577,11 @@ PoTuS PP PPQ PRECiOUS +prevail Prime PriMeHD PRiNCE +prithwi PRoDJi PROGRESS PROPHETS @@ -523,16 +600,19 @@ Purana PURE PUZZLE PxHD +PZE Q0S QCF QDP QiX QSP +Quali.SlaYer QXE R&C rabomil RANDi RAP +Rare.Share Razor1911 Reaperza REAVERS @@ -541,6 +621,8 @@ REFiNED RELOADED Republic REPULSiON +RETRO +Rets REVEiLLE REWARD RightSiZE @@ -549,8 +631,10 @@ RiPTATORz RiTALiX RiVER RMT +RoCK&BlueLadyRG RoCKRioT ROVERS +RS RSG RTA RUBY @@ -568,18 +652,22 @@ SAMFD SANTI SAPHiRE Sapphire +SATIVA SChiZO Scratch404 Scratched +SCREAM ScWb SecretMyth SECTOR7 SEMTEX SEPTiC +SEVcD SEVENTWENTY SexSh0p SFM SGKK +Shadow Shadowman SHAMNBOYZ SHDXXX @@ -587,16 +675,22 @@ shortbrehd SHS SHUNPO SiC +sickboy88 SiGHTHD SiHD SiLU +SINISTER SiNNERS +SiRiUs.sHaRe SiTV SKALiWAGZ +SkipTowne +SKYLIGHT SLM SLO SLOMO SMoKeR +Smurfenlars Sneak SNUGGLER SoCkS @@ -607,13 +701,16 @@ SPARKS SPOOKY sprinter SSF +STAGEMAN Stealthmaster stieg +stoffinho17 Stranded streetwars STV Subject16 SuBoXoNe +SUBZERO SUNSPOT SURFER SVD @@ -628,16 +725,22 @@ TASTE TASTETV TB TDF +TDR +TeamRV TELEFLiX TENEIGHTY +TeNNReeD TERRA terribleHD terribleSD +TFE THENiGHTMAREiNHD TheWretched +Thizz THOR THORA THUGLiNE +THUNDER TiDE TiMELORDS TiMPE @@ -653,8 +756,10 @@ TOPAZ TorrenTGui tpz trentalent +TRiMEDIA TRiPS TrollHD +trosa TruCK tRuE TRUEFRENCH @@ -663,6 +768,7 @@ TsH tsn TURKiSO TUSAHD +Tushar TVA TW TWiZTED @@ -676,12 +782,15 @@ USELESS UVall VaAr3 VALiOMEDiA +VALKYRiA VAMPS Vanillapunk VanRay VCDVaULT VeGaN Vegapunk +VeggTeppe +Vex ViCiOsO ViKAT ViNYL @@ -712,9 +821,11 @@ WiRE WLM WoLF Wolky +WoRKZ WPi WRCR WuSiWuG +XanaX Xander XiA XOR @@ -735,4 +846,9 @@ Yibis YoHo YOUFORGOTTOREPACKTHIS ZBS +ZEKTORM +ZEN +Zeus.Dias ZMG +Zox +Zuzuu \ No newline at end of file diff --git a/website/scripts/cleaner.groovy b/website/scripts/cleaner.groovy index de7868a9..293a1b13 100644 --- a/website/scripts/cleaner.groovy +++ b/website/scripts/cleaner.groovy @@ -1,18 +1,27 @@ -// filebot -script "http://filebot.sf.net/scripts/cleaner.groovy" -trust-script /path/to/media/ +// filebot -script "http://filebot.sf.net/scripts/cleaner.groovy" [--action test] /path/to/media/ /* - * Delete orphaned "clutter" files like nfo, jpg, etc + * Delete orphaned "clutter" files like nfo, jpg, etc and sample files */ -def isClutter(file) { - return file.hasExtension("nfo", "txt", "jpg", "jpeg") +def isClutter(f) { + f.path =~ /\b(?i:sample|trailer|extras|deleted.scenes|music.video|scrapbook)\b/ || f.hasExtension("jpg", "jpeg", "png", "gif", "nfo", "xml", "htm", "html", "log", "srt", "sub", "idx", "md5", "sfv", "txt", "rtf", "url", "db", "dna") } + +def clean(f) { + println "Delete $f" + + // do a dry run via --action test + if (_args.action == 'test') { + return false + } + + return f.isDirectory() ? f.deleteDir() : f.delete() +} + + // delete clutter files in orphaned media folders -args.getFiles{ isClutter(it) && !it.dir.hasFile{ it.isVideo() }}.each { - println "Delete file $it: " + it.delete() -} +args.getFiles{ isClutter(it) && !it.dir.hasFile{ (it.isVideo() || it.isAudio()) && !isClutter(it) }}.each { clean(it) } -// delete empty folders but exclude roots -args.getFolders{ it.getFiles().isEmpty() && !args.contains(it) }.each { - println "Delete dir $it: " + it.deleteDir() -} +// delete empty folders but exclude given args +args.getFolders{ it.listFiles().length == 0 && !args.contains(it) }.each { clean(it) }