diff --git a/source/net/sourceforge/filebot/WebServices.java b/source/net/sourceforge/filebot/WebServices.java index 937fa043..8d6fb54c 100644 --- a/source/net/sourceforge/filebot/WebServices.java +++ b/source/net/sourceforge/filebot/WebServices.java @@ -39,27 +39,27 @@ public final class WebServices { // movie dbs public static final TMDbClient TMDb = new TMDbClient(getApplicationProperty("themoviedb.apikey")); - + public static EpisodeListProvider[] getEpisodeListProviders() { return new EpisodeListProvider[] { TVRage, AniDB, IMDb, TheTVDB, Serienjunkies }; } - + public static MovieIdentificationService[] getMovieIdentificationServices() { - return new MovieIdentificationService[] { OpenSubtitles, TMDb }; + return new MovieIdentificationService[] { OpenSubtitles, IMDb, TMDb }; } - + public static SubtitleProvider[] getSubtitleProviders() { return new SubtitleProvider[] { OpenSubtitles, Sublight, Subscene }; } - + public static VideoHashSubtitleService[] getVideoHashSubtitleServices() { return new VideoHashSubtitleService[] { OpenSubtitles, Sublight }; } - + public static EpisodeListProvider getEpisodeListProvider(String name) { for (EpisodeListProvider it : WebServices.getEpisodeListProviders()) { if (it.getName().equalsIgnoreCase(name)) @@ -69,7 +69,7 @@ public final class WebServices { return null; // default } - + public static MovieIdentificationService getMovieIdentificationService(String name) { for (MovieIdentificationService it : getMovieIdentificationServices()) { if (it.getName().equalsIgnoreCase(name)) @@ -79,7 +79,7 @@ public final class WebServices { return null; // default } - + /** * Dummy constructor to prevent instantiation. */ diff --git a/source/net/sourceforge/filebot/web/IMDbClient.java b/source/net/sourceforge/filebot/web/IMDbClient.java index 27bfe606..115967db 100644 --- a/source/net/sourceforge/filebot/web/IMDbClient.java +++ b/source/net/sourceforge/filebot/web/IMDbClient.java @@ -5,6 +5,7 @@ package net.sourceforge.filebot.web; import static net.sourceforge.filebot.web.WebRequest.*; import static net.sourceforge.tuned.XPathUtilities.*; +import java.io.File; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; @@ -27,36 +28,34 @@ import net.sf.ehcache.CacheManager; import net.sourceforge.filebot.ResourceManager; -public class IMDbClient extends AbstractEpisodeListProvider { +public class IMDbClient extends AbstractEpisodeListProvider implements MovieIdentificationService { private final String host = "www.imdb.com"; - + @Override public String getName() { return "IMDb"; } - + @Override public Icon getIcon() { return ResourceManager.getIcon("search.imdb"); } - + @Override public ResultCache getCache() { return new ResultCache(host, CacheManager.getInstance().getCache("web-datasource")); } - + @Override public List fetchSearchResult(String query, Locale locale) throws IOException, SAXException { - URL searchUrl = new URL("http", host, "/find?s=tt&q=" + encode(query)); - Document dom = getHtmlDocument(openConnection(searchUrl)); + Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query))); List nodes = selectNodes("//TABLE//A[following-sibling::SMALL[contains(.,'series')]]", dom); - List results = new ArrayList(nodes.size()); for (Node node : nodes) { @@ -69,25 +68,20 @@ public class IMDbClient extends AbstractEpisodeListProvider { // we might have been redirected to the movie page if (results.isEmpty()) { - try { - String name = normalizeName(selectString("//H1/text()", dom)); - String year = new Scanner(selectString("//H1//SPAN", dom)).useDelimiter("\\D+").next(); - String url = selectString("//LINK[@rel='canonical']/@href", dom); - - results.add(new Movie(name, Integer.parseInt(year), getImdbId(url))); - } catch (Exception e) { - // ignore, we probably got redirected to an error page + Movie movie = scrapeMovie(dom); + if (movie != null) { + results.add(movie); } } return results; } - + @Override public List fetchEpisodeList(SearchResult searchResult, Locale locale) throws IOException, SAXException { Movie movie = (Movie) searchResult; - Document dom = getHtmlDocument(openConnection(getEpisodeListLink(searchResult).toURL())); + Document dom = parsePage(getEpisodeListLink(searchResult).toURL()); String seriesName = normalizeName(selectString("//H1/A", dom)); Date year = new Date(movie.getYear(), 0, 0); @@ -111,23 +105,13 @@ public class IMDbClient extends AbstractEpisodeListProvider { return episodes; } - - protected URLConnection openConnection(URL url) throws IOException { - URLConnection connection = url.openConnection(); - - // IMDb refuses default user agent (Java/1.6.0_12) - connection.addRequestProperty("User-Agent", "Scraper"); - - return connection; - } - protected String normalizeName(String name) { // remove quotation marks return name.replaceAll("\"", ""); } - + protected int getImdbId(String link) { Matcher matcher = Pattern.compile("tt(\\d{7})").matcher(link); @@ -139,13 +123,13 @@ public class IMDbClient extends AbstractEpisodeListProvider { throw new IllegalArgumentException(String.format("Cannot find imdb id: %s", link)); } - + @Override public URI getEpisodeListLink(SearchResult searchResult) { return getEpisodeListLink(searchResult, 0); } - + @Override public URI getEpisodeListLink(SearchResult searchResult, int season) { try { @@ -154,4 +138,72 @@ public class IMDbClient extends AbstractEpisodeListProvider { throw new RuntimeException(e); } } + + + @Override + public List searchMovie(String query, Locale locale) throws Exception { + Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query))); + + // select movie links followed by year in parenthesis + List nodes = selectNodes("//TABLE//A[string-length(substring-after(substring-before(following::text(),')'),'(')) = 4 and count(following-sibling::SMALL) = 0]", dom); + List results = new ArrayList(nodes.size()); + + for (Node node : nodes) { + String name = node.getTextContent().trim(); + String year = node.getNextSibling().getTextContent().trim().replaceAll("[\\p{Punct}\\p{Space}]+", ""); // remove non-number characters + String href = getAttribute("href", node); + + try { + results.add(new Movie(name, Integer.parseInt(year), getImdbId(href))); + } catch (NumberFormatException e) { + // ignore illegal movies (TV Shows, Videos, Video Games, etc) + } + } + + // we might have been redirected to the movie page + if (results.isEmpty()) { + Movie movie = scrapeMovie(dom); + if (movie != null) { + results.add(movie); + } + } + + return results; + } + + + protected Movie scrapeMovie(Document dom) { + try { + String name = normalizeName(selectString("//H1/text()", dom)); + String year = new Scanner(selectString("//H1//SPAN", dom)).useDelimiter("\\D+").next(); + String url = selectString("//LINK[@rel='canonical']/@href", dom); + return new Movie(name, Integer.parseInt(year), getImdbId(url)); + } catch (Exception e) { + // ignore, we probably got redirected to an error page + return null; + } + } + + + @Override + public Movie getMovieDescriptor(int imdbid, Locale locale) throws Exception { + return scrapeMovie(parsePage(new URL("http", host, String.format("/title/tt%07d/", imdbid)))); + } + + + protected Document parsePage(URL url) throws IOException, SAXException { + URLConnection connection = url.openConnection(); + + // IMDb refuses default user agent (Java/1.6.0_12) + connection.addRequestProperty("User-Agent", "Mozilla"); + + return getHtmlDocument(connection); + } + + + @Override + public Movie[] getMovieDescriptors(File[] movieFiles, Locale locale) throws Exception { + return new Movie[movieFiles.length]; // UNSUPPORTED OPERATION => EMPTY RESULT + } + } diff --git a/source/net/sourceforge/filebot/web/WebRequest.java b/source/net/sourceforge/filebot/web/WebRequest.java index 1eca5123..921557c4 100644 --- a/source/net/sourceforge/filebot/web/WebRequest.java +++ b/source/net/sourceforge/filebot/web/WebRequest.java @@ -98,7 +98,11 @@ public final class WebRequest { public static Document getDocument(InputSource source) throws IOException, SAXException { try { - return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setValidating(false); + factory.setFeature("http://xml.org/sax/features/namespaces", false); + factory.setFeature("http://xml.org/sax/features/validation", false); + return factory.newDocumentBuilder().parse(source); } catch (ParserConfigurationException e) { // will never happen throw new RuntimeException(e); diff --git a/test/net/sourceforge/filebot/web/IMDbClientTest.java b/test/net/sourceforge/filebot/web/IMDbClientTest.java index 84871c28..809c3d29 100644 --- a/test/net/sourceforge/filebot/web/IMDbClientTest.java +++ b/test/net/sourceforge/filebot/web/IMDbClientTest.java @@ -13,7 +13,7 @@ public class IMDbClientTest { private final IMDbClient imdb = new IMDbClient(); - + @Test public void search() throws Exception { List results = imdb.search("battlestar"); @@ -27,7 +27,7 @@ public class IMDbClientTest { assertEquals(8, results.size(), 0); } - + @Test public void searchMiniSeries() throws Exception { List results = imdb.search("generation kill"); @@ -39,7 +39,7 @@ public class IMDbClientTest { assertEquals(995832, movie.getImdbId(), 0); } - + @Test public void searchNoMatch() throws Exception { List results = imdb.search("i will not find anything for this query string"); @@ -47,7 +47,7 @@ public class IMDbClientTest { assertTrue(results.isEmpty()); } - + @Test public void searchResultPageRedirect() throws Exception { List results = imdb.search("my name is earl"); @@ -61,7 +61,7 @@ public class IMDbClientTest { assertEquals(460091, movie.getImdbId(), 0); } - + @Test public void getEpisodeList() throws Exception { List list = imdb.getEpisodeList(new Movie("Buffy", 1997, 118276)); @@ -87,7 +87,7 @@ public class IMDbClientTest { assertEquals("2003-05-20", last.airdate().toString()); } - + @Test public void getEpisodeListWithUnknownSeason() throws Exception { List list = imdb.getEpisodeList(new Movie("Mushishi", 2005, 807832)); @@ -103,7 +103,43 @@ public class IMDbClientTest { assertEquals("1", first.getSeason().toString()); } - + + @Test + public void searchMovie() throws Exception { + List results = imdb.searchMovie("Avatar", null); + + assertEquals(26, results.size()); + Movie movie = (Movie) results.get(0); + + assertEquals("Avatar", movie.getName()); + assertEquals(2009, movie.getYear()); + assertEquals(499549, movie.getImdbId(), 0); + } + + + @Test + public void searchMovieRedirect() throws Exception { + List results = imdb.searchMovie("battle angel alita", null); + + assertEquals(1, results.size()); + Movie movie = (Movie) results.get(0); + + assertEquals("Battle Angel", movie.getName()); + assertEquals(1993, movie.getYear()); + assertEquals(107061, movie.getImdbId(), 0); + } + + + @Test + public void getMovieDescriptor() throws Exception { + Movie movie = imdb.getMovieDescriptor(499549, null); + + assertEquals("Avatar", movie.getName()); + assertEquals(2009, movie.getYear()); + assertEquals(499549, movie.getImdbId(), 0); + } + + @Test public void getEpisodeListLink() throws Exception { assertEquals("http://www.imdb.com/title/tt0407362/episodes", imdb.getEpisodeListLink(new Movie("Battlestar Galactica", 2004, 407362)).toString()); diff --git a/website/data/query-blacklist.txt b/website/data/query-blacklist.txt index 0de45429..f69cb495 100644 --- a/website/data/query-blacklist.txt +++ b/website/data/query-blacklist.txt @@ -12,6 +12,7 @@ PROPER READNFO REPACK RETAIL +HDRip sample[s]?$ ShareReactor ShareZONE