From d3347d19d9b5a87fb0ff5e4bcb597ebf1a53671e Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Sun, 2 Dec 2012 09:36:59 +0000 Subject: [PATCH] * scrape info from main movie page rather than releaseinfo * spoof googlebot http headers to trick imdb geo-localisation * fix imdb url encoding issues --- .../sourceforge/filebot/web/IMDbClient.java | 44 +++++-------------- .../filebot/web/SubsceneSubtitleClient.java | 2 +- .../sourceforge/filebot/web/TMDbClient.java | 2 +- .../sourceforge/filebot/web/TVRageClient.java | 2 +- .../filebot/web/TheTVDBClient.java | 2 +- .../sourceforge/filebot/web/WebRequest.java | 17 ++++--- .../net/sourceforge/tuned/DownloadTask.java | 36 +++++++-------- .../filebot/web/IMDbClientTest.java | 27 ++++++++++-- 8 files changed, 66 insertions(+), 66 deletions(-) diff --git a/source/net/sourceforge/filebot/web/IMDbClient.java b/source/net/sourceforge/filebot/web/IMDbClient.java index 6dfb261f..e3b7d3b0 100644 --- a/source/net/sourceforge/filebot/web/IMDbClient.java +++ b/source/net/sourceforge/filebot/web/IMDbClient.java @@ -22,8 +22,6 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Scanner; -import java.util.logging.Level; -import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -57,7 +55,7 @@ public class IMDbClient implements MovieIdentificationService { protected String getHost() { String host = System.getProperty("imdb.hostname"); // default to akas.imdb.com but allow override via -Dimdb.host - return host == null ? "akas.imdb.com" : host; + return host == null ? "imdb.com" : host; } @@ -75,7 +73,7 @@ public class IMDbClient implements MovieIdentificationService { @Override public List searchMovie(String query, Locale locale) throws Exception { - Document dom = parsePage(new URL("http", getHost(), "/find?s=tt&q=" + encode(query))); + Document dom = parsePage(new URL("http", getHost(), "/find?s=tt&q=" + encode(query, false))); // select movie links followed by year in parenthesis List nodes = selectNodes("//TABLE[@class='findList']//TD/A[substring-after(substring-before(following::text(),')'),'(')]", dom); @@ -119,32 +117,11 @@ public class IMDbClient implements MovieIdentificationService { if (header.contains("(VG)")) // ignore video games and videos return null; - String name = selectString("//H1/A/text()", dom).replaceAll("\\s+", " ").trim(); - String year = new Scanner(selectString("//H1/A/following::A/text()", dom)).useDelimiter("\\D+").next(); - String url = selectString("//H1/A/@href", dom); + String name = selectString("//H1/text()", dom).replaceAll("\\s+", " ").trim(); + String year = new Scanner(selectString("//H1//A/text()", dom)).useDelimiter("\\D+").next(); + int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom)); - // try to get localized name - if (locale != null && locale != Locale.ROOT) { - try { - String language = String.format("(%s title)", locale.getDisplayLanguage(Locale.ENGLISH).toLowerCase()); - List akaRows = selectNodes("//*[@name='akas']//following::TABLE[1]//TR", dom); - - for (Node aka : akaRows) { - List columns = getChildren("TD", aka); - String akaTitle = getTextContent(columns.get(0)); - String languageDesc = getTextContent(columns.get(1)).toLowerCase(); - - if (language.length() > 0 && languageDesc.contains(language) && languageDesc.contains("international")) { - name = akaTitle; - break; - } - } - } catch (Exception e) { - Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to grep localized name: " + name); - } - } - - return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, getImdbId(url), -1); + return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, imdbid, -1); } catch (Exception e) { // ignore, we probably got redirected to an error page return null; @@ -155,7 +132,7 @@ public class IMDbClient implements MovieIdentificationService { @Override public Movie getMovieDescriptor(int imdbid, Locale locale) throws Exception { try { - return scrapeMovie(parsePage(new URL("http", getHost(), String.format("/title/tt%07d/releaseinfo", imdbid))), locale); + return scrapeMovie(parsePage(new URL("http", getHost(), String.format("/title/tt%07d", imdbid))), locale); } catch (FileNotFoundException e) { return null; // illegal imdbid } @@ -169,8 +146,11 @@ public class IMDbClient implements MovieIdentificationService { protected Reader openConnection(URL url) throws IOException { URLConnection connection = url.openConnection(); - // IMDb refuses default user agent (Java/1.6.0_12) - connection.addRequestProperty("User-Agent", "Mozilla"); + // IMDb refuses default user agent (Java/1.6.0_12) => SPOOF GOOGLEBOT + connection.addRequestProperty("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"); + connection.addRequestProperty("From", "googlebot(at)googlebot.com"); + connection.addRequestProperty("Accept", "*/*"); + connection.addRequestProperty("X-Forwarded-For", "66.249.73.100"); // TRICK ANNOYING IMDB GEO-LOCATION LOCALIZATION return getReader(connection); } diff --git a/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java b/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java index d30f4095..c7430e11 100644 --- a/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java +++ b/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java @@ -53,7 +53,7 @@ public class SubsceneSubtitleClient implements SubtitleProvider { @Override public List search(String query) throws IOException, SAXException { - URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query)); + URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query, true)); Document dom = getHtmlDocument(searchUrl); List nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom); diff --git a/source/net/sourceforge/filebot/web/TMDbClient.java b/source/net/sourceforge/filebot/web/TMDbClient.java index 6f129896..65288a37 100644 --- a/source/net/sourceforge/filebot/web/TMDbClient.java +++ b/source/net/sourceforge/filebot/web/TMDbClient.java @@ -236,7 +236,7 @@ public class TMDbClient implements MovieIdentificationService { } data.put("api_key", apikey); - URL url = new URL("http", host, "/" + version + "/" + resource + "?" + encodeParameters(data)); + URL url = new URL("http", host, "/" + version + "/" + resource + "?" + encodeParameters(data, true)); CachedResource json = new CachedResource(url.toString(), String.class, 7 * 24 * 60 * 60 * 1000) { diff --git a/source/net/sourceforge/filebot/web/TVRageClient.java b/source/net/sourceforge/filebot/web/TVRageClient.java index 3a7549be..3619784e 100644 --- a/source/net/sourceforge/filebot/web/TVRageClient.java +++ b/source/net/sourceforge/filebot/web/TVRageClient.java @@ -48,7 +48,7 @@ public class TVRageClient extends AbstractEpisodeListProvider { @Override public List fetchSearchResult(String query, Locale locale) throws IOException, SAXException { - URL searchUrl = new URL("http", host, "/feeds/full_search.php?show=" + encode(query)); + URL searchUrl = new URL("http", host, "/feeds/full_search.php?show=" + encode(query, true)); Document dom = getDocument(searchUrl); List nodes = selectNodes("Results/show", dom); diff --git a/source/net/sourceforge/filebot/web/TheTVDBClient.java b/source/net/sourceforge/filebot/web/TheTVDBClient.java index 83a86628..278e6bb4 100644 --- a/source/net/sourceforge/filebot/web/TheTVDBClient.java +++ b/source/net/sourceforge/filebot/web/TheTVDBClient.java @@ -106,7 +106,7 @@ public class TheTVDBClient extends AbstractEpisodeListProvider { @Override public List fetchSearchResult(String query, Locale locale) throws Exception { // perform online search - URL url = getResource(null, "/api/GetSeries.php?seriesname=" + encode(query) + "&language=" + getLanguageCode(locale)); + URL url = getResource(null, "/api/GetSeries.php?seriesname=" + encode(query, true) + "&language=" + getLanguageCode(locale)); Document dom = getDocument(url); List nodes = selectNodes("Data/Series", dom); diff --git a/source/net/sourceforge/filebot/web/WebRequest.java b/source/net/sourceforge/filebot/web/WebRequest.java index 6f675a4c..31793d6a 100644 --- a/source/net/sourceforge/filebot/web/WebRequest.java +++ b/source/net/sourceforge/filebot/web/WebRequest.java @@ -35,13 +35,13 @@ import javax.net.ssl.X509TrustManager; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; +import net.sourceforge.tuned.ByteBufferOutputStream; + import org.cyberneko.html.parsers.DOMParser; import org.w3c.dom.Document; import org.xml.sax.InputSource; import org.xml.sax.SAXException; -import net.sourceforge.tuned.ByteBufferOutputStream; - public final class WebRequest { @@ -168,7 +168,7 @@ public final class WebRequest { public static ByteBuffer post(HttpURLConnection connection, Map parameters) throws IOException { - return post(connection, encodeParameters(parameters).getBytes("UTF-8"), "application/x-www-form-urlencoded"); + return post(connection, encodeParameters(parameters, true).getBytes("UTF-8"), "application/x-www-form-urlencoded"); } @@ -236,7 +236,7 @@ public final class WebRequest { } - public static String encodeParameters(Map parameters) { + public static String encodeParameters(Map parameters, boolean unicode) { StringBuilder sb = new StringBuilder(); for (Entry entry : parameters.entrySet()) { @@ -247,7 +247,7 @@ public final class WebRequest { sb.append(entry.getKey()); if (entry.getValue() != null) { sb.append("="); - sb.append(encode(entry.getValue().toString())); + sb.append(encode(entry.getValue().toString(), unicode)); } } @@ -255,9 +255,9 @@ public final class WebRequest { } - public static String encode(String string) { + public static String encode(String string, boolean unicode) { try { - return URLEncoder.encode(string, "UTF-8"); + return URLEncoder.encode(string, unicode ? "UTF-8" : "ISO-8859-1"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } @@ -268,15 +268,18 @@ public final class WebRequest { // create a trust manager that does not validate certificate chains TrustManager trustAnyCertificate = new X509TrustManager() { + @Override public X509Certificate[] getAcceptedIssuers() { return null; } + @Override public void checkClientTrusted(X509Certificate[] certs, String authType) { } + @Override public void checkServerTrusted(X509Certificate[] certs, String authType) { } }; diff --git a/source/net/sourceforge/tuned/DownloadTask.java b/source/net/sourceforge/tuned/DownloadTask.java index 733fa7ec..a82d93eb 100644 --- a/source/net/sourceforge/tuned/DownloadTask.java +++ b/source/net/sourceforge/tuned/DownloadTask.java @@ -25,15 +25,11 @@ public class DownloadTask extends SwingWorker { public static final String DOWNLOAD_STATE = "download state"; public static final String DOWNLOAD_PROGRESS = "download progress"; - + public static enum DownloadState { - PENDING, - CONNECTING, - DOWNLOADING, - DONE + PENDING, CONNECTING, DOWNLOADING, DONE } - private URL url; private long contentLength = -1; @@ -43,12 +39,12 @@ public class DownloadTask extends SwingWorker { private Map requestHeaders; private Map> responseHeaders; - + public DownloadTask(URL url) { this.url = url; } - + protected HttpURLConnection createConnection() throws Exception { HttpURLConnection connection = (HttpURLConnection) url.openConnection(); @@ -61,7 +57,7 @@ public class DownloadTask extends SwingWorker { return connection; } - + @Override protected ByteBuffer doInBackground() throws Exception { setDownloadState(DownloadState.CONNECTING); @@ -69,7 +65,7 @@ public class DownloadTask extends SwingWorker { HttpURLConnection connection = createConnection(); if (postParameters != null) { - ByteBuffer postData = Charset.forName("UTF-8").encode(encodeParameters(postParameters)); + ByteBuffer postData = Charset.forName("UTF-8").encode(encodeParameters(postParameters, true)); // add content type and content length headers connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded"); @@ -118,53 +114,53 @@ public class DownloadTask extends SwingWorker { return buffer.getByteBuffer(); } - + protected void setDownloadState(DownloadState state) { this.state = state; firePropertyChange(DOWNLOAD_STATE, null, state); } - + public DownloadState getDownloadState() { return state; } - + public URL getUrl() { return url; } - + public boolean isContentLengthKnown() { return contentLength >= 0; } - + public long getContentLength() { return contentLength; } - + public void setRequestHeaders(Map requestHeaders) { this.requestHeaders = new HashMap(requestHeaders); } - + public void setPostParameters(Map postParameters) { this.postParameters = new HashMap(postParameters); } - + public Map> getResponseHeaders() { return responseHeaders; } - + public Map getPostParameters() { return postParameters; } - + public Map getRequestHeaders() { return requestHeaders; } diff --git a/test/net/sourceforge/filebot/web/IMDbClientTest.java b/test/net/sourceforge/filebot/web/IMDbClientTest.java index 803eebd6..a326da3f 100644 --- a/test/net/sourceforge/filebot/web/IMDbClientTest.java +++ b/test/net/sourceforge/filebot/web/IMDbClientTest.java @@ -16,7 +16,7 @@ public class IMDbClientTest { @Test - public void searchMovie() throws Exception { + public void searchMovie1() throws Exception { List results = imdb.searchMovie("Avatar", null); Movie movie = results.get(0); @@ -28,7 +28,7 @@ public class IMDbClientTest { @Test public void searchMovie2() throws Exception { - List results = imdb.searchMovie("the illusionist", null); + List results = imdb.searchMovie("The Illusionist", null); Movie movie = results.get(0); assertEquals("The Illusionist", movie.getName()); @@ -37,6 +37,17 @@ public class IMDbClientTest { } + @Test + public void searchMovie3() throws Exception { + List results = imdb.searchMovie("Amélie", null); + Movie movie = results.get(0); + + assertEquals("Amélie", movie.getName()); + assertEquals(2001, movie.getYear()); + assertEquals(211915, movie.getImdbId(), 0); + } + + @Test public void searchMovieRedirect() throws Exception { List results = imdb.searchMovie("(500) Days of Summer (2009)", null); @@ -50,7 +61,7 @@ public class IMDbClientTest { @Test - public void getMovieDescriptor() throws Exception { + public void getMovieDescriptor1() throws Exception { Movie movie = imdb.getMovieDescriptor(499549, null); assertEquals("Avatar", movie.getName()); @@ -59,6 +70,16 @@ public class IMDbClientTest { } + @Test + public void getMovieDescriptor2() throws Exception { + Movie movie = imdb.getMovieDescriptor(211915, null); + + assertEquals("Amélie", movie.getName()); + assertEquals(2001, movie.getYear()); + assertEquals(211915, movie.getImdbId(), 0); + } + + @Test public void getAkaMovieDescriptor() throws Exception { Movie movie = imdb.getMovieDescriptor(106559, Locale.ENGLISH);