* scrape info from main movie page rather than releaseinfo

* spoof googlebot http headers to trick imdb geo-localisation
* fix imdb url encoding issues
This commit is contained in:
Reinhard Pointner 2012-12-02 09:36:59 +00:00
parent 55b4b26890
commit d3347d19d9
8 changed files with 66 additions and 66 deletions

View File

@ -22,8 +22,6 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Scanner; import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -57,7 +55,7 @@ public class IMDbClient implements MovieIdentificationService {
protected String getHost() { protected String getHost() {
String host = System.getProperty("imdb.hostname"); // default to akas.imdb.com but allow override via -Dimdb.host String host = System.getProperty("imdb.hostname"); // default to akas.imdb.com but allow override via -Dimdb.host
return host == null ? "akas.imdb.com" : host; return host == null ? "imdb.com" : host;
} }
@ -75,7 +73,7 @@ public class IMDbClient implements MovieIdentificationService {
@Override @Override
public List<Movie> searchMovie(String query, Locale locale) throws Exception { public List<Movie> searchMovie(String query, Locale locale) throws Exception {
Document dom = parsePage(new URL("http", getHost(), "/find?s=tt&q=" + encode(query))); Document dom = parsePage(new URL("http", getHost(), "/find?s=tt&q=" + encode(query, false)));
// select movie links followed by year in parenthesis // select movie links followed by year in parenthesis
List<Node> nodes = selectNodes("//TABLE[@class='findList']//TD/A[substring-after(substring-before(following::text(),')'),'(')]", dom); List<Node> nodes = selectNodes("//TABLE[@class='findList']//TD/A[substring-after(substring-before(following::text(),')'),'(')]", dom);
@ -119,32 +117,11 @@ public class IMDbClient implements MovieIdentificationService {
if (header.contains("(VG)")) // ignore video games and videos if (header.contains("(VG)")) // ignore video games and videos
return null; return null;
String name = selectString("//H1/A/text()", dom).replaceAll("\\s+", " ").trim(); String name = selectString("//H1/text()", dom).replaceAll("\\s+", " ").trim();
String year = new Scanner(selectString("//H1/A/following::A/text()", dom)).useDelimiter("\\D+").next(); String year = new Scanner(selectString("//H1//A/text()", dom)).useDelimiter("\\D+").next();
String url = selectString("//H1/A/@href", dom); int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom));
// try to get localized name return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, imdbid, -1);
if (locale != null && locale != Locale.ROOT) {
try {
String language = String.format("(%s title)", locale.getDisplayLanguage(Locale.ENGLISH).toLowerCase());
List<Node> akaRows = selectNodes("//*[@name='akas']//following::TABLE[1]//TR", dom);
for (Node aka : akaRows) {
List<Node> columns = getChildren("TD", aka);
String akaTitle = getTextContent(columns.get(0));
String languageDesc = getTextContent(columns.get(1)).toLowerCase();
if (language.length() > 0 && languageDesc.contains(language) && languageDesc.contains("international")) {
name = akaTitle;
break;
}
}
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to grep localized name: " + name);
}
}
return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, getImdbId(url), -1);
} catch (Exception e) { } catch (Exception e) {
// ignore, we probably got redirected to an error page // ignore, we probably got redirected to an error page
return null; return null;
@ -155,7 +132,7 @@ public class IMDbClient implements MovieIdentificationService {
@Override @Override
public Movie getMovieDescriptor(int imdbid, Locale locale) throws Exception { public Movie getMovieDescriptor(int imdbid, Locale locale) throws Exception {
try { try {
return scrapeMovie(parsePage(new URL("http", getHost(), String.format("/title/tt%07d/releaseinfo", imdbid))), locale); return scrapeMovie(parsePage(new URL("http", getHost(), String.format("/title/tt%07d", imdbid))), locale);
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
return null; // illegal imdbid return null; // illegal imdbid
} }
@ -169,8 +146,11 @@ public class IMDbClient implements MovieIdentificationService {
protected Reader openConnection(URL url) throws IOException { protected Reader openConnection(URL url) throws IOException {
URLConnection connection = url.openConnection(); URLConnection connection = url.openConnection();
// IMDb refuses default user agent (Java/1.6.0_12) // IMDb refuses default user agent (Java/1.6.0_12) => SPOOF GOOGLEBOT
connection.addRequestProperty("User-Agent", "Mozilla"); connection.addRequestProperty("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)");
connection.addRequestProperty("From", "googlebot(at)googlebot.com");
connection.addRequestProperty("Accept", "*/*");
connection.addRequestProperty("X-Forwarded-For", "66.249.73.100"); // TRICK ANNOYING IMDB GEO-LOCATION LOCALIZATION
return getReader(connection); return getReader(connection);
} }

View File

@ -53,7 +53,7 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
@Override @Override
public List<SearchResult> search(String query) throws IOException, SAXException { public List<SearchResult> search(String query) throws IOException, SAXException {
URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query)); URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query, true));
Document dom = getHtmlDocument(searchUrl); Document dom = getHtmlDocument(searchUrl);
List<Node> nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom); List<Node> nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom);

View File

@ -236,7 +236,7 @@ public class TMDbClient implements MovieIdentificationService {
} }
data.put("api_key", apikey); data.put("api_key", apikey);
URL url = new URL("http", host, "/" + version + "/" + resource + "?" + encodeParameters(data)); URL url = new URL("http", host, "/" + version + "/" + resource + "?" + encodeParameters(data, true));
CachedResource<String> json = new CachedResource<String>(url.toString(), String.class, 7 * 24 * 60 * 60 * 1000) { CachedResource<String> json = new CachedResource<String>(url.toString(), String.class, 7 * 24 * 60 * 60 * 1000) {

View File

@ -48,7 +48,7 @@ public class TVRageClient extends AbstractEpisodeListProvider {
@Override @Override
public List<SearchResult> fetchSearchResult(String query, Locale locale) throws IOException, SAXException { public List<SearchResult> fetchSearchResult(String query, Locale locale) throws IOException, SAXException {
URL searchUrl = new URL("http", host, "/feeds/full_search.php?show=" + encode(query)); URL searchUrl = new URL("http", host, "/feeds/full_search.php?show=" + encode(query, true));
Document dom = getDocument(searchUrl); Document dom = getDocument(searchUrl);
List<Node> nodes = selectNodes("Results/show", dom); List<Node> nodes = selectNodes("Results/show", dom);

View File

@ -106,7 +106,7 @@ public class TheTVDBClient extends AbstractEpisodeListProvider {
@Override @Override
public List<SearchResult> fetchSearchResult(String query, Locale locale) throws Exception { public List<SearchResult> fetchSearchResult(String query, Locale locale) throws Exception {
// perform online search // perform online search
URL url = getResource(null, "/api/GetSeries.php?seriesname=" + encode(query) + "&language=" + getLanguageCode(locale)); URL url = getResource(null, "/api/GetSeries.php?seriesname=" + encode(query, true) + "&language=" + getLanguageCode(locale));
Document dom = getDocument(url); Document dom = getDocument(url);
List<Node> nodes = selectNodes("Data/Series", dom); List<Node> nodes = selectNodes("Data/Series", dom);

View File

@ -35,13 +35,13 @@ import javax.net.ssl.X509TrustManager;
import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.ParserConfigurationException;
import net.sourceforge.tuned.ByteBufferOutputStream;
import org.cyberneko.html.parsers.DOMParser; import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import org.xml.sax.InputSource; import org.xml.sax.InputSource;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import net.sourceforge.tuned.ByteBufferOutputStream;
public final class WebRequest { public final class WebRequest {
@ -168,7 +168,7 @@ public final class WebRequest {
public static ByteBuffer post(HttpURLConnection connection, Map<String, ?> parameters) throws IOException { public static ByteBuffer post(HttpURLConnection connection, Map<String, ?> parameters) throws IOException {
return post(connection, encodeParameters(parameters).getBytes("UTF-8"), "application/x-www-form-urlencoded"); return post(connection, encodeParameters(parameters, true).getBytes("UTF-8"), "application/x-www-form-urlencoded");
} }
@ -236,7 +236,7 @@ public final class WebRequest {
} }
public static String encodeParameters(Map<String, ?> parameters) { public static String encodeParameters(Map<String, ?> parameters, boolean unicode) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (Entry<String, ?> entry : parameters.entrySet()) { for (Entry<String, ?> entry : parameters.entrySet()) {
@ -247,7 +247,7 @@ public final class WebRequest {
sb.append(entry.getKey()); sb.append(entry.getKey());
if (entry.getValue() != null) { if (entry.getValue() != null) {
sb.append("="); sb.append("=");
sb.append(encode(entry.getValue().toString())); sb.append(encode(entry.getValue().toString(), unicode));
} }
} }
@ -255,9 +255,9 @@ public final class WebRequest {
} }
public static String encode(String string) { public static String encode(String string, boolean unicode) {
try { try {
return URLEncoder.encode(string, "UTF-8"); return URLEncoder.encode(string, unicode ? "UTF-8" : "ISO-8859-1");
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@ -268,15 +268,18 @@ public final class WebRequest {
// create a trust manager that does not validate certificate chains // create a trust manager that does not validate certificate chains
TrustManager trustAnyCertificate = new X509TrustManager() { TrustManager trustAnyCertificate = new X509TrustManager() {
@Override
public X509Certificate[] getAcceptedIssuers() { public X509Certificate[] getAcceptedIssuers() {
return null; return null;
} }
@Override
public void checkClientTrusted(X509Certificate[] certs, String authType) { public void checkClientTrusted(X509Certificate[] certs, String authType) {
} }
@Override
public void checkServerTrusted(X509Certificate[] certs, String authType) { public void checkServerTrusted(X509Certificate[] certs, String authType) {
} }
}; };

View File

@ -27,13 +27,9 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
public static enum DownloadState { public static enum DownloadState {
PENDING, PENDING, CONNECTING, DOWNLOADING, DONE
CONNECTING,
DOWNLOADING,
DONE
} }
private URL url; private URL url;
private long contentLength = -1; private long contentLength = -1;
@ -69,7 +65,7 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
HttpURLConnection connection = createConnection(); HttpURLConnection connection = createConnection();
if (postParameters != null) { if (postParameters != null) {
ByteBuffer postData = Charset.forName("UTF-8").encode(encodeParameters(postParameters)); ByteBuffer postData = Charset.forName("UTF-8").encode(encodeParameters(postParameters, true));
// add content type and content length headers // add content type and content length headers
connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded"); connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");

View File

@ -16,7 +16,7 @@ public class IMDbClientTest {
@Test @Test
public void searchMovie() throws Exception { public void searchMovie1() throws Exception {
List<Movie> results = imdb.searchMovie("Avatar", null); List<Movie> results = imdb.searchMovie("Avatar", null);
Movie movie = results.get(0); Movie movie = results.get(0);
@ -28,7 +28,7 @@ public class IMDbClientTest {
@Test @Test
public void searchMovie2() throws Exception { public void searchMovie2() throws Exception {
List<Movie> results = imdb.searchMovie("the illusionist", null); List<Movie> results = imdb.searchMovie("The Illusionist", null);
Movie movie = results.get(0); Movie movie = results.get(0);
assertEquals("The Illusionist", movie.getName()); assertEquals("The Illusionist", movie.getName());
@ -37,6 +37,17 @@ public class IMDbClientTest {
} }
@Test
public void searchMovie3() throws Exception {
List<Movie> results = imdb.searchMovie("Amélie", null);
Movie movie = results.get(0);
assertEquals("Amélie", movie.getName());
assertEquals(2001, movie.getYear());
assertEquals(211915, movie.getImdbId(), 0);
}
@Test @Test
public void searchMovieRedirect() throws Exception { public void searchMovieRedirect() throws Exception {
List<Movie> results = imdb.searchMovie("(500) Days of Summer (2009)", null); List<Movie> results = imdb.searchMovie("(500) Days of Summer (2009)", null);
@ -50,7 +61,7 @@ public class IMDbClientTest {
@Test @Test
public void getMovieDescriptor() throws Exception { public void getMovieDescriptor1() throws Exception {
Movie movie = imdb.getMovieDescriptor(499549, null); Movie movie = imdb.getMovieDescriptor(499549, null);
assertEquals("Avatar", movie.getName()); assertEquals("Avatar", movie.getName());
@ -59,6 +70,16 @@ public class IMDbClientTest {
} }
@Test
public void getMovieDescriptor2() throws Exception {
Movie movie = imdb.getMovieDescriptor(211915, null);
assertEquals("Amélie", movie.getName());
assertEquals(2001, movie.getYear());
assertEquals(211915, movie.getImdbId(), 0);
}
@Test @Test
public void getAkaMovieDescriptor() throws Exception { public void getAkaMovieDescriptor() throws Exception {
Movie movie = imdb.getMovieDescriptor(106559, Locale.ENGLISH); Movie movie = imdb.getMovieDescriptor(106559, Locale.ENGLISH);