* scrape info from main movie page rather than releaseinfo

* spoof googlebot http headers to trick imdb geo-localisation
* fix imdb url encoding issues
This commit is contained in:
Reinhard Pointner 2012-12-02 09:36:59 +00:00
parent 55b4b26890
commit d3347d19d9
8 changed files with 66 additions and 66 deletions

View File

@ -22,8 +22,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -57,7 +55,7 @@ public class IMDbClient implements MovieIdentificationService {
protected String getHost() {
String host = System.getProperty("imdb.hostname"); // default to akas.imdb.com but allow override via -Dimdb.host
return host == null ? "akas.imdb.com" : host;
return host == null ? "imdb.com" : host;
}
@ -75,7 +73,7 @@ public class IMDbClient implements MovieIdentificationService {
@Override
public List<Movie> searchMovie(String query, Locale locale) throws Exception {
Document dom = parsePage(new URL("http", getHost(), "/find?s=tt&q=" + encode(query)));
Document dom = parsePage(new URL("http", getHost(), "/find?s=tt&q=" + encode(query, false)));
// select movie links followed by year in parenthesis
List<Node> nodes = selectNodes("//TABLE[@class='findList']//TD/A[substring-after(substring-before(following::text(),')'),'(')]", dom);
@ -119,32 +117,11 @@ public class IMDbClient implements MovieIdentificationService {
if (header.contains("(VG)")) // ignore video games and videos
return null;
String name = selectString("//H1/A/text()", dom).replaceAll("\\s+", " ").trim();
String year = new Scanner(selectString("//H1/A/following::A/text()", dom)).useDelimiter("\\D+").next();
String url = selectString("//H1/A/@href", dom);
String name = selectString("//H1/text()", dom).replaceAll("\\s+", " ").trim();
String year = new Scanner(selectString("//H1//A/text()", dom)).useDelimiter("\\D+").next();
int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom));
// try to get localized name
if (locale != null && locale != Locale.ROOT) {
try {
String language = String.format("(%s title)", locale.getDisplayLanguage(Locale.ENGLISH).toLowerCase());
List<Node> akaRows = selectNodes("//*[@name='akas']//following::TABLE[1]//TR", dom);
for (Node aka : akaRows) {
List<Node> columns = getChildren("TD", aka);
String akaTitle = getTextContent(columns.get(0));
String languageDesc = getTextContent(columns.get(1)).toLowerCase();
if (language.length() > 0 && languageDesc.contains(language) && languageDesc.contains("international")) {
name = akaTitle;
break;
}
}
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to grep localized name: " + name);
}
}
return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, getImdbId(url), -1);
return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, imdbid, -1);
} catch (Exception e) {
// ignore, we probably got redirected to an error page
return null;
@ -155,7 +132,7 @@ public class IMDbClient implements MovieIdentificationService {
@Override
public Movie getMovieDescriptor(int imdbid, Locale locale) throws Exception {
try {
return scrapeMovie(parsePage(new URL("http", getHost(), String.format("/title/tt%07d/releaseinfo", imdbid))), locale);
return scrapeMovie(parsePage(new URL("http", getHost(), String.format("/title/tt%07d", imdbid))), locale);
} catch (FileNotFoundException e) {
return null; // illegal imdbid
}
@ -169,8 +146,11 @@ public class IMDbClient implements MovieIdentificationService {
protected Reader openConnection(URL url) throws IOException {
URLConnection connection = url.openConnection();
// IMDb refuses default user agent (Java/1.6.0_12)
connection.addRequestProperty("User-Agent", "Mozilla");
// IMDb refuses default user agent (Java/1.6.0_12) => SPOOF GOOGLEBOT
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)");
connection.addRequestProperty("From", "googlebot(at)googlebot.com");
connection.addRequestProperty("Accept", "*/*");
connection.addRequestProperty("X-Forwarded-For", "66.249.73.100"); // TRICK ANNOYING IMDB GEO-LOCATION LOCALIZATION
return getReader(connection);
}

View File

@ -53,7 +53,7 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
@Override
public List<SearchResult> search(String query) throws IOException, SAXException {
URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query));
URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query, true));
Document dom = getHtmlDocument(searchUrl);
List<Node> nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom);

View File

@ -236,7 +236,7 @@ public class TMDbClient implements MovieIdentificationService {
}
data.put("api_key", apikey);
URL url = new URL("http", host, "/" + version + "/" + resource + "?" + encodeParameters(data));
URL url = new URL("http", host, "/" + version + "/" + resource + "?" + encodeParameters(data, true));
CachedResource<String> json = new CachedResource<String>(url.toString(), String.class, 7 * 24 * 60 * 60 * 1000) {

View File

@ -48,7 +48,7 @@ public class TVRageClient extends AbstractEpisodeListProvider {
@Override
public List<SearchResult> fetchSearchResult(String query, Locale locale) throws IOException, SAXException {
URL searchUrl = new URL("http", host, "/feeds/full_search.php?show=" + encode(query));
URL searchUrl = new URL("http", host, "/feeds/full_search.php?show=" + encode(query, true));
Document dom = getDocument(searchUrl);
List<Node> nodes = selectNodes("Results/show", dom);

View File

@ -106,7 +106,7 @@ public class TheTVDBClient extends AbstractEpisodeListProvider {
@Override
public List<SearchResult> fetchSearchResult(String query, Locale locale) throws Exception {
// perform online search
URL url = getResource(null, "/api/GetSeries.php?seriesname=" + encode(query) + "&language=" + getLanguageCode(locale));
URL url = getResource(null, "/api/GetSeries.php?seriesname=" + encode(query, true) + "&language=" + getLanguageCode(locale));
Document dom = getDocument(url);
List<Node> nodes = selectNodes("Data/Series", dom);

View File

@ -35,13 +35,13 @@ import javax.net.ssl.X509TrustManager;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import net.sourceforge.tuned.ByteBufferOutputStream;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import net.sourceforge.tuned.ByteBufferOutputStream;
public final class WebRequest {
@ -168,7 +168,7 @@ public final class WebRequest {
public static ByteBuffer post(HttpURLConnection connection, Map<String, ?> parameters) throws IOException {
return post(connection, encodeParameters(parameters).getBytes("UTF-8"), "application/x-www-form-urlencoded");
return post(connection, encodeParameters(parameters, true).getBytes("UTF-8"), "application/x-www-form-urlencoded");
}
@ -236,7 +236,7 @@ public final class WebRequest {
}
public static String encodeParameters(Map<String, ?> parameters) {
public static String encodeParameters(Map<String, ?> parameters, boolean unicode) {
StringBuilder sb = new StringBuilder();
for (Entry<String, ?> entry : parameters.entrySet()) {
@ -247,7 +247,7 @@ public final class WebRequest {
sb.append(entry.getKey());
if (entry.getValue() != null) {
sb.append("=");
sb.append(encode(entry.getValue().toString()));
sb.append(encode(entry.getValue().toString(), unicode));
}
}
@ -255,9 +255,9 @@ public final class WebRequest {
}
public static String encode(String string) {
public static String encode(String string, boolean unicode) {
try {
return URLEncoder.encode(string, "UTF-8");
return URLEncoder.encode(string, unicode ? "UTF-8" : "ISO-8859-1");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
@ -268,15 +268,18 @@ public final class WebRequest {
// create a trust manager that does not validate certificate chains
TrustManager trustAnyCertificate = new X509TrustManager() {
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
@Override
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
@Override
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
};

View File

@ -25,15 +25,11 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
public static final String DOWNLOAD_STATE = "download state";
public static final String DOWNLOAD_PROGRESS = "download progress";
public static enum DownloadState {
PENDING,
CONNECTING,
DOWNLOADING,
DONE
PENDING, CONNECTING, DOWNLOADING, DONE
}
private URL url;
private long contentLength = -1;
@ -43,12 +39,12 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
private Map<String, String> requestHeaders;
private Map<String, List<String>> responseHeaders;
public DownloadTask(URL url) {
this.url = url;
}
protected HttpURLConnection createConnection() throws Exception {
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
@ -61,7 +57,7 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
return connection;
}
@Override
protected ByteBuffer doInBackground() throws Exception {
setDownloadState(DownloadState.CONNECTING);
@ -69,7 +65,7 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
HttpURLConnection connection = createConnection();
if (postParameters != null) {
ByteBuffer postData = Charset.forName("UTF-8").encode(encodeParameters(postParameters));
ByteBuffer postData = Charset.forName("UTF-8").encode(encodeParameters(postParameters, true));
// add content type and content length headers
connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");
@ -118,53 +114,53 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
return buffer.getByteBuffer();
}
protected void setDownloadState(DownloadState state) {
this.state = state;
firePropertyChange(DOWNLOAD_STATE, null, state);
}
public DownloadState getDownloadState() {
return state;
}
public URL getUrl() {
return url;
}
public boolean isContentLengthKnown() {
return contentLength >= 0;
}
public long getContentLength() {
return contentLength;
}
public void setRequestHeaders(Map<String, String> requestHeaders) {
this.requestHeaders = new HashMap<String, String>(requestHeaders);
}
public void setPostParameters(Map<String, String> postParameters) {
this.postParameters = new HashMap<String, String>(postParameters);
}
public Map<String, List<String>> getResponseHeaders() {
return responseHeaders;
}
public Map<String, String> getPostParameters() {
return postParameters;
}
public Map<String, String> getRequestHeaders() {
return requestHeaders;
}

View File

@ -16,7 +16,7 @@ public class IMDbClientTest {
@Test
public void searchMovie() throws Exception {
public void searchMovie1() throws Exception {
List<Movie> results = imdb.searchMovie("Avatar", null);
Movie movie = results.get(0);
@ -28,7 +28,7 @@ public class IMDbClientTest {
@Test
public void searchMovie2() throws Exception {
List<Movie> results = imdb.searchMovie("the illusionist", null);
List<Movie> results = imdb.searchMovie("The Illusionist", null);
Movie movie = results.get(0);
assertEquals("The Illusionist", movie.getName());
@ -37,6 +37,17 @@ public class IMDbClientTest {
}
@Test
public void searchMovie3() throws Exception {
List<Movie> results = imdb.searchMovie("Amélie", null);
Movie movie = results.get(0);
assertEquals("Amélie", movie.getName());
assertEquals(2001, movie.getYear());
assertEquals(211915, movie.getImdbId(), 0);
}
@Test
public void searchMovieRedirect() throws Exception {
List<Movie> results = imdb.searchMovie("(500) Days of Summer (2009)", null);
@ -50,7 +61,7 @@ public class IMDbClientTest {
@Test
public void getMovieDescriptor() throws Exception {
public void getMovieDescriptor1() throws Exception {
Movie movie = imdb.getMovieDescriptor(499549, null);
assertEquals("Avatar", movie.getName());
@ -59,6 +70,16 @@ public class IMDbClientTest {
}
@Test
public void getMovieDescriptor2() throws Exception {
Movie movie = imdb.getMovieDescriptor(211915, null);
assertEquals("Amélie", movie.getName());
assertEquals(2001, movie.getYear());
assertEquals(211915, movie.getImdbId(), 0);
}
@Test
public void getAkaMovieDescriptor() throws Exception {
Movie movie = imdb.getMovieDescriptor(106559, Locale.ENGLISH);