* scrape info from main movie page rather than releaseinfo
* spoof googlebot http headers to trick imdb geo-localisation * fix imdb url encoding issues
This commit is contained in:
parent
55b4b26890
commit
d3347d19d9
|
@ -22,8 +22,6 @@ import java.util.List;
|
|||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Scanner;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -57,7 +55,7 @@ public class IMDbClient implements MovieIdentificationService {
|
|||
|
||||
protected String getHost() {
|
||||
String host = System.getProperty("imdb.hostname"); // default to akas.imdb.com but allow override via -Dimdb.host
|
||||
return host == null ? "akas.imdb.com" : host;
|
||||
return host == null ? "imdb.com" : host;
|
||||
}
|
||||
|
||||
|
||||
|
@ -75,7 +73,7 @@ public class IMDbClient implements MovieIdentificationService {
|
|||
|
||||
@Override
|
||||
public List<Movie> searchMovie(String query, Locale locale) throws Exception {
|
||||
Document dom = parsePage(new URL("http", getHost(), "/find?s=tt&q=" + encode(query)));
|
||||
Document dom = parsePage(new URL("http", getHost(), "/find?s=tt&q=" + encode(query, false)));
|
||||
|
||||
// select movie links followed by year in parenthesis
|
||||
List<Node> nodes = selectNodes("//TABLE[@class='findList']//TD/A[substring-after(substring-before(following::text(),')'),'(')]", dom);
|
||||
|
@ -119,32 +117,11 @@ public class IMDbClient implements MovieIdentificationService {
|
|||
if (header.contains("(VG)")) // ignore video games and videos
|
||||
return null;
|
||||
|
||||
String name = selectString("//H1/A/text()", dom).replaceAll("\\s+", " ").trim();
|
||||
String year = new Scanner(selectString("//H1/A/following::A/text()", dom)).useDelimiter("\\D+").next();
|
||||
String url = selectString("//H1/A/@href", dom);
|
||||
String name = selectString("//H1/text()", dom).replaceAll("\\s+", " ").trim();
|
||||
String year = new Scanner(selectString("//H1//A/text()", dom)).useDelimiter("\\D+").next();
|
||||
int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom));
|
||||
|
||||
// try to get localized name
|
||||
if (locale != null && locale != Locale.ROOT) {
|
||||
try {
|
||||
String language = String.format("(%s title)", locale.getDisplayLanguage(Locale.ENGLISH).toLowerCase());
|
||||
List<Node> akaRows = selectNodes("//*[@name='akas']//following::TABLE[1]//TR", dom);
|
||||
|
||||
for (Node aka : akaRows) {
|
||||
List<Node> columns = getChildren("TD", aka);
|
||||
String akaTitle = getTextContent(columns.get(0));
|
||||
String languageDesc = getTextContent(columns.get(1)).toLowerCase();
|
||||
|
||||
if (language.length() > 0 && languageDesc.contains(language) && languageDesc.contains("international")) {
|
||||
name = akaTitle;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to grep localized name: " + name);
|
||||
}
|
||||
}
|
||||
|
||||
return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, getImdbId(url), -1);
|
||||
return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, imdbid, -1);
|
||||
} catch (Exception e) {
|
||||
// ignore, we probably got redirected to an error page
|
||||
return null;
|
||||
|
@ -155,7 +132,7 @@ public class IMDbClient implements MovieIdentificationService {
|
|||
@Override
|
||||
public Movie getMovieDescriptor(int imdbid, Locale locale) throws Exception {
|
||||
try {
|
||||
return scrapeMovie(parsePage(new URL("http", getHost(), String.format("/title/tt%07d/releaseinfo", imdbid))), locale);
|
||||
return scrapeMovie(parsePage(new URL("http", getHost(), String.format("/title/tt%07d", imdbid))), locale);
|
||||
} catch (FileNotFoundException e) {
|
||||
return null; // illegal imdbid
|
||||
}
|
||||
|
@ -169,8 +146,11 @@ public class IMDbClient implements MovieIdentificationService {
|
|||
protected Reader openConnection(URL url) throws IOException {
|
||||
URLConnection connection = url.openConnection();
|
||||
|
||||
// IMDb refuses default user agent (Java/1.6.0_12)
|
||||
connection.addRequestProperty("User-Agent", "Mozilla");
|
||||
// IMDb refuses default user agent (Java/1.6.0_12) => SPOOF GOOGLEBOT
|
||||
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)");
|
||||
connection.addRequestProperty("From", "googlebot(at)googlebot.com");
|
||||
connection.addRequestProperty("Accept", "*/*");
|
||||
connection.addRequestProperty("X-Forwarded-For", "66.249.73.100"); // TRICK ANNOYING IMDB GEO-LOCATION LOCALIZATION
|
||||
|
||||
return getReader(connection);
|
||||
}
|
||||
|
|
|
@ -53,7 +53,7 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
|
|||
|
||||
@Override
|
||||
public List<SearchResult> search(String query) throws IOException, SAXException {
|
||||
URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query));
|
||||
URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query, true));
|
||||
Document dom = getHtmlDocument(searchUrl);
|
||||
|
||||
List<Node> nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom);
|
||||
|
|
|
@ -236,7 +236,7 @@ public class TMDbClient implements MovieIdentificationService {
|
|||
}
|
||||
data.put("api_key", apikey);
|
||||
|
||||
URL url = new URL("http", host, "/" + version + "/" + resource + "?" + encodeParameters(data));
|
||||
URL url = new URL("http", host, "/" + version + "/" + resource + "?" + encodeParameters(data, true));
|
||||
|
||||
CachedResource<String> json = new CachedResource<String>(url.toString(), String.class, 7 * 24 * 60 * 60 * 1000) {
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ public class TVRageClient extends AbstractEpisodeListProvider {
|
|||
|
||||
@Override
|
||||
public List<SearchResult> fetchSearchResult(String query, Locale locale) throws IOException, SAXException {
|
||||
URL searchUrl = new URL("http", host, "/feeds/full_search.php?show=" + encode(query));
|
||||
URL searchUrl = new URL("http", host, "/feeds/full_search.php?show=" + encode(query, true));
|
||||
Document dom = getDocument(searchUrl);
|
||||
|
||||
List<Node> nodes = selectNodes("Results/show", dom);
|
||||
|
|
|
@ -106,7 +106,7 @@ public class TheTVDBClient extends AbstractEpisodeListProvider {
|
|||
@Override
|
||||
public List<SearchResult> fetchSearchResult(String query, Locale locale) throws Exception {
|
||||
// perform online search
|
||||
URL url = getResource(null, "/api/GetSeries.php?seriesname=" + encode(query) + "&language=" + getLanguageCode(locale));
|
||||
URL url = getResource(null, "/api/GetSeries.php?seriesname=" + encode(query, true) + "&language=" + getLanguageCode(locale));
|
||||
Document dom = getDocument(url);
|
||||
|
||||
List<Node> nodes = selectNodes("Data/Series", dom);
|
||||
|
|
|
@ -35,13 +35,13 @@ import javax.net.ssl.X509TrustManager;
|
|||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import net.sourceforge.tuned.ByteBufferOutputStream;
|
||||
|
||||
import org.cyberneko.html.parsers.DOMParser;
|
||||
import org.w3c.dom.Document;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import net.sourceforge.tuned.ByteBufferOutputStream;
|
||||
|
||||
|
||||
public final class WebRequest {
|
||||
|
||||
|
@ -168,7 +168,7 @@ public final class WebRequest {
|
|||
|
||||
|
||||
public static ByteBuffer post(HttpURLConnection connection, Map<String, ?> parameters) throws IOException {
|
||||
return post(connection, encodeParameters(parameters).getBytes("UTF-8"), "application/x-www-form-urlencoded");
|
||||
return post(connection, encodeParameters(parameters, true).getBytes("UTF-8"), "application/x-www-form-urlencoded");
|
||||
}
|
||||
|
||||
|
||||
|
@ -236,7 +236,7 @@ public final class WebRequest {
|
|||
}
|
||||
|
||||
|
||||
public static String encodeParameters(Map<String, ?> parameters) {
|
||||
public static String encodeParameters(Map<String, ?> parameters, boolean unicode) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (Entry<String, ?> entry : parameters.entrySet()) {
|
||||
|
@ -247,7 +247,7 @@ public final class WebRequest {
|
|||
sb.append(entry.getKey());
|
||||
if (entry.getValue() != null) {
|
||||
sb.append("=");
|
||||
sb.append(encode(entry.getValue().toString()));
|
||||
sb.append(encode(entry.getValue().toString(), unicode));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -255,9 +255,9 @@ public final class WebRequest {
|
|||
}
|
||||
|
||||
|
||||
public static String encode(String string) {
|
||||
public static String encode(String string, boolean unicode) {
|
||||
try {
|
||||
return URLEncoder.encode(string, "UTF-8");
|
||||
return URLEncoder.encode(string, unicode ? "UTF-8" : "ISO-8859-1");
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -268,15 +268,18 @@ public final class WebRequest {
|
|||
// create a trust manager that does not validate certificate chains
|
||||
TrustManager trustAnyCertificate = new X509TrustManager() {
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(X509Certificate[] certs, String authType) {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(X509Certificate[] certs, String authType) {
|
||||
}
|
||||
};
|
||||
|
|
|
@ -25,15 +25,11 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
|
|||
public static final String DOWNLOAD_STATE = "download state";
|
||||
public static final String DOWNLOAD_PROGRESS = "download progress";
|
||||
|
||||
|
||||
|
||||
public static enum DownloadState {
|
||||
PENDING,
|
||||
CONNECTING,
|
||||
DOWNLOADING,
|
||||
DONE
|
||||
PENDING, CONNECTING, DOWNLOADING, DONE
|
||||
}
|
||||
|
||||
|
||||
private URL url;
|
||||
|
||||
private long contentLength = -1;
|
||||
|
@ -43,12 +39,12 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
|
|||
private Map<String, String> requestHeaders;
|
||||
private Map<String, List<String>> responseHeaders;
|
||||
|
||||
|
||||
|
||||
public DownloadTask(URL url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected HttpURLConnection createConnection() throws Exception {
|
||||
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
|
||||
|
||||
|
@ -61,7 +57,7 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
|
|||
return connection;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
protected ByteBuffer doInBackground() throws Exception {
|
||||
setDownloadState(DownloadState.CONNECTING);
|
||||
|
@ -69,7 +65,7 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
|
|||
HttpURLConnection connection = createConnection();
|
||||
|
||||
if (postParameters != null) {
|
||||
ByteBuffer postData = Charset.forName("UTF-8").encode(encodeParameters(postParameters));
|
||||
ByteBuffer postData = Charset.forName("UTF-8").encode(encodeParameters(postParameters, true));
|
||||
|
||||
// add content type and content length headers
|
||||
connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");
|
||||
|
@ -118,53 +114,53 @@ public class DownloadTask extends SwingWorker<ByteBuffer, Void> {
|
|||
return buffer.getByteBuffer();
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected void setDownloadState(DownloadState state) {
|
||||
this.state = state;
|
||||
firePropertyChange(DOWNLOAD_STATE, null, state);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public DownloadState getDownloadState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public URL getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public boolean isContentLengthKnown() {
|
||||
return contentLength >= 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public long getContentLength() {
|
||||
return contentLength;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void setRequestHeaders(Map<String, String> requestHeaders) {
|
||||
this.requestHeaders = new HashMap<String, String>(requestHeaders);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void setPostParameters(Map<String, String> postParameters) {
|
||||
this.postParameters = new HashMap<String, String>(postParameters);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Map<String, List<String>> getResponseHeaders() {
|
||||
return responseHeaders;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Map<String, String> getPostParameters() {
|
||||
return postParameters;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Map<String, String> getRequestHeaders() {
|
||||
return requestHeaders;
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@ public class IMDbClientTest {
|
|||
|
||||
|
||||
@Test
|
||||
public void searchMovie() throws Exception {
|
||||
public void searchMovie1() throws Exception {
|
||||
List<Movie> results = imdb.searchMovie("Avatar", null);
|
||||
Movie movie = results.get(0);
|
||||
|
||||
|
@ -28,7 +28,7 @@ public class IMDbClientTest {
|
|||
|
||||
@Test
|
||||
public void searchMovie2() throws Exception {
|
||||
List<Movie> results = imdb.searchMovie("the illusionist", null);
|
||||
List<Movie> results = imdb.searchMovie("The Illusionist", null);
|
||||
Movie movie = results.get(0);
|
||||
|
||||
assertEquals("The Illusionist", movie.getName());
|
||||
|
@ -37,6 +37,17 @@ public class IMDbClientTest {
|
|||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void searchMovie3() throws Exception {
|
||||
List<Movie> results = imdb.searchMovie("Amélie", null);
|
||||
Movie movie = results.get(0);
|
||||
|
||||
assertEquals("Amélie", movie.getName());
|
||||
assertEquals(2001, movie.getYear());
|
||||
assertEquals(211915, movie.getImdbId(), 0);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void searchMovieRedirect() throws Exception {
|
||||
List<Movie> results = imdb.searchMovie("(500) Days of Summer (2009)", null);
|
||||
|
@ -50,7 +61,7 @@ public class IMDbClientTest {
|
|||
|
||||
|
||||
@Test
|
||||
public void getMovieDescriptor() throws Exception {
|
||||
public void getMovieDescriptor1() throws Exception {
|
||||
Movie movie = imdb.getMovieDescriptor(499549, null);
|
||||
|
||||
assertEquals("Avatar", movie.getName());
|
||||
|
@ -59,6 +70,16 @@ public class IMDbClientTest {
|
|||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void getMovieDescriptor2() throws Exception {
|
||||
Movie movie = imdb.getMovieDescriptor(211915, null);
|
||||
|
||||
assertEquals("Amélie", movie.getName());
|
||||
assertEquals(2001, movie.getYear());
|
||||
assertEquals(211915, movie.getImdbId(), 0);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void getAkaMovieDescriptor() throws Exception {
|
||||
Movie movie = imdb.getMovieDescriptor(106559, Locale.ENGLISH);
|
||||
|
|
Loading…
Reference in New Issue