* fixed some corner case issues with the imdb scraper

This commit is contained in:
Reinhard Pointner 2012-07-23 16:15:42 +00:00
parent 09ec7c9dfb
commit 0861220aed
2 changed files with 21 additions and 6 deletions

View File

@ -2,6 +2,7 @@
package net.sourceforge.filebot.web; package net.sourceforge.filebot.web;
import static java.util.Arrays.*;
import static java.util.Collections.*; import static java.util.Collections.*;
import static net.sourceforge.filebot.web.WebRequest.*; import static net.sourceforge.filebot.web.WebRequest.*;
import static net.sourceforge.tuned.XPathUtilities.*; import static net.sourceforge.tuned.XPathUtilities.*;
@ -92,10 +93,15 @@ public class IMDbClient implements MovieIdentificationService {
// we might have been redirected to the movie page // we might have been redirected to the movie page
if (results.isEmpty()) { if (results.isEmpty()) {
Movie movie = scrapeMovie(dom, locale); try {
if (movie != null) { int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom));
Movie movie = getMovieDescriptor(imdbid, locale);
if (movie == null) {
results.add(movie); results.add(movie);
} }
} catch (Exception e) {
// ignore, can't find movie
}
} }
return results; return results;
@ -115,7 +121,7 @@ public class IMDbClient implements MovieIdentificationService {
// try to get localized name // try to get localized name
if (locale != null && locale != Locale.ROOT) { if (locale != null && locale != Locale.ROOT) {
try { try {
String languageName = locale.getDisplayLanguage(Locale.ENGLISH).toLowerCase(); String language = String.format("(%s title)", locale.getDisplayLanguage(Locale.ENGLISH).toLowerCase());
List<Node> akaRows = selectNodes("//*[@name='akas']//following::TABLE[1]//TR", dom); List<Node> akaRows = selectNodes("//*[@name='akas']//following::TABLE[1]//TR", dom);
for (Node aka : akaRows) { for (Node aka : akaRows) {
@ -123,7 +129,7 @@ public class IMDbClient implements MovieIdentificationService {
String akaTitle = getTextContent(columns.get(0)); String akaTitle = getTextContent(columns.get(0));
String languageDesc = getTextContent(columns.get(1)).toLowerCase(); String languageDesc = getTextContent(columns.get(1)).toLowerCase();
if (languageName.length() > 0 && languageDesc.contains(languageName)) { if (language.length() > 0 && languageDesc.contains(language) && frequency(asList(languageDesc.split("\\W")), "title") == 1) {
name = akaTitle; name = akaTitle;
break; break;
} }
@ -207,7 +213,6 @@ public class IMDbClient implements MovieIdentificationService {
Map<MovieProperty, String> fields = new EnumMap<MovieProperty, String>(MovieProperty.class); Map<MovieProperty, String> fields = new EnumMap<MovieProperty, String>(MovieProperty.class);
fields.put(MovieProperty.name, data.get("title")); fields.put(MovieProperty.name, data.get("title"));
fields.put(MovieProperty.certification, data.get("rated")); fields.put(MovieProperty.certification, data.get("rated"));
fields.put(MovieProperty.released, Date.parse(data.get("released"), "dd MMM yyyy").toString());
fields.put(MovieProperty.tagline, data.get("plot")); fields.put(MovieProperty.tagline, data.get("plot"));
fields.put(MovieProperty.rating, data.get("imdbRating")); fields.put(MovieProperty.rating, data.get("imdbRating"));
fields.put(MovieProperty.votes, data.get("imdbVotes").replaceAll("\\D", "")); fields.put(MovieProperty.votes, data.get("imdbVotes").replaceAll("\\D", ""));

View File

@ -59,4 +59,14 @@ public class IMDbClientTest {
assertEquals(106559, movie.getImdbId(), 0); assertEquals(106559, movie.getImdbId(), 0);
} }
@Test
public void getAkaMovieDescriptorExtra() throws Exception {
Movie movie = imdb.getMovieDescriptor(470761, Locale.ENGLISH);
assertEquals("First Born", movie.getName());
assertEquals(2007, movie.getYear());
assertEquals(470761, movie.getImdbId(), 0);
}
} }