* fix imdb scraper

This commit is contained in:
Reinhard Pointner 2013-02-22 16:02:43 +00:00
parent 377506801b
commit 46de54e213
1 changed files with 6 additions and 8 deletions

View File

@ -21,7 +21,6 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Scanner;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -110,15 +109,14 @@ public class IMDbClient implements MovieIdentificationService {
protected Movie scrapeMovie(Document dom, Locale locale) { protected Movie scrapeMovie(Document dom, Locale locale) {
try { try {
String header = selectString("//H1", dom).toUpperCase(); int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom));
if (header.contains("(VG)")) // ignore video games and videos String title = selectString("//META[@property='og:title']/@content", dom);
Matcher titleMatcher = Pattern.compile("(.+)\\s\\((?i:TV\\s)?(\\d{4})\\)$").matcher(title);
if (!titleMatcher.matches())
return null; return null;
String name = selectString("//H1/text()", dom).replaceAll("\\s+", " ").trim(); return new Movie(titleMatcher.group(1), Integer.parseInt(titleMatcher.group(2)), imdbid, -1);
String year = new Scanner(selectNode("//H1/SPAN[@class='nobr']", dom).getTextContent()).useDelimiter("\\D+").next();
int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom));
return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, imdbid, -1);
} catch (Exception e) { } catch (Exception e) {
// ignore, we probably got redirected to an error page // ignore, we probably got redirected to an error page
return null; return null;