* fix imdb scraper
This commit is contained in:
parent
377506801b
commit
46de54e213
|
@ -21,7 +21,6 @@ import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Scanner;
|
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@ -110,15 +109,14 @@ public class IMDbClient implements MovieIdentificationService {
|
||||||
|
|
||||||
protected Movie scrapeMovie(Document dom, Locale locale) {
|
protected Movie scrapeMovie(Document dom, Locale locale) {
|
||||||
try {
|
try {
|
||||||
String header = selectString("//H1", dom).toUpperCase();
|
int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom));
|
||||||
if (header.contains("(VG)")) // ignore video games and videos
|
String title = selectString("//META[@property='og:title']/@content", dom);
|
||||||
|
|
||||||
|
Matcher titleMatcher = Pattern.compile("(.+)\\s\\((?i:TV\\s)?(\\d{4})\\)$").matcher(title);
|
||||||
|
if (!titleMatcher.matches())
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
String name = selectString("//H1/text()", dom).replaceAll("\\s+", " ").trim();
|
return new Movie(titleMatcher.group(1), Integer.parseInt(titleMatcher.group(2)), imdbid, -1);
|
||||||
String year = new Scanner(selectNode("//H1/SPAN[@class='nobr']", dom).getTextContent()).useDelimiter("\\D+").next();
|
|
||||||
int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom));
|
|
||||||
|
|
||||||
return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, imdbid, -1);
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// ignore, we probably got redirected to an error page
|
// ignore, we probably got redirected to an error page
|
||||||
return null;
|
return null;
|
||||||
|
|
Loading…
Reference in New Issue