* removed IMDB scraper
This commit is contained in:
parent
30a54c2cf4
commit
ff90a3f0df
|
@ -1,96 +0,0 @@
|
||||||
|
|
||||||
package net.sourceforge.filebot.web;
|
|
||||||
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.UnsupportedEncodingException;
|
|
||||||
import java.net.MalformedURLException;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.net.URLEncoder;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.logging.Level;
|
|
||||||
import java.util.logging.Logger;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import javax.swing.ImageIcon;
|
|
||||||
|
|
||||||
import net.sourceforge.filebot.resources.ResourceManager;
|
|
||||||
import net.sourceforge.tuned.XPathUtil;
|
|
||||||
|
|
||||||
import org.w3c.dom.Document;
|
|
||||||
import org.w3c.dom.Node;
|
|
||||||
import org.xml.sax.SAXException;
|
|
||||||
|
|
||||||
|
|
||||||
public class ImdbSearchEngine {
|
|
||||||
|
|
||||||
private String host = "www.imdb.com";
|
|
||||||
|
|
||||||
|
|
||||||
public List<MovieDescriptor> search(String searchterm) throws IOException, SAXException {
|
|
||||||
|
|
||||||
Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
|
|
||||||
|
|
||||||
List<Node> nodes = XPathUtil.selectNodes("id('outerbody')//TABLE//P[position() >= 2 and position() <=3 ]//A[count(child::IMG) <= 0]/..", dom);
|
|
||||||
|
|
||||||
ArrayList<MovieDescriptor> movies = new ArrayList<MovieDescriptor>();
|
|
||||||
|
|
||||||
for (Node node : nodes) {
|
|
||||||
try {
|
|
||||||
movies.add(parseMovieNode(node));
|
|
||||||
} catch (Exception e) {
|
|
||||||
Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Cannot parse movie node");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return movies;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private MovieDescriptor parseMovieNode(Node node) throws Exception {
|
|
||||||
// ignore javascript links
|
|
||||||
Node linkNode = XPathUtil.selectFirstNode("./A[count(@onclick) <= 0]", node);
|
|
||||||
|
|
||||||
String title = XPathUtil.selectString("text()", linkNode);
|
|
||||||
String href = XPathUtil.selectString("@href", linkNode);
|
|
||||||
|
|
||||||
// match /title/tt0379786/
|
|
||||||
Matcher idMatcher = Pattern.compile(".*/tt(\\d+)/.*").matcher(href);
|
|
||||||
int imdbId;
|
|
||||||
|
|
||||||
if (idMatcher.matches()) {
|
|
||||||
imdbId = new Integer(idMatcher.group(1));
|
|
||||||
} else
|
|
||||||
throw new IllegalArgumentException("Cannot match imdb id: " + href);
|
|
||||||
|
|
||||||
String yearString = XPathUtil.selectString("text()[1]", node);
|
|
||||||
|
|
||||||
// match (2005)
|
|
||||||
Matcher yearMatcher = Pattern.compile(".*\\((\\d+)\\).*").matcher(yearString);
|
|
||||||
Integer year = null;
|
|
||||||
|
|
||||||
if (yearMatcher.matches()) {
|
|
||||||
year = Integer.parseInt(yearMatcher.group(1));
|
|
||||||
} else
|
|
||||||
throw new IllegalArgumentException("Cannot match year: " + yearString);
|
|
||||||
|
|
||||||
URL imdbUrl = new URL("http", host, href);
|
|
||||||
|
|
||||||
return new MovieDescriptor(title, imdbId, year, imdbUrl);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private URL getSearchUrl(String searchterm) throws UnsupportedEncodingException, MalformedURLException {
|
|
||||||
String qs = URLEncoder.encode(searchterm, "UTF-8");
|
|
||||||
String file = "/find?q=" + qs + ";s=tt";
|
|
||||||
return new URL("http", host, file);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public ImageIcon getIcon() {
|
|
||||||
return ResourceManager.getIcon("search.imdb");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in New Issue