* use animetitles.dat.gz instead of scraping the anidb search interface

* use disk-persistent cache for all anidb data (expire after 30 days)
This commit is contained in:
Reinhard Pointner 2009-10-28 15:09:47 +00:00
parent d3331f3053
commit 0a2d323ac4
6 changed files with 227 additions and 156 deletions

View File

@ -132,6 +132,17 @@
memoryStoreEvictionPolicy="LRU"
/>
<!--
Very long-lived cache (one month!) for AniDB anime list and episode information.
-->
<cache name="anidb"
maxElementsInMemory="20"
timeToIdleSeconds="2628000"
timeToLiveSeconds="2628000"
diskPersistent="true"
memoryStoreEvictionPolicy="LRU"
/>
<!--
Simple memory cache for calculated checksums. Time to live is 2 hours. This cache is used in EpisodeFormatBindingBean

View File

@ -20,6 +20,7 @@ import javax.swing.UIManager;
import org.kohsuke.args4j.CmdLineParser;
import net.sf.ehcache.CacheManager;
import net.sourceforge.filebot.format.ExpressionFormat;
import net.sourceforge.filebot.ui.MainFrame;
import net.sourceforge.filebot.ui.NotificationLoggingHandler;
@ -35,6 +36,7 @@ public class Main {
public static void main(String... args) throws Exception {
// initialize this stuff before anything else
initializeLogging();
initializeCache();
initializeSecurityManager();
// parse arguments
@ -100,6 +102,20 @@ public class Main {
}
/**
* Shutdown ehcache properly, so that disk-persistent stores can actually be saved to disk
*/
private static void initializeCache() {
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
CacheManager.getInstance().shutdown();
}
});
}
/**
* Initialize default SecurityManager and grant all permissions via security policy.
* Initialization is required in order to run {@link ExpressionFormat} in a secure sandbox.

View File

@ -6,16 +6,27 @@ import static net.sourceforge.filebot.web.WebRequest.*;
import static net.sourceforge.tuned.XPathUtilities.*;
import java.io.IOException;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.logging.Level;
import java.util.Map;
import java.util.Scanner;
import java.util.TreeMap;
import java.util.AbstractMap.SimpleEntry;
import java.util.Map.Entry;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import javax.swing.Icon;
@ -23,6 +34,12 @@ import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Element;
import net.sourceforge.filebot.ResourceManager;
@ -30,6 +47,8 @@ public class AnidbClient implements EpisodeListProvider {
private static final String host = "anidb.net";
private static final Cache cache = CacheManager.getInstance().getCache("anidb");
@Override
public String getName() {
@ -45,74 +64,81 @@ public class AnidbClient implements EpisodeListProvider {
@Override
public List<SearchResult> search(String query) throws IOException, SAXException {
// Air Status: ignore
// Anime Type: TV Series, TV Special, OVA
// Hide Synonyms: true
URL searchUrl = new URL("http", host, "/perl-bin/animedb.pl?type.tvspecial=1&type.tvseries=1&type.ova=1&show=animelist&orderby.name=0.1&noalias=1&do.update=update&adb.search=" + URLEncoder.encode(query, "UTF-8"));
// normalize
query = query.toLowerCase();
Document dom = getHtmlDocument(searchUrl);
AbstractStringMetric metric = new QGramsDistance();
List<Node> nodes = selectNodes("//TABLE[@class='animelist']//TR/TD/ancestor::TR", dom);
final List<Entry<SearchResult, Float>> resultSet = new ArrayList<Entry<SearchResult, Float>>();
List<SearchResult> results = new ArrayList<SearchResult>(nodes.size());
for (AnidbSearchResult anime : getAnimeTitles()) {
for (String name : new String[] { anime.getMainTitle(), anime.getEnglishTitle() }) {
if (name != null) {
float similarity = metric.getSimilarity(name.toLowerCase(), query);
for (Node node : nodes) {
Node link = selectNode("./TD[@class='name']/A", node);
if (similarity > 0.5 || name.toLowerCase().contains(query)) {
resultSet.add(new SimpleEntry<SearchResult, Float>(anime, similarity));
String title = getTextContent(link);
String href = getAttribute("href", link);
try {
results.add(new HyperLink(title, new URL("http", host, "/perl-bin/" + href)));
} catch (MalformedURLException e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Invalid href: " + href);
// add only once
break;
}
}
// we might have been redirected to the episode list page
if (results.isEmpty()) {
// get anime information from document
String link = selectString("//*[@class='data']//A[@class='short_link']/@href", dom);
// check if page is an anime page, are an empty search result page
if (!link.isEmpty()) {
try {
results.add(new HyperLink(selectTitle(dom), new URL(link)));
} catch (MalformedURLException e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Invalid location: " + link);
}
}
}
return results;
// sort by similarity descending (best matches first)
Collections.sort(resultSet, new Comparator<Entry<SearchResult, Float>>() {
@Override
public int compare(Entry<SearchResult, Float> o1, Entry<SearchResult, Float> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
// view for the first 20 search results
return new AbstractList<SearchResult>() {
@Override
public SearchResult get(int index) {
return resultSet.get(index).getKey();
}
protected String selectTitle(Document animePage) {
// extract name from header (e.g. "Anime: Naruto")
return selectString("//H1", animePage).replaceFirst("^Anime:\\s*", "");
@Override
public int size() {
return Math.min(20, resultSet.size());
}
};
}
@Override
public List<Episode> getEpisodeList(SearchResult searchResult) throws IOException, SAXException {
int aid = getAnimeID(getEpisodeListLink(searchResult));
int aid = ((AnidbSearchResult) searchResult).getAnimeId();
URL url = new URL("http", host, "/perl-bin/animedb.pl?show=xml&t=anime&aid=" + aid);
// try cache first
try {
return Arrays.asList((Episode[]) cache.get(url.toString()).getValue());
} catch (Exception e) {
// ignore
}
// get anime page as xml
Document dom = getDocument(new URL("http", host, "/perl-bin/animedb.pl?show=xml&t=anime&aid=" + aid));
Document dom = getDocument(url);
// select main title
String animeTitle = selectString("//anime/titles/title[@type='main']/text()", dom);
String animeTitle = selectString("//title[@type='main']", dom);
List<Episode> episodes = new ArrayList<Episode>(25);
for (Node node : selectNodes("//anime/eps/ep", dom)) {
for (Node node : selectNodes("//ep", dom)) {
String flags = getTextContent("flags", node);
// allow only normal and recap episodes
if (flags == null || flags.equals("2")) {
String number = getTextContent("epno", node);
String title = selectString("./titles/title[@lang='en']", node);
String title = selectString(".//title[@lang='en']", node);
// no seasons for anime
episodes.add(new Episode(animeTitle, null, number, title));
@ -120,72 +146,27 @@ public class AnidbClient implements EpisodeListProvider {
}
// sanity check
if (episodes.isEmpty()) {
if (episodes.size() > 0) {
// populate cache
cache.put(new Element(url.toString(), episodes.toArray(new Episode[0])));
} else {
// anime page xml doesn't work sometimes
Logger.getLogger(getClass().getName()).warning(String.format("Failed to parse episode data from xml: %s (%d)", searchResult, aid));
// fall back to good old page scraper
return scrapeEpisodeList(searchResult);
}
return episodes;
}
protected List<Episode> scrapeEpisodeList(SearchResult searchResult) throws IOException, SAXException {
Document dom = getHtmlDocument(getEpisodeListLink(searchResult).toURL());
// use title from anime page
String animeTitle = selectTitle(dom);
List<Node> nodes = selectNodes("id('eplist')//TR/TD/SPAN/ancestor::TR", dom);
List<Episode> episodes = new ArrayList<Episode>(nodes.size());
for (Node node : nodes) {
List<Node> columns = getChildren("TD", node);
String number = getTextContent("A", columns.get(0));
String title = getTextContent("LABEL", columns.get(1));
// if number does not match, episode is probably some kind of special (S1, S2, ...)
if (number.matches("\\d+")) {
// no seasons for anime
episodes.add(new Episode(animeTitle, null, number, title));
}
}
return episodes;
}
protected int getAnimeID(URI uri) {
// e.g. http://anidb.net/perl-bin/animedb.pl?show=anime&aid=26
if (uri.getQuery() != null) {
Matcher query = Pattern.compile("aid=(\\d+)").matcher(uri.getQuery());
if (query.find()) {
return Integer.parseInt(query.group(1));
}
}
// e.g. http://anidb.net/a26
if (uri.getPath() != null) {
Matcher path = Pattern.compile("/a(\\d+)$").matcher(uri.getPath());
if (path.find()) {
return Integer.parseInt(path.group(1));
}
}
// no aid found
throw new IllegalArgumentException("URI does not contain an aid: " + uri);
}
@Override
public URI getEpisodeListLink(SearchResult searchResult) {
return ((HyperLink) searchResult).getURI();
int aid = ((AnidbSearchResult) searchResult).getAnimeId();
try {
return new URI("http", host, "/a" + aid, null);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
@ -206,4 +187,95 @@ public class AnidbClient implements EpisodeListProvider {
return null;
}
private AnidbSearchResult[] getAnimeTitles() throws MalformedURLException, IOException, SAXException {
URL url = new URL("http", host, "/api/animetitles.dat.gz");
// try cache first
try {
return (AnidbSearchResult[]) cache.get(url.toString()).getValue();
} catch (Exception e) {
// ignore
}
// <aid>|<type>|<language>|<title>
// type: 1=primary title (one per anime), 2=synonyms (multiple per anime), 3=shorttitles (multiple per anime), 4=official title (one per language)
Pattern pattern = Pattern.compile("^(?!#)(\\d+)[|](\\d)[|]([\\w-]+)[|](.+)$");
Map<Integer, String> primaryTitleMap = new TreeMap<Integer, String>();
Map<Integer, String> englishTitleMap = new HashMap<Integer, String>();
// fetch data
Scanner scanner = new Scanner(new GZIPInputStream(url.openStream()), "UTF-8");
try {
while (scanner.hasNextLine()) {
Matcher matcher = pattern.matcher(scanner.nextLine());
if (matcher.matches()) {
if (matcher.group(2).equals("1")) {
primaryTitleMap.put(Integer.parseInt(matcher.group(1)), matcher.group(4));
} else if (matcher.group(2).equals("4") && matcher.group(3).equals("en")) {
englishTitleMap.put(Integer.parseInt(matcher.group(1)), matcher.group(4));
}
}
}
} finally {
scanner.close();
}
List<AnidbSearchResult> anime = new ArrayList<AnidbSearchResult>(primaryTitleMap.size());
for (Entry<Integer, String> entry : primaryTitleMap.entrySet()) {
anime.add(new AnidbSearchResult(entry.getKey(), entry.getValue(), englishTitleMap.get(entry.getKey())));
}
// populate cache
AnidbSearchResult[] result = anime.toArray(new AnidbSearchResult[0]);
cache.put(new Element(url.toString(), result));
return result;
}
public static class AnidbSearchResult extends SearchResult implements Serializable {
protected int aid;
protected String mainTitle;
protected String englishTitle;
protected AnidbSearchResult() {
// used by serializer
}
public AnidbSearchResult(int aid, String mainTitle, String englishTitle) {
this.aid = aid;
this.mainTitle = mainTitle;
this.englishTitle = englishTitle;
}
public int getAnimeId() {
return aid;
}
@Override
public String getName() {
return mainTitle;
}
public String getMainTitle() {
return mainTitle;
}
public String getEnglishTitle() {
return englishTitle;
}
}
}

View File

@ -8,10 +8,15 @@ import java.util.Arrays;
public class Episode implements Serializable {
private final String seriesName;
private final String season;
private final String episode;
private final String title;
private String seriesName;
private String season;
private String episode;
private String title;
protected Episode() {
// used by serializer
}
public Episode(String seriesName, int season, int episode, String title) {

View File

@ -2,13 +2,16 @@
package net.sourceforge.filebot.web;
public abstract class SearchResult {
protected final String name;
protected SearchResult() {
this.name = null;
}
public SearchResult(String name) {
this.name = name;
}
@ -21,7 +24,7 @@ public abstract class SearchResult {
@Override
public String toString() {
return name;
return getName();
}
}

View File

@ -2,39 +2,39 @@
package net.sourceforge.filebot.web;
import static net.sourceforge.filebot.web.WebRequest.*;
import static org.junit.Assert.*;
import java.net.URL;
import java.util.List;
import org.junit.BeforeClass;
import org.junit.Test;
import net.sourceforge.filebot.web.AnidbClient.AnidbSearchResult;
public class AnidbClientTest {
/**
* 74 episodes
*/
private static HyperLink monsterSearchResult;
private static AnidbSearchResult monsterSearchResult;
/**
* 45 episodes, direct result page (short_link)
* 45 episodes
*/
private static HyperLink twelvekingdomsSearchResult;
private static AnidbSearchResult twelvekingdomsSearchResult;
/**
* 38 episodes, lots of special characters
*/
private static HyperLink princessTutuSearchResult;
private static AnidbSearchResult princessTutuSearchResult;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
monsterSearchResult = new HyperLink("Monster", new URL("http://anidb.net/perl-bin/animedb.pl?show=anime&aid=1539"));
twelvekingdomsSearchResult = new HyperLink("Juuni Kokuki", new URL("http://anidb.net/a26"));
princessTutuSearchResult = new HyperLink("Princess Tutu", new URL("http://anidb.net/a516"));
monsterSearchResult = new AnidbSearchResult(1539, "Monster", null);
twelvekingdomsSearchResult = new AnidbSearchResult(26, "Juuni Kokuki", "The Twelve Kingdoms");
princessTutuSearchResult = new AnidbSearchResult(516, "Princess Tutu", null);
}
@ -45,10 +45,10 @@ public class AnidbClientTest {
public void search() throws Exception {
List<SearchResult> results = anidb.search("one piece");
HyperLink result = (HyperLink) results.get(0);
AnidbSearchResult result = (AnidbSearchResult) results.get(0);
assertEquals("One Piece", result.getName());
assertEquals("http://anidb.net/perl-bin/animedb.pl?show=anime&aid=69", result.getURL().toString());
assertEquals(69, result.getAnimeId());
}
@ -60,23 +60,6 @@ public class AnidbClientTest {
}
@Test
public void searchHideSynonyms() throws Exception {
final List<SearchResult> results = anidb.search("one piece");
int count = 0;
for (SearchResult result : results) {
if ("one piece".equalsIgnoreCase(result.getName())) {
count++;
}
}
// must only occur once
assertEquals(1, count, 0);
}
@Test
public void searchTitleAlias() throws Exception {
// Seikai no Senki (main title), Banner of the Stars (official English title)
@ -88,19 +71,6 @@ public class AnidbClientTest {
}
@Test
public void searchPageRedirect() throws Exception {
List<SearchResult> results = anidb.search("twelve kingdoms");
assertEquals(1, results.size());
HyperLink result = (HyperLink) results.get(0);
assertEquals("Juuni Kokuki", result.getName());
assertEquals("http://anidb.net/a26", result.getURL().toString());
}
@Test
public void getEpisodeListAll() throws Exception {
List<Episode> list = anidb.getEpisodeList(monsterSearchResult);
@ -143,15 +113,9 @@ public class AnidbClientTest {
}
@Test
public void selectTitle() throws Exception {
assertEquals("Seikai no Senki", anidb.selectTitle(getHtmlDocument(new URL("http://anidb.net/a4"))));
}
@Test
public void getEpisodeListLink() throws Exception {
assertEquals(monsterSearchResult.getURL().toString(), anidb.getEpisodeListLink(monsterSearchResult).toURL().toString());
assertEquals("http://anidb.net/a1539", anidb.getEpisodeListLink(monsterSearchResult).toURL().toString());
}
}