* heavily improved Subscene support (up to 35x faster)

This commit is contained in:
Reinhard Pointner 2008-07-06 18:31:04 +00:00
parent a94cedd601
commit 9eb74e8038
8 changed files with 248 additions and 89 deletions

View File

@ -6,7 +6,6 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL; import java.net.URL;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.text.NumberFormat; import java.text.NumberFormat;
@ -60,10 +59,8 @@ public class AnidbClient extends EpisodeListClient {
String path = "/perl-bin/" + href; String path = "/perl-bin/" + href;
try { try {
URI animeUrl = new URI("http", host, path, null); searchResults.add(new HyperLink(title, new URL("http", host, path)));
} catch (MalformedURLException e) {
searchResults.add(new HyperLink(title, animeUrl));
} catch (URISyntaxException e) {
Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href); Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href);
} }
} }
@ -76,7 +73,7 @@ public class AnidbClient extends EpisodeListClient {
String header = XPathUtil.selectString("id('layout-content')//H1[1]", dom); String header = XPathUtil.selectString("id('layout-content')//H1[1]", dom);
String title = header.replaceFirst("Anime:\\s*", ""); String title = header.replaceFirst("Anime:\\s*", "");
searchResults.add(new HyperLink(title, URI.create(getSearchUrl(searchterm).toString()))); searchResults.add(new HyperLink(title, getSearchUrl(searchterm)));
} }
} }
@ -123,7 +120,7 @@ public class AnidbClient extends EpisodeListClient {
@Override @Override
public URI getEpisodeListLink(SearchResult searchResult) { public URI getEpisodeListLink(SearchResult searchResult) {
return ((HyperLink) searchResult).getURI(); return ((HyperLink) searchResult).toURI();
} }

View File

@ -10,6 +10,7 @@ import java.net.URI;
import java.net.URL; import java.net.URL;
import java.net.URLConnection; import java.net.URLConnection;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Map;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import java.util.regex.Matcher; import java.util.regex.Matcher;
@ -56,6 +57,17 @@ public class HtmlUtil {
} }
public static Document getHtmlDocument(URL url, Map<String, String> requestHeaders) throws IOException, SAXException {
URLConnection connection = url.openConnection();
for (String key : requestHeaders.keySet()) {
connection.addRequestProperty(key, requestHeaders.get(key));
}
return getHtmlDocument(connection);
}
public static Document getHtmlDocument(URLConnection connection) throws IOException, SAXException { public static Document getHtmlDocument(URLConnection connection) throws IOException, SAXException {
Charset charset = getCharset(connection.getContentType()); Charset charset = getCharset(connection.getContentType());
String encoding = connection.getContentEncoding(); String encoding = connection.getContentEncoding();

View File

@ -4,26 +4,31 @@ package net.sourceforge.filebot.web;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.net.URL;
public class HyperLink extends SearchResult { public class HyperLink extends SearchResult {
private final URI uri; private final URL url;
public HyperLink(String name, URI uri) { public HyperLink(String name, URL url) {
super(name); super(name);
this.uri = uri; this.url = url;
} }
public HyperLink(String name, String uri) throws URISyntaxException { public URL getURL() {
this(name, new URI(uri)); return url;
} }
public URI getURI() { public URI toURI() {
return uri; try {
return url.toURI();
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
} }
} }

View File

@ -6,15 +6,16 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL; import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import java.util.regex.Matcher; import java.util.regex.Matcher;
@ -33,6 +34,8 @@ public class SubsceneSubtitleClient extends SubtitleClient {
private final SearchResultCache cache = new SearchResultCache(); private final SearchResultCache cache = new SearchResultCache();
private final Map<String, Integer> languageFilterMap = new ConcurrentHashMap<String, Integer>(50);
private final String host = "subscene.com"; private final String host = "subscene.com";
@ -56,13 +59,14 @@ public class SubsceneSubtitleClient extends SubtitleClient {
for (Node node : nodes) { for (Node node : nodes) {
String title = XPathUtil.selectString("text()", node); String title = XPathUtil.selectString("text()", node);
String href = XPathUtil.selectString("@href", node); String href = XPathUtil.selectString("@href", node);
String count = XPathUtil.selectString("./DFN", node).replaceAll("\\D+", "");
try { try {
//TODO which exception? URL subtitleListUrl = new URL("http", host, href);
URI url = new URI("http", host, href); int subtitleCount = Integer.parseInt(count);
searchResults.add(new HyperLink(title, url)); searchResults.add(new SubsceneSearchResult(title, subtitleListUrl, subtitleCount));
} catch (URISyntaxException e) { } catch (MalformedURLException e) {
Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href, e); Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href, e);
} }
} }
@ -72,85 +76,127 @@ public class SubsceneSubtitleClient extends SubtitleClient {
return searchResults; return searchResults;
} }
HashMap<String, String> languageIdCache;
private void updateLanguageFilterMap(Document subtitleListDocument) {
List<Node> nodes = XPathUtil.selectNodes("//DIV[@class='languageList']/DIV", subtitleListDocument);
for (Node node : nodes) {
String onClick = XPathUtil.selectString("./INPUT/@onclick", node);
String filter = new Scanner(onClick).findInLine("\\d+");
if (filter != null) {
String name = XPathUtil.selectString("./LABEL/text()", node);
languageFilterMap.put(name.toLowerCase(), Integer.valueOf(filter));
}
}
}
private Integer getLanguageFilter(String languageName) {
if (languageName == null)
return null;
return languageFilterMap.get(languageName.toLowerCase());
}
public String getLanguageID(Locale language) {
return languageIdCache.get(language.getDisplayLanguage(Locale.ENGLISH).toLowerCase()); private String getLanguageName(Locale language) {
if (language == null || language == Locale.ROOT)
return null;
return language.getDisplayLanguage(Locale.ENGLISH);
} }
@Override @Override
public List<SubtitleDescriptor> getSubtitleList(SearchResult searchResult, Locale language) throws Exception { public List<SubtitleDescriptor> getSubtitleList(SearchResult searchResult, Locale language) throws Exception {
URL url = getSubtitleListLink(searchResult).toURL(); URL subtitleListUrl = getSubtitleListLink(searchResult).toURL();
String languageName = getLanguageName(language);
Integer languageFilter = getLanguageFilter(languageName);
Document dom = null; boolean reloadFilteredDocument = (languageFilter == null && useFilteredDocument(searchResult));
boolean forceReload = false;
if (languageIdCache != null) { if (reloadFilteredDocument && languageFilterMap.isEmpty()) {
URLConnection connection = url.openConnection(); // we don't know the filter values yet, so we request a document with an invalid filter,
// that will return a subtitle document very fast
languageFilter = -1;
forceReload = true;
}
Document subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter);
if (languageFilterMap.isEmpty()) {
updateLanguageFilterMap(subtitleListDocument);
}
// check if document is already filtered and if requesting a filtered document
// will result in a performance gain (Note: XPath can be very slow)
if (reloadFilteredDocument) {
languageFilter = getLanguageFilter(languageName);
if (language != null && language != Locale.ROOT) { // if language filter has become available, request a filtered document, or if first request was a dummy request
System.out.println(getLanguageID(language)); if (languageFilter != null || forceReload) {
connection.addRequestProperty("Cookie", "subscene_sLanguageIds=" + getLanguageID(language)); subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter);
}
dom = HtmlUtil.getHtmlDocument(connection);
} else {
URLConnection connection = url.openConnection();
dom = HtmlUtil.getHtmlDocument(connection);
List<Node> nodes = XPathUtil.selectNodes("//DIV[@class='languageList']/DIV", dom);
Pattern onClickPattern = Pattern.compile("selectLanguage\\((\\d+)\\);");
languageIdCache = new HashMap<String, String>();
for (Node node : nodes) {
Matcher matcher = onClickPattern.matcher(XPathUtil.selectString("./INPUT/@onclick", node));
if (matcher.matches()) {
String name = XPathUtil.selectString("./LABEL/text()", node);
String id = matcher.group(1);
//TODO sysout
System.out.println(name + " = " + id);
languageIdCache.put(name.toLowerCase(), id);
}
} }
} }
List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='filmSubtitleList']//A[@id]//ancestor::TR", dom); return getSubtitleList(subtitleListUrl, languageName, subtitleListDocument);
}
private boolean useFilteredDocument(SearchResult searchResult) {
SubsceneSearchResult sr = (SubsceneSearchResult) searchResult;
return sr.getSubtitleCount() > 100;
}
private Document getSubtitleListDocument(URL subtitleListUrl, Integer languageFilter) throws IOException, SAXException {
Map<String, String> requestHeaders = new HashMap<String, String>(1);
Pattern hrefPattern = Pattern.compile("javascript:Subtitle\\((\\d+), '(\\w+)', '\\d+', '(\\d+)'\\);"); if (languageFilter != null) {
requestHeaders.put("Cookie", "subscene_sLanguageIds=" + languageFilter);
}
ArrayList<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size()); return HtmlUtil.getHtmlDocument(subtitleListUrl, requestHeaders);
}
private List<SubtitleDescriptor> getSubtitleList(URL subtitleListUrl, String languageName, Document subtitleListDocument) {
List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='filmSubtitleList']//A[@id]//ancestor::TR", subtitleListDocument);
Pattern hrefPattern = Pattern.compile("javascript:Subtitle\\((\\d+), '(\\w+)', .*");
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size());
for (Node node : nodes) { for (Node node : nodes) {
try { try {
Node linkNode = XPathUtil.selectFirstNode("./TD[1]/A", node); Node linkNode = XPathUtil.selectFirstNode("./TD[1]/A", node);
String lang = XPathUtil.selectString("./SPAN[1]", linkNode); String lang = XPathUtil.selectString("./SPAN[1]", linkNode);
String href = XPathUtil.selectString("@href", linkNode); if (languageName == null || languageName.equalsIgnoreCase(lang)) {
String name = XPathUtil.selectString("./SPAN[2]", linkNode); String href = XPathUtil.selectString("@href", linkNode);
String name = XPathUtil.selectString("./SPAN[2]", linkNode);
String author = XPathUtil.selectString("./TD[4]", node); String author = XPathUtil.selectString("./TD[4]", node);
Matcher matcher = hrefPattern.matcher(href); Matcher matcher = hrefPattern.matcher(href);
if (!matcher.matches()) if (!matcher.matches())
throw new IllegalArgumentException("Cannot extract download parameters: " + href); throw new IllegalArgumentException("Cannot extract download parameters: " + href);
String subtitleId = matcher.group(1); String subtitleId = matcher.group(1);
String typeId = matcher.group(2); String typeId = matcher.group(2);
URL downloadUrl = getDownloadUrl(url, subtitleId, typeId); URL downloadUrl = getDownloadUrl(subtitleListUrl, subtitleId, typeId);
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, author, typeId, downloadUrl, url)); subtitles.add(new SubsceneSubtitleDescriptor(name, lang, author, typeId, downloadUrl, subtitleListUrl));
}
} catch (Exception e) { } catch (Exception e) {
Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Cannot parse subtitle node", e); Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Cannot parse subtitle node", e);
} }
@ -170,7 +216,7 @@ public class SubsceneSubtitleClient extends SubtitleClient {
@Override @Override
public URI getSubtitleListLink(SearchResult searchResult) { public URI getSubtitleListLink(SearchResult searchResult) {
return ((HyperLink) searchResult).getURI(); return ((HyperLink) searchResult).toURI();
} }
@ -180,4 +226,22 @@ public class SubsceneSubtitleClient extends SubtitleClient {
return new URL("http", host, file); return new URL("http", host, file);
} }
protected static class SubsceneSearchResult extends HyperLink {
private final int subtitleCount;
public SubsceneSearchResult(String name, URL url, int subtitleCount) {
super(name, url);
this.subtitleCount = subtitleCount;
}
public int getSubtitleCount() {
return subtitleCount;
}
}
} }

View File

@ -6,7 +6,6 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL; import java.net.URL;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.text.NumberFormat; import java.text.NumberFormat;
@ -65,10 +64,10 @@ public class TVDotComClient extends EpisodeListClient {
String href = XPathUtil.selectString("@href", node); String href = XPathUtil.selectString("@href", node);
try { try {
String episodeListingUrl = href.replaceFirst(Pattern.quote("summary.html?") + ".*", "episode_listings.html"); URL episodeListingUrl = new URL(href.replaceFirst(Pattern.quote("summary.html?") + ".*", "episode_listings.html"));
searchResults.add(new HyperLink(title, episodeListingUrl)); searchResults.add(new HyperLink(title, episodeListingUrl));
} catch (URISyntaxException e) { } catch (MalformedURLException e) {
Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href, e); Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href, e);
} }
} }
@ -169,7 +168,7 @@ public class TVDotComClient extends EpisodeListClient {
@Override @Override
public URI getEpisodeListLink(SearchResult searchResult, int season) { public URI getEpisodeListLink(SearchResult searchResult, int season) {
String episodeListingUrl = ((HyperLink) searchResult).getURI().toString(); URL episodeListingUrl = ((HyperLink) searchResult).getURL();
return URI.create(episodeListingUrl + "?season=" + season); return URI.create(episodeListingUrl + "?season=" + season);
} }

View File

@ -0,0 +1,72 @@
package net.sourceforge.filebot.web;
import static org.junit.Assert.assertEquals;
import java.net.URL;
import java.util.List;
import java.util.Locale;
import net.sourceforge.filebot.ui.panel.subtitle.LanguageResolver;
import net.sourceforge.filebot.web.SubsceneSubtitleClient.SubsceneSearchResult;
import org.junit.BeforeClass;
import org.junit.Test;
public class SubsceneSubtitleClientTest {
private static SubsceneSearchResult testResult;
private static SubsceneSearchResult manySubtitlesTestResult;
private SubsceneSubtitleClient client = new SubsceneSubtitleClient();
@BeforeClass
public static void setUpBeforeClass() throws Exception {
testResult = new SubsceneSearchResult("Twin Peaks - First Season (1990)", new URL("http://subscene.com/twin-peaks--first-season/subtitles-32482.aspx"), 17);
manySubtitlesTestResult = new SubsceneSearchResult("Lost - Fourth Season (2008)", new URL("http://subscene.com/Lost-Fourth-Season/subtitles-70963.aspx"), 420);
}
@Test
public void search() throws Exception {
List<SearchResult> results = client.search("twin peaks");
SubsceneSearchResult result = (SubsceneSearchResult) results.get(1);
assertEquals(testResult.getName(), result.getName());
assertEquals(testResult.getURL().toString(), result.getURL().toString());
assertEquals(testResult.getSubtitleCount(), result.getSubtitleCount());
}
@Test
public void getSubtitleListSearchResult() throws Exception {
List<SubtitleDescriptor> subtitleList = client.getSubtitleList(testResult, Locale.ITALIAN);
assertEquals(1, subtitleList.size());
SubtitleDescriptor subtitle = subtitleList.get(0);
assertEquals("Twin Peaks - First Season", subtitle.getName());
assertEquals("Italian", subtitle.getLanguageName());
assertEquals("zip", subtitle.getArchiveType());
}
@Test
public void getSubtitleListSearchResultMany() throws Exception {
List<SubtitleDescriptor> subtitleList = client.getSubtitleList(manySubtitlesTestResult, LanguageResolver.getDefault().getLocale("Vietnamese"));
assertEquals(1, subtitleList.size());
}
@Test
public void getSubtitleListLink() throws Exception {
assertEquals(testResult.getURL().toString(), client.getSubtitleListLink(testResult).toURL().toString());
}
}

View File

@ -4,21 +4,30 @@ package net.sourceforge.filebot.web;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import java.net.URI; import java.net.URL;
import java.util.List; import java.util.List;
import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
public class TVDotComClientTest { public class TVDotComClientTest {
private static TVDotComClient tvdotcom = new TVDotComClient(); private static HyperLink testResult;
private static HyperLink singleSeasonTestResult;
private static HyperLink manySeasonsTestResult;
private static HyperLink testResult = new HyperLink("Buffy the Vampire Slayer", URI.create("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html")); private TVDotComClient tvdotcom = new TVDotComClient();
private static HyperLink singleSeasonTestResult = new HyperLink("Firefly", URI.create("http://www.tv.com/firefly/show/7097/episode_listings.html"));
private static HyperLink manySeasonsTestResult = new HyperLink("Doctor Who", URI.create("http://www.tv.com/doctor-who/show/355/episode_listings.html"));
@BeforeClass
public static void setUpBeforeClass() throws Exception {
testResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html"));
singleSeasonTestResult = new HyperLink("Firefly", new URL("http://www.tv.com/firefly/show/7097/episode_listings.html"));
manySeasonsTestResult = new HyperLink("Doctor Who", new URL("http://www.tv.com/doctor-who/show/355/episode_listings.html"));
}
@Test @Test
public void search() throws Exception { public void search() throws Exception {
List<SearchResult> results = tvdotcom.search("Buffy"); List<SearchResult> results = tvdotcom.search("Buffy");
@ -26,7 +35,7 @@ public class TVDotComClientTest {
HyperLink result = (HyperLink) results.get(0); HyperLink result = (HyperLink) results.get(0);
assertEquals(testResult.getName(), result.getName()); assertEquals(testResult.getName(), result.getName());
assertEquals(testResult.getURI(), result.getURI()); assertEquals(testResult.getURL().toString(), result.getURL().toString());
} }

View File

@ -13,9 +13,10 @@ import org.junit.Test;
public class TVRageClientTest { public class TVRageClientTest {
private static TVRageClient tvrage = new TVRageClient();
private static TVRageSearchResult testResult = new TVRageSearchResult("Buffy the Vampire Slayer", 2930, "http://www.tvrage.com/Buffy_The_Vampire_Slayer"); private static TVRageSearchResult testResult = new TVRageSearchResult("Buffy the Vampire Slayer", 2930, "http://www.tvrage.com/Buffy_The_Vampire_Slayer");
private TVRageClient tvrage = new TVRageClient();
@Test @Test
public void search() throws Exception { public void search() throws Exception {