From 9eb74e8038802ac14e36ca7f9e22bb2a0630b579 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Sun, 6 Jul 2008 18:31:04 +0000 Subject: [PATCH] * heavily improved Subscene support (up to 35x faster) --- .../sourceforge/filebot/web/AnidbClient.java | 11 +- .../net/sourceforge/filebot/web/HtmlUtil.java | 12 ++ .../sourceforge/filebot/web/HyperLink.java | 19 +- .../filebot/web/SubsceneSubtitleClient.java | 192 ++++++++++++------ .../filebot/web/TVDotComClient.java | 7 +- .../web/SubsceneSubtitleClientTest.java | 72 +++++++ .../filebot/web/TVDotComClientTest.java | 21 +- .../filebot/web/TVRageClientTest.java | 3 +- 8 files changed, 248 insertions(+), 89 deletions(-) create mode 100644 test/net/sourceforge/filebot/web/SubsceneSubtitleClientTest.java diff --git a/source/net/sourceforge/filebot/web/AnidbClient.java b/source/net/sourceforge/filebot/web/AnidbClient.java index 2406dabb..97319da0 100644 --- a/source/net/sourceforge/filebot/web/AnidbClient.java +++ b/source/net/sourceforge/filebot/web/AnidbClient.java @@ -6,7 +6,6 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; -import java.net.URISyntaxException; import java.net.URL; import java.net.URLEncoder; import java.text.NumberFormat; @@ -60,10 +59,8 @@ public class AnidbClient extends EpisodeListClient { String path = "/perl-bin/" + href; try { - URI animeUrl = new URI("http", host, path, null); - - searchResults.add(new HyperLink(title, animeUrl)); - } catch (URISyntaxException e) { + searchResults.add(new HyperLink(title, new URL("http", host, path))); + } catch (MalformedURLException e) { Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href); } } @@ -76,7 +73,7 @@ public class AnidbClient extends EpisodeListClient { String header = XPathUtil.selectString("id('layout-content')//H1[1]", dom); String title = header.replaceFirst("Anime:\\s*", ""); - searchResults.add(new HyperLink(title, URI.create(getSearchUrl(searchterm).toString()))); + searchResults.add(new HyperLink(title, getSearchUrl(searchterm))); } } @@ -123,7 +120,7 @@ public class AnidbClient extends EpisodeListClient { @Override public URI getEpisodeListLink(SearchResult searchResult) { - return ((HyperLink) searchResult).getURI(); + return ((HyperLink) searchResult).toURI(); } diff --git a/source/net/sourceforge/filebot/web/HtmlUtil.java b/source/net/sourceforge/filebot/web/HtmlUtil.java index 42359fc8..00717e9d 100644 --- a/source/net/sourceforge/filebot/web/HtmlUtil.java +++ b/source/net/sourceforge/filebot/web/HtmlUtil.java @@ -10,6 +10,7 @@ import java.net.URI; import java.net.URL; import java.net.URLConnection; import java.nio.charset.Charset; +import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; @@ -56,6 +57,17 @@ public class HtmlUtil { } + public static Document getHtmlDocument(URL url, Map requestHeaders) throws IOException, SAXException { + URLConnection connection = url.openConnection(); + + for (String key : requestHeaders.keySet()) { + connection.addRequestProperty(key, requestHeaders.get(key)); + } + + return getHtmlDocument(connection); + } + + public static Document getHtmlDocument(URLConnection connection) throws IOException, SAXException { Charset charset = getCharset(connection.getContentType()); String encoding = connection.getContentEncoding(); diff --git a/source/net/sourceforge/filebot/web/HyperLink.java b/source/net/sourceforge/filebot/web/HyperLink.java index 88c7cfc2..4327dd6b 100644 --- a/source/net/sourceforge/filebot/web/HyperLink.java +++ b/source/net/sourceforge/filebot/web/HyperLink.java @@ -4,26 +4,31 @@ package net.sourceforge.filebot.web; import java.net.URI; import java.net.URISyntaxException; +import java.net.URL; public class HyperLink extends SearchResult { - private final URI uri; + private final URL url; - public HyperLink(String name, URI uri) { + public HyperLink(String name, URL url) { super(name); - this.uri = uri; + this.url = url; } - public HyperLink(String name, String uri) throws URISyntaxException { - this(name, new URI(uri)); + public URL getURL() { + return url; } - public URI getURI() { - return uri; + public URI toURI() { + try { + return url.toURI(); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } } } diff --git a/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java b/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java index 035f68e4..24165fce 100644 --- a/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java +++ b/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java @@ -6,15 +6,16 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; -import java.net.URISyntaxException; import java.net.URL; -import java.net.URLConnection; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Locale; +import java.util.Map; +import java.util.Scanner; +import java.util.concurrent.ConcurrentHashMap; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; @@ -33,6 +34,8 @@ public class SubsceneSubtitleClient extends SubtitleClient { private final SearchResultCache cache = new SearchResultCache(); + private final Map languageFilterMap = new ConcurrentHashMap(50); + private final String host = "subscene.com"; @@ -56,13 +59,14 @@ public class SubsceneSubtitleClient extends SubtitleClient { for (Node node : nodes) { String title = XPathUtil.selectString("text()", node); String href = XPathUtil.selectString("@href", node); + String count = XPathUtil.selectString("./DFN", node).replaceAll("\\D+", ""); try { - //TODO which exception? - URI url = new URI("http", host, href); + URL subtitleListUrl = new URL("http", host, href); + int subtitleCount = Integer.parseInt(count); - searchResults.add(new HyperLink(title, url)); - } catch (URISyntaxException e) { + searchResults.add(new SubsceneSearchResult(title, subtitleListUrl, subtitleCount)); + } catch (MalformedURLException e) { Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href, e); } } @@ -72,85 +76,127 @@ public class SubsceneSubtitleClient extends SubtitleClient { return searchResults; } - HashMap languageIdCache; + + private void updateLanguageFilterMap(Document subtitleListDocument) { + + List nodes = XPathUtil.selectNodes("//DIV[@class='languageList']/DIV", subtitleListDocument); + + for (Node node : nodes) { + String onClick = XPathUtil.selectString("./INPUT/@onclick", node); + + String filter = new Scanner(onClick).findInLine("\\d+"); + + if (filter != null) { + String name = XPathUtil.selectString("./LABEL/text()", node); + + languageFilterMap.put(name.toLowerCase(), Integer.valueOf(filter)); + } + } + } + + private Integer getLanguageFilter(String languageName) { + if (languageName == null) + return null; + + return languageFilterMap.get(languageName.toLowerCase()); + } - public String getLanguageID(Locale language) { - return languageIdCache.get(language.getDisplayLanguage(Locale.ENGLISH).toLowerCase()); + + private String getLanguageName(Locale language) { + if (language == null || language == Locale.ROOT) + return null; + + return language.getDisplayLanguage(Locale.ENGLISH); } @Override public List getSubtitleList(SearchResult searchResult, Locale language) throws Exception { - URL url = getSubtitleListLink(searchResult).toURL(); + URL subtitleListUrl = getSubtitleListLink(searchResult).toURL(); + String languageName = getLanguageName(language); + Integer languageFilter = getLanguageFilter(languageName); - Document dom = null; + boolean reloadFilteredDocument = (languageFilter == null && useFilteredDocument(searchResult)); + boolean forceReload = false; - if (languageIdCache != null) { - URLConnection connection = url.openConnection(); + if (reloadFilteredDocument && languageFilterMap.isEmpty()) { + // we don't know the filter values yet, so we request a document with an invalid filter, + // that will return a subtitle document very fast + languageFilter = -1; + forceReload = true; + } + + Document subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter); + + if (languageFilterMap.isEmpty()) { + updateLanguageFilterMap(subtitleListDocument); + } + + // check if document is already filtered and if requesting a filtered document + // will result in a performance gain (Note: XPath can be very slow) + if (reloadFilteredDocument) { + languageFilter = getLanguageFilter(languageName); - if (language != null && language != Locale.ROOT) { - System.out.println(getLanguageID(language)); - connection.addRequestProperty("Cookie", "subscene_sLanguageIds=" + getLanguageID(language)); - } - - dom = HtmlUtil.getHtmlDocument(connection); - } else { - URLConnection connection = url.openConnection(); - - dom = HtmlUtil.getHtmlDocument(connection); - - List nodes = XPathUtil.selectNodes("//DIV[@class='languageList']/DIV", dom); - - Pattern onClickPattern = Pattern.compile("selectLanguage\\((\\d+)\\);"); - - languageIdCache = new HashMap(); - - for (Node node : nodes) { - Matcher matcher = onClickPattern.matcher(XPathUtil.selectString("./INPUT/@onclick", node)); - - if (matcher.matches()) { - String name = XPathUtil.selectString("./LABEL/text()", node); - String id = matcher.group(1); - - //TODO sysout - System.out.println(name + " = " + id); - - languageIdCache.put(name.toLowerCase(), id); - } + // if language filter has become available, request a filtered document, or if first request was a dummy request + if (languageFilter != null || forceReload) { + subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter); } } - List nodes = XPathUtil.selectNodes("//TABLE[@class='filmSubtitleList']//A[@id]//ancestor::TR", dom); + return getSubtitleList(subtitleListUrl, languageName, subtitleListDocument); + } + + + private boolean useFilteredDocument(SearchResult searchResult) { + SubsceneSearchResult sr = (SubsceneSearchResult) searchResult; + return sr.getSubtitleCount() > 100; + } + + + private Document getSubtitleListDocument(URL subtitleListUrl, Integer languageFilter) throws IOException, SAXException { + Map requestHeaders = new HashMap(1); - Pattern hrefPattern = Pattern.compile("javascript:Subtitle\\((\\d+), '(\\w+)', '\\d+', '(\\d+)'\\);"); + if (languageFilter != null) { + requestHeaders.put("Cookie", "subscene_sLanguageIds=" + languageFilter); + } - ArrayList subtitles = new ArrayList(nodes.size()); + return HtmlUtil.getHtmlDocument(subtitleListUrl, requestHeaders); + } + + + private List getSubtitleList(URL subtitleListUrl, String languageName, Document subtitleListDocument) { + + List nodes = XPathUtil.selectNodes("//TABLE[@class='filmSubtitleList']//A[@id]//ancestor::TR", subtitleListDocument); + + Pattern hrefPattern = Pattern.compile("javascript:Subtitle\\((\\d+), '(\\w+)', .*"); + + List subtitles = new ArrayList(nodes.size()); for (Node node : nodes) { try { Node linkNode = XPathUtil.selectFirstNode("./TD[1]/A", node); - String lang = XPathUtil.selectString("./SPAN[1]", linkNode); - String href = XPathUtil.selectString("@href", linkNode); - - String name = XPathUtil.selectString("./SPAN[2]", linkNode); - - String author = XPathUtil.selectString("./TD[4]", node); - - Matcher matcher = hrefPattern.matcher(href); - - if (!matcher.matches()) - throw new IllegalArgumentException("Cannot extract download parameters: " + href); - - String subtitleId = matcher.group(1); - String typeId = matcher.group(2); - - URL downloadUrl = getDownloadUrl(url, subtitleId, typeId); - - subtitles.add(new SubsceneSubtitleDescriptor(name, lang, author, typeId, downloadUrl, url)); + if (languageName == null || languageName.equalsIgnoreCase(lang)) { + + String href = XPathUtil.selectString("@href", linkNode); + String name = XPathUtil.selectString("./SPAN[2]", linkNode); + String author = XPathUtil.selectString("./TD[4]", node); + + Matcher matcher = hrefPattern.matcher(href); + + if (!matcher.matches()) + throw new IllegalArgumentException("Cannot extract download parameters: " + href); + + String subtitleId = matcher.group(1); + String typeId = matcher.group(2); + + URL downloadUrl = getDownloadUrl(subtitleListUrl, subtitleId, typeId); + + subtitles.add(new SubsceneSubtitleDescriptor(name, lang, author, typeId, downloadUrl, subtitleListUrl)); + } } catch (Exception e) { Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Cannot parse subtitle node", e); } @@ -170,7 +216,7 @@ public class SubsceneSubtitleClient extends SubtitleClient { @Override public URI getSubtitleListLink(SearchResult searchResult) { - return ((HyperLink) searchResult).getURI(); + return ((HyperLink) searchResult).toURI(); } @@ -180,4 +226,22 @@ public class SubsceneSubtitleClient extends SubtitleClient { return new URL("http", host, file); } + + protected static class SubsceneSearchResult extends HyperLink { + + private final int subtitleCount; + + + public SubsceneSearchResult(String name, URL url, int subtitleCount) { + super(name, url); + this.subtitleCount = subtitleCount; + } + + + public int getSubtitleCount() { + return subtitleCount; + } + + } + } diff --git a/source/net/sourceforge/filebot/web/TVDotComClient.java b/source/net/sourceforge/filebot/web/TVDotComClient.java index 68bc95d1..90d3e0e7 100644 --- a/source/net/sourceforge/filebot/web/TVDotComClient.java +++ b/source/net/sourceforge/filebot/web/TVDotComClient.java @@ -6,7 +6,6 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; -import java.net.URISyntaxException; import java.net.URL; import java.net.URLEncoder; import java.text.NumberFormat; @@ -65,10 +64,10 @@ public class TVDotComClient extends EpisodeListClient { String href = XPathUtil.selectString("@href", node); try { - String episodeListingUrl = href.replaceFirst(Pattern.quote("summary.html?") + ".*", "episode_listings.html"); + URL episodeListingUrl = new URL(href.replaceFirst(Pattern.quote("summary.html?") + ".*", "episode_listings.html")); searchResults.add(new HyperLink(title, episodeListingUrl)); - } catch (URISyntaxException e) { + } catch (MalformedURLException e) { Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href, e); } } @@ -169,7 +168,7 @@ public class TVDotComClient extends EpisodeListClient { @Override public URI getEpisodeListLink(SearchResult searchResult, int season) { - String episodeListingUrl = ((HyperLink) searchResult).getURI().toString(); + URL episodeListingUrl = ((HyperLink) searchResult).getURL(); return URI.create(episodeListingUrl + "?season=" + season); } diff --git a/test/net/sourceforge/filebot/web/SubsceneSubtitleClientTest.java b/test/net/sourceforge/filebot/web/SubsceneSubtitleClientTest.java new file mode 100644 index 00000000..d3f3467f --- /dev/null +++ b/test/net/sourceforge/filebot/web/SubsceneSubtitleClientTest.java @@ -0,0 +1,72 @@ + +package net.sourceforge.filebot.web; + + +import static org.junit.Assert.assertEquals; + +import java.net.URL; +import java.util.List; +import java.util.Locale; + +import net.sourceforge.filebot.ui.panel.subtitle.LanguageResolver; +import net.sourceforge.filebot.web.SubsceneSubtitleClient.SubsceneSearchResult; + +import org.junit.BeforeClass; +import org.junit.Test; + + +public class SubsceneSubtitleClientTest { + + private static SubsceneSearchResult testResult; + private static SubsceneSearchResult manySubtitlesTestResult; + + private SubsceneSubtitleClient client = new SubsceneSubtitleClient(); + + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + testResult = new SubsceneSearchResult("Twin Peaks - First Season (1990)", new URL("http://subscene.com/twin-peaks--first-season/subtitles-32482.aspx"), 17); + manySubtitlesTestResult = new SubsceneSearchResult("Lost - Fourth Season (2008)", new URL("http://subscene.com/Lost-Fourth-Season/subtitles-70963.aspx"), 420); + } + + + @Test + public void search() throws Exception { + List results = client.search("twin peaks"); + + SubsceneSearchResult result = (SubsceneSearchResult) results.get(1); + + assertEquals(testResult.getName(), result.getName()); + assertEquals(testResult.getURL().toString(), result.getURL().toString()); + assertEquals(testResult.getSubtitleCount(), result.getSubtitleCount()); + } + + + @Test + public void getSubtitleListSearchResult() throws Exception { + List subtitleList = client.getSubtitleList(testResult, Locale.ITALIAN); + + assertEquals(1, subtitleList.size()); + + SubtitleDescriptor subtitle = subtitleList.get(0); + + assertEquals("Twin Peaks - First Season", subtitle.getName()); + assertEquals("Italian", subtitle.getLanguageName()); + assertEquals("zip", subtitle.getArchiveType()); + } + + + @Test + public void getSubtitleListSearchResultMany() throws Exception { + List subtitleList = client.getSubtitleList(manySubtitlesTestResult, LanguageResolver.getDefault().getLocale("Vietnamese")); + + assertEquals(1, subtitleList.size()); + } + + + @Test + public void getSubtitleListLink() throws Exception { + assertEquals(testResult.getURL().toString(), client.getSubtitleListLink(testResult).toURL().toString()); + } + +} diff --git a/test/net/sourceforge/filebot/web/TVDotComClientTest.java b/test/net/sourceforge/filebot/web/TVDotComClientTest.java index 187586eb..ceffcaa7 100644 --- a/test/net/sourceforge/filebot/web/TVDotComClientTest.java +++ b/test/net/sourceforge/filebot/web/TVDotComClientTest.java @@ -4,21 +4,30 @@ package net.sourceforge.filebot.web; import static org.junit.Assert.assertEquals; -import java.net.URI; +import java.net.URL; import java.util.List; +import org.junit.BeforeClass; import org.junit.Test; public class TVDotComClientTest { - private static TVDotComClient tvdotcom = new TVDotComClient(); + private static HyperLink testResult; + private static HyperLink singleSeasonTestResult; + private static HyperLink manySeasonsTestResult; - private static HyperLink testResult = new HyperLink("Buffy the Vampire Slayer", URI.create("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html")); - private static HyperLink singleSeasonTestResult = new HyperLink("Firefly", URI.create("http://www.tv.com/firefly/show/7097/episode_listings.html")); - private static HyperLink manySeasonsTestResult = new HyperLink("Doctor Who", URI.create("http://www.tv.com/doctor-who/show/355/episode_listings.html")); + private TVDotComClient tvdotcom = new TVDotComClient(); + @BeforeClass + public static void setUpBeforeClass() throws Exception { + testResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html")); + singleSeasonTestResult = new HyperLink("Firefly", new URL("http://www.tv.com/firefly/show/7097/episode_listings.html")); + manySeasonsTestResult = new HyperLink("Doctor Who", new URL("http://www.tv.com/doctor-who/show/355/episode_listings.html")); + } + + @Test public void search() throws Exception { List results = tvdotcom.search("Buffy"); @@ -26,7 +35,7 @@ public class TVDotComClientTest { HyperLink result = (HyperLink) results.get(0); assertEquals(testResult.getName(), result.getName()); - assertEquals(testResult.getURI(), result.getURI()); + assertEquals(testResult.getURL().toString(), result.getURL().toString()); } diff --git a/test/net/sourceforge/filebot/web/TVRageClientTest.java b/test/net/sourceforge/filebot/web/TVRageClientTest.java index d9da8588..c2ae8650 100644 --- a/test/net/sourceforge/filebot/web/TVRageClientTest.java +++ b/test/net/sourceforge/filebot/web/TVRageClientTest.java @@ -13,9 +13,10 @@ import org.junit.Test; public class TVRageClientTest { - private static TVRageClient tvrage = new TVRageClient(); private static TVRageSearchResult testResult = new TVRageSearchResult("Buffy the Vampire Slayer", 2930, "http://www.tvrage.com/Buffy_The_Vampire_Slayer"); + private TVRageClient tvrage = new TVRageClient(); + @Test public void search() throws Exception {