* updated tv.com page scraper (will now target episode guide page, instead of episode list page)

* add season-links to IMDbClient
* don't resolve & TVRageClient episode title
This commit is contained in:
Reinhard Pointner 2009-04-25 07:57:14 +00:00
parent 82bd7fa3b1
commit 09a1e0b731
5 changed files with 42 additions and 55 deletions

View File

@ -11,6 +11,7 @@ import static net.sourceforge.tuned.XPathUtilities.selectString;
import java.io.IOException; import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL; import java.net.URL;
import java.net.URLConnection; import java.net.URLConnection;
import java.net.URLEncoder; import java.net.URLEncoder;
@ -130,13 +131,16 @@ public class IMDbClient implements EpisodeListProvider {
@Override @Override
public URI getEpisodeListLink(SearchResult searchResult) { public URI getEpisodeListLink(SearchResult searchResult) {
return URI.create("http://" + host + String.format("/title/tt%07d/episodes", ((MovieDescriptor) searchResult).getImdbId())); return getEpisodeListLink(searchResult, 0);
} }
@Override @Override
public URI getEpisodeListLink(SearchResult searchResult, int season) { public URI getEpisodeListLink(SearchResult searchResult, int season) {
return null; try {
return new URI("http", host, String.format("/title/tt%07d/episodes", ((MovieDescriptor) searchResult).getImdbId()), season > 0 ? String.format("season-%d", season) : null);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
} }
} }

View File

@ -20,6 +20,8 @@ import java.util.concurrent.Executors;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.Icon; import javax.swing.Icon;
@ -70,7 +72,7 @@ public class TVDotComClient implements EpisodeListProvider {
String href = getAttribute("href", node); String href = getAttribute("href", node);
try { try {
URL episodeListingUrl = new URL(href.replaceFirst("summary.html\\?.*", "episode_listings.html")); URL episodeListingUrl = new URL(href.replaceAll("summary\\.html\\?.*", "episode.html"));
searchResults.add(new HyperLink(title, episodeListingUrl)); searchResults.add(new HyperLink(title, episodeListingUrl));
} catch (Exception e) { } catch (Exception e) {
@ -118,7 +120,7 @@ public class TVDotComClient implements EpisodeListProvider {
List<Episode> episodes = new ArrayList<Episode>(25 * seasonCount); List<Episode> episodes = new ArrayList<Episode>(25 * seasonCount);
// get episode list from season 1 document // get episode list from season 1 document
episodes.addAll(getEpisodeList(searchResult, 1, dom)); episodes.addAll(getEpisodeList(searchResult, dom));
// get episodes from executor threads // get episodes from executor threads
for (Future<List<Episode>> future : futures) { for (Future<List<Episode>> future : futures) {
@ -134,37 +136,31 @@ public class TVDotComClient implements EpisodeListProvider {
Document dom = getHtmlDocument(getEpisodeListLink(searchResult, season).toURL()); Document dom = getHtmlDocument(getEpisodeListLink(searchResult, season).toURL());
return getEpisodeList(searchResult, season, dom); return getEpisodeList(searchResult, dom);
} }
private List<Episode> getEpisodeList(SearchResult searchResult, int season, Document dom) { private List<Episode> getEpisodeList(SearchResult searchResult, Document dom) {
List<Node> nodes = selectNodes("id('episode_listing')//*[@class='episode']", dom); List<Node> nodes = selectNodes("id('episode_guide_list')//*[@class='info']", dom);
Integer episodeOffset = null; Pattern seasonEpisodePattern = Pattern.compile("Season (\\d+), Episode (\\d+)");
List<Episode> episodes = new ArrayList<Episode>(nodes.size()); List<Episode> episodes = new ArrayList<Episode>(nodes.size());
for (Node node : nodes) { for (Node node : nodes) {
String episodeNumber = selectString("./*[@class='number']", node); String meta = selectString("./*[@class='meta']", node);
String title = selectString("./*[@class='title']", node);
String seasonNumber = String.valueOf(season);
try { // normalize space and then match season and episode numbers
// convert the absolute episode number to the season episode number Matcher matcher = seasonEpisodePattern.matcher(meta.replaceAll("\\p{Space}+", " "));
int n = Integer.parseInt(episodeNumber);
if (matcher.find()) {
String title = selectString("./H3/A/text()", node);
String season = matcher.group(1);
String episode = matcher.group(2);
if (episodeOffset == null) episodes.add(new Episode(searchResult.getName(), season, episode, title));
episodeOffset = (n <= 1) ? 0 : n - 1;
episodeNumber = String.valueOf(n - episodeOffset);
} catch (NumberFormatException e) {
// episode may be "Pilot", "Special", "TV Movie" ...
seasonNumber = null;
} }
episodes.add(new Episode(searchResult.getName(), seasonNumber, episodeNumber, title));
} }
return episodes; return episodes;

View File

@ -99,7 +99,7 @@ public class TVRageClient implements EpisodeListProvider {
List<Episode> episodes = new ArrayList<Episode>(nodes.size()); List<Episode> episodes = new ArrayList<Episode>(nodes.size());
for (Node node : nodes) { for (Node node : nodes) {
String title = getTextContent("title", node).replace("&amp;", "&"); String title = getTextContent("title", node);
String episodeNumber = getTextContent("seasonnum", node); String episodeNumber = getTextContent("seasonnum", node);
String seasonNumber = node.getParentNode().getAttributes().getNamedItem("no").getTextContent(); String seasonNumber = node.getParentNode().getAttributes().getNamedItem("no").getTextContent();

View File

@ -14,32 +14,17 @@ import org.junit.Test;
public class TVDotComClientTest { public class TVDotComClientTest {
/** private static TVDotComClient tvdotcom = new TVDotComClient();
* 145 episodes / 7 seasons
*/
private static HyperLink buffySearchResult; private static HyperLink buffySearchResult;
/**
* 13 episodes / 1 season only
*/
private static HyperLink fireflySearchResult;
/**
* more than 700 episodes / 26 seasons (on going)
*/
private static HyperLink doctorwhoTestResult;
@BeforeClass @BeforeClass
public static void setUpBeforeClass() throws Exception { public static void setUpBeforeClass() throws Exception {
buffySearchResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html")); buffySearchResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html"));
fireflySearchResult = new HyperLink("Firefly", new URL("http://www.tv.com/firefly/show/7097/episode_listings.html"));
doctorwhoTestResult = new HyperLink("Doctor Who", new URL("http://www.tv.com/doctor-who/show/355/episode_listings.html"));
} }
private TVDotComClient tvdotcom = new TVDotComClient();
@Test @Test
public void search() throws Exception { public void search() throws Exception {
List<SearchResult> results = tvdotcom.search("Buffy"); List<SearchResult> results = tvdotcom.search("Buffy");
@ -71,22 +56,24 @@ public class TVDotComClientTest {
@Test @Test
public void getEpisodeListAllMultiSeason() throws Exception { public void getEpisodeListAllMultiSeason() throws Exception {
// 144 episodes / 7 seasons
List<Episode> list = tvdotcom.getEpisodeList(buffySearchResult); List<Episode> list = tvdotcom.getEpisodeList(buffySearchResult);
assertEquals(145, list.size()); assertEquals(144, list.size());
Episode first = list.get(0); Episode first = list.get(0);
assertEquals("Buffy the Vampire Slayer", first.getSeriesName()); assertEquals("Buffy the Vampire Slayer", first.getSeriesName());
assertEquals("Unaired Pilot", first.getTitle()); assertEquals("Welcome to the Hellmouth (1)", first.getTitle());
assertEquals("0", first.getEpisodeNumber()); assertEquals("1", first.getEpisodeNumber());
assertEquals("1", first.getSeasonNumber()); assertEquals("1", first.getSeasonNumber());
} }
@Test @Test
public void getEpisodeListAllSingleSeason() throws Exception { public void getEpisodeListAllSingleSeason() throws Exception {
List<Episode> list = tvdotcom.getEpisodeList(fireflySearchResult); // 13 episodes / 1 season only
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Firefly").get(0));
assertEquals(15, list.size()); assertEquals(15, list.size());
@ -101,7 +88,8 @@ public class TVDotComClientTest {
@Test @Test
public void getEpisodeListAllManySeasons() throws Exception { public void getEpisodeListAllManySeasons() throws Exception {
List<Episode> list = tvdotcom.getEpisodeList(doctorwhoTestResult); // more than 700 episodes / 26 seasons (on going)
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Doctor Who").get(0));
// there are still new episodes coming out // there are still new episodes coming out
assertTrue(list.size() > 700); assertTrue(list.size() > 700);
@ -110,9 +98,7 @@ public class TVDotComClientTest {
@Test @Test
public void getEpisodeListEncoding() throws Exception { public void getEpisodeListEncoding() throws Exception {
HyperLink lostTestResult = new HyperLink("Lost", new URL("http://www.tv.com/lost/show/24313/episode_listings.html")); List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Lost").get(0), 3);
List<Episode> list = tvdotcom.getEpisodeList(lostTestResult, 3);
Episode episode = list.get(13); Episode episode = list.get(13);
@ -125,13 +111,13 @@ public class TVDotComClientTest {
@Test @Test
public void getEpisodeListLink() { public void getEpisodeListLink() {
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 1).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html?season=1"); assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 1).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html?season=1");
} }
@Test @Test
public void getEpisodeListLinkAll() { public void getEpisodeListLinkAll() {
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 0).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html?season=0"); assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 0).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html?season=0");
} }
} }

View File

@ -4,6 +4,7 @@ package net.sourceforge.filebot.web;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.List; import java.util.List;
@ -65,7 +66,7 @@ public class TheTVDBClientTest {
public void getEpisodeListAll() throws Exception { public void getEpisodeListAll() throws Exception {
List<Episode> list = thetvdb.getEpisodeList(new TheTVDBSearchResult("Buffy the Vampire Slayer", 70327)); List<Episode> list = thetvdb.getEpisodeList(new TheTVDBSearchResult("Buffy the Vampire Slayer", 70327));
assertEquals(147, list.size()); assertTrue(list.size() >= 144);
Episode first = list.get(0); Episode first = list.get(0);