* updated tv.com page scraper (will now target episode guide page, instead of episode list page)

* add season-links to IMDbClient
* don't resolve & TVRageClient episode title
This commit is contained in:
Reinhard Pointner 2009-04-25 07:57:14 +00:00
parent 82bd7fa3b1
commit 09a1e0b731
5 changed files with 42 additions and 55 deletions

View File

@ -11,6 +11,7 @@ import static net.sourceforge.tuned.XPathUtilities.selectString;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
@ -130,13 +131,16 @@ public class IMDbClient implements EpisodeListProvider {
@Override
public URI getEpisodeListLink(SearchResult searchResult) {
return URI.create("http://" + host + String.format("/title/tt%07d/episodes", ((MovieDescriptor) searchResult).getImdbId()));
return getEpisodeListLink(searchResult, 0);
}
@Override
public URI getEpisodeListLink(SearchResult searchResult, int season) {
return null;
try {
return new URI("http", host, String.format("/title/tt%07d/episodes", ((MovieDescriptor) searchResult).getImdbId()), season > 0 ? String.format("season-%d", season) : null);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -20,6 +20,8 @@ import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.Icon;
@ -70,7 +72,7 @@ public class TVDotComClient implements EpisodeListProvider {
String href = getAttribute("href", node);
try {
URL episodeListingUrl = new URL(href.replaceFirst("summary.html\\?.*", "episode_listings.html"));
URL episodeListingUrl = new URL(href.replaceAll("summary\\.html\\?.*", "episode.html"));
searchResults.add(new HyperLink(title, episodeListingUrl));
} catch (Exception e) {
@ -118,7 +120,7 @@ public class TVDotComClient implements EpisodeListProvider {
List<Episode> episodes = new ArrayList<Episode>(25 * seasonCount);
// get episode list from season 1 document
episodes.addAll(getEpisodeList(searchResult, 1, dom));
episodes.addAll(getEpisodeList(searchResult, dom));
// get episodes from executor threads
for (Future<List<Episode>> future : futures) {
@ -134,37 +136,31 @@ public class TVDotComClient implements EpisodeListProvider {
Document dom = getHtmlDocument(getEpisodeListLink(searchResult, season).toURL());
return getEpisodeList(searchResult, season, dom);
return getEpisodeList(searchResult, dom);
}
private List<Episode> getEpisodeList(SearchResult searchResult, int season, Document dom) {
private List<Episode> getEpisodeList(SearchResult searchResult, Document dom) {
List<Node> nodes = selectNodes("id('episode_listing')//*[@class='episode']", dom);
List<Node> nodes = selectNodes("id('episode_guide_list')//*[@class='info']", dom);
Integer episodeOffset = null;
Pattern seasonEpisodePattern = Pattern.compile("Season (\\d+), Episode (\\d+)");
List<Episode> episodes = new ArrayList<Episode>(nodes.size());
for (Node node : nodes) {
String episodeNumber = selectString("./*[@class='number']", node);
String title = selectString("./*[@class='title']", node);
String seasonNumber = String.valueOf(season);
String meta = selectString("./*[@class='meta']", node);
try {
// convert the absolute episode number to the season episode number
int n = Integer.parseInt(episodeNumber);
// normalize space and then match season and episode numbers
Matcher matcher = seasonEpisodePattern.matcher(meta.replaceAll("\\p{Space}+", " "));
if (matcher.find()) {
String title = selectString("./H3/A/text()", node);
String season = matcher.group(1);
String episode = matcher.group(2);
if (episodeOffset == null)
episodeOffset = (n <= 1) ? 0 : n - 1;
episodeNumber = String.valueOf(n - episodeOffset);
} catch (NumberFormatException e) {
// episode may be "Pilot", "Special", "TV Movie" ...
seasonNumber = null;
episodes.add(new Episode(searchResult.getName(), season, episode, title));
}
episodes.add(new Episode(searchResult.getName(), seasonNumber, episodeNumber, title));
}
return episodes;

View File

@ -99,7 +99,7 @@ public class TVRageClient implements EpisodeListProvider {
List<Episode> episodes = new ArrayList<Episode>(nodes.size());
for (Node node : nodes) {
String title = getTextContent("title", node).replace("&amp;", "&");
String title = getTextContent("title", node);
String episodeNumber = getTextContent("seasonnum", node);
String seasonNumber = node.getParentNode().getAttributes().getNamedItem("no").getTextContent();

View File

@ -14,32 +14,17 @@ import org.junit.Test;
public class TVDotComClientTest {
/**
* 145 episodes / 7 seasons
*/
private static TVDotComClient tvdotcom = new TVDotComClient();
private static HyperLink buffySearchResult;
/**
* 13 episodes / 1 season only
*/
private static HyperLink fireflySearchResult;
/**
* more than 700 episodes / 26 seasons (on going)
*/
private static HyperLink doctorwhoTestResult;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
buffySearchResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html"));
fireflySearchResult = new HyperLink("Firefly", new URL("http://www.tv.com/firefly/show/7097/episode_listings.html"));
doctorwhoTestResult = new HyperLink("Doctor Who", new URL("http://www.tv.com/doctor-who/show/355/episode_listings.html"));
buffySearchResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html"));
}
private TVDotComClient tvdotcom = new TVDotComClient();
@Test
public void search() throws Exception {
List<SearchResult> results = tvdotcom.search("Buffy");
@ -71,22 +56,24 @@ public class TVDotComClientTest {
@Test
public void getEpisodeListAllMultiSeason() throws Exception {
// 144 episodes / 7 seasons
List<Episode> list = tvdotcom.getEpisodeList(buffySearchResult);
assertEquals(145, list.size());
assertEquals(144, list.size());
Episode first = list.get(0);
assertEquals("Buffy the Vampire Slayer", first.getSeriesName());
assertEquals("Unaired Pilot", first.getTitle());
assertEquals("0", first.getEpisodeNumber());
assertEquals("Welcome to the Hellmouth (1)", first.getTitle());
assertEquals("1", first.getEpisodeNumber());
assertEquals("1", first.getSeasonNumber());
}
@Test
public void getEpisodeListAllSingleSeason() throws Exception {
List<Episode> list = tvdotcom.getEpisodeList(fireflySearchResult);
// 13 episodes / 1 season only
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Firefly").get(0));
assertEquals(15, list.size());
@ -101,7 +88,8 @@ public class TVDotComClientTest {
@Test
public void getEpisodeListAllManySeasons() throws Exception {
List<Episode> list = tvdotcom.getEpisodeList(doctorwhoTestResult);
// more than 700 episodes / 26 seasons (on going)
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Doctor Who").get(0));
// there are still new episodes coming out
assertTrue(list.size() > 700);
@ -110,9 +98,7 @@ public class TVDotComClientTest {
@Test
public void getEpisodeListEncoding() throws Exception {
HyperLink lostTestResult = new HyperLink("Lost", new URL("http://www.tv.com/lost/show/24313/episode_listings.html"));
List<Episode> list = tvdotcom.getEpisodeList(lostTestResult, 3);
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Lost").get(0), 3);
Episode episode = list.get(13);
@ -125,13 +111,13 @@ public class TVDotComClientTest {
@Test
public void getEpisodeListLink() {
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 1).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html?season=1");
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 1).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html?season=1");
}
@Test
public void getEpisodeListLinkAll() {
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 0).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html?season=0");
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 0).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html?season=0");
}
}

View File

@ -4,6 +4,7 @@ package net.sourceforge.filebot.web;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.util.EnumSet;
import java.util.List;
@ -65,7 +66,7 @@ public class TheTVDBClientTest {
public void getEpisodeListAll() throws Exception {
List<Episode> list = thetvdb.getEpisodeList(new TheTVDBSearchResult("Buffy the Vampire Slayer", 70327));
assertEquals(147, list.size());
assertTrue(list.size() >= 144);
Episode first = list.get(0);