* updated tv.com page scraper (will now target episode guide page, instead of episode list page)
* add season-links to IMDbClient * don't resolve & TVRageClient episode title
This commit is contained in:
parent
82bd7fa3b1
commit
09a1e0b731
|
@ -11,6 +11,7 @@ import static net.sourceforge.tuned.XPathUtilities.selectString;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.net.URLEncoder;
|
||||
|
@ -130,13 +131,16 @@ public class IMDbClient implements EpisodeListProvider {
|
|||
|
||||
@Override
|
||||
public URI getEpisodeListLink(SearchResult searchResult) {
|
||||
return URI.create("http://" + host + String.format("/title/tt%07d/episodes", ((MovieDescriptor) searchResult).getImdbId()));
|
||||
return getEpisodeListLink(searchResult, 0);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public URI getEpisodeListLink(SearchResult searchResult, int season) {
|
||||
return null;
|
||||
try {
|
||||
return new URI("http", host, String.format("/title/tt%07d/episodes", ((MovieDescriptor) searchResult).getImdbId()), season > 0 ? String.format("season-%d", season) : null);
|
||||
} catch (URISyntaxException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,6 +20,8 @@ import java.util.concurrent.Executors;
|
|||
import java.util.concurrent.Future;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import javax.swing.Icon;
|
||||
|
||||
|
@ -70,7 +72,7 @@ public class TVDotComClient implements EpisodeListProvider {
|
|||
String href = getAttribute("href", node);
|
||||
|
||||
try {
|
||||
URL episodeListingUrl = new URL(href.replaceFirst("summary.html\\?.*", "episode_listings.html"));
|
||||
URL episodeListingUrl = new URL(href.replaceAll("summary\\.html\\?.*", "episode.html"));
|
||||
|
||||
searchResults.add(new HyperLink(title, episodeListingUrl));
|
||||
} catch (Exception e) {
|
||||
|
@ -118,7 +120,7 @@ public class TVDotComClient implements EpisodeListProvider {
|
|||
List<Episode> episodes = new ArrayList<Episode>(25 * seasonCount);
|
||||
|
||||
// get episode list from season 1 document
|
||||
episodes.addAll(getEpisodeList(searchResult, 1, dom));
|
||||
episodes.addAll(getEpisodeList(searchResult, dom));
|
||||
|
||||
// get episodes from executor threads
|
||||
for (Future<List<Episode>> future : futures) {
|
||||
|
@ -134,37 +136,31 @@ public class TVDotComClient implements EpisodeListProvider {
|
|||
|
||||
Document dom = getHtmlDocument(getEpisodeListLink(searchResult, season).toURL());
|
||||
|
||||
return getEpisodeList(searchResult, season, dom);
|
||||
return getEpisodeList(searchResult, dom);
|
||||
}
|
||||
|
||||
|
||||
private List<Episode> getEpisodeList(SearchResult searchResult, int season, Document dom) {
|
||||
private List<Episode> getEpisodeList(SearchResult searchResult, Document dom) {
|
||||
|
||||
List<Node> nodes = selectNodes("id('episode_listing')//*[@class='episode']", dom);
|
||||
List<Node> nodes = selectNodes("id('episode_guide_list')//*[@class='info']", dom);
|
||||
|
||||
Integer episodeOffset = null;
|
||||
Pattern seasonEpisodePattern = Pattern.compile("Season (\\d+), Episode (\\d+)");
|
||||
|
||||
List<Episode> episodes = new ArrayList<Episode>(nodes.size());
|
||||
|
||||
for (Node node : nodes) {
|
||||
String episodeNumber = selectString("./*[@class='number']", node);
|
||||
String title = selectString("./*[@class='title']", node);
|
||||
String seasonNumber = String.valueOf(season);
|
||||
String meta = selectString("./*[@class='meta']", node);
|
||||
|
||||
try {
|
||||
// convert the absolute episode number to the season episode number
|
||||
int n = Integer.parseInt(episodeNumber);
|
||||
// normalize space and then match season and episode numbers
|
||||
Matcher matcher = seasonEpisodePattern.matcher(meta.replaceAll("\\p{Space}+", " "));
|
||||
|
||||
if (matcher.find()) {
|
||||
String title = selectString("./H3/A/text()", node);
|
||||
String season = matcher.group(1);
|
||||
String episode = matcher.group(2);
|
||||
|
||||
if (episodeOffset == null)
|
||||
episodeOffset = (n <= 1) ? 0 : n - 1;
|
||||
|
||||
episodeNumber = String.valueOf(n - episodeOffset);
|
||||
} catch (NumberFormatException e) {
|
||||
// episode may be "Pilot", "Special", "TV Movie" ...
|
||||
seasonNumber = null;
|
||||
episodes.add(new Episode(searchResult.getName(), season, episode, title));
|
||||
}
|
||||
|
||||
episodes.add(new Episode(searchResult.getName(), seasonNumber, episodeNumber, title));
|
||||
}
|
||||
|
||||
return episodes;
|
||||
|
|
|
@ -99,7 +99,7 @@ public class TVRageClient implements EpisodeListProvider {
|
|||
List<Episode> episodes = new ArrayList<Episode>(nodes.size());
|
||||
|
||||
for (Node node : nodes) {
|
||||
String title = getTextContent("title", node).replace("&", "&");
|
||||
String title = getTextContent("title", node);
|
||||
String episodeNumber = getTextContent("seasonnum", node);
|
||||
String seasonNumber = node.getParentNode().getAttributes().getNamedItem("no").getTextContent();
|
||||
|
||||
|
|
|
@ -14,32 +14,17 @@ import org.junit.Test;
|
|||
|
||||
public class TVDotComClientTest {
|
||||
|
||||
/**
|
||||
* 145 episodes / 7 seasons
|
||||
*/
|
||||
private static TVDotComClient tvdotcom = new TVDotComClient();
|
||||
|
||||
private static HyperLink buffySearchResult;
|
||||
|
||||
/**
|
||||
* 13 episodes / 1 season only
|
||||
*/
|
||||
private static HyperLink fireflySearchResult;
|
||||
|
||||
/**
|
||||
* more than 700 episodes / 26 seasons (on going)
|
||||
*/
|
||||
private static HyperLink doctorwhoTestResult;
|
||||
|
||||
|
||||
@BeforeClass
|
||||
public static void setUpBeforeClass() throws Exception {
|
||||
buffySearchResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html"));
|
||||
fireflySearchResult = new HyperLink("Firefly", new URL("http://www.tv.com/firefly/show/7097/episode_listings.html"));
|
||||
doctorwhoTestResult = new HyperLink("Doctor Who", new URL("http://www.tv.com/doctor-who/show/355/episode_listings.html"));
|
||||
buffySearchResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html"));
|
||||
}
|
||||
|
||||
private TVDotComClient tvdotcom = new TVDotComClient();
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void search() throws Exception {
|
||||
List<SearchResult> results = tvdotcom.search("Buffy");
|
||||
|
@ -71,22 +56,24 @@ public class TVDotComClientTest {
|
|||
|
||||
@Test
|
||||
public void getEpisodeListAllMultiSeason() throws Exception {
|
||||
// 144 episodes / 7 seasons
|
||||
List<Episode> list = tvdotcom.getEpisodeList(buffySearchResult);
|
||||
|
||||
assertEquals(145, list.size());
|
||||
assertEquals(144, list.size());
|
||||
|
||||
Episode first = list.get(0);
|
||||
|
||||
assertEquals("Buffy the Vampire Slayer", first.getSeriesName());
|
||||
assertEquals("Unaired Pilot", first.getTitle());
|
||||
assertEquals("0", first.getEpisodeNumber());
|
||||
assertEquals("Welcome to the Hellmouth (1)", first.getTitle());
|
||||
assertEquals("1", first.getEpisodeNumber());
|
||||
assertEquals("1", first.getSeasonNumber());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void getEpisodeListAllSingleSeason() throws Exception {
|
||||
List<Episode> list = tvdotcom.getEpisodeList(fireflySearchResult);
|
||||
// 13 episodes / 1 season only
|
||||
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Firefly").get(0));
|
||||
|
||||
assertEquals(15, list.size());
|
||||
|
||||
|
@ -101,7 +88,8 @@ public class TVDotComClientTest {
|
|||
|
||||
@Test
|
||||
public void getEpisodeListAllManySeasons() throws Exception {
|
||||
List<Episode> list = tvdotcom.getEpisodeList(doctorwhoTestResult);
|
||||
// more than 700 episodes / 26 seasons (on going)
|
||||
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Doctor Who").get(0));
|
||||
|
||||
// there are still new episodes coming out
|
||||
assertTrue(list.size() > 700);
|
||||
|
@ -110,9 +98,7 @@ public class TVDotComClientTest {
|
|||
|
||||
@Test
|
||||
public void getEpisodeListEncoding() throws Exception {
|
||||
HyperLink lostTestResult = new HyperLink("Lost", new URL("http://www.tv.com/lost/show/24313/episode_listings.html"));
|
||||
|
||||
List<Episode> list = tvdotcom.getEpisodeList(lostTestResult, 3);
|
||||
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Lost").get(0), 3);
|
||||
|
||||
Episode episode = list.get(13);
|
||||
|
||||
|
@ -125,13 +111,13 @@ public class TVDotComClientTest {
|
|||
|
||||
@Test
|
||||
public void getEpisodeListLink() {
|
||||
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 1).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html?season=1");
|
||||
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 1).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html?season=1");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void getEpisodeListLinkAll() {
|
||||
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 0).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html?season=0");
|
||||
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 0).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html?season=0");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ package net.sourceforge.filebot.web;
|
|||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
@ -65,7 +66,7 @@ public class TheTVDBClientTest {
|
|||
public void getEpisodeListAll() throws Exception {
|
||||
List<Episode> list = thetvdb.getEpisodeList(new TheTVDBSearchResult("Buffy the Vampire Slayer", 70327));
|
||||
|
||||
assertEquals(147, list.size());
|
||||
assertTrue(list.size() >= 144);
|
||||
|
||||
Episode first = list.get(0);
|
||||
|
||||
|
|
Loading…
Reference in New Issue