+ dropped support for TV.com scraper (broken due to site changes)

This commit is contained in:
Reinhard Pointner 2011-08-11 10:52:17 +00:00
parent 6f394dfadf
commit 6074680401
5 changed files with 3 additions and 339 deletions

View File

@ -12,7 +12,6 @@ import net.sourceforge.filebot.web.SerienjunkiesClient;
import net.sourceforge.filebot.web.SublightSubtitleClient;
import net.sourceforge.filebot.web.SubsceneSubtitleClient;
import net.sourceforge.filebot.web.SubtitleProvider;
import net.sourceforge.filebot.web.TVDotComClient;
import net.sourceforge.filebot.web.TVRageClient;
import net.sourceforge.filebot.web.TheTVDBClient;
import net.sourceforge.filebot.web.VideoHashSubtitleService;
@ -26,7 +25,6 @@ public final class WebServices {
// episode dbs
public static final TVRageClient TVRage = new TVRageClient();
public static final AnidbClient AniDB = new AnidbClient("filebot", 1);
public static final TVDotComClient TVDotCom = new TVDotComClient();
public static final IMDbClient IMDb = new IMDbClient();
public static final TheTVDBClient TheTVDB = new TheTVDBClient(getApplicationProperty("thetvdb.apikey"));
public static final SerienjunkiesClient Serienjunkies = new SerienjunkiesClient(getApplicationProperty("serienjunkies.apikey"));
@ -38,7 +36,7 @@ public final class WebServices {
public static EpisodeListProvider[] getEpisodeListProviders() {
return new EpisodeListProvider[] { TVRage, AniDB, TVDotCom, IMDb, TheTVDB, Serienjunkies };
return new EpisodeListProvider[] { TVRage, AniDB, IMDb, TheTVDB, Serienjunkies };
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 722 B

View File

@ -1,200 +0,0 @@
package net.sourceforge.filebot.web;
import static net.sourceforge.filebot.web.WebRequest.*;
import static net.sourceforge.tuned.XPathUtilities.*;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.Icon;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import net.sourceforge.filebot.ResourceManager;
public class TVDotComClient extends AbstractEpisodeListProvider {
private static final String host = "www.tv.com";
@Override
public String getName() {
return "TV.com";
}
@Override
public Icon getIcon() {
return ResourceManager.getIcon("search.tvdotcom");
}
@Override
public List<SearchResult> search(String query, Locale locale) throws IOException, SAXException {
// use ajax search request, because we don't need the whole search result page
URL searchUrl = new URL("http", host, "/search.php?type=Search&stype=ajax_search&search_type=program&qs=" + URLEncoder.encode(query, "UTF-8"));
Document dom = getHtmlDocument(searchUrl);
List<SearchResult> searchResults = new ArrayList<SearchResult>();
for (Node node : selectNodes("//H2/A", dom)) {
String title = getTextContent(node);
String href = getAttribute("href", node);
try {
URL episodeGuideLocation = new URL(href.replaceAll("summary[.]html[?].*", "episode.html"));
searchResults.add(new HyperLink(title, episodeGuideLocation));
} catch (MalformedURLException e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Invalid href: " + href, e);
}
}
return searchResults;
}
@Override
public List<Episode> getEpisodeList(final SearchResult searchResult, final Locale locale) throws Exception {
// get document for season 1
Document dom = getHtmlDocument(getEpisodeListLink(searchResult, 1).toURL());
// seasons are ordered in reverse, first element is latest season
String latestSeasonString = selectString("id('episode_list_header')//*[contains(@class, 'number')]", dom);
if (latestSeasonString.isEmpty()) {
// assume single season series
latestSeasonString = "1";
}
// strip unexpected characters from season string (e.g. "7...");
int seasonCount = Integer.valueOf(latestSeasonString.replaceAll("\\D+", ""));
// we're going to fetch the episode list for each season on multiple threads
List<Future<List<Episode>>> futures = new ArrayList<Future<List<Episode>>>(seasonCount);
if (seasonCount > 1) {
// max. 12 threads so we don't get too many concurrent connections
ExecutorService executor = Executors.newFixedThreadPool(Math.min(seasonCount - 1, 12));
// we already have the document for season 1, start with season 2
for (int i = 2; i <= seasonCount; i++) {
// season used in anonymous class
final int season = i;
futures.add(executor.submit(new Callable<List<Episode>>() {
@Override
public List<Episode> call() throws Exception {
return getEpisodeList(searchResult, season, locale);
}
}));
}
// shutdown after all tasks are done
executor.shutdown();
}
List<Episode> episodes = new ArrayList<Episode>(25 * seasonCount);
// get episode list from season 1 document
episodes.addAll(getEpisodeList(searchResult, dom));
// get episodes from executor threads
for (Future<List<Episode>> future : futures) {
episodes.addAll(future.get());
}
return episodes;
}
@Override
public List<Episode> getEpisodeList(SearchResult searchResult, int season, Locale locale) throws IOException, SAXException {
Document dom = getHtmlDocument(getEpisodeListLink(searchResult, season).toURL());
return getEpisodeList(searchResult, dom);
}
private List<Episode> getEpisodeList(SearchResult searchResult, Document dom) {
List<Node> nodes = selectNodes("id('episode_guide_list')//*[@class='info']", dom);
Pattern episodePattern = Pattern.compile("Season.(\\d+).+Episode.(\\d+)");
Pattern airdatePattern = Pattern.compile("\\d{1,2}.\\d{1,2}.\\d{4}");
List<Episode> episodes = new ArrayList<Episode>(nodes.size());
for (Node node : nodes) {
String title = selectString("./H3/A/text()", node);
String meta = selectString("./*[@class='meta']", node).replaceAll("\\p{Space}+", " ");
Integer season = null;
Integer episode = null;
Date airdate = null;
Matcher m;
// try to match episode information
if ((m = episodePattern.matcher(meta)).find()) {
// matches episode
season = new Integer(m.group(1));
episode = new Integer(m.group(2));
}
// try to match airdate information
if ((m = airdatePattern.matcher(meta)).find()) {
airdate = Date.parse(m.group(), "MM/dd/yyyy"); // e.g. 5/20/2003
}
// add episode if SxE info has been found
if (season != null && episode != null) {
episodes.add(new Episode(searchResult.getName(), season, episode, title, null, null, airdate));
}
}
// episodes are listed in reverse order
Collections.reverse(episodes);
return episodes;
}
@Override
public URI getEpisodeListLink(SearchResult searchResult) {
return getEpisodeListLink(searchResult, "All");
}
@Override
public URI getEpisodeListLink(SearchResult searchResult, int season) {
return getEpisodeListLink(searchResult, Integer.toString(season));
}
public URI getEpisodeListLink(SearchResult searchResult, String season) {
URL episodeGuide = ((HyperLink) searchResult).getURL();
return URI.create(episodeGuide + "?season=" + season);
}
}

View File

@ -1,134 +0,0 @@
package net.sourceforge.filebot.web;
import static org.junit.Assert.*;
import java.net.URL;
import java.util.List;
import org.junit.BeforeClass;
import org.junit.Test;
public class TVDotComClientTest {
private static TVDotComClient tvdotcom = new TVDotComClient();
private static HyperLink buffySearchResult;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
buffySearchResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html"));
}
@Test
public void search() throws Exception {
List<SearchResult> results = tvdotcom.search("buffy the vampire slayer");
// if this fails, there is probably a problem with the xpath query
assertEquals(4, results.size());
HyperLink result = (HyperLink) results.get(0);
assertEquals(buffySearchResult.getName(), result.getName());
assertEquals(buffySearchResult.getURL().toString(), result.getURL().toString());
}
@Test
public void searchNoMatch() throws Exception {
List<SearchResult> results = tvdotcom.search("i will not find anything for this query string");
assertTrue(results.isEmpty());
}
@Test
public void getEpisodeList() throws Exception {
List<Episode> list = tvdotcom.getEpisodeList(buffySearchResult, 7);
assertEquals(22, list.size());
Episode chosen = list.get(21);
assertEquals("Buffy the Vampire Slayer", chosen.getSeriesName());
assertEquals("Chosen", chosen.getTitle());
assertEquals("22", chosen.getEpisode().toString());
assertEquals("7", chosen.getSeason().toString());
assertEquals("2003-05-20", chosen.airdate().toString());
}
@Test
public void getEpisodeListAllMultiSeason() throws Exception {
// 144 episodes / 7 seasons
List<Episode> list = tvdotcom.getEpisodeList(buffySearchResult);
assertEquals(144, list.size());
Episode first = list.get(0);
assertEquals("Buffy the Vampire Slayer", first.getSeriesName());
assertEquals("Welcome to the Hellmouth (1)", first.getTitle());
assertEquals("1", first.getEpisode().toString());
assertEquals("1", first.getSeason().toString());
assertEquals("1997-03-10", first.airdate().toString());
}
@Test
public void getEpisodeListAllSingleSeason() throws Exception {
// 13 episodes / 1 season only
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Firefly").get(0));
assertEquals(15, list.size());
Episode fourth = list.get(3);
assertEquals("Firefly", fourth.getSeriesName());
assertEquals("Jaynestown", fourth.getTitle());
assertEquals("4", fourth.getEpisode().toString());
assertEquals("1", fourth.getSeason().toString());
assertEquals("2002-10-18", fourth.airdate().toString());
}
@Test
public void getEpisodeListAllManySeasons() throws Exception {
// more than 700 episodes / 26 seasons
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Doctor Who (1963)").get(0));
// there are still new episodes coming out
assertTrue(list.size() > 700);
}
@Test
public void getEpisodeListEncoding() throws Exception {
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Lost").get(0), 3);
Episode episode = list.get(13);
assertEquals("Lost", episode.getSeriesName());
assertEquals("Exposé", episode.getTitle());
assertEquals("14", episode.getEpisode().toString());
assertEquals("3", episode.getSeason().toString());
assertEquals("2007-03-28", episode.airdate().toString());
}
@Test
public void getEpisodeListLink() {
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 1).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html?season=1");
}
@Test
public void getEpisodeListLinkAll() {
assertEquals(tvdotcom.getEpisodeListLink(buffySearchResult, 0).toString(), "http://www.tv.com/buffy-the-vampire-slayer/show/10/episode.html?season=0");
}
}

View File

@ -8,8 +8,8 @@ import org.junit.runners.Suite.SuiteClasses;
@RunWith(Suite.class)
@SuiteClasses( { TVDotComClientTest.class, AnidbClientTest.class, TVRageClientTest.class, TheTVDBClientTest.class, SerienjunkiesClientTest.class, TMDbClientTest.class, IMDbClientTest.class, SubsceneSubtitleClientTest.class,
SublightSubtitleClientTest.class, OpenSubtitlesXmlRpcTest.class })
@SuiteClasses( { AnidbClientTest.class, TVRageClientTest.class, TheTVDBClientTest.class, SerienjunkiesClientTest.class, TMDbClientTest.class, IMDbClientTest.class, SubsceneSubtitleClientTest.class, SublightSubtitleClientTest.class,
OpenSubtitlesXmlRpcTest.class })
public class WebTestSuite {
}