* fixed/updated Subscene scraper

* fixed/updated Sublight webservice
* updated test cases related to various web resources
This commit is contained in:
Reinhard Pointner 2010-10-21 14:47:47 +00:00
parent 531b455da2
commit d3bdcf597e
13 changed files with 154 additions and 64 deletions

Binary file not shown.

View File

@ -73,7 +73,7 @@ public class IMDbClient implements EpisodeListProvider {
if (results.isEmpty()) { if (results.isEmpty()) {
try { try {
String name = normalizeName(selectString("//H1/text()", dom)); String name = normalizeName(selectString("//H1/text()", dom));
String year = selectString("//H1//A", dom); String year = new Scanner(selectString("//H1//SPAN", dom)).useDelimiter("\\D+").next();
String url = selectString("//LINK[@rel='canonical']/@href", dom); String url = selectString("//LINK[@rel='canonical']/@href", dom);
results.add(new MovieDescriptor(name, Integer.parseInt(year), getImdbId(url))); results.add(new MovieDescriptor(name, Integer.parseInt(year), getImdbId(url)));

View File

@ -29,10 +29,10 @@ import net.sublight.webservice.ClientInfo;
import net.sublight.webservice.Genre; import net.sublight.webservice.Genre;
import net.sublight.webservice.IMDB; import net.sublight.webservice.IMDB;
import net.sublight.webservice.Release; import net.sublight.webservice.Release;
import net.sublight.webservice.Sublight;
import net.sublight.webservice.SublightSoap;
import net.sublight.webservice.Subtitle; import net.sublight.webservice.Subtitle;
import net.sublight.webservice.SubtitleLanguage; import net.sublight.webservice.SubtitleLanguage;
import net.sublight.webservice.SubtitlesAPI2;
import net.sublight.webservice.SubtitlesAPI2Soap;
public class SublightSubtitleClient implements SubtitleProvider, VideoHashSubtitleService { public class SublightSubtitleClient implements SubtitleProvider, VideoHashSubtitleService {
@ -41,7 +41,7 @@ public class SublightSubtitleClient implements SubtitleProvider, VideoHashSubtit
private final ClientInfo clientInfo = new ClientInfo(); private final ClientInfo clientInfo = new ClientInfo();
private SubtitlesAPI2Soap webservice; private SublightSoap webservice;
private String session; private String session;
@ -273,19 +273,23 @@ public class SublightSubtitleClient implements SubtitleProvider, VideoHashSubtit
} }
protected byte[] getZipArchive(Subtitle subtitle) throws WebServiceException { protected byte[] getZipArchive(Subtitle subtitle) throws WebServiceException, InterruptedException {
// require login // require login
login(); login();
Holder<String> ticket = new Holder<String>(); Holder<String> ticket = new Holder<String>();
Holder<Short> que = new Holder<Short>();
Holder<byte[]> data = new Holder<byte[]>(); Holder<byte[]> data = new Holder<byte[]>();
Holder<String> error = new Holder<String>(); Holder<String> error = new Holder<String>();
webservice.getDownloadTicket(session, null, subtitle.getSubtitleID(), null, ticket, null, error); webservice.getDownloadTicket2(session, null, subtitle.getSubtitleID(), null, ticket, que, null, error);
// abort if something went wrong // abort if something went wrong
checkError(error); checkError(error);
// wait x seconds as specified by the download ticket response, download ticket is not valid until then
Thread.sleep(que.value * 1000);
webservice.downloadByID4(session, subtitle.getSubtitleID(), -1, false, ticket.value, null, data, null, error); webservice.downloadByID4(session, subtitle.getSubtitleID(), -1, false, ticket.value, null, data, null, error);
// abort if something went wrong // abort if something went wrong
@ -306,7 +310,7 @@ public class SublightSubtitleClient implements SubtitleProvider, VideoHashSubtit
protected synchronized void login() throws WebServiceException { protected synchronized void login() throws WebServiceException {
if (webservice == null) { if (webservice == null) {
// lazy initialize because all the JAX-WS class loading can take quite some time // lazy initialize because all the JAX-WS class loading can take quite some time
webservice = new SubtitlesAPI2().getSubtitlesAPI2Soap(); webservice = new Sublight().getSublightSoap();
} }
if (session == null) { if (session == null) {

View File

@ -17,8 +17,6 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.Icon; import javax.swing.Icon;
@ -28,7 +26,6 @@ import org.xml.sax.SAXException;
import net.sourceforge.filebot.ResourceManager; import net.sourceforge.filebot.ResourceManager;
import net.sourceforge.filebot.Settings; import net.sourceforge.filebot.Settings;
import net.sourceforge.tuned.FileUtilities;
public class SubsceneSubtitleClient implements SubtitleProvider { public class SubsceneSubtitleClient implements SubtitleProvider {
@ -128,9 +125,6 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
List<Node> nodes = selectNodes("//TABLE[@class='filmSubtitleList']//A[@class='a1']", subtitleListDocument); List<Node> nodes = selectNodes("//TABLE[@class='filmSubtitleList']//A[@class='a1']", subtitleListDocument);
// match subtitleId and typeId
Pattern hrefPattern = Pattern.compile("javascript:Subtitle\\((\\d+), '(\\w+)', .*");
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size()); List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size());
for (Node node : nodes) { for (Node node : nodes) {
@ -140,18 +134,9 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
if (languageName == null || languageName.equalsIgnoreCase(lang)) { if (languageName == null || languageName.equalsIgnoreCase(lang)) {
String name = getTextContent(getChildren("SPAN", node).get(1)); String name = getTextContent(getChildren("SPAN", node).get(1));
String href = getAttribute("href", node); String href = getAttribute("href", node);
URL subtitlePage = new URL(subtitleListUrl.getProtocol(), subtitleListUrl.getHost(), href);
Matcher matcher = hrefPattern.matcher(href); subtitles.add(new SubsceneSubtitleDescriptor(name, lang, subtitlePage));
if (!matcher.matches())
throw new IllegalArgumentException("Cannot parse download parameters: " + href);
String subtitleId = matcher.group(1);
String archiveType = matcher.group(2);
URL downloadUrl = getDownloadLink(subtitleListUrl, subtitleId, archiveType);
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, archiveType, downloadUrl, subtitleListUrl));
} }
} catch (Exception e) { } catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e); Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e);
@ -199,14 +184,6 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
} }
protected URL getDownloadLink(URL referer, String subtitleId, String typeId) throws MalformedURLException {
String basePath = FileUtilities.getNameWithoutExtension(referer.getFile());
String path = String.format("%s-dlpath-%s/%s.zipx", basePath, subtitleId, typeId);
return new URL(referer.getProtocol(), referer.getHost(), path);
}
@Override @Override
public URI getSubtitleListLink(SearchResult searchResult, String languageName) { public URI getSubtitleListLink(SearchResult searchResult, String languageName) {
return ((HyperLink) searchResult).getURI(); return ((HyperLink) searchResult).getURI();

View File

@ -2,31 +2,33 @@
package net.sourceforge.filebot.web; package net.sourceforge.filebot.web;
import static java.util.Collections.*; import static net.sourceforge.filebot.web.WebRequest.*;
import static net.sourceforge.tuned.XPathUtilities.*;
import java.net.HttpURLConnection;
import java.net.URL; import java.net.URL;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.Map;
import org.w3c.dom.Document;
import net.sourceforge.tuned.FileUtilities;
public class SubsceneSubtitleDescriptor implements SubtitleDescriptor { public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
private final String title; private String title;
private final String language; private String language;
private final String archiveType; private URL subtitlePage;
private Map<String, String> subtitleInfo;
private final URL downloadLink;
private final URL referer;
public SubsceneSubtitleDescriptor(String title, String language, String archiveType, URL downloadLink, URL referer) { public SubsceneSubtitleDescriptor(String title, String language, URL subtitlePage) {
this.title = title; this.title = title;
this.language = language; this.language = language;
this.subtitlePage = subtitlePage;
this.archiveType = archiveType;
this.downloadLink = downloadLink;
this.referer = referer;
} }
@ -44,13 +46,42 @@ public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
@Override @Override
public String getType() { public String getType() {
return archiveType; return getSubtitleInfo().get("typeId");
} }
@Override @Override
public ByteBuffer fetch() throws Exception { public ByteBuffer fetch() throws Exception {
return WebRequest.fetch(downloadLink, singletonMap("Referer", referer.toString())); // e.g. http://subscene.com/english/Firefly-The-Complete-Series/subtitle-40003-dlpath-20008/rar.zipx
String subtitlePagePath = FileUtilities.getNameWithoutExtension(subtitlePage.getFile());
String path = String.format("%s-dlpath-%s/%s.zipx", subtitlePagePath, getSubtitleInfo().get("filmId"), getSubtitleInfo().get("typeId"));
URL downloadLocator = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), path);
Map<String, String> downloadPostData = subtitleInfo;
HttpURLConnection connection = (HttpURLConnection) downloadLocator.openConnection();
connection.addRequestProperty("Referer", subtitlePage.toString());
return WebRequest.post(connection, downloadPostData);
}
private synchronized Map<String, String> getSubtitleInfo() {
// extract subtitle information from subtitle page if necessary
if (subtitleInfo == null) {
try {
Document dom = getHtmlDocument(subtitlePage);
subtitleInfo = new HashMap<String, String>();
subtitleInfo.put("subtitleId", selectString("//INPUT[@name='subtitleId']/@value", dom));
subtitleInfo.put("typeId", selectString("//INPUT[@name='typeId']/@value", dom));
subtitleInfo.put("filmId", selectString("//INPUT[@name='filmId']/@value", dom));
} catch (Exception e) {
throw new RuntimeException("Failed to extract subtitle info", e);
}
}
return subtitleInfo;
} }

View File

@ -5,9 +5,13 @@ package net.sourceforge.filebot.web;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader; import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL; import java.net.URL;
import java.net.URLConnection; import java.net.URLConnection;
import java.net.URLEncoder;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Map; import java.util.Map;
@ -131,6 +135,44 @@ public final class WebRequest {
} }
public static ByteBuffer post(HttpURLConnection connection, Map<String, String> parameters) throws IOException {
byte[] postData = encodeParameters(parameters).getBytes("UTF-8");
// add content type and content length headers
connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.addRequestProperty("Content-Length", String.valueOf(postData.length));
connection.setRequestMethod("POST");
connection.setDoOutput(true);
// write post data
OutputStream out = connection.getOutputStream();
out.write(postData);
out.close();
// read response
int contentLength = connection.getContentLength();
InputStream in = connection.getInputStream();
ByteBufferOutputStream buffer = new ByteBufferOutputStream(contentLength >= 0 ? contentLength : 32 * 1024);
try {
// read all
buffer.transferFully(in);
} catch (IOException e) {
// if the content length is not known in advance an IOException (Premature EOF)
// is always thrown after all the data has been read
if (contentLength >= 0) {
throw e;
}
} finally {
in.close();
}
return buffer.getByteBuffer();
}
private static Charset getCharset(String contentType) { private static Charset getCharset(String contentType) {
if (contentType != null) { if (contentType != null) {
// e.g. Content-Type: text/html; charset=iso-8859-1 // e.g. Content-Type: text/html; charset=iso-8859-1
@ -155,6 +197,28 @@ public final class WebRequest {
} }
public static String encodeParameters(Map<String, String> parameters) {
StringBuilder sb = new StringBuilder();
for (Entry<String, String> entry : parameters.entrySet()) {
if (sb.length() > 0)
sb.append("&");
sb.append(entry.getKey());
sb.append("=");
try {
sb.append(URLEncoder.encode(entry.getValue(), "UTF-8"));
} catch (UnsupportedEncodingException e) {
// will never happen
throw new RuntimeException(e);
}
}
return sb.toString();
}
/** /**
* Dummy constructor to prevent instantiation. * Dummy constructor to prevent instantiation.
*/ */

View File

@ -7,7 +7,8 @@ import static org.junit.Assert.*;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.net.URL; import java.net.URL;
import java.util.LinkedList; import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import org.junit.Test; import org.junit.Test;
@ -17,7 +18,7 @@ public class SubRipReaderTest {
@Test @Test
public void parse() throws Exception { public void parse() throws Exception {
LinkedList<SubtitleElement> list = new LinkedList<SubtitleElement>(); List<SubtitleElement> list = new ArrayList<SubtitleElement>();
URL resource = new URL("http://www.opensubtitles.org/en/download/file/1951733951.gz"); URL resource = new URL("http://www.opensubtitles.org/en/download/file/1951733951.gz");
InputStream source = new GZIPInputStream(resource.openStream()); InputStream source = new GZIPInputStream(resource.openStream());
@ -32,12 +33,12 @@ public class SubRipReaderTest {
reader.close(); reader.close();
} }
assertEquals(499, list.size(), 0); assertEquals(501, list.size(), 0);
assertEquals(3455, list.getFirst().getStart(), 0); assertEquals(3455, list.get(0).getStart(), 0);
assertEquals(6799, list.getFirst().getEnd(), 0); assertEquals(6799, list.get(0).getEnd(), 0);
assertEquals("Come with me if you want to live.", list.get(253).getText()); assertEquals("Come with me if you want to live.", list.get(254).getText());
} }
} }

View File

@ -24,7 +24,7 @@ public class IMDbClientTest {
assertEquals(2004, movie.getYear()); assertEquals(2004, movie.getYear());
assertEquals(407362, movie.getImdbId(), 0); assertEquals(407362, movie.getImdbId(), 0);
assertEquals(7, results.size(), 0); assertEquals(8, results.size(), 0);
} }
@ -92,7 +92,7 @@ public class IMDbClientTest {
Episode first = list.get(0); Episode first = list.get(0);
assertEquals("Mushishi", first.getSeriesName()); assertEquals("Mushi-Shi", first.getSeriesName());
assertEquals("Midori no za", first.getTitle()); assertEquals("Midori no za", first.getTitle());
assertEquals("1", first.getEpisode()); assertEquals("1", first.getEpisode());
assertEquals("1", first.getSeason()); assertEquals("1", first.getSeason());

View File

@ -39,7 +39,7 @@ public class OpenSubtitlesXmlRpcTest {
MovieDescriptor sample = (MovieDescriptor) list.get(0); MovieDescriptor sample = (MovieDescriptor) list.get(0);
// check sample entry // check sample entry
assertEquals("\"Babylon 5\" (1994) (TV series)", sample.getName()); assertEquals("\"Babylon 5\" (1994)", sample.getName());
assertEquals(105946, sample.getImdbId()); assertEquals(105946, sample.getImdbId());
} }
@ -182,7 +182,7 @@ public class OpenSubtitlesXmlRpcTest {
ByteBuffer data = list.get(0).fetch(); ByteBuffer data = list.get(0).fetch();
// check size // check size
assertEquals(48550, data.remaining(), 0); assertEquals(48707, data.remaining(), 0);
} }

View File

@ -5,6 +5,7 @@ package net.sourceforge.filebot.web;
import static org.junit.Assert.*; import static org.junit.Assert.*;
import java.net.URL; import java.net.URL;
import java.nio.ByteBuffer;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -98,4 +99,16 @@ public class SubsceneSubtitleClientTest {
assertEquals(twinpeaksSearchResult.getURL().toString(), subscene.getSubtitleListLink(twinpeaksSearchResult, null).toURL().toString()); assertEquals(twinpeaksSearchResult.getURL().toString(), subscene.getSubtitleListLink(twinpeaksSearchResult, null).toURL().toString());
} }
@Test
public void downloadSubtitleArchive() throws Exception {
SearchResult selectedResult = subscene.search("firefly").get(0);
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(1);
assertEquals(subtitleDescriptor.getName(), "Firefly - The Complete Series");
ByteBuffer archive = subtitleDescriptor.fetch();
assertEquals(254549, archive.remaining());
}
} }

View File

@ -17,8 +17,8 @@ public class TMDbClientTest {
@Test @Test
public void searchByName() throws Exception { public void searchByName() throws Exception {
List<MovieDescriptor> result = tmdb.searchMovie("transformers"); List<MovieDescriptor> result = tmdb.searchMovie("Transformers");
MovieDescriptor movie = result.get(0); MovieDescriptor movie = result.get(1);
assertEquals("Transformers", movie.getName()); assertEquals("Transformers", movie.getName());
assertEquals(2007, movie.getYear()); assertEquals(2007, movie.getYear());

View File

@ -107,7 +107,7 @@ public class TVDotComClientTest {
public void getEpisodeListEncoding() throws Exception { public void getEpisodeListEncoding() throws Exception {
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Lost").get(0), 3); List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Lost").get(0), 3);
Episode episode = list.get(16); Episode episode = list.get(13);
assertEquals("Lost", episode.getSeriesName()); assertEquals("Lost", episode.getSeriesName());
assertEquals("Exposé", episode.getTitle()); assertEquals("Exposé", episode.getTitle());

View File

@ -40,7 +40,7 @@ public class TheTVDBClientTest {
public void searchGerman() throws Exception { public void searchGerman() throws Exception {
List<SearchResult> results = thetvdb.search("buffy", Locale.GERMAN); List<SearchResult> results = thetvdb.search("buffy", Locale.GERMAN);
assertEquals(3, results.size()); assertEquals(4, results.size());
TheTVDBSearchResult first = (TheTVDBSearchResult) results.get(0); TheTVDBSearchResult first = (TheTVDBSearchResult) results.get(0);
@ -70,9 +70,9 @@ public class TheTVDBClientTest {
// check special episode // check special episode
Episode last = list.get(list.size() - 1); Episode last = list.get(list.size() - 1);
assertEquals("Buffy the Vampire Slayer", last.getSeriesName()); assertEquals("Buffy the Vampire Slayer", last.getSeriesName());
assertEquals("Season 5 Overview", last.getTitle()); assertEquals("Unaired Pilot", last.getTitle());
assertEquals("Special 17", last.getEpisode()); assertEquals("Special 1", last.getEpisode());
assertEquals("0", last.getSeason()); assertEquals("1", last.getSeason());
} }