* fixed/updated Subscene scraper

* fixed/updated Sublight webservice
* updated test cases related to various web resources
This commit is contained in:
Reinhard Pointner 2010-10-21 14:47:47 +00:00
parent 531b455da2
commit d3bdcf597e
13 changed files with 154 additions and 64 deletions

Binary file not shown.

View File

@ -73,7 +73,7 @@ public class IMDbClient implements EpisodeListProvider {
if (results.isEmpty()) {
try {
String name = normalizeName(selectString("//H1/text()", dom));
String year = selectString("//H1//A", dom);
String year = new Scanner(selectString("//H1//SPAN", dom)).useDelimiter("\\D+").next();
String url = selectString("//LINK[@rel='canonical']/@href", dom);
results.add(new MovieDescriptor(name, Integer.parseInt(year), getImdbId(url)));

View File

@ -29,10 +29,10 @@ import net.sublight.webservice.ClientInfo;
import net.sublight.webservice.Genre;
import net.sublight.webservice.IMDB;
import net.sublight.webservice.Release;
import net.sublight.webservice.Sublight;
import net.sublight.webservice.SublightSoap;
import net.sublight.webservice.Subtitle;
import net.sublight.webservice.SubtitleLanguage;
import net.sublight.webservice.SubtitlesAPI2;
import net.sublight.webservice.SubtitlesAPI2Soap;
public class SublightSubtitleClient implements SubtitleProvider, VideoHashSubtitleService {
@ -41,7 +41,7 @@ public class SublightSubtitleClient implements SubtitleProvider, VideoHashSubtit
private final ClientInfo clientInfo = new ClientInfo();
private SubtitlesAPI2Soap webservice;
private SublightSoap webservice;
private String session;
@ -273,19 +273,23 @@ public class SublightSubtitleClient implements SubtitleProvider, VideoHashSubtit
}
protected byte[] getZipArchive(Subtitle subtitle) throws WebServiceException {
protected byte[] getZipArchive(Subtitle subtitle) throws WebServiceException, InterruptedException {
// require login
login();
Holder<String> ticket = new Holder<String>();
Holder<Short> que = new Holder<Short>();
Holder<byte[]> data = new Holder<byte[]>();
Holder<String> error = new Holder<String>();
webservice.getDownloadTicket(session, null, subtitle.getSubtitleID(), null, ticket, null, error);
webservice.getDownloadTicket2(session, null, subtitle.getSubtitleID(), null, ticket, que, null, error);
// abort if something went wrong
checkError(error);
// wait x seconds as specified by the download ticket response, download ticket is not valid until then
Thread.sleep(que.value * 1000);
webservice.downloadByID4(session, subtitle.getSubtitleID(), -1, false, ticket.value, null, data, null, error);
// abort if something went wrong
@ -306,7 +310,7 @@ public class SublightSubtitleClient implements SubtitleProvider, VideoHashSubtit
protected synchronized void login() throws WebServiceException {
if (webservice == null) {
// lazy initialize because all the JAX-WS class loading can take quite some time
webservice = new SubtitlesAPI2().getSubtitlesAPI2Soap();
webservice = new Sublight().getSublightSoap();
}
if (session == null) {

View File

@ -17,8 +17,6 @@ import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.Icon;
@ -28,7 +26,6 @@ import org.xml.sax.SAXException;
import net.sourceforge.filebot.ResourceManager;
import net.sourceforge.filebot.Settings;
import net.sourceforge.tuned.FileUtilities;
public class SubsceneSubtitleClient implements SubtitleProvider {
@ -128,9 +125,6 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
List<Node> nodes = selectNodes("//TABLE[@class='filmSubtitleList']//A[@class='a1']", subtitleListDocument);
// match subtitleId and typeId
Pattern hrefPattern = Pattern.compile("javascript:Subtitle\\((\\d+), '(\\w+)', .*");
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size());
for (Node node : nodes) {
@ -140,18 +134,9 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
if (languageName == null || languageName.equalsIgnoreCase(lang)) {
String name = getTextContent(getChildren("SPAN", node).get(1));
String href = getAttribute("href", node);
URL subtitlePage = new URL(subtitleListUrl.getProtocol(), subtitleListUrl.getHost(), href);
Matcher matcher = hrefPattern.matcher(href);
if (!matcher.matches())
throw new IllegalArgumentException("Cannot parse download parameters: " + href);
String subtitleId = matcher.group(1);
String archiveType = matcher.group(2);
URL downloadUrl = getDownloadLink(subtitleListUrl, subtitleId, archiveType);
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, archiveType, downloadUrl, subtitleListUrl));
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, subtitlePage));
}
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e);
@ -199,14 +184,6 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
}
protected URL getDownloadLink(URL referer, String subtitleId, String typeId) throws MalformedURLException {
String basePath = FileUtilities.getNameWithoutExtension(referer.getFile());
String path = String.format("%s-dlpath-%s/%s.zipx", basePath, subtitleId, typeId);
return new URL(referer.getProtocol(), referer.getHost(), path);
}
@Override
public URI getSubtitleListLink(SearchResult searchResult, String languageName) {
return ((HyperLink) searchResult).getURI();

View File

@ -2,31 +2,33 @@
package net.sourceforge.filebot.web;
import static java.util.Collections.*;
import static net.sourceforge.filebot.web.WebRequest.*;
import static net.sourceforge.tuned.XPathUtilities.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.Map;
import org.w3c.dom.Document;
import net.sourceforge.tuned.FileUtilities;
public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
private final String title;
private final String language;
private String title;
private String language;
private final String archiveType;
private final URL downloadLink;
private final URL referer;
private URL subtitlePage;
private Map<String, String> subtitleInfo;
public SubsceneSubtitleDescriptor(String title, String language, String archiveType, URL downloadLink, URL referer) {
public SubsceneSubtitleDescriptor(String title, String language, URL subtitlePage) {
this.title = title;
this.language = language;
this.archiveType = archiveType;
this.downloadLink = downloadLink;
this.referer = referer;
this.subtitlePage = subtitlePage;
}
@ -44,13 +46,42 @@ public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
@Override
public String getType() {
return archiveType;
return getSubtitleInfo().get("typeId");
}
@Override
public ByteBuffer fetch() throws Exception {
return WebRequest.fetch(downloadLink, singletonMap("Referer", referer.toString()));
// e.g. http://subscene.com/english/Firefly-The-Complete-Series/subtitle-40003-dlpath-20008/rar.zipx
String subtitlePagePath = FileUtilities.getNameWithoutExtension(subtitlePage.getFile());
String path = String.format("%s-dlpath-%s/%s.zipx", subtitlePagePath, getSubtitleInfo().get("filmId"), getSubtitleInfo().get("typeId"));
URL downloadLocator = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), path);
Map<String, String> downloadPostData = subtitleInfo;
HttpURLConnection connection = (HttpURLConnection) downloadLocator.openConnection();
connection.addRequestProperty("Referer", subtitlePage.toString());
return WebRequest.post(connection, downloadPostData);
}
private synchronized Map<String, String> getSubtitleInfo() {
// extract subtitle information from subtitle page if necessary
if (subtitleInfo == null) {
try {
Document dom = getHtmlDocument(subtitlePage);
subtitleInfo = new HashMap<String, String>();
subtitleInfo.put("subtitleId", selectString("//INPUT[@name='subtitleId']/@value", dom));
subtitleInfo.put("typeId", selectString("//INPUT[@name='typeId']/@value", dom));
subtitleInfo.put("filmId", selectString("//INPUT[@name='filmId']/@value", dom));
} catch (Exception e) {
throw new RuntimeException("Failed to extract subtitle info", e);
}
}
return subtitleInfo;
}

View File

@ -5,9 +5,13 @@ package net.sourceforge.filebot.web;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Map;
@ -131,6 +135,44 @@ public final class WebRequest {
}
public static ByteBuffer post(HttpURLConnection connection, Map<String, String> parameters) throws IOException {
byte[] postData = encodeParameters(parameters).getBytes("UTF-8");
// add content type and content length headers
connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.addRequestProperty("Content-Length", String.valueOf(postData.length));
connection.setRequestMethod("POST");
connection.setDoOutput(true);
// write post data
OutputStream out = connection.getOutputStream();
out.write(postData);
out.close();
// read response
int contentLength = connection.getContentLength();
InputStream in = connection.getInputStream();
ByteBufferOutputStream buffer = new ByteBufferOutputStream(contentLength >= 0 ? contentLength : 32 * 1024);
try {
// read all
buffer.transferFully(in);
} catch (IOException e) {
// if the content length is not known in advance an IOException (Premature EOF)
// is always thrown after all the data has been read
if (contentLength >= 0) {
throw e;
}
} finally {
in.close();
}
return buffer.getByteBuffer();
}
private static Charset getCharset(String contentType) {
if (contentType != null) {
// e.g. Content-Type: text/html; charset=iso-8859-1
@ -155,6 +197,28 @@ public final class WebRequest {
}
public static String encodeParameters(Map<String, String> parameters) {
StringBuilder sb = new StringBuilder();
for (Entry<String, String> entry : parameters.entrySet()) {
if (sb.length() > 0)
sb.append("&");
sb.append(entry.getKey());
sb.append("=");
try {
sb.append(URLEncoder.encode(entry.getValue(), "UTF-8"));
} catch (UnsupportedEncodingException e) {
// will never happen
throw new RuntimeException(e);
}
}
return sb.toString();
}
/**
* Dummy constructor to prevent instantiation.
*/

View File

@ -7,7 +7,8 @@ import static org.junit.Assert.*;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.LinkedList;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.junit.Test;
@ -17,7 +18,7 @@ public class SubRipReaderTest {
@Test
public void parse() throws Exception {
LinkedList<SubtitleElement> list = new LinkedList<SubtitleElement>();
List<SubtitleElement> list = new ArrayList<SubtitleElement>();
URL resource = new URL("http://www.opensubtitles.org/en/download/file/1951733951.gz");
InputStream source = new GZIPInputStream(resource.openStream());
@ -32,12 +33,12 @@ public class SubRipReaderTest {
reader.close();
}
assertEquals(499, list.size(), 0);
assertEquals(501, list.size(), 0);
assertEquals(3455, list.getFirst().getStart(), 0);
assertEquals(6799, list.getFirst().getEnd(), 0);
assertEquals(3455, list.get(0).getStart(), 0);
assertEquals(6799, list.get(0).getEnd(), 0);
assertEquals("Come with me if you want to live.", list.get(253).getText());
assertEquals("Come with me if you want to live.", list.get(254).getText());
}
}

View File

@ -24,7 +24,7 @@ public class IMDbClientTest {
assertEquals(2004, movie.getYear());
assertEquals(407362, movie.getImdbId(), 0);
assertEquals(7, results.size(), 0);
assertEquals(8, results.size(), 0);
}
@ -92,7 +92,7 @@ public class IMDbClientTest {
Episode first = list.get(0);
assertEquals("Mushishi", first.getSeriesName());
assertEquals("Mushi-Shi", first.getSeriesName());
assertEquals("Midori no za", first.getTitle());
assertEquals("1", first.getEpisode());
assertEquals("1", first.getSeason());

View File

@ -39,7 +39,7 @@ public class OpenSubtitlesXmlRpcTest {
MovieDescriptor sample = (MovieDescriptor) list.get(0);
// check sample entry
assertEquals("\"Babylon 5\" (1994) (TV series)", sample.getName());
assertEquals("\"Babylon 5\" (1994)", sample.getName());
assertEquals(105946, sample.getImdbId());
}
@ -182,7 +182,7 @@ public class OpenSubtitlesXmlRpcTest {
ByteBuffer data = list.get(0).fetch();
// check size
assertEquals(48550, data.remaining(), 0);
assertEquals(48707, data.remaining(), 0);
}

View File

@ -5,6 +5,7 @@ package net.sourceforge.filebot.web;
import static org.junit.Assert.*;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;
@ -98,4 +99,16 @@ public class SubsceneSubtitleClientTest {
assertEquals(twinpeaksSearchResult.getURL().toString(), subscene.getSubtitleListLink(twinpeaksSearchResult, null).toURL().toString());
}
@Test
public void downloadSubtitleArchive() throws Exception {
SearchResult selectedResult = subscene.search("firefly").get(0);
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(1);
assertEquals(subtitleDescriptor.getName(), "Firefly - The Complete Series");
ByteBuffer archive = subtitleDescriptor.fetch();
assertEquals(254549, archive.remaining());
}
}

View File

@ -17,8 +17,8 @@ public class TMDbClientTest {
@Test
public void searchByName() throws Exception {
List<MovieDescriptor> result = tmdb.searchMovie("transformers");
MovieDescriptor movie = result.get(0);
List<MovieDescriptor> result = tmdb.searchMovie("Transformers");
MovieDescriptor movie = result.get(1);
assertEquals("Transformers", movie.getName());
assertEquals(2007, movie.getYear());

View File

@ -107,7 +107,7 @@ public class TVDotComClientTest {
public void getEpisodeListEncoding() throws Exception {
List<Episode> list = tvdotcom.getEpisodeList(tvdotcom.search("Lost").get(0), 3);
Episode episode = list.get(16);
Episode episode = list.get(13);
assertEquals("Lost", episode.getSeriesName());
assertEquals("Exposé", episode.getTitle());

View File

@ -40,7 +40,7 @@ public class TheTVDBClientTest {
public void searchGerman() throws Exception {
List<SearchResult> results = thetvdb.search("buffy", Locale.GERMAN);
assertEquals(3, results.size());
assertEquals(4, results.size());
TheTVDBSearchResult first = (TheTVDBSearchResult) results.get(0);
@ -70,9 +70,9 @@ public class TheTVDBClientTest {
// check special episode
Episode last = list.get(list.size() - 1);
assertEquals("Buffy the Vampire Slayer", last.getSeriesName());
assertEquals("Season 5 Overview", last.getTitle());
assertEquals("Special 17", last.getEpisode());
assertEquals("0", last.getSeason());
assertEquals("Unaired Pilot", last.getTitle());
assertEquals("Special 1", last.getEpisode());
assertEquals("1", last.getSeason());
}