* fix subscene scraper

This commit is contained in:
Reinhard Pointner 2013-01-15 11:28:19 +00:00
parent 6384e97b64
commit e3ba7b79e3
3 changed files with 19 additions and 32 deletions

View File

@ -2,17 +2,15 @@
package net.sourceforge.filebot.web;
import static net.sourceforge.filebot.web.WebRequest.*;
import static java.util.Collections.*;
import static net.sourceforge.tuned.XPathUtilities.*;
import java.net.HttpURLConnection;
import java.io.IOException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.Map;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
@ -21,7 +19,6 @@ public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
private String language;
private URL subtitlePage;
private Map<String, String> subtitleInfo;
public SubsceneSubtitleDescriptor(String title, String language, URL subtitlePage) {
@ -51,30 +48,14 @@ public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
@Override
public ByteBuffer fetch() throws Exception {
URL downloadLink = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), "/subtitle/download");
HttpURLConnection connection = (HttpURLConnection) downloadLink.openConnection();
connection.addRequestProperty("Referer", subtitlePage.toString());
return WebRequest.post(connection, getSubtitleInfo());
return WebRequest.fetch(getDownloadLink(), 0, singletonMap("Referer", subtitlePage.toString()));
}
private synchronized Map<String, String> getSubtitleInfo() {
// extract subtitle information from subtitle page if necessary
if (subtitleInfo == null) {
subtitleInfo = new HashMap<String, String>();
try {
Document dom = getHtmlDocument(subtitlePage);
for (Node input : selectNodes("id('dl')//INPUT[@name]", dom)) {
subtitleInfo.put(getAttribute("name", input), getAttribute("value", input));
}
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Failed to extract subtitle info", e);
}
}
return subtitleInfo;
private URL getDownloadLink() throws IOException, SAXException {
Document page = WebRequest.getHtmlDocument(subtitlePage);
String file = selectString("id('downloadButton')/@href", page);
return new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), file);
}

View File

@ -133,7 +133,9 @@ public final class WebRequest {
public static ByteBuffer fetch(URL url, long ifModifiedSince, Map<String, String> requestParameters) throws IOException {
URLConnection connection = url.openConnection();
connection.setIfModifiedSince(ifModifiedSince);
if (ifModifiedSince > 0) {
connection.setIfModifiedSince(ifModifiedSince);
}
if (requestParameters != null) {
for (Entry<String, String> parameter : requestParameters.entrySet()) {

View File

@ -9,6 +9,8 @@ import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;
import net.sourceforge.filebot.vfs.ArchiveType;
import net.sourceforge.filebot.vfs.MemoryFile;
import net.sourceforge.filebot.web.SubsceneSubtitleClient.SubsceneSearchResult;
import org.junit.BeforeClass;
@ -50,7 +52,7 @@ public class SubsceneSubtitleClientTest {
@Test
public void search2() throws Exception {
List<SearchResult> results = subscene.search("Avatar 2009");
List<SearchResult> results = subscene.search("firefly");
SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
assertEquals("Firefly - The Complete Series (2002)", result.toString());
@ -100,10 +102,12 @@ public class SubsceneSubtitleClientTest {
public void downloadSubtitleArchive() throws Exception {
SearchResult selectedResult = subscene.search("firefly").get(0);
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(0);
assertEquals("Firefly.S01E00-13.DVDRip-Rogue.eng-RETAIL", subtitleDescriptor.getName());
assertEquals("Firefly The Complete Series", subtitleDescriptor.getName());
ByteBuffer archive = subtitleDescriptor.fetch();
assertEquals(254549, archive.remaining());
ByteBuffer data = subtitleDescriptor.fetch();
Iterable<MemoryFile> archive = ArchiveType.RAR.fromData(data);
MemoryFile file = archive.iterator().next();
assertEquals("Firefly - 1x01 - Serenity.srt", file.getName());
}
}