+ support n-alias data files

This commit is contained in:
Reinhard Pointner 2013-09-07 15:48:24 +00:00
parent 3a7769ea2f
commit c227ec4bd9
11 changed files with 222 additions and 240 deletions

View File

@ -157,10 +157,15 @@ if (thetvdb_txt.size() < 30000) { throw new Exception('TheTVDB index sanity fail
// BUILD anidb-index.gz
def anidb = new net.sourceforge.filebot.web.AnidbClient(null, 0).getAnimeTitles()
def anidb_index = anidb.findResults{ [it.getAnimeId(), it.getPrimaryTitle(), it.getEnglishTitle()] }
def anidb_index = anidb.findResults{
def row = []
row += it.getAnimeId().pad(5)
row += it.names*.replaceAll(/\s+/, ' ')*.replaceAll(/['`´ʻ]+/, /'/)*.trim().unique()
return row
}
// join and sort
def anidb_txt = anidb_index.findResults{ [it[0].pad(5), it[1] ?: '', it[2] == null || it[2].equals(it[1]) ? '' : it[2]]*.replaceAll(/\s+/, ' ')*.trim().join('\t').replaceAll(/['`´ʻ]+/, /'/) }.sort().unique()
def anidb_txt = anidb_index.findResults{ row -> row.join('\t') }.sort().unique()
pack(anidb_out, anidb_txt)
println "AniDB Index: " + anidb_txt.size()

View File

@ -2,9 +2,11 @@
package net.sourceforge.filebot;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static net.sourceforge.filebot.Settings.*;
import static java.util.Arrays.asList;
import static java.util.Collections.emptyList;
import static net.sourceforge.filebot.Settings.getApplicationName;
import static net.sourceforge.filebot.Settings.getApplicationProperty;
import static net.sourceforge.filebot.Settings.getApplicationVersion;
import java.io.IOException;
import java.util.ArrayList;
@ -142,7 +144,7 @@ public final class WebServices {
@Override
protected Set<String> getFields(SearchResult object) {
return set(object.getName());
return set(object.getNames());
}
};

View File

@ -61,7 +61,6 @@ import net.sourceforge.filebot.similarity.SequenceMatchSimilarity;
import net.sourceforge.filebot.similarity.SeriesNameMatcher;
import net.sourceforge.filebot.similarity.SimilarityComparator;
import net.sourceforge.filebot.similarity.SimilarityMetric;
import net.sourceforge.filebot.web.AnidbSearchResult;
import net.sourceforge.filebot.web.Date;
import net.sourceforge.filebot.web.Episode;
import net.sourceforge.filebot.web.Movie;
@ -377,13 +376,11 @@ public class MediaDetection {
public static synchronized List<Entry<String, SearchResult>> getSeriesIndex() throws IOException {
if (seriesIndex.isEmpty()) {
try {
for (TheTVDBSearchResult it : releaseInfo.getTheTVDBIndex()) {
seriesIndex.add(new SimpleEntry<String, SearchResult>(normalizePunctuation(it.getName()).toLowerCase(), it));
for (SearchResult[] index : new SearchResult[][] { releaseInfo.getTheTVDBIndex(), releaseInfo.getAnidbIndex() }) {
for (SearchResult item : index) {
for (String name : item.getNames()) {
seriesIndex.add(new SimpleEntry<String, SearchResult>(normalizePunctuation(name).toLowerCase(), item));
}
for (AnidbSearchResult it : releaseInfo.getAnidbIndex()) {
seriesIndex.add(new SimpleEntry<String, SearchResult>(normalizePunctuation(it.getPrimaryTitle()).toLowerCase(), it));
if (it.getEnglishTitle() != null) {
seriesIndex.add(new SimpleEntry<String, SearchResult>(normalizePunctuation(it.getEnglishTitle()).toLowerCase(), it));
}
}
} catch (Exception e) {

View File

@ -1,12 +1,15 @@
package net.sourceforge.filebot.media;
import static java.lang.Integer.parseInt;
import static java.util.Arrays.asList;
import static java.util.Arrays.copyOfRange;
import static java.util.Collections.unmodifiableMap;
import static java.util.ResourceBundle.getBundle;
import static java.util.regex.Pattern.CASE_INSENSITIVE;
import static java.util.regex.Pattern.UNICODE_CASE;
import static java.util.regex.Pattern.compile;
import static net.sourceforge.filebot.similarity.Normalization.normalizePunctuation;
import static net.sourceforge.tuned.FileUtilities.readCSV;
import static net.sourceforge.tuned.StringUtilities.join;
import java.io.File;
@ -26,7 +29,6 @@ import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
@ -274,14 +276,15 @@ public class ReleaseInfo {
@Override
public Movie[] process(ByteBuffer data) throws IOException {
Scanner scanner = new Scanner(new XZInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
List<String[]> rows = readCSV(new XZInputStream(new ByteBufferInputStream(data)), "UTF-8", "\t");
List<Movie> movies = new ArrayList<Movie>(rows.size());
List<Movie> movies = new ArrayList<Movie>();
while (scanner.hasNext()) {
int imdbid = scanner.nextInt();
String name = scanner.next().trim();
int year = scanner.nextInt();
movies.add(new Movie(name, year, imdbid, -1));
for (String[] row : rows) {
int imdbid = parseInt(row[0]);
int year = parseInt(row[1]);
String name = row[2];
String[] aliasNames = copyOfRange(row, 3, row.length);
movies.add(new Movie(name, aliasNames, year, imdbid, -1));
}
return movies.toArray(new Movie[0]);
@ -296,13 +299,14 @@ public class ReleaseInfo {
@Override
public TheTVDBSearchResult[] process(ByteBuffer data) throws IOException {
Scanner scanner = new Scanner(new XZInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
List<String[]> rows = readCSV(new XZInputStream(new ByteBufferInputStream(data)), "UTF-8", "\t");
List<TheTVDBSearchResult> tvshows = new ArrayList<TheTVDBSearchResult>(rows.size());
List<TheTVDBSearchResult> tvshows = new ArrayList<TheTVDBSearchResult>();
while (scanner.hasNext() && scanner.hasNextInt()) {
int id = scanner.nextInt();
String name = scanner.next().trim();
tvshows.add(new TheTVDBSearchResult(name, id));
for (String[] row : rows) {
int id = parseInt(row[0]);
String name = row[1];
String[] aliasNames = copyOfRange(row, 2, row.length);
tvshows.add(new TheTVDBSearchResult(name, aliasNames, id));
}
return tvshows.toArray(new TheTVDBSearchResult[0]);
@ -317,15 +321,14 @@ public class ReleaseInfo {
@Override
public AnidbSearchResult[] process(ByteBuffer data) throws IOException {
Scanner scanner = new Scanner(new XZInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
List<String[]> rows = readCSV(new XZInputStream(new ByteBufferInputStream(data)), "UTF-8", "\t");
List<AnidbSearchResult> anime = new ArrayList<AnidbSearchResult>(rows.size());
List<AnidbSearchResult> anime = new ArrayList<AnidbSearchResult>();
while (scanner.hasNext() && scanner.hasNextInt()) {
int aid = scanner.nextInt();
String primaryTitle = scanner.next().trim();
String englishTitle = scanner.next().trim();
anime.add(new AnidbSearchResult(aid, primaryTitle, englishTitle.isEmpty() ? null : englishTitle));
for (String[] row : rows) {
int aid = parseInt(row[0]);
String primaryTitle = row[1];
String[] aliasNames = copyOfRange(row, 2, row.length);
anime.add(new AnidbSearchResult(aid, primaryTitle, aliasNames));
}
return anime.toArray(new AnidbSearchResult[0]);

View File

@ -1,10 +1,12 @@
package net.sourceforge.filebot.web;
import static net.sourceforge.filebot.web.EpisodeUtilities.*;
import static net.sourceforge.filebot.web.WebRequest.*;
import static net.sourceforge.tuned.XPathUtilities.*;
import static net.sourceforge.filebot.web.EpisodeUtilities.sortEpisodes;
import static net.sourceforge.filebot.web.WebRequest.getDocument;
import static net.sourceforge.tuned.XPathUtilities.getAttribute;
import static net.sourceforge.tuned.XPathUtilities.getChild;
import static net.sourceforge.tuned.XPathUtilities.getTextContent;
import static net.sourceforge.tuned.XPathUtilities.selectNodes;
import static net.sourceforge.tuned.XPathUtilities.selectString;
import java.net.URI;
import java.net.URISyntaxException;
@ -30,7 +32,6 @@ import net.sourceforge.filebot.ResourceManager;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
public class AnidbClient extends AbstractEpisodeListProvider {
private static final FloodLimit REQUEST_LIMIT = new FloodLimit(5, 12, TimeUnit.SECONDS); // no more than 5 requests within a 10 second window (+2 seconds for good measure)
@ -40,64 +41,55 @@ public class AnidbClient extends AbstractEpisodeListProvider {
private final String client;
private final int clientver;
public AnidbClient(String client, int clientver) {
this.client = client;
this.clientver = clientver;
}
@Override
public String getName() {
return "AniDB";
}
@Override
public Icon getIcon() {
return ResourceManager.getIcon("search.anidb");
}
@Override
public boolean hasSingleSeasonSupport() {
return false;
}
@Override
public boolean hasLocaleSupport() {
return true;
}
@Override
public ResultCache getCache() {
return new ResultCache(host, Cache.getCache("web-datasource-lv2"));
}
@Override
public List<SearchResult> search(String query, final Locale locale) throws Exception {
// bypass automatic caching since search is based on locally cached data anyway
return fetchSearchResult(query, locale);
}
@Override
public List<SearchResult> fetchSearchResult(String query, final Locale locale) throws Exception {
LocalSearch<AnidbSearchResult> index = new LocalSearch<AnidbSearchResult>(getAnimeTitles()) {
LocalSearch<SearchResult> index = new LocalSearch<SearchResult>(getAnimeTitles()) {
@Override
protected Set<String> getFields(AnidbSearchResult anime) {
return set(anime.getPrimaryTitle(), anime.getEnglishTitle());
protected Set<String> getFields(SearchResult it) {
return set(it.getNames());
}
};
return new ArrayList<SearchResult>(index.search(query));
}
@Override
public List<Episode> fetchEpisodeList(SearchResult searchResult, SortOrder sortOrder, Locale language) throws Exception {
AnidbSearchResult anime = (AnidbSearchResult) searchResult;
@ -152,7 +144,6 @@ public class AnidbClient extends AbstractEpisodeListProvider {
return episodes;
}
@Override
public URI getEpisodeListLink(SearchResult searchResult) {
try {
@ -162,7 +153,6 @@ public class AnidbClient extends AbstractEpisodeListProvider {
}
}
public synchronized List<AnidbSearchResult> getAnimeTitles() throws Exception {
URL url = new URL("http", host, "/api/anime-titles.dat.gz");
ResultCache cache = getCache();
@ -224,7 +214,8 @@ public class AnidbClient extends AbstractEpisodeListProvider {
localizedTitles.putAll(officialTitleMap.get(entry.getKey())); // primarily use official title if available
}
anime.add(new AnidbSearchResult(entry.getKey(), entry.getValue(), localizedTitles.get("en")));
String englishTitle = localizedTitles.get("en"); // ONLY SUPPORT ENGLISH LOCALIZATION
anime.add(new AnidbSearchResult(entry.getKey(), entry.getValue(), englishTitle == null || englishTitle.isEmpty() ? new String[] {} : new String[] { englishTitle }));
}
// populate cache

View File

@ -8,8 +8,8 @@ public class AnidbSearchResult extends SearchResult {
// used by serializer
}
public AnidbSearchResult(int aid, String primaryTitle, String englishTitle) {
super(primaryTitle, englishTitle);
public AnidbSearchResult(int aid, String primaryTitle, String[] localizedTitles) {
super(primaryTitle, localizedTitles);
this.aid = aid;
}
@ -30,10 +30,6 @@ public class AnidbSearchResult extends SearchResult {
return name;
}
public String getEnglishTitle() {
return aliasNames.length > 0 ? aliasNames[0] : null;
}
@Override
public int hashCode() {
return aid;

View File

@ -1,9 +1,8 @@
package net.sourceforge.filebot.web;
import static java.util.Collections.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import static java.util.Collections.singleton;
import static java.util.Collections.sort;
import static net.sourceforge.filebot.similarity.Normalization.normalizePunctuation;
import java.util.AbstractList;
import java.util.AbstractMap.SimpleEntry;
@ -25,7 +24,6 @@ import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import com.ibm.icu.text.Transliterator;
public class LocalSearch<T> {
private final AbstractStringMetric metric = new QGramsDistance();
@ -37,7 +35,6 @@ public class LocalSearch<T> {
private final List<T> objects;
private final List<Set<String>> fields;
public LocalSearch(Collection<? extends T> data) {
objects = new ArrayList<T>(data);
fields = new ArrayList<Set<String>>(objects.size());
@ -47,7 +44,6 @@ public class LocalSearch<T> {
}
}
public List<T> search(String query) throws ExecutionException, InterruptedException {
final String q = normalize(query);
List<Callable<Entry<T, Float>>> tasks = new ArrayList<Callable<Entry<T, Float>>>(objects.size());
@ -105,7 +101,6 @@ public class LocalSearch<T> {
return resultSet.get(index).getKey();
}
@Override
public int size() {
return Math.min(resultSetSize, resultSet.size());
@ -113,24 +108,20 @@ public class LocalSearch<T> {
};
}
public void setResultMinimumSimilarity(float resultMinimumSimilarity) {
this.resultMinimumSimilarity = resultMinimumSimilarity;
}
public void setResultSetSize(int resultSetSize) {
this.resultSetSize = resultSetSize;
}
protected Set<String> getFields(T object) {
return set(object.toString());
return set(singleton(object.toString()));
}
protected Set<String> set(String... values) {
Set<String> set = new HashSet<String>(values.length);
protected Set<String> set(Collection<String> values) {
Set<String> set = new HashSet<String>(values.size());
for (String value : values) {
if (value != null) {
set.add(normalize(value));
@ -139,7 +130,6 @@ public class LocalSearch<T> {
return set;
}
protected String normalize(String value) {
// normalize separator, normalize case and trim
return normalizePunctuation(transliterator.transform(value)).toLowerCase();

View File

@ -1,6 +1,8 @@
package net.sourceforge.filebot.web;
import java.io.Serializable;
import java.util.AbstractList;
import java.util.List;
public abstract class SearchResult implements Serializable {
@ -24,6 +26,21 @@ public abstract class SearchResult implements Serializable {
return aliasNames.clone();
}
public List<String> getNames() {
return new AbstractList<String>() {
@Override
public String get(int index) {
return index == 0 ? name : aliasNames[index - 1];
}
@Override
public int size() {
return 1 + aliasNames.length;
}
};
}
@Override
public String toString() {
return name;

View File

@ -1,15 +1,15 @@
package net.sourceforge.filebot.web;
import static net.sourceforge.filebot.web.EpisodeUtilities.*;
import static net.sourceforge.filebot.web.WebRequest.*;
import static net.sourceforge.filebot.web.EpisodeUtilities.sortEpisodes;
import static net.sourceforge.filebot.web.WebRequest.createIgnoreCertificateSocketFactory;
import static net.sourceforge.filebot.web.WebRequest.getReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
@ -24,64 +24,55 @@ import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;
public class SerienjunkiesClient extends AbstractEpisodeListProvider {
private final String host = "api.serienjunkies.de";
private final String apikey;
public SerienjunkiesClient(String apikey) {
this.apikey = apikey;
}
@Override
public String getName() {
return "Serienjunkies";
}
@Override
public Icon getIcon() {
return ResourceManager.getIcon("search.serienjunkies");
}
@Override
public Locale getDefaultLocale() {
return Locale.GERMAN;
}
@Override
public ResultCache getCache() {
return new ResultCache(host, Cache.getCache("web-datasource"));
}
@Override
public List<SearchResult> search(String query, final Locale locale) throws Exception {
// bypass automatic caching since search is based on locally cached data anyway
return fetchSearchResult(query, locale);
}
@Override
public List<SearchResult> fetchSearchResult(String query, Locale locale) throws Exception {
LocalSearch<SerienjunkiesSearchResult> index = new LocalSearch<SerienjunkiesSearchResult>(getSeriesTitles()) {
LocalSearch<SearchResult> index = new LocalSearch<SearchResult>(getSeriesTitles()) {
@Override
protected Set<String> getFields(SerienjunkiesSearchResult series) {
return set(series.getMainTitle(), series.getGermanTitle());
protected Set<String> getFields(SearchResult series) {
return set(series.getNames());
}
};
return new ArrayList<SearchResult>(index.search(query));
}
protected synchronized List<SerienjunkiesSearchResult> getSeriesTitles() throws IOException {
ResultCache cache = getCache();
@ -106,14 +97,22 @@ public class SerienjunkiesClient extends AbstractEpisodeListProvider {
String germanTitle = (String) obj.get("short_german");
Date startDate = Date.parse((String) obj.get("firstepisode"), "yyyy-MM-dd");
seriesList.add(new SerienjunkiesSearchResult(sid, link, mainTitle, germanTitle != null && !germanTitle.isEmpty() ? germanTitle : null, startDate));
Set<String> titleSet = new LinkedHashSet<String>(2);
for (String title : new String[] { germanTitle, mainTitle }) {
if (title != null && title.length() > 0) {
titleSet.add(title);
}
}
if (titleSet.size() > 0) {
List<String> titleList = new ArrayList<String>(titleSet);
seriesList.add(new SerienjunkiesSearchResult(sid, link, titleList.get(0), titleList.subList(1, titleList.size()).toArray(new String[0]), startDate));
}
}
// populate cache
return cache.putSearchResult(null, Locale.ROOT, seriesList);
}
@Override
public List<Episode> fetchEpisodeList(SearchResult searchResult, SortOrder sortOrder, Locale locale) throws IOException {
SerienjunkiesSearchResult series = (SerienjunkiesSearchResult) searchResult;
@ -121,7 +120,7 @@ public class SerienjunkiesClient extends AbstractEpisodeListProvider {
// fetch episode data
List<Episode> episodes = new ArrayList<Episode>(25);
String seriesName = locale.equals(Locale.GERMAN) && series.getGermanTitle() != null ? series.getGermanTitle() : series.getMainTitle();
String seriesName = series.getName();
JSONObject data = (JSONObject) request("/allepisodes.php?d=" + apikey + "&q=" + series.getSeriesId());
JSONArray list = (JSONArray) data.get("allepisodes");
@ -152,7 +151,6 @@ public class SerienjunkiesClient extends AbstractEpisodeListProvider {
return episodes;
}
protected Object request(String resource) throws IOException {
URL url = new URL("https", host, resource);
HttpsURLConnection connection = (HttpsURLConnection) url.openConnection();
@ -169,7 +167,6 @@ public class SerienjunkiesClient extends AbstractEpisodeListProvider {
}
}
@Override
public URI getEpisodeListLink(SearchResult searchResult) {
return URI.create(String.format("http://www.serienjunkies.de/%s/alle-serien-staffeln.html", ((SerienjunkiesSearchResult) searchResult).getLink()));

View File

@ -1,72 +1,43 @@
package net.sourceforge.filebot.web;
public class SerienjunkiesSearchResult extends SearchResult {
protected int sid;
protected String link;
protected String mainTitle;
protected String germanTitle;
protected Date startDate;
protected SerienjunkiesSearchResult() {
// used by serializer
}
public SerienjunkiesSearchResult(int sid, String link, String mainTitle, String germanTitle, Date startDate) {
public SerienjunkiesSearchResult(int sid, String link, String germanTitle, String[] otherTitles, Date startDate) {
super(germanTitle, otherTitles);
this.sid = sid;
this.link = link;
this.mainTitle = mainTitle;
this.germanTitle = germanTitle;
this.startDate = startDate;
}
public int getId() {
return sid;
}
@Override
public String getName() {
return germanTitle != null ? germanTitle : mainTitle; // prefer German title
}
public int getSeriesId() {
return sid;
}
public String getLink() {
return link;
}
public String getMainTitle() {
return mainTitle;
}
public String getGermanTitle() {
return germanTitle;
}
public Date getStartDate() {
return startDate;
}
@Override
public int hashCode() {
return sid;
}
@Override
public boolean equals(Object object) {
if (object instanceof SerienjunkiesSearchResult) {

View File

@ -21,6 +21,7 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
@ -160,6 +161,18 @@ public final class FileUtilities {
}
}
public static List<String[]> readCSV(InputStream source, String charsetName, String separatorPattern) {
Scanner scanner = new Scanner(source, charsetName);
Pattern separator = Pattern.compile(separatorPattern);
List<String[]> rows = new ArrayList<String[]>(65536);
while (scanner.hasNextLine()) {
rows.add(separator.split(scanner.nextLine()));
}
return rows;
}
public static Reader createTextReader(File file) throws IOException {
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding