* fine-tune anime matching

This commit is contained in:
Reinhard Pointner 2013-12-27 22:49:56 +00:00
parent 2232576c1d
commit 3a1eada102
8 changed files with 239 additions and 243 deletions

View File

@ -228,7 +228,7 @@ if (thetvdb_txt.size() < 30000) { throw new Exception('TheTVDB index sanity fail
// BUILD anidb index
def anidb = new net.sourceforge.filebot.web.AnidbClient(null, 0).getAnimeTitles()
def anidb = new net.sourceforge.filebot.web.AnidbClient('filebot', 4).getAnimeTitles()
def anidb_index = anidb.findResults{
def row = []
@ -243,4 +243,4 @@ pack(anidb_out, anidb_txt)
println "AniDB Index: " + anidb_txt.size()
// sanity check
if (anidb_txt.size() < 5000) { throw new Exception('AniDB index sanity failed') }
if (anidb_txt.size() < 8000) { throw new Exception('AniDB index sanity failed') }

View File

@ -106,7 +106,7 @@ public class CmdlineOperations implements CmdlineInterface {
int sxe = 0; // SxE
int cws = 0; // common word sequence
SeriesNameMatcher nameMatcher = new SeriesNameMatcher(locale);
SeriesNameMatcher nameMatcher = new SeriesNameMatcher(locale, true);
Collection<String> cwsList = emptySet();
if (max >= 5) {
cwsList = nameMatcher.matchAll(mediaFiles.toArray(new File[0]));

View File

@ -360,7 +360,7 @@ public class MediaDetection {
Collection<String> matches = new LinkedHashSet<String>();
// check CWS matches
SeriesNameMatcher snm = new SeriesNameMatcher(locale);
SeriesNameMatcher snm = new SeriesNameMatcher(locale, true);
matches.addAll(snm.matchAll(files.toArray(new File[files.size()])));
// check for known pattern matches

View File

@ -300,64 +300,72 @@ public enum EpisodeMetrics implements SimilarityMetric {
SeriesName(new NameSimilarityMetric() {
private ReleaseInfo releaseInfo = new ReleaseInfo();
private SeriesNameMatcher seriesNameMatcher = new SeriesNameMatcher();
private SeriesNameMatcher seriesNameMatcher = new SeriesNameMatcher(Locale.ROOT, false);
@Override
public float getSimilarity(Object o1, Object o2) {
float lowerBound = super.getSimilarity(normalize(o1, true), normalize(o2, true));
float upperBound = super.getSimilarity(normalize(o1, false), normalize(o2, false));
String[] f1 = getNormalizedEffectiveIdentifiers(o1);
String[] f2 = getNormalizedEffectiveIdentifiers(o2);
return (float) (floor(max(lowerBound, upperBound) * 4) / 4);
};
// match all fields and average similarity
float max = 0;
for (String s1 : f1) {
for (String s2 : f2) {
max = max(super.getSimilarity(s1, s2), max);
}
}
// normalize absolute similarity to similarity rank (4 ranks in total),
// so we are less likely to fall for false positives in this pass, and move on to the next one
return (float) (floor(max * 4) / 4);
}
@Override
protected String normalize(Object object) {
return object.toString();
};
}
protected String[] getNormalizedEffectiveIdentifiers(Object object) {
List<?> identifiers = getEffectiveIdentifiers(object);
String[] names = new String[identifiers.size()];
for (int i = 0; i < names.length; i++) {
names[i] = normalizeObject(identifiers.get(i));
}
return names;
}
protected List<?> getEffectiveIdentifiers(Object object) {
List<String> names = null;
protected String normalize(Object object, boolean strict) {
if (object instanceof Episode) {
if (strict) {
object = ((Episode) object).getSeriesName(); // focus on series name
} else {
object = removeTrailingBrackets(((Episode) object).getSeriesName()); // focus on series name (without US/UK 1967/2005 differentiation)
}
names = ((Episode) object).getSeries().getEffectiveNames();
} else if (object instanceof File) {
object = ((File) object).getName(); // try to narrow down on series name
try {
object = resolveSeriesDirectMapping((String) object);
} catch (IOException e) {
Logger.getLogger(EpisodeMetrics.class.getName()).log(Level.WARNING, e.getMessage());
names = new ArrayList<String>(3);
for (File f : listPathTail((File) object, 3, true)) {
String fn = getName(f);
String sn = seriesNameMatcher.matchByEpisodeIdentifier(fn);
if (sn != null) {
names.add(sn);
} else {
names.add(fn);
}
String snm = seriesNameMatcher.matchByEpisodeIdentifier((String) object);
if (snm != null) {
object = snm;
}
}
// equally strip away strip potential any clutter
if (names != null) {
try {
object = releaseInfo.cleanRelease(singleton(object.toString()), strict).iterator().next();
return releaseInfo.cleanRelease(names, false);
} catch (NoSuchElementException e) {
// keep default value in case all tokens are stripped away
} catch (IOException e) {
Logger.getLogger(EpisodeMetrics.class.getName()).log(Level.WARNING, e.getMessage());
}
}
// simplify file name, if possible
return normalizeObject(object);
}
protected String resolveSeriesDirectMapping(String input) throws IOException {
for (Pattern it : releaseInfo.getSeriesDirectMappings().keySet()) {
Matcher m = it.matcher(input);
if (m.find()) {
return m.replaceAll(releaseInfo.getSeriesDirectMappings().get(it));
}
}
return input;
return emptyList();
}
}),

View File

@ -1,7 +1,5 @@
package net.sourceforge.filebot.similarity;
import static java.util.Collections.*;
import static java.util.regex.Pattern.*;
import static net.sourceforge.filebot.similarity.CommonSequenceMatcher.*;
@ -29,23 +27,24 @@ import java.util.regex.Pattern;
import net.sourceforge.filebot.similarity.SeasonEpisodeMatcher.SxE;
import net.sourceforge.tuned.FileUtilities;
public class SeriesNameMatcher {
protected SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher(SeasonEpisodeMatcher.DEFAULT_SANITY, true);
protected DateMatcher dateMatcher = new DateMatcher();
protected SeasonEpisodeMatcher seasonEpisodeMatcher;
protected DateMatcher dateMatcher;
protected NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric();
protected NameSimilarityMetric nameSimilarityMetric;
protected CommonSequenceMatcher commonSequenceMatcher;
public SeriesNameMatcher() {
this(Locale.ROOT);
this(Locale.ROOT, true);
}
public SeriesNameMatcher(Locale locale, boolean strict) {
seasonEpisodeMatcher = new SeasonEpisodeMatcher(SeasonEpisodeMatcher.DEFAULT_SANITY, strict);
dateMatcher = new DateMatcher();
nameSimilarityMetric = new NameSimilarityMetric();
public SeriesNameMatcher(Locale locale) {
commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(locale), 3, true) {
@Override
@ -55,7 +54,6 @@ public class SeriesNameMatcher {
};
}
public Collection<String> matchAll(File[] files) {
SeriesNameCollection seriesNames = new SeriesNameCollection();
@ -76,7 +74,6 @@ public class SeriesNameMatcher {
return seriesNames;
}
public Collection<String> matchAll(String[] names) {
SeriesNameCollection seriesNames = new SeriesNameCollection();
@ -110,13 +107,12 @@ public class SeriesNameMatcher {
return seriesNames;
}
/**
* Try to match and verify all series names using known season episode patterns.
*
* @param names episode names
* @return series names that have been matched one or multiple times depending on the
* threshold
* @param names
* episode names
* @return series names that have been matched one or multiple times depending on the threshold
*/
private Collection<String> flatMatchAll(String[] names, Pattern prefixPattern, int threshold, boolean strict) {
@SuppressWarnings("unchecked")
@ -155,11 +151,11 @@ public class SeriesNameMatcher {
return thresholdCollection;
}
/**
* Try to match all common word sequences in the given list.
*
* @param names list of episode names
* @param names
* list of episode names
* @return all common word sequences that have been found
*/
private Collection<String> deepMatchAll(String[] names, int threshold) {
@ -185,14 +181,12 @@ public class SeriesNameMatcher {
return results;
}
/**
* Try to match a series name from the given episode name using known season episode
* patterns.
* Try to match a series name from the given episode name using known season episode patterns.
*
* @param name episode name
* @return a substring of the given name that ends before the first occurrence of a season
* episode pattern, or null if there is no such pattern
* @param name
* episode name
* @return a substring of the given name that ends before the first occurrence of a season episode pattern, or null if there is no such pattern
*/
public String matchByEpisodeIdentifier(String name) {
int seasonEpisodePosition = seasonEpisodeMatcher.find(name, 0);
@ -210,13 +204,14 @@ public class SeriesNameMatcher {
return null;
}
/**
* Try to match a series name from the first common word sequence.
*
* @param names various episode names (at least two)
* @param names
* various episode names (at least two)
* @return a word sequence all episode names have in common, or null
* @throws IllegalArgumentException if less than 2 episode names are given
* @throws IllegalArgumentException
* if less than 2 episode names are given
*/
public String matchByFirstCommonWordSequence(String... names) {
if (names.length < 2) {
@ -226,7 +221,6 @@ public class SeriesNameMatcher {
return commonSequenceMatcher.matchFirstCommonSequence(names);
}
protected String normalize(String name) {
// remove group names and checksums, any [...] or (...)
name = normalizeBrackets(name);
@ -237,7 +231,6 @@ public class SeriesNameMatcher {
return name;
}
protected <T> T[] firstCommonSequence(T[] seq1, T[] seq2, int maxStartIndex, Comparator<T> equalsComparator) {
for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
@ -263,7 +256,6 @@ public class SeriesNameMatcher {
return null;
}
private Map<File, String[]> mapNamesByFolder(File... files) {
Map<File, List<File>> filesByFolder = new LinkedHashMap<File, List<File>>();
@ -290,7 +282,6 @@ public class SeriesNameMatcher {
return namesByFolder;
}
protected String[] names(Collection<File> files) {
String[] names = new String[files.size()];
@ -304,12 +295,10 @@ public class SeriesNameMatcher {
return names;
}
protected static class SeriesNameCollection extends AbstractCollection<String> {
private final Map<String, String> data = new LinkedHashMap<String, String>();
@Override
public boolean add(String value) {
value = value.trim();
@ -330,12 +319,10 @@ public class SeriesNameMatcher {
return false;
}
protected String key(Object value) {
return value.toString().toLowerCase();
}
protected float firstCharacterCaseBalance(String s) {
int upper = 0;
int lower = 0;
@ -355,19 +342,16 @@ public class SeriesNameMatcher {
return (lower + (upper * 1.01f)) / Math.abs(lower - upper);
}
@Override
public boolean contains(Object value) {
return data.containsKey(key(value));
}
@Override
public Iterator<String> iterator() {
return data.values().iterator();
}
@Override
public int size() {
return data.size();
@ -375,7 +359,6 @@ public class SeriesNameMatcher {
}
protected static class ThresholdCollection<E> extends AbstractCollection<E> {
private final Collection<E> heaven;
@ -383,14 +366,12 @@ public class SeriesNameMatcher {
private final int threshold;
public ThresholdCollection(int threshold, Comparator<E> equalityComparator) {
this.heaven = new ArrayList<E>();
this.limbo = new TreeMap<E, Collection<E>>(equalityComparator);
this.threshold = threshold;
}
@Override
public boolean add(E value) {
Collection<E> buffer = limbo.get(value);
@ -422,18 +403,15 @@ public class SeriesNameMatcher {
return false;
};
public boolean addDirect(E element) {
return heaven.add(element);
}
@Override
public Iterator<E> iterator() {
return heaven.iterator();
}
@Override
public int size() {
return heaven.size();

View File

@ -8,6 +8,8 @@ import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
@ -27,6 +29,7 @@ import javax.swing.Icon;
import net.sourceforge.filebot.Cache;
import net.sourceforge.filebot.ResourceManager;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
@ -165,13 +168,15 @@ public class AnidbClient extends AbstractEpisodeListProvider {
// type: 1=primary title (one per anime), 2=synonyms (multiple per anime), 3=shorttitles (multiple per anime), 4=official title (one per language)
Pattern pattern = Pattern.compile("^(?!#)(\\d+)[|](\\d)[|]([\\w-]+)[|](.+)$");
Map<Integer, String> primaryTitleMap = new HashMap<Integer, String>();
Map<Integer, Map<String, String>> officialTitleMap = new HashMap<Integer, Map<String, String>>();
Map<Integer, Map<String, String>> synonymsTitleMap = new HashMap<Integer, Map<String, String>>();
List<String> languageOrder = new ArrayList<String>();
languageOrder.add("x-jat");
languageOrder.add("en");
languageOrder.add("ja");
// fetch data
Scanner scanner = new Scanner(new GZIPInputStream(url.openStream()), "UTF-8");
Map<Integer, List<Object[]>> entriesByAnime = new HashMap<Integer, List<Object[]>>(65536);
Scanner scanner = new Scanner(new GZIPInputStream(url.openStream()), "UTF-8");
try {
while (scanner.hasNextLine()) {
Matcher matcher = pattern.matcher(scanner.nextLine());
@ -182,17 +187,17 @@ public class AnidbClient extends AbstractEpisodeListProvider {
String language = matcher.group(3);
String title = matcher.group(4);
if (type.equals("1")) {
primaryTitleMap.put(aid, title);
} else if (type.equals("2") || type.equals("4")) {
Map<Integer, Map<String, String>> titleMap = (type.equals("4") ? officialTitleMap : synonymsTitleMap);
Map<String, String> languageTitleMap = titleMap.get(aid);
if (languageTitleMap == null) {
languageTitleMap = new HashMap<String, String>();
titleMap.put(aid, languageTitleMap);
if (aid > 0 && title.length() > 0 && languageOrder.contains(language)) {
List<Object[]> names = entriesByAnime.get(aid);
if (names == null) {
names = new ArrayList<Object[]>();
entriesByAnime.put(aid, names);
}
languageTitleMap.put(language, title);
// resolve HTML entities
title = Jsoup.parse(title).text();
names.add(new Object[] { Integer.parseInt(type), languageOrder.indexOf(language), title });
}
}
}
@ -201,23 +206,36 @@ public class AnidbClient extends AbstractEpisodeListProvider {
}
// build up a list of all possible AniDB search results
anime = new ArrayList<AnidbSearchResult>(primaryTitleMap.size());
anime = new ArrayList<AnidbSearchResult>(entriesByAnime.size());
for (Entry<Integer, String> entry : primaryTitleMap.entrySet()) {
Map<String, String> localizedTitles = new HashMap<String, String>();
if (synonymsTitleMap.containsKey(entry.getKey())) {
localizedTitles.putAll(synonymsTitleMap.get(entry.getKey())); // use synonym as fallback
for (Entry<Integer, List<Object[]>> entry : entriesByAnime.entrySet()) {
int aid = entry.getKey();
List<Object[]> triples = entry.getValue();
Collections.sort(triples, new Comparator<Object[]>() {
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public int compare(Object[] a, Object[] b) {
for (int i = 0; i < a.length; i++) {
if (!a[i].equals(b[i]))
return ((Comparable) a[i]).compareTo(b[i]);
}
if (officialTitleMap.containsKey(entry.getKey())) {
localizedTitles.putAll(officialTitleMap.get(entry.getKey())); // primarily use official title if available
return 0;
}
});
List<String> names = new ArrayList<String>(triples.size());
for (Object[] it : triples) {
names.add((String) it[2]);
}
String englishTitle = localizedTitles.get("en"); // ONLY SUPPORT ENGLISH LOCALIZATION
anime.add(new AnidbSearchResult(entry.getKey(), entry.getValue(), englishTitle == null || englishTitle.isEmpty() ? new String[] {} : new String[] { englishTitle }));
String primaryTitle = names.get(0);
String[] aliasNames = names.subList(1, names.size()).toArray(new String[0]);
anime.add(new AnidbSearchResult(aid, primaryTitle, aliasNames));
}
// populate cache
return cache.putSearchResult(null, Locale.ROOT, anime);
}
}

View File

@ -8,8 +8,8 @@ public class AnidbSearchResult extends SearchResult {
// used by serializer
}
public AnidbSearchResult(int aid, String primaryTitle, String[] localizedTitles) {
super(primaryTitle, localizedTitles);
public AnidbSearchResult(int aid, String primaryTitle, String[] aliasNames) {
super(primaryTitle, aliasNames);
this.aid = aid;
}

View File

@ -1,7 +1,5 @@
package net.sourceforge.filebot.web;
import static org.junit.Assert.*;
import java.util.List;
@ -13,7 +11,6 @@ import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public class AnidbClientTest {
/**
@ -31,7 +28,6 @@ public class AnidbClientTest {
*/
private static AnidbSearchResult princessTutuSearchResult;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
monsterSearchResult = new AnidbSearchResult(1539, "Monster", null);
@ -41,6 +37,11 @@ public class AnidbClientTest {
private AnidbClient anidb = new AnidbClient("filebot", 4);
@Test
public void getAnimeTitles() throws Exception {
List<AnidbSearchResult> animeTitles = anidb.getAnimeTitles();
assertTrue(animeTitles.size() > 8000);
}
@Test
public void search() throws Exception {
@ -51,7 +52,6 @@ public class AnidbClientTest {
assertEquals(69, result.getAnimeId());
}
@Test
public void searchNoMatch() throws Exception {
List<SearchResult> results = anidb.search("i will not find anything for this query string");
@ -59,7 +59,6 @@ public class AnidbClientTest {
assertTrue(results.isEmpty());
}
@Test
public void searchTitleAlias() throws Exception {
// Seikai no Senki (main title), Banner of the Stars (official English title)
@ -70,7 +69,6 @@ public class AnidbClientTest {
assertEquals("Naruto", anidb.search("naruto").get(0).getName());
}
@Test
public void getEpisodeListAll() throws Exception {
List<Episode> list = anidb.getEpisodeList(monsterSearchResult);
@ -88,7 +86,6 @@ public class AnidbClientTest {
assertEquals("2004-04-07", first.getAirdate().toString());
}
@Test
public void getEpisodeListAllShortLink() throws Exception {
List<Episode> list = anidb.getEpisodeList(twelvekingdomsSearchResult);
@ -106,13 +103,11 @@ public class AnidbClientTest {
assertEquals("2002-04-09", first.getAirdate().toString());
}
@Test
public void getEpisodeListEncoding() throws Exception {
assertEquals("Raven Princess - An der schönen blauen Donau", anidb.getEpisodeList(princessTutuSearchResult).get(6).getTitle());
}
@Test
public void getEpisodeListI18N() throws Exception {
List<Episode> list = anidb.getEpisodeList(monsterSearchResult, SortOrder.Airdate, Locale.JAPANESE);
@ -127,19 +122,16 @@ public class AnidbClientTest {
assertEquals("2005-09-28", last.getAirdate().toString());
}
@Test
public void getEpisodeListTrimRecap() throws Exception {
assertEquals("Sea God of the East, Azure Sea of the West - Transition Chapter", anidb.getEpisodeList(twelvekingdomsSearchResult).get(44).getTitle());
}
@Test
public void getEpisodeListLink() throws Exception {
assertEquals("http://anidb.net/a1539", anidb.getEpisodeListLink(monsterSearchResult).toURL().toString());
}
@BeforeClass
@AfterClass
public static void clearCache() {