* Transliterate any text to ASCII before running any kind of string similarity

This commit is contained in:
Reinhard Pointner 2012-12-02 13:41:06 +00:00
parent d3347d19d9
commit 9d1f33ae76
4 changed files with 49 additions and 2 deletions

View File

@ -21,6 +21,8 @@ import net.sourceforge.filebot.web.Episode;
import net.sourceforge.filebot.web.EpisodeFormat;
import net.sourceforge.filebot.web.Movie;
import com.ibm.icu.text.Transliterator;
public enum EpisodeMetrics implements SimilarityMetric {
@ -351,6 +353,7 @@ public enum EpisodeMetrics implements SimilarityMetric {
}
private static final Map<Object, String> transformCache = synchronizedMap(new HashMap<Object, String>(64, 4));
private static final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
protected static String normalizeObject(Object object) {
@ -375,6 +378,10 @@ public enum EpisodeMetrics implements SimilarityMetric {
// remove checksums, any [...] or (...)
name = removeEmbeddedChecksum(name);
synchronized (transliterator) {
name = transliterator.transform(name);
}
// remove/normalize special characters
name = normalizePunctuation(name);

View File

@ -7,15 +7,24 @@ import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3;
import com.ibm.icu.text.Transliterator;
public class NameSimilarityMetric implements SimilarityMetric {
private final AbstractStringMetric metric;
private final Transliterator transliterator;
public NameSimilarityMetric() {
// QGramsDistance with a QGram tokenizer seems to work best for similarity of names
metric = new QGramsDistance(new TokeniserQGram3());
this(new QGramsDistance(new TokeniserQGram3()), Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"));
}
public NameSimilarityMetric(AbstractStringMetric metric, Transliterator transliterator) {
this.metric = metric;
this.transliterator = transliterator;
}
@ -29,6 +38,11 @@ public class NameSimilarityMetric implements SimilarityMetric {
// use string representation
String name = object.toString();
// apply transliterator
if (transliterator != null) {
name = transliterator.transform(name);
}
// normalize separators
name = normalizePunctuation(name);

View File

@ -23,6 +23,8 @@ import java.util.concurrent.Future;
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import com.ibm.icu.text.Transliterator;
public class LocalSearch<T> {
@ -30,6 +32,8 @@ public class LocalSearch<T> {
private float resultMinimumSimilarity = 0.5f;
private int resultSetSize = 20;
private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
private final List<T> objects;
private final List<Set<String>> fields;
@ -138,7 +142,7 @@ public class LocalSearch<T> {
protected String normalize(String value) {
// normalize separator, normalize case and trim
return normalizePunctuation(value).toLowerCase();
return normalizePunctuation(transliterator.transform(value)).toLowerCase();
}
}

View File

@ -48,6 +48,28 @@ public class IMDbClientTest {
}
@Test
public void searchMovie4() throws Exception {
List<Movie> results = imdb.searchMovie("Heat", null);
Movie movie = results.get(0);
assertEquals("Heat", movie.getName());
assertEquals(1995, movie.getYear());
assertEquals(113277, movie.getImdbId(), 0);
}
@Test
public void searchMovie5() throws Exception {
List<Movie> results = imdb.searchMovie("Det sjunde inseglet", null);
Movie movie = results.get(0);
assertEquals("The Seventh Seal", movie.getName());
assertEquals(1957, movie.getYear());
assertEquals(50976, movie.getImdbId(), 0);
}
@Test
public void searchMovieRedirect() throws Exception {
List<Movie> results = imdb.searchMovie("(500) Days of Summer (2009)", null);