* Transliterate any text to ASCII before running any kind of string similarity
This commit is contained in:
parent
d3347d19d9
commit
9d1f33ae76
|
@ -21,6 +21,8 @@ import net.sourceforge.filebot.web.Episode;
|
|||
import net.sourceforge.filebot.web.EpisodeFormat;
|
||||
import net.sourceforge.filebot.web.Movie;
|
||||
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
|
||||
public enum EpisodeMetrics implements SimilarityMetric {
|
||||
|
||||
|
@ -351,6 +353,7 @@ public enum EpisodeMetrics implements SimilarityMetric {
|
|||
}
|
||||
|
||||
private static final Map<Object, String> transformCache = synchronizedMap(new HashMap<Object, String>(64, 4));
|
||||
private static final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
||||
|
||||
|
||||
protected static String normalizeObject(Object object) {
|
||||
|
@ -375,6 +378,10 @@ public enum EpisodeMetrics implements SimilarityMetric {
|
|||
// remove checksums, any [...] or (...)
|
||||
name = removeEmbeddedChecksum(name);
|
||||
|
||||
synchronized (transliterator) {
|
||||
name = transliterator.transform(name);
|
||||
}
|
||||
|
||||
// remove/normalize special characters
|
||||
name = normalizePunctuation(name);
|
||||
|
||||
|
|
|
@ -7,15 +7,24 @@ import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
|||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
||||
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3;
|
||||
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
|
||||
public class NameSimilarityMetric implements SimilarityMetric {
|
||||
|
||||
private final AbstractStringMetric metric;
|
||||
private final Transliterator transliterator;
|
||||
|
||||
|
||||
public NameSimilarityMetric() {
|
||||
// QGramsDistance with a QGram tokenizer seems to work best for similarity of names
|
||||
metric = new QGramsDistance(new TokeniserQGram3());
|
||||
this(new QGramsDistance(new TokeniserQGram3()), Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"));
|
||||
}
|
||||
|
||||
|
||||
public NameSimilarityMetric(AbstractStringMetric metric, Transliterator transliterator) {
|
||||
this.metric = metric;
|
||||
this.transliterator = transliterator;
|
||||
}
|
||||
|
||||
|
||||
|
@ -29,6 +38,11 @@ public class NameSimilarityMetric implements SimilarityMetric {
|
|||
// use string representation
|
||||
String name = object.toString();
|
||||
|
||||
// apply transliterator
|
||||
if (transliterator != null) {
|
||||
name = transliterator.transform(name);
|
||||
}
|
||||
|
||||
// normalize separators
|
||||
name = normalizePunctuation(name);
|
||||
|
||||
|
|
|
@ -23,6 +23,8 @@ import java.util.concurrent.Future;
|
|||
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
||||
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
|
||||
public class LocalSearch<T> {
|
||||
|
||||
|
@ -30,6 +32,8 @@ public class LocalSearch<T> {
|
|||
private float resultMinimumSimilarity = 0.5f;
|
||||
private int resultSetSize = 20;
|
||||
|
||||
private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
||||
|
||||
private final List<T> objects;
|
||||
private final List<Set<String>> fields;
|
||||
|
||||
|
@ -138,7 +142,7 @@ public class LocalSearch<T> {
|
|||
|
||||
protected String normalize(String value) {
|
||||
// normalize separator, normalize case and trim
|
||||
return normalizePunctuation(value).toLowerCase();
|
||||
return normalizePunctuation(transliterator.transform(value)).toLowerCase();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -48,6 +48,28 @@ public class IMDbClientTest {
|
|||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void searchMovie4() throws Exception {
|
||||
List<Movie> results = imdb.searchMovie("Heat", null);
|
||||
Movie movie = results.get(0);
|
||||
|
||||
assertEquals("Heat", movie.getName());
|
||||
assertEquals(1995, movie.getYear());
|
||||
assertEquals(113277, movie.getImdbId(), 0);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void searchMovie5() throws Exception {
|
||||
List<Movie> results = imdb.searchMovie("Det sjunde inseglet", null);
|
||||
Movie movie = results.get(0);
|
||||
|
||||
assertEquals("The Seventh Seal", movie.getName());
|
||||
assertEquals(1957, movie.getYear());
|
||||
assertEquals(50976, movie.getImdbId(), 0);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void searchMovieRedirect() throws Exception {
|
||||
List<Movie> results = imdb.searchMovie("(500) Days of Summer (2009)", null);
|
||||
|
|
Loading…
Reference in New Issue