From 9d1f33ae7678d8b7f29a3d9c0e253c9681771853 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Sun, 2 Dec 2012 13:41:06 +0000 Subject: [PATCH] * Transliterate any text to ASCII before running any kind of string similarity --- .../filebot/similarity/EpisodeMetrics.java | 7 ++++++ .../similarity/NameSimilarityMetric.java | 16 +++++++++++++- .../sourceforge/filebot/web/LocalSearch.java | 6 ++++- .../filebot/web/IMDbClientTest.java | 22 +++++++++++++++++++ 4 files changed, 49 insertions(+), 2 deletions(-) diff --git a/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java b/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java index 1e9f03a5..c244c659 100644 --- a/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java +++ b/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java @@ -21,6 +21,8 @@ import net.sourceforge.filebot.web.Episode; import net.sourceforge.filebot.web.EpisodeFormat; import net.sourceforge.filebot.web.Movie; +import com.ibm.icu.text.Transliterator; + public enum EpisodeMetrics implements SimilarityMetric { @@ -351,6 +353,7 @@ public enum EpisodeMetrics implements SimilarityMetric { } private static final Map transformCache = synchronizedMap(new HashMap(64, 4)); + private static final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"); protected static String normalizeObject(Object object) { @@ -375,6 +378,10 @@ public enum EpisodeMetrics implements SimilarityMetric { // remove checksums, any [...] or (...) name = removeEmbeddedChecksum(name); + synchronized (transliterator) { + name = transliterator.transform(name); + } + // remove/normalize special characters name = normalizePunctuation(name); diff --git a/source/net/sourceforge/filebot/similarity/NameSimilarityMetric.java b/source/net/sourceforge/filebot/similarity/NameSimilarityMetric.java index 5346e71d..b85a8312 100644 --- a/source/net/sourceforge/filebot/similarity/NameSimilarityMetric.java +++ b/source/net/sourceforge/filebot/similarity/NameSimilarityMetric.java @@ -7,15 +7,24 @@ import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3; +import com.ibm.icu.text.Transliterator; + public class NameSimilarityMetric implements SimilarityMetric { private final AbstractStringMetric metric; + private final Transliterator transliterator; public NameSimilarityMetric() { // QGramsDistance with a QGram tokenizer seems to work best for similarity of names - metric = new QGramsDistance(new TokeniserQGram3()); + this(new QGramsDistance(new TokeniserQGram3()), Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove")); + } + + + public NameSimilarityMetric(AbstractStringMetric metric, Transliterator transliterator) { + this.metric = metric; + this.transliterator = transliterator; } @@ -29,6 +38,11 @@ public class NameSimilarityMetric implements SimilarityMetric { // use string representation String name = object.toString(); + // apply transliterator + if (transliterator != null) { + name = transliterator.transform(name); + } + // normalize separators name = normalizePunctuation(name); diff --git a/source/net/sourceforge/filebot/web/LocalSearch.java b/source/net/sourceforge/filebot/web/LocalSearch.java index f9b7c492..3239b09f 100644 --- a/source/net/sourceforge/filebot/web/LocalSearch.java +++ b/source/net/sourceforge/filebot/web/LocalSearch.java @@ -23,6 +23,8 @@ import java.util.concurrent.Future; import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; +import com.ibm.icu.text.Transliterator; + public class LocalSearch { @@ -30,6 +32,8 @@ public class LocalSearch { private float resultMinimumSimilarity = 0.5f; private int resultSetSize = 20; + private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"); + private final List objects; private final List> fields; @@ -138,7 +142,7 @@ public class LocalSearch { protected String normalize(String value) { // normalize separator, normalize case and trim - return normalizePunctuation(value).toLowerCase(); + return normalizePunctuation(transliterator.transform(value)).toLowerCase(); } } diff --git a/test/net/sourceforge/filebot/web/IMDbClientTest.java b/test/net/sourceforge/filebot/web/IMDbClientTest.java index a326da3f..739d6d55 100644 --- a/test/net/sourceforge/filebot/web/IMDbClientTest.java +++ b/test/net/sourceforge/filebot/web/IMDbClientTest.java @@ -48,6 +48,28 @@ public class IMDbClientTest { } + @Test + public void searchMovie4() throws Exception { + List results = imdb.searchMovie("Heat", null); + Movie movie = results.get(0); + + assertEquals("Heat", movie.getName()); + assertEquals(1995, movie.getYear()); + assertEquals(113277, movie.getImdbId(), 0); + } + + + @Test + public void searchMovie5() throws Exception { + List results = imdb.searchMovie("Det sjunde inseglet", null); + Movie movie = results.get(0); + + assertEquals("The Seventh Seal", movie.getName()); + assertEquals(1957, movie.getYear()); + assertEquals(50976, movie.getImdbId(), 0); + } + + @Test public void searchMovieRedirect() throws Exception { List results = imdb.searchMovie("(500) Days of Summer (2009)", null);