From 6707a94518fae0cf7564a4ac855420413e69870f Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Mon, 2 Jan 2012 03:34:13 +0000 Subject: [PATCH] * reuse name normalization code --- .../filebot/hash/VerificationUtilities.java | 18 ++++------- .../filebot/similarity/EpisodeMetrics.java | 7 ++--- .../similarity/NameSimilarityMetric.java | 11 ++++--- .../filebot/similarity/Normalization.java | 31 +++++++++++++++++++ .../filebot/similarity/SubstringMetric.java | 7 +++-- .../sourceforge/filebot/web/LocalSearch.java | 22 ++++++------- 6 files changed, 61 insertions(+), 35 deletions(-) create mode 100644 source/net/sourceforge/filebot/similarity/Normalization.java diff --git a/source/net/sourceforge/filebot/hash/VerificationUtilities.java b/source/net/sourceforge/filebot/hash/VerificationUtilities.java index 2f601716..5c4bf0e4 100644 --- a/source/net/sourceforge/filebot/hash/VerificationUtilities.java +++ b/source/net/sourceforge/filebot/hash/VerificationUtilities.java @@ -22,7 +22,7 @@ public final class VerificationUtilities { */ public static final Pattern EMBEDDED_CHECKSUM = Pattern.compile("(?<=\\[|\\()(\\p{XDigit}{8})(?=\\]|\\))"); - + public static String getEmbeddedChecksum(CharSequence string) { Matcher matcher = EMBEDDED_CHECKSUM.matcher(string); String embeddedChecksum = null; @@ -35,18 +35,12 @@ public final class VerificationUtilities { return embeddedChecksum; } - - public static String removeEmbeddedChecksum(String string) { - // match embedded checksum and surrounding brackets - return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", ""); - } - public static String getHashFromVerificationFile(File file, HashType type, int maxDepth) throws IOException { return getHashFromVerificationFile(file.getParentFile(), file, type, 0, maxDepth); } - + private static String getHashFromVerificationFile(File folder, File target, HashType type, int depth, int maxDepth) throws IOException { // stop if we reached max depth or the file system root if (folder == null || depth > maxDepth) @@ -75,7 +69,7 @@ public final class VerificationUtilities { return getHashFromVerificationFile(folder.getParentFile(), target, type, depth + 1, maxDepth); } - + public static HashType getHashType(File verificationFile) { for (HashType hashType : HashType.values()) { if (hashType.getFilter().accept(verificationFile)) @@ -85,7 +79,7 @@ public final class VerificationUtilities { return null; } - + public static HashType getHashTypeByExtension(String extension) { for (HashType hashType : HashType.values()) { if (hashType.getFilter().acceptExtension(extension)) @@ -95,7 +89,7 @@ public final class VerificationUtilities { return null; } - + public static String computeHash(File file, HashType type) throws IOException, InterruptedException { Hash hash = type.newHash(); @@ -120,7 +114,7 @@ public final class VerificationUtilities { return hash.digest(); } - + /** * Dummy constructor to prevent instantiation. */ diff --git a/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java b/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java index 424ea51a..d619770f 100644 --- a/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java +++ b/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java @@ -5,7 +5,7 @@ package net.sourceforge.filebot.similarity; import static java.lang.Math.*; import static java.util.Arrays.*; import static java.util.Collections.*; -import static net.sourceforge.filebot.hash.VerificationUtilities.*; +import static net.sourceforge.filebot.similarity.Normalization.*; import static net.sourceforge.tuned.FileUtilities.*; import java.io.File; @@ -287,10 +287,9 @@ public enum EpisodeMetrics implements SimilarityMetric { name = removeEmbeddedChecksum(name); // remove/normalize special characters - name = name.replaceAll("['`´]+", ""); - name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " "); + name = normalizePunctuation(name); - return name.trim().toLowerCase(); + return name.toLowerCase(); } diff --git a/source/net/sourceforge/filebot/similarity/NameSimilarityMetric.java b/source/net/sourceforge/filebot/similarity/NameSimilarityMetric.java index 3ed7fb16..5346e71d 100644 --- a/source/net/sourceforge/filebot/similarity/NameSimilarityMetric.java +++ b/source/net/sourceforge/filebot/similarity/NameSimilarityMetric.java @@ -2,6 +2,7 @@ package net.sourceforge.filebot.similarity; +import static net.sourceforge.filebot.similarity.Normalization.*; import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3; @@ -11,28 +12,28 @@ public class NameSimilarityMetric implements SimilarityMetric { private final AbstractStringMetric metric; - + public NameSimilarityMetric() { // QGramsDistance with a QGram tokenizer seems to work best for similarity of names metric = new QGramsDistance(new TokeniserQGram3()); } - + @Override public float getSimilarity(Object o1, Object o2) { return metric.getSimilarity(normalize(o1), normalize(o2)); } - + protected String normalize(Object object) { // use string representation String name = object.toString(); // normalize separators - name = name.replaceAll("['`´]+", "").replaceAll("[\\p{Punct}\\p{Space}]+", " "); + name = normalizePunctuation(name); // normalize case and trim - return name.trim().toLowerCase(); + return name.toLowerCase(); } } diff --git a/source/net/sourceforge/filebot/similarity/Normalization.java b/source/net/sourceforge/filebot/similarity/Normalization.java new file mode 100644 index 00000000..8eb0233b --- /dev/null +++ b/source/net/sourceforge/filebot/similarity/Normalization.java @@ -0,0 +1,31 @@ + +package net.sourceforge.filebot.similarity; + + +public class Normalization { + + public static String normalizePunctuation(String name) { + // remove/normalize special characters + name = name.replaceAll("['`´]+", ""); + name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " "); + + return name.trim(); + } + + + public static String normalizeBrackets(String name) { + // remove group names and checksums, any [...] or (...) + name = name.replaceAll("\\([^\\(]*\\)", " "); + name = name.replaceAll("\\[[^\\[]*\\]", " "); + name = name.replaceAll("\\{[^\\{]*\\}", " "); + + return name; + } + + + public static String removeEmbeddedChecksum(String string) { + // match embedded checksum and surrounding brackets + return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", ""); + } + +} diff --git a/source/net/sourceforge/filebot/similarity/SubstringMetric.java b/source/net/sourceforge/filebot/similarity/SubstringMetric.java index 9da24f8b..34cf26de 100644 --- a/source/net/sourceforge/filebot/similarity/SubstringMetric.java +++ b/source/net/sourceforge/filebot/similarity/SubstringMetric.java @@ -2,6 +2,9 @@ package net.sourceforge.filebot.similarity; +import static net.sourceforge.filebot.similarity.Normalization.*; + + public class SubstringMetric implements SimilarityMetric { @Override @@ -17,13 +20,13 @@ public class SubstringMetric implements SimilarityMetric { return s1.contains(s2) || s2.contains(s1) ? 1 : 0; } - + protected String normalize(Object object) { // use string representation String name = object.toString(); // normalize separators - name = name.replaceAll("['`´]+", "").replaceAll("[\\p{Punct}\\p{Space}]+", " "); + name = normalizePunctuation(name); // normalize case and trim return name.trim().toLowerCase(); diff --git a/source/net/sourceforge/filebot/web/LocalSearch.java b/source/net/sourceforge/filebot/web/LocalSearch.java index 58c088e0..f89fecb9 100644 --- a/source/net/sourceforge/filebot/web/LocalSearch.java +++ b/source/net/sourceforge/filebot/web/LocalSearch.java @@ -3,16 +3,17 @@ package net.sourceforge.filebot.web; import static java.util.Collections.*; +import static net.sourceforge.filebot.similarity.Normalization.*; import java.util.AbstractList; +import java.util.AbstractMap.SimpleEntry; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.HashSet; import java.util.List; -import java.util.Set; -import java.util.AbstractMap.SimpleEntry; import java.util.Map.Entry; +import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -32,7 +33,7 @@ class LocalSearch { private final List objects; private final List> fields; - + public LocalSearch(Collection data) { objects = new ArrayList(data); fields = new ArrayList>(objects.size()); @@ -42,7 +43,7 @@ class LocalSearch { } } - + public List search(String query) throws ExecutionException, InterruptedException { final String q = normalize(query); List>> tasks = new ArrayList>>(objects.size()); @@ -96,7 +97,7 @@ class LocalSearch { return resultSet.get(index).getKey(); } - + @Override public int size() { return Math.min(resultSetSize, resultSet.size()); @@ -104,12 +105,12 @@ class LocalSearch { }; } - + protected Set getFields(T object) { return set(object.toString()); } - + protected Set set(String... values) { Set set = new HashSet(values.length); for (String value : values) { @@ -120,13 +121,10 @@ class LocalSearch { return set; } - + protected String normalize(String value) { // normalize separator, normalize case and trim - value = value.replaceAll("['`´]+", ""); - value = value.replaceAll("[\\p{Punct}\\p{Space}]+", " "); - - return value.trim().toLowerCase(); + return normalizePunctuation(value).toLowerCase(); } }