* reuse name normalization code

This commit is contained in:
Reinhard Pointner 2012-01-02 03:34:13 +00:00
parent b8c96b8fbe
commit 6707a94518
6 changed files with 61 additions and 35 deletions

View File

@ -36,12 +36,6 @@ public final class VerificationUtilities {
}
public static String removeEmbeddedChecksum(String string) {
// match embedded checksum and surrounding brackets
return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", "");
}
public static String getHashFromVerificationFile(File file, HashType type, int maxDepth) throws IOException {
return getHashFromVerificationFile(file.getParentFile(), file, type, 0, maxDepth);
}

View File

@ -5,7 +5,7 @@ package net.sourceforge.filebot.similarity;
import static java.lang.Math.*;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static net.sourceforge.filebot.hash.VerificationUtilities.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.FileUtilities.*;
import java.io.File;
@ -287,10 +287,9 @@ public enum EpisodeMetrics implements SimilarityMetric {
name = removeEmbeddedChecksum(name);
// remove/normalize special characters
name = name.replaceAll("['`´]+", "");
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
name = normalizePunctuation(name);
return name.trim().toLowerCase();
return name.toLowerCase();
}

View File

@ -2,6 +2,7 @@
package net.sourceforge.filebot.similarity;
import static net.sourceforge.filebot.similarity.Normalization.*;
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3;
@ -29,10 +30,10 @@ public class NameSimilarityMetric implements SimilarityMetric {
String name = object.toString();
// normalize separators
name = name.replaceAll("['`´]+", "").replaceAll("[\\p{Punct}\\p{Space}]+", " ");
name = normalizePunctuation(name);
// normalize case and trim
return name.trim().toLowerCase();
return name.toLowerCase();
}
}

View File

@ -0,0 +1,31 @@
package net.sourceforge.filebot.similarity;
public class Normalization {
public static String normalizePunctuation(String name) {
// remove/normalize special characters
name = name.replaceAll("['`´]+", "");
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
return name.trim();
}
public static String normalizeBrackets(String name) {
// remove group names and checksums, any [...] or (...)
name = name.replaceAll("\\([^\\(]*\\)", " ");
name = name.replaceAll("\\[[^\\[]*\\]", " ");
name = name.replaceAll("\\{[^\\{]*\\}", " ");
return name;
}
public static String removeEmbeddedChecksum(String string) {
// match embedded checksum and surrounding brackets
return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", "");
}
}

View File

@ -2,6 +2,9 @@
package net.sourceforge.filebot.similarity;
import static net.sourceforge.filebot.similarity.Normalization.*;
public class SubstringMetric implements SimilarityMetric {
@Override
@ -23,7 +26,7 @@ public class SubstringMetric implements SimilarityMetric {
String name = object.toString();
// normalize separators
name = name.replaceAll("['`´]+", "").replaceAll("[\\p{Punct}\\p{Space}]+", " ");
name = normalizePunctuation(name);
// normalize case and trim
return name.trim().toLowerCase();

View File

@ -3,16 +3,17 @@ package net.sourceforge.filebot.web;
import static java.util.Collections.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import java.util.AbstractList;
import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.AbstractMap.SimpleEntry;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
@ -123,10 +124,7 @@ class LocalSearch<T> {
protected String normalize(String value) {
// normalize separator, normalize case and trim
value = value.replaceAll("['`´]+", "");
value = value.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
return value.trim().toLowerCase();
return normalizePunctuation(value).toLowerCase();
}
}