* reuse name normalization code
This commit is contained in:
parent
b8c96b8fbe
commit
6707a94518
@ -22,7 +22,7 @@ public final class VerificationUtilities {
|
||||
*/
|
||||
public static final Pattern EMBEDDED_CHECKSUM = Pattern.compile("(?<=\\[|\\()(\\p{XDigit}{8})(?=\\]|\\))");
|
||||
|
||||
|
||||
|
||||
public static String getEmbeddedChecksum(CharSequence string) {
|
||||
Matcher matcher = EMBEDDED_CHECKSUM.matcher(string);
|
||||
String embeddedChecksum = null;
|
||||
@ -35,18 +35,12 @@ public final class VerificationUtilities {
|
||||
return embeddedChecksum;
|
||||
}
|
||||
|
||||
|
||||
public static String removeEmbeddedChecksum(String string) {
|
||||
// match embedded checksum and surrounding brackets
|
||||
return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", "");
|
||||
}
|
||||
|
||||
|
||||
public static String getHashFromVerificationFile(File file, HashType type, int maxDepth) throws IOException {
|
||||
return getHashFromVerificationFile(file.getParentFile(), file, type, 0, maxDepth);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static String getHashFromVerificationFile(File folder, File target, HashType type, int depth, int maxDepth) throws IOException {
|
||||
// stop if we reached max depth or the file system root
|
||||
if (folder == null || depth > maxDepth)
|
||||
@ -75,7 +69,7 @@ public final class VerificationUtilities {
|
||||
return getHashFromVerificationFile(folder.getParentFile(), target, type, depth + 1, maxDepth);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static HashType getHashType(File verificationFile) {
|
||||
for (HashType hashType : HashType.values()) {
|
||||
if (hashType.getFilter().accept(verificationFile))
|
||||
@ -85,7 +79,7 @@ public final class VerificationUtilities {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static HashType getHashTypeByExtension(String extension) {
|
||||
for (HashType hashType : HashType.values()) {
|
||||
if (hashType.getFilter().acceptExtension(extension))
|
||||
@ -95,7 +89,7 @@ public final class VerificationUtilities {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static String computeHash(File file, HashType type) throws IOException, InterruptedException {
|
||||
Hash hash = type.newHash();
|
||||
|
||||
@ -120,7 +114,7 @@ public final class VerificationUtilities {
|
||||
return hash.digest();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Dummy constructor to prevent instantiation.
|
||||
*/
|
||||
|
@ -5,7 +5,7 @@ package net.sourceforge.filebot.similarity;
|
||||
import static java.lang.Math.*;
|
||||
import static java.util.Arrays.*;
|
||||
import static java.util.Collections.*;
|
||||
import static net.sourceforge.filebot.hash.VerificationUtilities.*;
|
||||
import static net.sourceforge.filebot.similarity.Normalization.*;
|
||||
import static net.sourceforge.tuned.FileUtilities.*;
|
||||
|
||||
import java.io.File;
|
||||
@ -287,10 +287,9 @@ public enum EpisodeMetrics implements SimilarityMetric {
|
||||
name = removeEmbeddedChecksum(name);
|
||||
|
||||
// remove/normalize special characters
|
||||
name = name.replaceAll("['`´]+", "");
|
||||
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
|
||||
name = normalizePunctuation(name);
|
||||
|
||||
return name.trim().toLowerCase();
|
||||
return name.toLowerCase();
|
||||
}
|
||||
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
package net.sourceforge.filebot.similarity;
|
||||
|
||||
|
||||
import static net.sourceforge.filebot.similarity.Normalization.*;
|
||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
||||
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3;
|
||||
@ -11,28 +12,28 @@ public class NameSimilarityMetric implements SimilarityMetric {
|
||||
|
||||
private final AbstractStringMetric metric;
|
||||
|
||||
|
||||
|
||||
public NameSimilarityMetric() {
|
||||
// QGramsDistance with a QGram tokenizer seems to work best for similarity of names
|
||||
metric = new QGramsDistance(new TokeniserQGram3());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public float getSimilarity(Object o1, Object o2) {
|
||||
return metric.getSimilarity(normalize(o1), normalize(o2));
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected String normalize(Object object) {
|
||||
// use string representation
|
||||
String name = object.toString();
|
||||
|
||||
// normalize separators
|
||||
name = name.replaceAll("['`´]+", "").replaceAll("[\\p{Punct}\\p{Space}]+", " ");
|
||||
name = normalizePunctuation(name);
|
||||
|
||||
// normalize case and trim
|
||||
return name.trim().toLowerCase();
|
||||
return name.toLowerCase();
|
||||
}
|
||||
|
||||
}
|
||||
|
31
source/net/sourceforge/filebot/similarity/Normalization.java
Normal file
31
source/net/sourceforge/filebot/similarity/Normalization.java
Normal file
@ -0,0 +1,31 @@
|
||||
|
||||
package net.sourceforge.filebot.similarity;
|
||||
|
||||
|
||||
public class Normalization {
|
||||
|
||||
public static String normalizePunctuation(String name) {
|
||||
// remove/normalize special characters
|
||||
name = name.replaceAll("['`´]+", "");
|
||||
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
|
||||
|
||||
return name.trim();
|
||||
}
|
||||
|
||||
|
||||
public static String normalizeBrackets(String name) {
|
||||
// remove group names and checksums, any [...] or (...)
|
||||
name = name.replaceAll("\\([^\\(]*\\)", " ");
|
||||
name = name.replaceAll("\\[[^\\[]*\\]", " ");
|
||||
name = name.replaceAll("\\{[^\\{]*\\}", " ");
|
||||
|
||||
return name;
|
||||
}
|
||||
|
||||
|
||||
public static String removeEmbeddedChecksum(String string) {
|
||||
// match embedded checksum and surrounding brackets
|
||||
return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", "");
|
||||
}
|
||||
|
||||
}
|
@ -2,6 +2,9 @@
|
||||
package net.sourceforge.filebot.similarity;
|
||||
|
||||
|
||||
import static net.sourceforge.filebot.similarity.Normalization.*;
|
||||
|
||||
|
||||
public class SubstringMetric implements SimilarityMetric {
|
||||
|
||||
@Override
|
||||
@ -17,13 +20,13 @@ public class SubstringMetric implements SimilarityMetric {
|
||||
return s1.contains(s2) || s2.contains(s1) ? 1 : 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected String normalize(Object object) {
|
||||
// use string representation
|
||||
String name = object.toString();
|
||||
|
||||
// normalize separators
|
||||
name = name.replaceAll("['`´]+", "").replaceAll("[\\p{Punct}\\p{Space}]+", " ");
|
||||
name = normalizePunctuation(name);
|
||||
|
||||
// normalize case and trim
|
||||
return name.trim().toLowerCase();
|
||||
|
@ -3,16 +3,17 @@ package net.sourceforge.filebot.web;
|
||||
|
||||
|
||||
import static java.util.Collections.*;
|
||||
import static net.sourceforge.filebot.similarity.Normalization.*;
|
||||
|
||||
import java.util.AbstractList;
|
||||
import java.util.AbstractMap.SimpleEntry;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.AbstractMap.SimpleEntry;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
@ -32,7 +33,7 @@ class LocalSearch<T> {
|
||||
private final List<T> objects;
|
||||
private final List<Set<String>> fields;
|
||||
|
||||
|
||||
|
||||
public LocalSearch(Collection<? extends T> data) {
|
||||
objects = new ArrayList<T>(data);
|
||||
fields = new ArrayList<Set<String>>(objects.size());
|
||||
@ -42,7 +43,7 @@ class LocalSearch<T> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public List<T> search(String query) throws ExecutionException, InterruptedException {
|
||||
final String q = normalize(query);
|
||||
List<Callable<Entry<T, Float>>> tasks = new ArrayList<Callable<Entry<T, Float>>>(objects.size());
|
||||
@ -96,7 +97,7 @@ class LocalSearch<T> {
|
||||
return resultSet.get(index).getKey();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return Math.min(resultSetSize, resultSet.size());
|
||||
@ -104,12 +105,12 @@ class LocalSearch<T> {
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected Set<String> getFields(T object) {
|
||||
return set(object.toString());
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected Set<String> set(String... values) {
|
||||
Set<String> set = new HashSet<String>(values.length);
|
||||
for (String value : values) {
|
||||
@ -120,13 +121,10 @@ class LocalSearch<T> {
|
||||
return set;
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected String normalize(String value) {
|
||||
// normalize separator, normalize case and trim
|
||||
value = value.replaceAll("['`´]+", "");
|
||||
value = value.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
|
||||
|
||||
return value.trim().toLowerCase();
|
||||
return normalizePunctuation(value).toLowerCase();
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user