+ String.asciiQuotes() to normalize wierd quotation marks (e.g. "\u00b4\u2018\u2019\u02bb".asciiQuotes() == "''''")

This commit is contained in:
Reinhard Pointner 2015-06-04 17:17:30 +00:00
parent 5c1dac0533
commit 8a77762e34
2 changed files with 17 additions and 0 deletions

View File

@ -12,6 +12,7 @@ import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import net.filebot.similarity.Normalization;
import net.filebot.util.FileUtilities;
import com.ibm.icu.text.Transliterator;
@ -254,6 +255,10 @@ public class ExpressionFormatMethods {
return Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove").transform(self).replaceAll("[^\\p{ASCII}]+", fallback).trim();
}
public static String asciiQuotes(String self) {
return Normalization.normalizeQuotationMarks(self);
}
/**
* Replace multiple replacement pairs
*

View File

@ -17,6 +17,18 @@ public class Normalization {
private static final Pattern checksum = compile("[\\(\\[]\\p{XDigit}{8}[\\]\\)]");
private static final char[] doubleQuotes = new char[] { '\"', '\u0060', '\u00b4', '\u2018', '\u2019', '\u02bb' };
private static final char[] singleQuotes = new char[] { '\'', '\u201c', '\u201d' };
public static String normalizeQuotationMarks(String name) {
for (char[] cs : new char[][] { doubleQuotes, singleQuotes }) {
for (char c : cs) {
name = name.replace(c, cs[0]);
}
}
return name;
}
public static String normalizePunctuation(String name) {
// remove/normalize special characters
name = apostrophe.matcher(name).replaceAll("");