* make sure substring metric only matches word sequences, rather than simple substrings which may match half of a word which wouldn't never make sense and could only cause issues
This commit is contained in:
parent
f61f30e862
commit
1ca8de3ab7
@ -26,7 +26,22 @@ public class SubstringMetric implements SimilarityMetric {
|
||||
if (s2 == null || s2.isEmpty())
|
||||
return 0;
|
||||
|
||||
return (o1c2 && s1.contains(s2)) || (o2c1 && s2.contains(s1)) ? 1 : 0;
|
||||
return (o1c2 && matches(s1, s2) || (o2c1 && matches(s2, s1))) ? 1 : 0;
|
||||
}
|
||||
|
||||
protected boolean matches(String s1, String s2) {
|
||||
int index = s1.lastIndexOf(s2);
|
||||
if (index < 0)
|
||||
return false;
|
||||
|
||||
// check before and after and make sure we're only matching between word boundries
|
||||
if (index - 1 >= 0 && !Character.isLetterOrDigit(s1.charAt(index - 1)))
|
||||
return false;
|
||||
|
||||
if (index + s2.length() < s1.length() && !Character.isLetterOrDigit(index + s2.length()))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected String normalize(Object object) {
|
||||
|
Loading…
Reference in New Issue
Block a user