* support S0EP00 pattern

* added numeric sequence match differentiation step to improve support for generic season/episode patterns
This commit is contained in:
Reinhard Pointner 2013-02-01 08:12:15 +00:00
parent 12b277dacc
commit e631641a0c
3 changed files with 26 additions and 18 deletions

View File

@ -209,9 +209,9 @@ public enum EpisodeMetrics implements SimilarityMetric {
@Override
public float getSimilarity(Object o1, Object o2) {
// normalize absolute similarity to similarity rank (5 ranks in total),
// normalize absolute similarity to similarity rank (4 ranks in total),
// so we are less likely to fall for false positives in this pass, and move on to the next one
return (float) (floor(super.getSimilarity(o1, o2) * 5) / 5);
return (float) (floor(super.getSimilarity(o1, o2) * 4) / 4);
}
@ -222,6 +222,15 @@ public enum EpisodeMetrics implements SimilarityMetric {
}
}),
NumericSequence(new SequenceMatchSimilarity() {
@Override
protected String normalize(Object object) {
// simplify file name, if possible
return normalizeObject(object).replaceAll("\\D+", " ").trim();
}
}),
// Match by generic numeric similarity
Numeric(new NumericSimilarityMetric() {
@ -402,9 +411,9 @@ public enum EpisodeMetrics implements SimilarityMetric {
// 7 pass: prefer episodes that were aired closer to the last modified date of the file
// 8 pass: resolve remaining collisions via absolute string similarity
if (includeFileMetrics) {
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(SubstringSequence, Name), Numeric, Name, TimeStamp, new NameSimilarityMetric() };
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(SubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, new NameSimilarityMetric() };
} else {
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(SubstringSequence, Name), Numeric, Name, TimeStamp, new NameSimilarityMetric() };
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(SubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, new NameSimilarityMetric() };
}
}

View File

@ -3,7 +3,7 @@ package net.sourceforge.filebot.similarity;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Scanner;
import java.util.Set;
@ -18,31 +18,31 @@ public class NumericSimilarityMetric implements SimilarityMetric {
private final AbstractStringMetric metric;
public NumericSimilarityMetric() {
// I don't exactly know why, but I get a good matching behavior
// when using QGramsDistance or BlockDistance
metric = new QGramsDistance(new NumberTokeniser());
}
@Override
public float getSimilarity(Object o1, Object o2) {
return metric.getSimilarity(normalize(o1), normalize(o2));
}
protected String normalize(Object object) {
// no need to do anything special here, because we don't care about anything but number patterns anyway
return object.toString();
}
private static class NumberTokeniser implements InterfaceTokeniser {
private final String delimiter = "\\D+";
@Override
public ArrayList<String> tokenizeToArrayList(String input) {
ArrayList<String> tokens = new ArrayList<String>();
@ -58,34 +58,33 @@ public class NumericSimilarityMetric implements SimilarityMetric {
return tokens;
}
@Override
public Set<String> tokenizeToSet(String input) {
return new HashSet<String>(tokenizeToArrayList(input));
return new LinkedHashSet<String>(tokenizeToArrayList(input));
}
@Override
public String getShortDescriptionString() {
return getClass().getSimpleName();
}
@Override
public String getDelimiters() {
return delimiter;
}
private InterfaceTermHandler stopWordHandler = new DummyStopTermHandler();
@Override
public InterfaceTermHandler getStopWordHandler() {
return stopWordHandler;
}
@Override
public void setStopWordHandler(InterfaceTermHandler stopWordHandler) {
this.stopWordHandler = stopWordHandler;

View File

@ -29,7 +29,7 @@ public class SeasonEpisodeMatcher {
patterns[0] = new SeasonEpisodePattern(null, "(?<!\\p{Alnum})(?i:season|series)[^\\p{Alnum}]{0,3}(\\d{1,4})[^\\p{Alnum}]{0,3}(?i:episode)[^\\p{Alnum}]{0,3}(\\d{1,4})[^\\p{Alnum}]{0,3}(?!\\p{Digit})");
// match patterns like S01E01, s01e02, ... [s01]_[e02], s01.e02, s01e02a, s2010e01 ... s01e01-02-03-04, [s01]_[e01-02-03-04] ...
patterns[1] = new SeasonEpisodePattern(null, "(?<!\\p{Digit})[Ss](\\d{1,2}|\\d{4})[^\\p{Alnum}]{0,3}[Ee](((?<=[^._ ])[Ee]?\\d{1,3}(\\D|$))+)") {
patterns[1] = new SeasonEpisodePattern(null, "(?<!\\p{Digit})[Ss](\\d{1,2}|\\d{4})[^\\p{Alnum}]{0,3}[Ee][Pp]?(((?<=[^._ ])[Ee]?[Pp]?\\d{1,3}(\\D|$))+)") {
@Override
protected Collection<SxE> process(MatchResult match) {