Optimize DateMatcher

This commit is contained in:
Reinhard Pointner 2016-02-10 11:47:17 +00:00
parent 3b79ef9e39
commit ffa8b021e0
10 changed files with 193 additions and 86 deletions

View File

@ -48,7 +48,6 @@ import net.filebot.archive.Archive;
import net.filebot.format.MediaBindingBean; import net.filebot.format.MediaBindingBean;
import net.filebot.similarity.CommonSequenceMatcher; import net.filebot.similarity.CommonSequenceMatcher;
import net.filebot.similarity.DateMatcher; import net.filebot.similarity.DateMatcher;
import net.filebot.similarity.DateMetric;
import net.filebot.similarity.EpisodeMetrics; import net.filebot.similarity.EpisodeMetrics;
import net.filebot.similarity.MetricAvg; import net.filebot.similarity.MetricAvg;
import net.filebot.similarity.NameSimilarityMetric; import net.filebot.similarity.NameSimilarityMetric;
@ -118,11 +117,16 @@ public class MediaDetection {
private static final SeasonEpisodeMatcher seasonEpisodeMatcherStrict = new SmartSeasonEpisodeMatcher(true); private static final SeasonEpisodeMatcher seasonEpisodeMatcherStrict = new SmartSeasonEpisodeMatcher(true);
private static final SeasonEpisodeMatcher seasonEpisodeMatcherNonStrict = new SmartSeasonEpisodeMatcher(false); private static final SeasonEpisodeMatcher seasonEpisodeMatcherNonStrict = new SmartSeasonEpisodeMatcher(false);
private static final DateMatcher dateMatcher = new DateMatcher(Locale.ROOT, DateMatcher.DEFAULT_SANITY);
public static SeasonEpisodeMatcher getSeasonEpisodeMatcher(boolean strict) { public static SeasonEpisodeMatcher getSeasonEpisodeMatcher(boolean strict) {
return strict ? seasonEpisodeMatcherStrict : seasonEpisodeMatcherNonStrict; return strict ? seasonEpisodeMatcherStrict : seasonEpisodeMatcherNonStrict;
} }
public static DateMatcher getDateMatcher() {
return dateMatcher;
}
public static boolean isEpisode(String name, boolean strict) { public static boolean isEpisode(String name, boolean strict) {
return parseEpisodeNumber(name, strict) != null || parseDate(name) != null; return parseEpisodeNumber(name, strict) != null || parseDate(name) != null;
} }
@ -136,7 +140,10 @@ public class MediaDetection {
} }
public static SimpleDate parseDate(Object object) { public static SimpleDate parseDate(Object object) {
return new DateMetric().parse(object); if (object instanceof File) {
return getDateMatcher().match((File) object);
}
return getDateMatcher().match(object.toString());
} }
public static Map<Set<File>, Set<String>> mapSeriesNamesByFiles(Collection<File> files, Locale locale, boolean useSeriesIndex, boolean useAnimeIndex) throws Exception { public static Map<Set<File>, Set<String>> mapSeriesNamesByFiles(Collection<File> files, Locale locale, boolean useSeriesIndex, boolean useAnimeIndex) throws Exception {
@ -264,7 +271,7 @@ public class MediaDetection {
// then Date pattern // then Date pattern
if (match == null) { if (match == null) {
match = new DateMatcher().match(name); match = getDateMatcher().match(name);
} }
// check SxE non-strict // check SxE non-strict
@ -744,7 +751,7 @@ public class MediaDetection {
} }
public static List<Integer> parseMovieYear(String name) { public static List<Integer> parseMovieYear(String name) {
return matchIntegers(name).stream().filter(year -> 1950 < year && year < 2050).collect(toList()); return matchIntegers(name).stream().filter(DateMatcher.DEFAULT_SANITY::acceptYear).collect(toList());
} }
public static String reduceMovieName(String name, boolean strict) throws IOException { public static String reduceMovieName(String name, boolean strict) throws IOException {

View File

@ -1,47 +1,84 @@
package net.filebot.similarity; package net.filebot.similarity;
import static java.util.Arrays.*;
import static java.util.stream.Collectors.*;
import static net.filebot.util.FileUtilities.*; import static net.filebot.util.FileUtilities.*;
import java.io.File; import java.io.File;
import java.time.LocalDate;
import java.time.Month;
import java.time.chrono.ChronoLocalDate;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.function.Predicate;
import java.util.regex.MatchResult; import java.util.regex.MatchResult;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.IntStream;
import net.filebot.web.SimpleDate; import net.filebot.web.SimpleDate;
public class DateMatcher { public class DateMatcher {
public static final DateFilter DEFAULT_SANITY = new DateFilter(LocalDate.of(1930, Month.JANUARY, 1), LocalDate.of(2050, Month.JANUARY, 1));
private final DatePattern[] patterns; private final DatePattern[] patterns;
public DateMatcher() { public DateMatcher(Locale locale, DateFilter sanity) {
patterns = new DatePattern[7]; // generate default date format patterns
List<String> format = new ArrayList<String>(7);
// match yyyy-mm-dd patterns like 2010-10-24, 2009/6/1, etc // match yyyy-mm-dd patterns like 2010-10-24, 2009/6/1, etc
patterns[0] = new NumericDatePattern("(?<!\\p{Alnum})(\\d{4})[^\\p{Alnum}](\\d{1,2})[^\\p{Alnum}](\\d{1,2})(?!\\p{Alnum})", new int[] { 1, 2, 3 }); format.add("y M d");
// match dd-mm-yyyy patterns like 1.1.2010, 01/06/2010, etc // match dd-mm-yyyy patterns like 1.1.2010, 01/06/2010, etc
patterns[1] = new NumericDatePattern("(?<!\\p{Alnum})(\\d{1,2})[^\\p{Alnum}](\\d{1,2})[^\\p{Alnum}](\\d{4})(?!\\p{Alnum})", new int[] { 3, 2, 1 }); format.add("d M y");
// match yyyy.MMMMM.dd patterns like 2015.October.05 // match yyyy.MMMMM.dd patterns like 2015.October.05
patterns[2] = new DateFormatPattern("(?<!\\p{Alnum})(\\d{4})[^\\p{Alnum}](?i:January|February|March|April|May|June|July|August|September|October|November|December)[^\\p{Alnum}](\\d{1,2})(?!\\p{Alnum})", "yyyy MMMMM dd"); format.add("y MMMM d");
// match yyyy.MMM.dd patterns like 2015.Oct.06 // match yyyy.MMM.dd patterns like 2015.Oct.6
patterns[3] = new DateFormatPattern("(?<!\\p{Alnum})(\\d{4})[^\\p{Alnum}](?i:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[^\\p{Alnum}](\\d{1,2})(?!\\p{Alnum})", "yyyy MMM dd"); format.add("y MMM d");
// match dd.MMMMM.yyyy patterns like 25 July 2014 // match dd.MMMMM.yyyy patterns like 25 July 2014
patterns[4] = new DateFormatPattern("(?<!\\p{Alnum})(\\d{1,2})[^\\p{Alnum}](?i:January|February|March|April|May|June|July|August|September|October|November|December)[^\\p{Alnum}](\\d{4})(?!\\p{Alnum})", "dd MMMMM yyyy"); format.add("d MMMM y");
// match dd.MMM.yyyy patterns like 8 Sep 2015 // match dd.MMM.yyyy patterns like 8 Sep 2015
patterns[5] = new DateFormatPattern("(?<!\\p{Alnum})(\\d{1,2})[^\\p{Alnum}](?i:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[^\\p{Alnum}](\\d{4})(?!\\p{Alnum})", "dd MMM yyyy"); format.add("d MMM y");
// match yyyymmdd patterns like 20140408 // match yyyymmdd patterns like 20140408
patterns[6] = new DateFormatPattern("(?<!\\p{Alnum})(\\d{8})(?!\\p{Alnum})", "yyyyMMdd"); format.add("yyyyMMdd");
this.patterns = compile(format, locale, sanity);
} }
public DateMatcher(DatePattern... patterns) { protected DatePattern[] compile(List<String> dateFormat, Locale locale, DateFilter sanity) {
this.patterns = patterns; return dateFormat.stream().map(format -> {
String pattern = stream(format.split(DateFormatPattern.DELIMITER)).map(this::getPatternGroup).collect(joining("[^\\p{Alnum}]", "(?<!\\p{Alnum})", "(?!\\p{Alnum})"));
return new DateFormatPattern(pattern, format, locale, sanity);
}).toArray(DateFormatPattern[]::new);
}
protected String getPatternGroup(String token) {
switch (token) {
case "y":
return "(\\d{4})";
case "M":
return "(\\d{1,2})";
case "d":
return "(\\d{1,2})";
case "yyyyMMdd":
return "(\\d{8})";
case "MMMM":
return "(January|February|March|April|May|June|July|August|September|October|November|December)";
case "MMM":
return "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)";
default:
throw new IllegalArgumentException(token);
}
} }
public SimpleDate match(CharSequence seq) { public SimpleDate match(CharSequence seq) {
@ -52,7 +89,6 @@ public class DateMatcher {
return match; return match;
} }
} }
return null; return null;
} }
@ -64,7 +100,6 @@ public class DateMatcher {
return pos; return pos;
} }
} }
return -1; return -1;
} }
@ -89,7 +124,7 @@ public class DateMatcher {
return tail; return tail;
} }
private static interface DatePattern { public static interface DatePattern {
public SimpleDate match(CharSequence seq); public SimpleDate match(CharSequence seq);
@ -97,60 +132,34 @@ public class DateMatcher {
} }
private static class NumericDatePattern implements DatePattern { public static class DateFormatPattern implements DatePattern {
public static final String DELIMITER = " ";
protected final Pattern pattern; protected final Pattern pattern;
protected final int[] order; protected final DateTimeFormatter format;
protected final DateFilter sanity;
public NumericDatePattern(String pattern, int[] order) { public DateFormatPattern(String pattern, String format, Locale locale, DateFilter sanity) {
this.pattern = Pattern.compile(pattern); this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
this.order = order; this.format = DateTimeFormatter.ofPattern(format, locale);
this.sanity = sanity;
} }
protected SimpleDate process(MatchResult match) { protected SimpleDate process(MatchResult match) {
return new SimpleDate(Integer.parseInt(match.group(order[0])), Integer.parseInt(match.group(order[1])), Integer.parseInt(match.group(order[2]))); try {
String dateString = IntStream.rangeClosed(1, match.groupCount()).mapToObj(match::group).collect(joining(DELIMITER));
LocalDate date = LocalDate.parse(dateString, format);
if (sanity == null || sanity.test(date)) {
return new SimpleDate(date.getYear(), date.getMonthValue(), date.getDayOfMonth());
} }
} catch (DateTimeParseException e) {
@Override // date is invalid
public SimpleDate match(CharSequence seq) {
Matcher matcher = pattern.matcher(seq);
if (matcher.find()) {
return process(matcher);
} }
return null; return null;
} }
@Override
public int find(CharSequence seq, int fromIndex) {
Matcher matcher = pattern.matcher(seq).region(fromIndex, seq.length());
if (matcher.find()) {
return matcher.start();
}
return -1;
}
}
private static class DateFormatPattern implements DatePattern {
protected final Pattern space = Pattern.compile("[^\\p{Alnum}]+");
protected final Pattern pattern;
protected final String dateFormat;
public DateFormatPattern(String pattern, String dateFormat) {
this.pattern = Pattern.compile(pattern);
this.dateFormat = dateFormat;
}
protected SimpleDate process(MatchResult match) {
return SimpleDate.parse(space.matcher(match.group()).replaceAll(" "), dateFormat);
}
@Override @Override
public SimpleDate match(CharSequence seq) { public SimpleDate match(CharSequence seq) {
Matcher matcher = pattern.matcher(seq); Matcher matcher = pattern.matcher(seq);
@ -158,7 +167,6 @@ public class DateMatcher {
if (matcher.find()) { if (matcher.find()) {
return process(matcher); return process(matcher);
} }
return null; return null;
} }
@ -171,10 +179,34 @@ public class DateMatcher {
return matcher.start(); return matcher.start();
} }
} }
return -1; return -1;
} }
} }
public static class DateFilter implements Predicate<ChronoLocalDate> {
public final ChronoLocalDate lowerBound;
public final ChronoLocalDate upperBound;
public DateFilter(ChronoLocalDate lowerBound, ChronoLocalDate upperBound) {
this.lowerBound = lowerBound;
this.upperBound = upperBound;
}
@Override
public boolean test(ChronoLocalDate date) {
return date.isAfter(lowerBound) && date.isBefore(upperBound);
}
public boolean acceptDate(int year, int month, int day) {
return test(LocalDate.of(year, month, day));
}
public boolean acceptYear(int year) {
return test(LocalDate.of(year, 1, 1));
}
}
} }

View File

@ -8,10 +8,6 @@ public class DateMetric implements SimilarityMetric {
private final DateMatcher matcher; private final DateMatcher matcher;
public DateMetric() {
this.matcher = new DateMatcher();
}
public DateMetric(DateMatcher matcher) { public DateMetric(DateMatcher matcher) {
this.matcher = matcher; this.matcher = matcher;
} }
@ -33,7 +29,6 @@ public class DateMetric implements SimilarityMetric {
if (object instanceof File) { if (object instanceof File) {
return matcher.match((File) object); return matcher.match((File) object);
} }
return matcher.match(object.toString()); return matcher.match(object.toString());
} }

View File

@ -83,7 +83,7 @@ public enum EpisodeMetrics implements SimilarityMetric {
}), }),
// Match episode airdate // Match episode airdate
AirDate(new DateMetric() { AirDate(new DateMetric(getDateMatcher()) {
private final Map<Object, SimpleDate> transformCache = synchronizedMap(new HashMap<Object, SimpleDate>(64, 4)); private final Map<Object, SimpleDate> transformCache = synchronizedMap(new HashMap<Object, SimpleDate>(64, 4));

View File

@ -43,7 +43,7 @@ public class SeriesNameMatcher {
public SeriesNameMatcher(Locale locale, boolean strict) { public SeriesNameMatcher(Locale locale, boolean strict) {
seasonEpisodeMatcher = new SmartSeasonEpisodeMatcher(SeasonEpisodeMatcher.DEFAULT_SANITY, strict); seasonEpisodeMatcher = new SmartSeasonEpisodeMatcher(SeasonEpisodeMatcher.DEFAULT_SANITY, strict);
dateMatcher = new DateMatcher(); dateMatcher = new DateMatcher(locale, DateMatcher.DEFAULT_SANITY);
nameSimilarityMetric = new NameSimilarityMetric(); nameSimilarityMetric = new NameSimilarityMetric();
commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(locale), 3, true) { commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(locale), 3, true) {

View File

@ -50,7 +50,7 @@ public class LanguageComboBox extends JComboBox {
// guess favorite languages // guess favorite languages
if (getModel().favorites().isEmpty()) { if (getModel().favorites().isEmpty()) {
for (Locale locale : new Locale[] { Locale.getDefault(), Locale.ENGLISH }) { for (Locale locale : new Locale[] { Locale.ENGLISH, Locale.getDefault() }) {
getModel().favorites().add(getLanguage(locale.getLanguage())); getModel().favorites().add(getLanguage(locale.getLanguage()));
} }
} }

View File

@ -11,6 +11,10 @@ import java.io.IOException;
import java.net.URL; import java.net.URL;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.time.temporal.ChronoField;
import java.time.temporal.TemporalAccessor;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.EnumMap; import java.util.EnumMap;
@ -22,6 +26,8 @@ import java.util.Map.Entry;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.function.Function; import java.util.function.Function;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -211,18 +217,16 @@ public class OMDbClient implements MovieIdentificationService {
fields.put(MovieProperty.poster_path, data.get("poster")); fields.put(MovieProperty.poster_path, data.get("poster"));
// convert release date to yyyy-MM-dd // convert release date to yyyy-MM-dd
SimpleDate released = SimpleDate.parse(data.get("released"), "dd MMM yyyy"); SimpleDate release = parsePartialDate(data.get("released"), "d MMM yyyy");
if (released != null) { if (release == null) {
fields.put(MovieProperty.release_date, released.format("yyyy-MM-dd")); release = parsePartialDate(data.get("released"), "yyyy");
} else {
SimpleDate year = SimpleDate.parse(data.get("year"), "yyyy");
if (year != null) {
fields.put(MovieProperty.release_date, year.format("yyyy-MM-dd"));
} }
if (release != null) {
fields.put(MovieProperty.release_date, release.toString());
} }
// convert lists
Pattern delim = Pattern.compile(","); Pattern delim = Pattern.compile(",");
List<String> genres = split(delim, data.get("genre"), String::toString); List<String> genres = split(delim, data.get("genre"), String::toString);
List<String> languages = split(delim, data.get("language"), String::toString); List<String> languages = split(delim, data.get("language"), String::toString);
@ -234,10 +238,29 @@ public class OMDbClient implements MovieIdentificationService {
return new MovieInfo(fields, emptyList(), genres, emptyMap(), languages, emptyList(), emptyList(), actors, emptyList()); return new MovieInfo(fields, emptyList(), genres, emptyMap(), languages, emptyList(), emptyList(), actors, emptyList());
} }
private SimpleDate parsePartialDate(String value, String format) {
if (value != null && value.length() > 0) {
try {
TemporalAccessor f = DateTimeFormatter.ofPattern(format, Locale.ENGLISH).parse(value);
if (f.isSupported(ChronoField.YEAR)) {
if (f.isSupported(ChronoField.MONTH_OF_YEAR) && f.isSupported(ChronoField.DAY_OF_MONTH)) {
return new SimpleDate(f.get(ChronoField.YEAR), f.get(ChronoField.MONTH_OF_YEAR), f.get(ChronoField.DAY_OF_MONTH));
} else {
return new SimpleDate(f.get(ChronoField.YEAR), 1, 1);
}
}
} catch (DateTimeParseException e) {
Logger.getLogger(OMDbClient.class.getName()).log(Level.WARNING, String.format("Bad date: %s: %s", value, e.getMessage()));
}
}
return null;
}
private <T> List<T> split(Pattern regex, String value, Function<String, T> toObject) { private <T> List<T> split(Pattern regex, String value, Function<String, T> toObject) {
if (value == null || value.isEmpty()) if (value == null || value.isEmpty())
return emptyList(); return emptyList();
return regex.splitAsStream(value).map(String::trim).filter(s -> !s.equals("N/A")).map(toObject).collect(toList()); return regex.splitAsStream(value).map(String::trim).filter(s -> !s.equals("N/A")).map(toObject).collect(toList());
} }
} }

View File

@ -0,0 +1,14 @@
package net.filebot.media;
import static org.junit.Assert.*;
import org.junit.Test;
public class MediaDetectionTest {
@Test
public void parseMovieYear() {
assertEquals("[2009]", MediaDetection.parseMovieYear("Avatar 2009 2100").toString());
assertEquals("[1955]", MediaDetection.parseMovieYear("1898 Sissi 1955").toString());
}
}

View File

@ -0,0 +1,38 @@
package net.filebot.similarity;
import static org.junit.Assert.*;
import java.util.Locale;
import org.junit.Test;
public class DateMatcherTest {
DateMatcher m = new DateMatcher(Locale.ROOT, DateMatcher.DEFAULT_SANITY);
@Test
public void parse() {
assertEquals("2010-10-24", m.match("2010-10-24").toString());
assertEquals("2009-06-01", m.match("2009/6/1").toString());
assertEquals("2010-01-01", m.match("1.1.2010").toString());
assertEquals("2010-06-01", m.match("01/06/2010").toString());
assertEquals("2015-10-05", m.match("2015.October.05").toString());
assertEquals("2015-10-06", m.match("2015.Oct.6").toString());
assertEquals("2014-07-25", m.match("25 July 2014").toString());
assertEquals("2015-09-08", m.match("8 Sep 2015").toString());
assertEquals("2014-04-08", m.match("20140408").toString());
}
@Test
public void parseIllegal() {
assertEquals(null, m.match("2000-01-32"));
assertEquals(null, m.match("123456789"));
}
@Test
public void sanity() {
assertEquals(null, m.match("1911-01-01")); // too low
assertEquals(null, m.match("2099-01-01")); // too high
}
}

View File

@ -1,16 +1,14 @@
package net.filebot.similarity; package net.filebot.similarity;
import static org.junit.Assert.*; import static org.junit.Assert.*;
import org.junit.Test; import java.util.Locale;
import org.junit.Test;
public class DateMetricTest { public class DateMetricTest {
private static DateMetric metric = new DateMetric(); DateMetric metric = new DateMetric(new DateMatcher(Locale.ROOT, DateMatcher.DEFAULT_SANITY));
@Test @Test
public void getSimilarity() { public void getSimilarity() {