From f7fdc5b5db8e984dd1a773067ab880cb6e80e248 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Mon, 2 Feb 2009 20:50:04 +0000 Subject: [PATCH] * improved SeriesNameMatcher * slightly modified season episode regex * added ehcache to fatjar build --- build.xml | 4 + .../sourceforge/filebot/FileBotUtilities.java | 10 +- .../similarity/SeasonEpisodeMatcher.java | 2 +- .../filebot/similarity/SeriesNameMatcher.java | 141 ++++++++++++++---- .../sourceforge/filebot/ui/FileBotWindow.java | 7 +- .../list/FileListTransferablePolicy.java | 10 +- .../panel/rename/AutoEpisodeListMatcher.java | 13 +- .../similarity/SeriesNameMatcherTest.java | 11 +- 8 files changed, 147 insertions(+), 51 deletions(-) diff --git a/build.xml b/build.xml index 19f338cb..b3944563 100644 --- a/build.xml +++ b/build.xml @@ -92,6 +92,10 @@ + + + + diff --git a/source/net/sourceforge/filebot/FileBotUtilities.java b/source/net/sourceforge/filebot/FileBotUtilities.java index c5e0399e..18d92dfa 100644 --- a/source/net/sourceforge/filebot/FileBotUtilities.java +++ b/source/net/sourceforge/filebot/FileBotUtilities.java @@ -2,12 +2,14 @@ package net.sourceforge.filebot; +import java.io.File; import java.io.FileFilter; import java.util.AbstractList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import net.sourceforge.tuned.FileUtilities; import net.sourceforge.tuned.FileUtilities.ExtensionFileFilter; @@ -63,10 +65,6 @@ public final class FileBotUtilities { public static String join(Object[] values, String separator) { - if (values == null) { - return null; - } - StringBuilder sb = new StringBuilder(); for (int i = 0; i < values.length; i++) { @@ -81,12 +79,12 @@ public final class FileBotUtilities { } - public static List asStringList(final List list) { + public static List asFileNameList(final List list) { return new AbstractList() { @Override public String get(int index) { - return list.get(index).toString(); + return FileUtilities.getName(list.get(index)); } diff --git a/source/net/sourceforge/filebot/similarity/SeasonEpisodeMatcher.java b/source/net/sourceforge/filebot/similarity/SeasonEpisodeMatcher.java index 59c2c6dc..50499dbf 100644 --- a/source/net/sourceforge/filebot/similarity/SeasonEpisodeMatcher.java +++ b/source/net/sourceforge/filebot/similarity/SeasonEpisodeMatcher.java @@ -23,7 +23,7 @@ public class SeasonEpisodeMatcher { patterns[1] = new SeasonEpisodePattern("(? matchAll(List names) { + public String match(File file) { + return match(file.getName(), file.getParent()); + } + + + public Collection matchAll(File... files) { SeriesNameCollection seriesNames = new SeriesNameCollection(); - // use pattern matching with frequency threshold + // group files by parent folder + for (Entry entry : mapNamesByFolder(files).entrySet()) { + String parent = entry.getKey().getName(); + String[] names = entry.getValue(); + + for (String nameMatch : matchAll(names)) { + String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent); + + // prefer common match, but use name match if there is no matching word sequence + seriesNames.add(commonMatch != null ? commonMatch : nameMatch); + } + } + + return seriesNames; + } + + + public Collection matchAll(String... names) { + SeriesNameCollection seriesNames = new SeriesNameCollection(); + + // 1. use pattern matching with frequency threshold seriesNames.addAll(flatMatchAll(names)); - // deep match common word sequences + // 2. match common word sequences seriesNames.addAll(deepMatchAll(names)); return seriesNames; @@ -49,11 +78,11 @@ public class SeriesNameMatcher { /** * Try to match and verify all series names using known season episode patterns. * - * @param names list of episode names - * @return series names that have been matched one or multiple times depending on the size - * of the given list + * @param names episode names + * @return series names that have been matched one or multiple times depending on the + * threshold */ - protected Collection flatMatchAll(Iterable names) { + private Collection flatMatchAll(String[] names) { ThresholdCollection seriesNames = new ThresholdCollection(threshold, String.CASE_INSENSITIVE_ORDER); for (String name : names) { @@ -74,9 +103,9 @@ public class SeriesNameMatcher { * @param names list of episode names * @return all common word sequences that have been found */ - protected Collection deepMatchAll(List names) { - // don't use common word sequence matching for less than 5 names - if (names.size() < threshold) { + private Collection deepMatchAll(String[] names) { + // can't use common word sequence matching for less than 2 names + if (names.length < 2 || names.length < threshold) { return Collections.emptySet(); } @@ -90,23 +119,44 @@ public class SeriesNameMatcher { // recursive divide and conquer List results = new ArrayList(); - if (names.size() >= 2) { - // split list in two and try to match common word sequence on those - results.addAll(deepMatchAll(names.subList(0, names.size() / 2))); - results.addAll(deepMatchAll(names.subList(names.size() / 2, names.size()))); - } + // split list in two and try to match common word sequence on those + results.addAll(deepMatchAll(Arrays.copyOfRange(names, 0, names.length / 2))); + results.addAll(deepMatchAll(Arrays.copyOfRange(names, names.length / 2, names.length))); return results; } + /** + * Match series name using season episode pattern and then try to find a common word + * sequence between the first match and the given parent. + * + * @param name episode name + * @param parent a string that contains the series name + * @return a likely series name + */ + public String match(String name, String parent) { + String nameMatch = matchBySeasonEpisodePattern(name); + + if (nameMatch != null) { + String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent); + + if (commonMatch != null) { + return commonMatch; + } + } + + return nameMatch; + } + + /** * Try to match a series name from the given episode name using known season episode * patterns. * * @param name episode name * @return a substring of the given name that ends before the first occurrence of a season - * episode pattern, or null + * episode pattern, or null if there is no such pattern */ public String matchBySeasonEpisodePattern(String name) { int seasonEpisodePosition = seasonEpisodeMatcher.find(name); @@ -126,10 +176,9 @@ public class SeriesNameMatcher { * @param names various episode names (5 or more for accurate results) * @return a word sequence all episode names have in common, or null */ - public String matchByFirstCommonWordSequence(Collection names) { - if (names.size() <= 1) { - // can't match common sequence from less than two names - return null; + public String matchByFirstCommonWordSequence(String... names) { + if (names.length < 2) { + throw new IllegalArgumentException("Can't match common sequence from less than two names"); } String[] common = null; @@ -151,14 +200,19 @@ public class SeriesNameMatcher { } } - // join will return null, if common is null + if (common == null) + return null; + return join(common, " "); } protected String normalize(String name) { - // remove group names (remove any [...]) - name = name.replaceAll("\\[[^\\]]+\\]", ""); + // normalize brackets, convert (...) to [...] + name = name.replace('(', '[').replace(')', ']'); + + // remove group names, any [...] + name = name.replaceAll("\\[[^\\[]+\\]", ""); // remove special characters name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " "); @@ -195,6 +249,33 @@ public class SeriesNameMatcher { return null; } + + private Map mapNamesByFolder(File... files) { + Map> filesByFolder = new LinkedHashMap>(); + + for (File file : files) { + File folder = file.getParentFile(); + + List list = filesByFolder.get(folder); + + if (list == null) { + list = new ArrayList(); + filesByFolder.put(folder, list); + } + + list.add(file); + } + + // convert folder->files map to folder->names map + Map namesByFolder = new LinkedHashMap(); + + for (Entry> entry : filesByFolder.entrySet()) { + namesByFolder.put(entry.getKey(), FileBotUtilities.asFileNameList(entry.getValue()).toArray(new String[0])); + } + + return namesByFolder; + } + protected static class SeriesNameCollection extends AbstractCollection { @@ -272,30 +353,30 @@ public class SeriesNameMatcher { @Override - public boolean add(E e) { - Collection buffer = limbo.get(e); + public boolean add(E value) { + Collection buffer = limbo.get(value); if (buffer == null) { // initialize buffer buffer = new ArrayList(threshold); - limbo.put(e, buffer); + limbo.put(value, buffer); } if (buffer == heaven) { // threshold reached - heaven.add(e); + heaven.add(value); return true; } // add element to buffer - buffer.add(e); + buffer.add(value); // check if threshold has been reached if (buffer.size() >= threshold) { heaven.addAll(buffer); // replace buffer with heaven - limbo.put(e, heaven); + limbo.put(value, heaven); return true; } diff --git a/source/net/sourceforge/filebot/ui/FileBotWindow.java b/source/net/sourceforge/filebot/ui/FileBotWindow.java index b44ca156..21ac4fc7 100644 --- a/source/net/sourceforge/filebot/ui/FileBotWindow.java +++ b/source/net/sourceforge/filebot/ui/FileBotWindow.java @@ -2,7 +2,6 @@ package net.sourceforge.filebot.ui; -import static net.sourceforge.filebot.FileBotUtilities.asStringList; import static net.sourceforge.filebot.Settings.getApplicationName; import java.awt.BorderLayout; @@ -64,10 +63,10 @@ public class FileBotWindow extends JFrame implements ListSelectionListener { setSize(760, 615); - // restore the panel selection from last time, + //TODO restore the panel selection from last time, // switch to EpisodeListPanel by default (e.g. first start) - int selectedPanel = asStringList(panelSelectionList.getPanelModel()).indexOf(Settings.userRoot().get("selectedPanel")); - panelSelectionList.setSelectedIndex(selectedPanel); + // int selectedPanel = asStringList(panelSelectionList.getPanelModel()).indexOf(Settings.userRoot().get("selectedPanel")); + // panelSelectionList.setSelectedIndex(selectedPanel); // connect message handlers to message bus MessageBus.getDefault().addMessageHandler("panel", panelSelectMessageHandler); diff --git a/source/net/sourceforge/filebot/ui/panel/list/FileListTransferablePolicy.java b/source/net/sourceforge/filebot/ui/panel/list/FileListTransferablePolicy.java index c881e85f..e6932e41 100644 --- a/source/net/sourceforge/filebot/ui/panel/list/FileListTransferablePolicy.java +++ b/source/net/sourceforge/filebot/ui/panel/list/FileListTransferablePolicy.java @@ -9,10 +9,12 @@ import static net.sourceforge.tuned.FileUtilities.containsOnly; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; +import net.sourceforge.filebot.FileBotUtilities; import net.sourceforge.filebot.torrent.Torrent; import net.sourceforge.filebot.ui.FileBotList; import net.sourceforge.filebot.ui.transfer.FileTransferablePolicy; @@ -51,9 +53,7 @@ class FileListTransferablePolicy extends FileTransferablePolicy { } else if (containsOnly(files, TORRENT_FILES)) { loadTorrents(files); } else { - for (File file : files) { - list.getModel().add(FileUtilities.getName(file)); - } + list.getModel().addAll(FileBotUtilities.asFileNameList(files)); } } @@ -65,9 +65,7 @@ class FileListTransferablePolicy extends FileTransferablePolicy { } for (File folder : folders) { - for (File file : folder.listFiles()) { - list.getModel().add(FileUtilities.getName(file)); - } + list.getModel().addAll(FileBotUtilities.asFileNameList(Arrays.asList(folder.listFiles()))); } } diff --git a/source/net/sourceforge/filebot/ui/panel/rename/AutoEpisodeListMatcher.java b/source/net/sourceforge/filebot/ui/panel/rename/AutoEpisodeListMatcher.java index c70fac99..64e72b2b 100644 --- a/source/net/sourceforge/filebot/ui/panel/rename/AutoEpisodeListMatcher.java +++ b/source/net/sourceforge/filebot/ui/panel/rename/AutoEpisodeListMatcher.java @@ -3,10 +3,10 @@ package net.sourceforge.filebot.ui.panel.rename; import static net.sourceforge.filebot.FileBotUtilities.SUBTITLE_FILES; -import static net.sourceforge.filebot.FileBotUtilities.asStringList; import static net.sourceforge.filebot.web.Episode.formatEpisodeNumbers; import static net.sourceforge.tuned.FileUtilities.FILES; +import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -52,9 +52,16 @@ class AutoEpisodeListMatcher extends SwingWorker> protected Collection matchSeriesNames(List episodes) { - int threshold = Math.min(episodes.size(), 5); + File[] files = new File[episodes.size()]; - return new SeriesNameMatcher(threshold).matchAll(asStringList(episodes)); + for (int i = 0; i < files.length; i++) { + files[i] = episodes.get(i).getFile(); + } + + // allow matching of a small number of episodes, by setting threshold = length if length < 5 + int threshold = Math.min(files.length, 5); + + return new SeriesNameMatcher(threshold).matchAll(files); } diff --git a/test/net/sourceforge/filebot/similarity/SeriesNameMatcherTest.java b/test/net/sourceforge/filebot/similarity/SeriesNameMatcherTest.java index a5be70c7..087e4f88 100644 --- a/test/net/sourceforge/filebot/similarity/SeriesNameMatcherTest.java +++ b/test/net/sourceforge/filebot/similarity/SeriesNameMatcherTest.java @@ -15,6 +15,12 @@ public class SeriesNameMatcherTest { private static SeriesNameMatcher matcher = new SeriesNameMatcher(5); + @Test + public void match() { + assertEquals("Test Series", matcher.match("My Test Series - 1x01", "Test Series - Season 1")); + } + + @Test public void matchBeforeSeasonEpisodePattern() { assertEquals("The Test", matcher.matchBySeasonEpisodePattern("The Test - 1x01")); @@ -30,7 +36,10 @@ public class SeriesNameMatcherTest { assertEquals("The Test", matcher.normalize("_The_Test_-_ ...")); // brackets - assertEquals("Luffy", matcher.normalize("[strawhat] Luffy [D.] [@Monkey]")); + assertEquals("Luffy", matcher.normalize("[strawhat] Luffy [D.] [#Monkey]")); + + // invalid brackets + assertEquals("strawhat Luffy", matcher.normalize("(strawhat [Luffy (#Monkey)")); }