* improved SeriesNameMatcher

* slightly modified season episode regex
* added ehcache to fatjar build
This commit is contained in:
Reinhard Pointner 2009-02-02 20:50:04 +00:00
parent 9fd13dceae
commit f7fdc5b5db
8 changed files with 147 additions and 51 deletions

View File

@ -92,6 +92,10 @@
<include name="**/*.class" /> <include name="**/*.class" />
<include name="**/*.properties" /> <include name="**/*.properties" />
</zipfileset> </zipfileset>
<zipfileset src="${dir.lib}/ehcache.jar">
<include name="net/sf/ehcache/**" />
</zipfileset>
</jar> </jar>
</target> </target>

View File

@ -2,12 +2,14 @@
package net.sourceforge.filebot; package net.sourceforge.filebot;
import java.io.File;
import java.io.FileFilter; import java.io.FileFilter;
import java.util.AbstractList; import java.util.AbstractList;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.sourceforge.tuned.FileUtilities;
import net.sourceforge.tuned.FileUtilities.ExtensionFileFilter; import net.sourceforge.tuned.FileUtilities.ExtensionFileFilter;
@ -63,10 +65,6 @@ public final class FileBotUtilities {
public static String join(Object[] values, String separator) { public static String join(Object[] values, String separator) {
if (values == null) {
return null;
}
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (int i = 0; i < values.length; i++) { for (int i = 0; i < values.length; i++) {
@ -81,12 +79,12 @@ public final class FileBotUtilities {
} }
public static List<String> asStringList(final List<?> list) { public static List<String> asFileNameList(final List<File> list) {
return new AbstractList<String>() { return new AbstractList<String>() {
@Override @Override
public String get(int index) { public String get(int index) {
return list.get(index).toString(); return FileUtilities.getName(list.get(index));
} }

View File

@ -23,7 +23,7 @@ public class SeasonEpisodeMatcher {
patterns[1] = new SeasonEpisodePattern("(?<!\\p{Alnum})(\\d{1,2})x(\\d{1,3})(?!\\p{Digit})"); patterns[1] = new SeasonEpisodePattern("(?<!\\p{Alnum})(\\d{1,2})x(\\d{1,3})(?!\\p{Digit})");
// match patterns like 01, 102, 1003 (enclosed in separators) // match patterns like 01, 102, 1003 (enclosed in separators)
patterns[2] = new SeasonEpisodePattern("(?<=^|[\\._ ])([0-2]?\\d?)(\\d{2})(?=[\\._ ]|$)"); patterns[2] = new SeasonEpisodePattern("(?<=^|[\\._ ])([0-1]?\\d?)(\\d{2})(?=[\\._ ]|$)");
} }

View File

@ -4,6 +4,7 @@ package net.sourceforge.filebot.similarity;
import static net.sourceforge.filebot.FileBotUtilities.join; import static net.sourceforge.filebot.FileBotUtilities.join;
import java.io.File;
import java.util.AbstractCollection; import java.util.AbstractCollection;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -16,6 +17,9 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Scanner; import java.util.Scanner;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.Map.Entry;
import net.sourceforge.filebot.FileBotUtilities;
public class SeriesNameMatcher { public class SeriesNameMatcher {
@ -26,20 +30,45 @@ public class SeriesNameMatcher {
public SeriesNameMatcher(int threshold) { public SeriesNameMatcher(int threshold) {
if (threshold <= 0) if (threshold < 0)
throw new IllegalArgumentException("threshold must be greater than 0"); throw new IllegalArgumentException("threshold must be greater than 0");
this.threshold = threshold; this.threshold = threshold;
} }
public Collection<String> matchAll(List<String> names) { public String match(File file) {
return match(file.getName(), file.getParent());
}
public Collection<String> matchAll(File... files) {
SeriesNameCollection seriesNames = new SeriesNameCollection(); SeriesNameCollection seriesNames = new SeriesNameCollection();
// use pattern matching with frequency threshold // group files by parent folder
for (Entry<File, String[]> entry : mapNamesByFolder(files).entrySet()) {
String parent = entry.getKey().getName();
String[] names = entry.getValue();
for (String nameMatch : matchAll(names)) {
String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
// prefer common match, but use name match if there is no matching word sequence
seriesNames.add(commonMatch != null ? commonMatch : nameMatch);
}
}
return seriesNames;
}
public Collection<String> matchAll(String... names) {
SeriesNameCollection seriesNames = new SeriesNameCollection();
// 1. use pattern matching with frequency threshold
seriesNames.addAll(flatMatchAll(names)); seriesNames.addAll(flatMatchAll(names));
// deep match common word sequences // 2. match common word sequences
seriesNames.addAll(deepMatchAll(names)); seriesNames.addAll(deepMatchAll(names));
return seriesNames; return seriesNames;
@ -49,11 +78,11 @@ public class SeriesNameMatcher {
/** /**
* Try to match and verify all series names using known season episode patterns. * Try to match and verify all series names using known season episode patterns.
* *
* @param names list of episode names * @param names episode names
* @return series names that have been matched one or multiple times depending on the size * @return series names that have been matched one or multiple times depending on the
* of the given list * threshold
*/ */
protected Collection<String> flatMatchAll(Iterable<String> names) { private Collection<String> flatMatchAll(String[] names) {
ThresholdCollection<String> seriesNames = new ThresholdCollection<String>(threshold, String.CASE_INSENSITIVE_ORDER); ThresholdCollection<String> seriesNames = new ThresholdCollection<String>(threshold, String.CASE_INSENSITIVE_ORDER);
for (String name : names) { for (String name : names) {
@ -74,9 +103,9 @@ public class SeriesNameMatcher {
* @param names list of episode names * @param names list of episode names
* @return all common word sequences that have been found * @return all common word sequences that have been found
*/ */
protected Collection<String> deepMatchAll(List<String> names) { private Collection<String> deepMatchAll(String[] names) {
// don't use common word sequence matching for less than 5 names // can't use common word sequence matching for less than 2 names
if (names.size() < threshold) { if (names.length < 2 || names.length < threshold) {
return Collections.emptySet(); return Collections.emptySet();
} }
@ -90,23 +119,44 @@ public class SeriesNameMatcher {
// recursive divide and conquer // recursive divide and conquer
List<String> results = new ArrayList<String>(); List<String> results = new ArrayList<String>();
if (names.size() >= 2) {
// split list in two and try to match common word sequence on those // split list in two and try to match common word sequence on those
results.addAll(deepMatchAll(names.subList(0, names.size() / 2))); results.addAll(deepMatchAll(Arrays.copyOfRange(names, 0, names.length / 2)));
results.addAll(deepMatchAll(names.subList(names.size() / 2, names.size()))); results.addAll(deepMatchAll(Arrays.copyOfRange(names, names.length / 2, names.length)));
}
return results; return results;
} }
/**
* Match series name using season episode pattern and then try to find a common word
* sequence between the first match and the given parent.
*
* @param name episode name
* @param parent a string that contains the series name
* @return a likely series name
*/
public String match(String name, String parent) {
String nameMatch = matchBySeasonEpisodePattern(name);
if (nameMatch != null) {
String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
if (commonMatch != null) {
return commonMatch;
}
}
return nameMatch;
}
/** /**
* Try to match a series name from the given episode name using known season episode * Try to match a series name from the given episode name using known season episode
* patterns. * patterns.
* *
* @param name episode name * @param name episode name
* @return a substring of the given name that ends before the first occurrence of a season * @return a substring of the given name that ends before the first occurrence of a season
* episode pattern, or null * episode pattern, or null if there is no such pattern
*/ */
public String matchBySeasonEpisodePattern(String name) { public String matchBySeasonEpisodePattern(String name) {
int seasonEpisodePosition = seasonEpisodeMatcher.find(name); int seasonEpisodePosition = seasonEpisodeMatcher.find(name);
@ -126,10 +176,9 @@ public class SeriesNameMatcher {
* @param names various episode names (5 or more for accurate results) * @param names various episode names (5 or more for accurate results)
* @return a word sequence all episode names have in common, or null * @return a word sequence all episode names have in common, or null
*/ */
public String matchByFirstCommonWordSequence(Collection<String> names) { public String matchByFirstCommonWordSequence(String... names) {
if (names.size() <= 1) { if (names.length < 2) {
// can't match common sequence from less than two names throw new IllegalArgumentException("Can't match common sequence from less than two names");
return null;
} }
String[] common = null; String[] common = null;
@ -151,14 +200,19 @@ public class SeriesNameMatcher {
} }
} }
// join will return null, if common is null if (common == null)
return null;
return join(common, " "); return join(common, " ");
} }
protected String normalize(String name) { protected String normalize(String name) {
// remove group names (remove any [...]) // normalize brackets, convert (...) to [...]
name = name.replaceAll("\\[[^\\]]+\\]", ""); name = name.replace('(', '[').replace(')', ']');
// remove group names, any [...]
name = name.replaceAll("\\[[^\\[]+\\]", "");
// remove special characters // remove special characters
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " "); name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
@ -196,6 +250,33 @@ public class SeriesNameMatcher {
} }
private Map<File, String[]> mapNamesByFolder(File... files) {
Map<File, List<File>> filesByFolder = new LinkedHashMap<File, List<File>>();
for (File file : files) {
File folder = file.getParentFile();
List<File> list = filesByFolder.get(folder);
if (list == null) {
list = new ArrayList<File>();
filesByFolder.put(folder, list);
}
list.add(file);
}
// convert folder->files map to folder->names map
Map<File, String[]> namesByFolder = new LinkedHashMap<File, String[]>();
for (Entry<File, List<File>> entry : filesByFolder.entrySet()) {
namesByFolder.put(entry.getKey(), FileBotUtilities.asFileNameList(entry.getValue()).toArray(new String[0]));
}
return namesByFolder;
}
protected static class SeriesNameCollection extends AbstractCollection<String> { protected static class SeriesNameCollection extends AbstractCollection<String> {
private final Map<String, String> data = new LinkedHashMap<String, String>(); private final Map<String, String> data = new LinkedHashMap<String, String>();
@ -272,30 +353,30 @@ public class SeriesNameMatcher {
@Override @Override
public boolean add(E e) { public boolean add(E value) {
Collection<E> buffer = limbo.get(e); Collection<E> buffer = limbo.get(value);
if (buffer == null) { if (buffer == null) {
// initialize buffer // initialize buffer
buffer = new ArrayList<E>(threshold); buffer = new ArrayList<E>(threshold);
limbo.put(e, buffer); limbo.put(value, buffer);
} }
if (buffer == heaven) { if (buffer == heaven) {
// threshold reached // threshold reached
heaven.add(e); heaven.add(value);
return true; return true;
} }
// add element to buffer // add element to buffer
buffer.add(e); buffer.add(value);
// check if threshold has been reached // check if threshold has been reached
if (buffer.size() >= threshold) { if (buffer.size() >= threshold) {
heaven.addAll(buffer); heaven.addAll(buffer);
// replace buffer with heaven // replace buffer with heaven
limbo.put(e, heaven); limbo.put(value, heaven);
return true; return true;
} }

View File

@ -2,7 +2,6 @@
package net.sourceforge.filebot.ui; package net.sourceforge.filebot.ui;
import static net.sourceforge.filebot.FileBotUtilities.asStringList;
import static net.sourceforge.filebot.Settings.getApplicationName; import static net.sourceforge.filebot.Settings.getApplicationName;
import java.awt.BorderLayout; import java.awt.BorderLayout;
@ -64,10 +63,10 @@ public class FileBotWindow extends JFrame implements ListSelectionListener {
setSize(760, 615); setSize(760, 615);
// restore the panel selection from last time, //TODO restore the panel selection from last time,
// switch to EpisodeListPanel by default (e.g. first start) // switch to EpisodeListPanel by default (e.g. first start)
int selectedPanel = asStringList(panelSelectionList.getPanelModel()).indexOf(Settings.userRoot().get("selectedPanel")); // int selectedPanel = asStringList(panelSelectionList.getPanelModel()).indexOf(Settings.userRoot().get("selectedPanel"));
panelSelectionList.setSelectedIndex(selectedPanel); // panelSelectionList.setSelectedIndex(selectedPanel);
// connect message handlers to message bus // connect message handlers to message bus
MessageBus.getDefault().addMessageHandler("panel", panelSelectMessageHandler); MessageBus.getDefault().addMessageHandler("panel", panelSelectMessageHandler);

View File

@ -9,10 +9,12 @@ import static net.sourceforge.tuned.FileUtilities.containsOnly;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import net.sourceforge.filebot.FileBotUtilities;
import net.sourceforge.filebot.torrent.Torrent; import net.sourceforge.filebot.torrent.Torrent;
import net.sourceforge.filebot.ui.FileBotList; import net.sourceforge.filebot.ui.FileBotList;
import net.sourceforge.filebot.ui.transfer.FileTransferablePolicy; import net.sourceforge.filebot.ui.transfer.FileTransferablePolicy;
@ -51,9 +53,7 @@ class FileListTransferablePolicy extends FileTransferablePolicy {
} else if (containsOnly(files, TORRENT_FILES)) { } else if (containsOnly(files, TORRENT_FILES)) {
loadTorrents(files); loadTorrents(files);
} else { } else {
for (File file : files) { list.getModel().addAll(FileBotUtilities.asFileNameList(files));
list.getModel().add(FileUtilities.getName(file));
}
} }
} }
@ -65,9 +65,7 @@ class FileListTransferablePolicy extends FileTransferablePolicy {
} }
for (File folder : folders) { for (File folder : folders) {
for (File file : folder.listFiles()) { list.getModel().addAll(FileBotUtilities.asFileNameList(Arrays.asList(folder.listFiles())));
list.getModel().add(FileUtilities.getName(file));
}
} }
} }

View File

@ -3,10 +3,10 @@ package net.sourceforge.filebot.ui.panel.rename;
import static net.sourceforge.filebot.FileBotUtilities.SUBTITLE_FILES; import static net.sourceforge.filebot.FileBotUtilities.SUBTITLE_FILES;
import static net.sourceforge.filebot.FileBotUtilities.asStringList;
import static net.sourceforge.filebot.web.Episode.formatEpisodeNumbers; import static net.sourceforge.filebot.web.Episode.formatEpisodeNumbers;
import static net.sourceforge.tuned.FileUtilities.FILES; import static net.sourceforge.tuned.FileUtilities.FILES;
import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
@ -52,9 +52,16 @@ class AutoEpisodeListMatcher extends SwingWorker<List<Match<FileEntry, Episode>>
protected Collection<String> matchSeriesNames(List<FileEntry> episodes) { protected Collection<String> matchSeriesNames(List<FileEntry> episodes) {
int threshold = Math.min(episodes.size(), 5); File[] files = new File[episodes.size()];
return new SeriesNameMatcher(threshold).matchAll(asStringList(episodes)); for (int i = 0; i < files.length; i++) {
files[i] = episodes.get(i).getFile();
}
// allow matching of a small number of episodes, by setting threshold = length if length < 5
int threshold = Math.min(files.length, 5);
return new SeriesNameMatcher(threshold).matchAll(files);
} }

View File

@ -15,6 +15,12 @@ public class SeriesNameMatcherTest {
private static SeriesNameMatcher matcher = new SeriesNameMatcher(5); private static SeriesNameMatcher matcher = new SeriesNameMatcher(5);
@Test
public void match() {
assertEquals("Test Series", matcher.match("My Test Series - 1x01", "Test Series - Season 1"));
}
@Test @Test
public void matchBeforeSeasonEpisodePattern() { public void matchBeforeSeasonEpisodePattern() {
assertEquals("The Test", matcher.matchBySeasonEpisodePattern("The Test - 1x01")); assertEquals("The Test", matcher.matchBySeasonEpisodePattern("The Test - 1x01"));
@ -30,7 +36,10 @@ public class SeriesNameMatcherTest {
assertEquals("The Test", matcher.normalize("_The_Test_-_ ...")); assertEquals("The Test", matcher.normalize("_The_Test_-_ ..."));
// brackets // brackets
assertEquals("Luffy", matcher.normalize("[strawhat] Luffy [D.] [@Monkey]")); assertEquals("Luffy", matcher.normalize("[strawhat] Luffy [D.] [#Monkey]"));
// invalid brackets
assertEquals("strawhat Luffy", matcher.normalize("(strawhat [Luffy (#Monkey)"));
} }