* improved SeriesNameMatcher

* slightly modified season episode regex
* added ehcache to fatjar build
This commit is contained in:
Reinhard Pointner 2009-02-02 20:50:04 +00:00
parent 9fd13dceae
commit f7fdc5b5db
8 changed files with 147 additions and 51 deletions

View File

@ -92,6 +92,10 @@
<include name="**/*.class" />
<include name="**/*.properties" />
</zipfileset>
<zipfileset src="${dir.lib}/ehcache.jar">
<include name="net/sf/ehcache/**" />
</zipfileset>
</jar>
</target>

View File

@ -2,12 +2,14 @@
package net.sourceforge.filebot;
import java.io.File;
import java.io.FileFilter;
import java.util.AbstractList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sourceforge.tuned.FileUtilities;
import net.sourceforge.tuned.FileUtilities.ExtensionFileFilter;
@ -63,10 +65,6 @@ public final class FileBotUtilities {
public static String join(Object[] values, String separator) {
if (values == null) {
return null;
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < values.length; i++) {
@ -81,12 +79,12 @@ public final class FileBotUtilities {
}
public static List<String> asStringList(final List<?> list) {
public static List<String> asFileNameList(final List<File> list) {
return new AbstractList<String>() {
@Override
public String get(int index) {
return list.get(index).toString();
return FileUtilities.getName(list.get(index));
}

View File

@ -23,7 +23,7 @@ public class SeasonEpisodeMatcher {
patterns[1] = new SeasonEpisodePattern("(?<!\\p{Alnum})(\\d{1,2})x(\\d{1,3})(?!\\p{Digit})");
// match patterns like 01, 102, 1003 (enclosed in separators)
patterns[2] = new SeasonEpisodePattern("(?<=^|[\\._ ])([0-2]?\\d?)(\\d{2})(?=[\\._ ]|$)");
patterns[2] = new SeasonEpisodePattern("(?<=^|[\\._ ])([0-1]?\\d?)(\\d{2})(?=[\\._ ]|$)");
}

View File

@ -4,6 +4,7 @@ package net.sourceforge.filebot.similarity;
import static net.sourceforge.filebot.FileBotUtilities.join;
import java.io.File;
import java.util.AbstractCollection;
import java.util.ArrayList;
import java.util.Arrays;
@ -16,6 +17,9 @@ import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.TreeMap;
import java.util.Map.Entry;
import net.sourceforge.filebot.FileBotUtilities;
public class SeriesNameMatcher {
@ -26,20 +30,45 @@ public class SeriesNameMatcher {
public SeriesNameMatcher(int threshold) {
if (threshold <= 0)
if (threshold < 0)
throw new IllegalArgumentException("threshold must be greater than 0");
this.threshold = threshold;
}
public Collection<String> matchAll(List<String> names) {
public String match(File file) {
return match(file.getName(), file.getParent());
}
public Collection<String> matchAll(File... files) {
SeriesNameCollection seriesNames = new SeriesNameCollection();
// use pattern matching with frequency threshold
// group files by parent folder
for (Entry<File, String[]> entry : mapNamesByFolder(files).entrySet()) {
String parent = entry.getKey().getName();
String[] names = entry.getValue();
for (String nameMatch : matchAll(names)) {
String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
// prefer common match, but use name match if there is no matching word sequence
seriesNames.add(commonMatch != null ? commonMatch : nameMatch);
}
}
return seriesNames;
}
public Collection<String> matchAll(String... names) {
SeriesNameCollection seriesNames = new SeriesNameCollection();
// 1. use pattern matching with frequency threshold
seriesNames.addAll(flatMatchAll(names));
// deep match common word sequences
// 2. match common word sequences
seriesNames.addAll(deepMatchAll(names));
return seriesNames;
@ -49,11 +78,11 @@ public class SeriesNameMatcher {
/**
* Try to match and verify all series names using known season episode patterns.
*
* @param names list of episode names
* @return series names that have been matched one or multiple times depending on the size
* of the given list
* @param names episode names
* @return series names that have been matched one or multiple times depending on the
* threshold
*/
protected Collection<String> flatMatchAll(Iterable<String> names) {
private Collection<String> flatMatchAll(String[] names) {
ThresholdCollection<String> seriesNames = new ThresholdCollection<String>(threshold, String.CASE_INSENSITIVE_ORDER);
for (String name : names) {
@ -74,9 +103,9 @@ public class SeriesNameMatcher {
* @param names list of episode names
* @return all common word sequences that have been found
*/
protected Collection<String> deepMatchAll(List<String> names) {
// don't use common word sequence matching for less than 5 names
if (names.size() < threshold) {
private Collection<String> deepMatchAll(String[] names) {
// can't use common word sequence matching for less than 2 names
if (names.length < 2 || names.length < threshold) {
return Collections.emptySet();
}
@ -90,23 +119,44 @@ public class SeriesNameMatcher {
// recursive divide and conquer
List<String> results = new ArrayList<String>();
if (names.size() >= 2) {
// split list in two and try to match common word sequence on those
results.addAll(deepMatchAll(names.subList(0, names.size() / 2)));
results.addAll(deepMatchAll(names.subList(names.size() / 2, names.size())));
}
// split list in two and try to match common word sequence on those
results.addAll(deepMatchAll(Arrays.copyOfRange(names, 0, names.length / 2)));
results.addAll(deepMatchAll(Arrays.copyOfRange(names, names.length / 2, names.length)));
return results;
}
/**
* Match series name using season episode pattern and then try to find a common word
* sequence between the first match and the given parent.
*
* @param name episode name
* @param parent a string that contains the series name
* @return a likely series name
*/
public String match(String name, String parent) {
String nameMatch = matchBySeasonEpisodePattern(name);
if (nameMatch != null) {
String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
if (commonMatch != null) {
return commonMatch;
}
}
return nameMatch;
}
/**
* Try to match a series name from the given episode name using known season episode
* patterns.
*
* @param name episode name
* @return a substring of the given name that ends before the first occurrence of a season
* episode pattern, or null
* episode pattern, or null if there is no such pattern
*/
public String matchBySeasonEpisodePattern(String name) {
int seasonEpisodePosition = seasonEpisodeMatcher.find(name);
@ -126,10 +176,9 @@ public class SeriesNameMatcher {
* @param names various episode names (5 or more for accurate results)
* @return a word sequence all episode names have in common, or null
*/
public String matchByFirstCommonWordSequence(Collection<String> names) {
if (names.size() <= 1) {
// can't match common sequence from less than two names
return null;
public String matchByFirstCommonWordSequence(String... names) {
if (names.length < 2) {
throw new IllegalArgumentException("Can't match common sequence from less than two names");
}
String[] common = null;
@ -151,14 +200,19 @@ public class SeriesNameMatcher {
}
}
// join will return null, if common is null
if (common == null)
return null;
return join(common, " ");
}
protected String normalize(String name) {
// remove group names (remove any [...])
name = name.replaceAll("\\[[^\\]]+\\]", "");
// normalize brackets, convert (...) to [...]
name = name.replace('(', '[').replace(')', ']');
// remove group names, any [...]
name = name.replaceAll("\\[[^\\[]+\\]", "");
// remove special characters
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
@ -195,6 +249,33 @@ public class SeriesNameMatcher {
return null;
}
private Map<File, String[]> mapNamesByFolder(File... files) {
Map<File, List<File>> filesByFolder = new LinkedHashMap<File, List<File>>();
for (File file : files) {
File folder = file.getParentFile();
List<File> list = filesByFolder.get(folder);
if (list == null) {
list = new ArrayList<File>();
filesByFolder.put(folder, list);
}
list.add(file);
}
// convert folder->files map to folder->names map
Map<File, String[]> namesByFolder = new LinkedHashMap<File, String[]>();
for (Entry<File, List<File>> entry : filesByFolder.entrySet()) {
namesByFolder.put(entry.getKey(), FileBotUtilities.asFileNameList(entry.getValue()).toArray(new String[0]));
}
return namesByFolder;
}
protected static class SeriesNameCollection extends AbstractCollection<String> {
@ -272,30 +353,30 @@ public class SeriesNameMatcher {
@Override
public boolean add(E e) {
Collection<E> buffer = limbo.get(e);
public boolean add(E value) {
Collection<E> buffer = limbo.get(value);
if (buffer == null) {
// initialize buffer
buffer = new ArrayList<E>(threshold);
limbo.put(e, buffer);
limbo.put(value, buffer);
}
if (buffer == heaven) {
// threshold reached
heaven.add(e);
heaven.add(value);
return true;
}
// add element to buffer
buffer.add(e);
buffer.add(value);
// check if threshold has been reached
if (buffer.size() >= threshold) {
heaven.addAll(buffer);
// replace buffer with heaven
limbo.put(e, heaven);
limbo.put(value, heaven);
return true;
}

View File

@ -2,7 +2,6 @@
package net.sourceforge.filebot.ui;
import static net.sourceforge.filebot.FileBotUtilities.asStringList;
import static net.sourceforge.filebot.Settings.getApplicationName;
import java.awt.BorderLayout;
@ -64,10 +63,10 @@ public class FileBotWindow extends JFrame implements ListSelectionListener {
setSize(760, 615);
// restore the panel selection from last time,
//TODO restore the panel selection from last time,
// switch to EpisodeListPanel by default (e.g. first start)
int selectedPanel = asStringList(panelSelectionList.getPanelModel()).indexOf(Settings.userRoot().get("selectedPanel"));
panelSelectionList.setSelectedIndex(selectedPanel);
// int selectedPanel = asStringList(panelSelectionList.getPanelModel()).indexOf(Settings.userRoot().get("selectedPanel"));
// panelSelectionList.setSelectedIndex(selectedPanel);
// connect message handlers to message bus
MessageBus.getDefault().addMessageHandler("panel", panelSelectMessageHandler);

View File

@ -9,10 +9,12 @@ import static net.sourceforge.tuned.FileUtilities.containsOnly;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import net.sourceforge.filebot.FileBotUtilities;
import net.sourceforge.filebot.torrent.Torrent;
import net.sourceforge.filebot.ui.FileBotList;
import net.sourceforge.filebot.ui.transfer.FileTransferablePolicy;
@ -51,9 +53,7 @@ class FileListTransferablePolicy extends FileTransferablePolicy {
} else if (containsOnly(files, TORRENT_FILES)) {
loadTorrents(files);
} else {
for (File file : files) {
list.getModel().add(FileUtilities.getName(file));
}
list.getModel().addAll(FileBotUtilities.asFileNameList(files));
}
}
@ -65,9 +65,7 @@ class FileListTransferablePolicy extends FileTransferablePolicy {
}
for (File folder : folders) {
for (File file : folder.listFiles()) {
list.getModel().add(FileUtilities.getName(file));
}
list.getModel().addAll(FileBotUtilities.asFileNameList(Arrays.asList(folder.listFiles())));
}
}

View File

@ -3,10 +3,10 @@ package net.sourceforge.filebot.ui.panel.rename;
import static net.sourceforge.filebot.FileBotUtilities.SUBTITLE_FILES;
import static net.sourceforge.filebot.FileBotUtilities.asStringList;
import static net.sourceforge.filebot.web.Episode.formatEpisodeNumbers;
import static net.sourceforge.tuned.FileUtilities.FILES;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@ -52,9 +52,16 @@ class AutoEpisodeListMatcher extends SwingWorker<List<Match<FileEntry, Episode>>
protected Collection<String> matchSeriesNames(List<FileEntry> episodes) {
int threshold = Math.min(episodes.size(), 5);
File[] files = new File[episodes.size()];
return new SeriesNameMatcher(threshold).matchAll(asStringList(episodes));
for (int i = 0; i < files.length; i++) {
files[i] = episodes.get(i).getFile();
}
// allow matching of a small number of episodes, by setting threshold = length if length < 5
int threshold = Math.min(files.length, 5);
return new SeriesNameMatcher(threshold).matchAll(files);
}

View File

@ -15,6 +15,12 @@ public class SeriesNameMatcherTest {
private static SeriesNameMatcher matcher = new SeriesNameMatcher(5);
@Test
public void match() {
assertEquals("Test Series", matcher.match("My Test Series - 1x01", "Test Series - Season 1"));
}
@Test
public void matchBeforeSeasonEpisodePattern() {
assertEquals("The Test", matcher.matchBySeasonEpisodePattern("The Test - 1x01"));
@ -30,7 +36,10 @@ public class SeriesNameMatcherTest {
assertEquals("The Test", matcher.normalize("_The_Test_-_ ..."));
// brackets
assertEquals("Luffy", matcher.normalize("[strawhat] Luffy [D.] [@Monkey]"));
assertEquals("Luffy", matcher.normalize("[strawhat] Luffy [D.] [#Monkey]"));
// invalid brackets
assertEquals("strawhat Luffy", matcher.normalize("(strawhat [Luffy (#Monkey)"));
}