* fine-tune new match set auto-detection and use in cmdline as well

* added final string similarity pass to matching cascade
This commit is contained in:
Reinhard Pointner 2011-12-25 15:47:19 +00:00
parent 3af542f195
commit d7d37104c4
4 changed files with 70 additions and 32 deletions

View File

@ -129,22 +129,37 @@ public class CmdlineOperations implements CmdlineInterface {
CLILogger.config(format("Rename episodes using [%s]", db.getName()));
List<File> mediaFiles = filter(files, VIDEO_FILES, SUBTITLE_FILES);
// auto-detect series name if not given
Collection<String> seriesNames = (query == null) ? detectQuery(mediaFiles, strict) : singleton(query);
// fetch episode data
Set<Episode> episodes = fetchEpisodeSet(db, seriesNames, locale, strict);
if (episodes.isEmpty()) {
throw new Exception("Failed to fetch episode data");
}
// similarity metrics for matching
SimilarityMetric[] sequence = strict ? StrictEpisodeMetrics.defaultSequence(false) : EpisodeMetrics.defaultSequence(false);
List<Match<File, Episode>> matches = new ArrayList<Match<File, Episode>>();
matches.addAll(matchEpisodes(filter(mediaFiles, VIDEO_FILES), episodes, sequence));
matches.addAll(matchEpisodes(filter(mediaFiles, SUBTITLE_FILES), episodes, sequence));
// auto-determine optimal batch sets
for (Entry<Set<File>, Set<String>> sameSeriesGroup : mapSeriesNamesByFiles(mediaFiles).entrySet()) {
List<List<File>> batchSets = new ArrayList<List<File>>();
if (sameSeriesGroup.getValue() != null && sameSeriesGroup.getValue().size() > 0) {
// handle series name batch set all at once
batchSets.add(new ArrayList<File>(sameSeriesGroup.getKey()));
} else {
// these files don't seem to belong to any series -> handle folder per folder
batchSets.addAll(mapByFolder(sameSeriesGroup.getKey()).values());
}
for (List<File> batch : batchSets) {
// auto-detect series name if not given
Collection<String> seriesNames = (query == null) ? detectQuery(batch, strict) : singleton(query);
// fetch episode data
Set<Episode> episodes = fetchEpisodeSet(db, seriesNames, locale, strict);
if (episodes.size() > 0) {
matches.addAll(matchEpisodes(filter(mediaFiles, VIDEO_FILES), episodes, sequence));
matches.addAll(matchEpisodes(filter(mediaFiles, SUBTITLE_FILES), episodes, sequence));
} else {
CLILogger.warning("Failed to fetch episode data: " + mapByFolder(batch).keySet());
}
}
}
if (matches.isEmpty()) {
throw new Exception("Unable to match files to episode data");

View File

@ -35,7 +35,7 @@ import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
public class MediaDetection {
public static Map<Set<File>, Set<String>> mapFoldersBySeriesNames(Collection<File> files) throws Exception {
public static Map<Set<File>, Set<String>> mapSeriesNamesByFiles(Collection<File> files) throws Exception {
SortedMap<File, List<File>> filesByFolder = mapByFolder(filter(files, VIDEO_FILES, SUBTITLE_FILES));
// map series names by folder
@ -64,7 +64,7 @@ public class MediaDetection {
}
// join both sets
Map<Set<File>, Set<String>> matchSets = new HashMap<Set<File>, Set<String>>();
Map<Set<File>, Set<String>> batchSets = new HashMap<Set<File>, Set<String>>();
while (seriesNamesByFolder.size() > 0) {
Set<String> combinedNameSet = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
@ -90,13 +90,22 @@ public class MediaDetection {
for (File folder : combinedFolderSet) {
combinedFileSet.addAll(filesByFolder.get(folder));
}
matchSets.put(combinedFileSet, combinedNameSet);
batchSets.put(combinedFileSet, combinedNameSet);
// set folders as accounted for
seriesNamesByFolder.keySet().removeAll(combinedFolderSet);
}
return matchSets;
// handle files that have not been matched to a batch set yet
Set<File> remainingFiles = new HashSet<File>(files);
for (Set<File> batch : batchSets.keySet()) {
remainingFiles.removeAll(batch);
}
if (remainingFiles.size() > 0) {
batchSets.put(remainingFiles, null);
}
return batchSets;
}

View File

@ -295,15 +295,16 @@ public enum EpisodeMetrics implements SimilarityMetric {
public static SimilarityMetric[] defaultSequence(boolean includeFileMetrics) {
// 1. pass: match by file length (fast, but only works when matching torrents or files)
// 2. pass: match by season / episode numbers
// 3. pass: match by checking series / episode title against the file path
// 4. pass: match by generic name similarity (slow, but most matches will have been determined in second pass)
// 5. pass: match by generic numeric similarity
// 1 pass: divide by file length (only works for matching torrent entries or files)
// 2-3 pass: divide by title or season / episode numbers
// 4 pass: divide by folder / file name and show name / episode title
// 5 pass: divide by name (rounded into n levels)
// 6 pass: divide by generic numeric similarity
// 7 pass: resolve remaining collisions via absolute string similarity
if (includeFileMetrics) {
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, Name, Numeric };
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, Name, Numeric, new NameSimilarityMetric() };
} else {
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, Name, Numeric };
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, Name, Numeric, new NameSimilarityMetric() };
}
}

View File

@ -20,6 +20,7 @@ import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
@ -173,14 +174,26 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
List<Callable<List<Match<File, ?>>>> taskPerFolder = new ArrayList<Callable<List<Match<File, ?>>>>();
// detect series names and create episode list fetch tasks
for (final Set<File> folder : mapFoldersBySeriesNames(mediaFiles).keySet()) {
taskPerFolder.add(new Callable<List<Match<File, ?>>>() {
@Override
public List<Match<File, ?>> call() throws Exception {
return matchEpisodeSet(new ArrayList<File>(folder), locale, autodetection, parent);
}
});
for (Entry<Set<File>, Set<String>> sameSeriesGroup : mapSeriesNamesByFiles(mediaFiles).entrySet()) {
List<List<File>> batchSets = new ArrayList<List<File>>();
if (sameSeriesGroup.getValue() != null && sameSeriesGroup.getValue().size() > 0) {
// handle series name batch set all at once
batchSets.add(new ArrayList<File>(sameSeriesGroup.getKey()));
} else {
// these files don't seem to belong to any series -> handle folder per folder
batchSets.addAll(mapByFolder(sameSeriesGroup.getKey()).values());
}
for (final List<File> batchSet : batchSets) {
taskPerFolder.add(new Callable<List<Match<File, ?>>>() {
@Override
public List<Match<File, ?>> call() throws Exception {
return matchEpisodeSet(batchSet, locale, autodetection, parent);
}
});
}
}
// match folder per folder in parallel