From 3ac78751b66e6c5ad22841fdc8b86f5c1315fea7 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Tue, 14 Feb 2017 01:34:24 +0800 Subject: [PATCH] Experiment with SAMI subtitles --- source/net/filebot/subtitle/SamiReader.java | 97 +++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 source/net/filebot/subtitle/SamiReader.java diff --git a/source/net/filebot/subtitle/SamiReader.java b/source/net/filebot/subtitle/SamiReader.java new file mode 100644 index 00000000..9bfa21a7 --- /dev/null +++ b/source/net/filebot/subtitle/SamiReader.java @@ -0,0 +1,97 @@ +package net.filebot.subtitle; + +import static java.util.stream.Collectors.*; +import static net.filebot.Logging.*; +import static net.filebot.similarity.Normalization.*; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +public class SamiReader { + + public List decode(CharSequence file) { + List subtitles = new ArrayList(); + + Matcher matcher = Pattern.compile("", Pattern.CASE_INSENSITIVE).matcher(file); + + long previousSyncStart = -1; + long previousSyncEnd = -1; + int previousSequenceEnd = -1; + + while (matcher.find()) { + Element sync = Jsoup.parseBodyFragment(matcher.group()).select("sync").first(); + + long nextSyncStart = getLongAttribute(sync, "start"); + long nextSyncEnd = getLongAttribute(sync, "end"); + + if (previousSequenceEnd > 0) { + // use Start time of the next subtitle element as End time of the previous one by default + if (previousSyncEnd < 0) { + previousSyncEnd = nextSyncStart; + } + + SubtitleElement subtitle = getSubtitle(previousSyncStart, previousSyncEnd, file.subSequence(previousSequenceEnd, matcher.start())); + if (subtitle != null) { + subtitles.add(subtitle); + } + } + + if (nextSyncStart >= 0) { + previousSyncStart = nextSyncStart; + previousSyncEnd = nextSyncEnd; + previousSequenceEnd = matcher.end(); + } + } + + // last element if any + if (previousSequenceEnd > 0) { + // if end time is not known, then just set subtitle duration to 2 seconds + if (previousSyncEnd < 0) { + previousSyncEnd = previousSyncStart + 2000; + } + + SubtitleElement subtitle = getSubtitle(previousSyncStart, previousSyncEnd, file.subSequence(previousSequenceEnd, file.length())); + if (subtitle != null) { + subtitles.add(subtitle); + } + } + + return subtitles; + } + + private SubtitleElement getSubtitle(long start, long end, CharSequence fragment) { + if (start >= 0 && end >= 0) { + Document document = Jsoup.parseBodyFragment(fragment.toString()); + String text = document.select("p").stream().map(p -> p.text()).map(s -> replaceSpace(s, " ")).filter(s -> s.length() > 0).collect(joining("\n")).trim(); + + if (text.length() > 0) { + return new SubtitleElement(start, end, text); + } + } + + return null; + } + + private long getLongAttribute(Element node, String key) { + if (node != null) { + String value = node.attr(key); + + if (value.length() > 0) { + try { + return Long.parseLong(value); + } catch (Exception e) { + debug.warning(cause(e)); + } + } + } + + return -1; + } + +}