use DOM parser instead of XmlPullParser

This commit is contained in:
kapodamy 2018-09-26 16:05:04 -03:00 committed by GitHub
parent 891e23374e
commit 5d41651cf0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 95 additions and 248 deletions

View File

@ -1,68 +1,82 @@
package org.schabi.newpipe.extractor.utils;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
import android.util.Log;
import java.io.BufferedInputStream;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import org.schabi.newpipe.extractor.utils.io.SharpStream;
import org.xmlpull.v1.XmlPullParserFactory;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;
/**
*
* @author kapodamy
*/
public class SubtitleConverter {
private static final int BUFFER_SIZE = 64 * 1024;
private static final String TAG = "SubtitleConverter";
private static final String NEW_LINE = "\r\n";
public int dumpTTML(InputStream in, final SharpStream out, final boolean ignoreEmptyFrames, final boolean detectYoutubeDuplicateLines) {
public int dumpTTML(SharpStream in, final SharpStream out, final boolean ignoreEmptyFrames, final boolean detectYoutubeDuplicateLines) {
try {
final int[] frame_index = {0};// ugly workaround
final Charset charset = Charset.forName("utf-8");
read_xml_based(in, new FrameWriter() {
@Override
public void yield(SubtitleFrame frame) throws IOException {
if (ignoreEmptyFrames && frame.isEmptyText()) {
return;
}
out.write(String.valueOf(frame_index[0]++).getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
out.write(getTime(frame.start, true).getBytes(charset));
out.write(" --> ".getBytes(charset));
out.write(getTime(frame.end, true).getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
out.write(frame.text.getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
}
}, detectYoutubeDuplicateLines, "tt", "xmlns", "http://www.w3.org/ns/ttml", new String[]{"tt", "body", "div", "p"}, "begin", "end", true);
@Override
public void yield(SubtitleFrame frame) throws IOException {
if (ignoreEmptyFrames && frame.isEmptyText()) {
return;
}
out.write(String.valueOf(frame_index[0]++).getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
out.write(getTime(frame.start, true).getBytes(charset));
out.write(" --> ".getBytes(charset));
out.write(getTime(frame.end, true).getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
out.write(frame.text.getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
}
},
detectYoutubeDuplicateLines, "tt", "xmlns", "http://www.w3.org/ns/ttml",
new String[]{"timedtext", "head", "wp"}, new String[]{"body", "div", "p"},
"begin", "end", true);
} catch (Exception err) {
Log.e(TAG, "subtitle parse failed", err);
if (err instanceof IOException) {
return 1;
} else if (err instanceof ParseException) {
return 2;
} else if (err instanceof XmlPullParserException) {
} else if (err instanceof SAXException) {
return 3;
} else if (err instanceof ParserConfigurationException) {
return 4;
} else if (err instanceof XPathExpressionException) {
return 7;
}
return 4;
return 8;
}
return 0;
}
private void read_xml_based(InputStream reader, FrameWriter callback, boolean detectYoutubeDuplicateLines,
String root, String formatAttr, String formatVersion, String[] framePath,
String timeAttr, String durationAttr, boolean hasTimestamp
) throws XmlPullParserException, IOException, ParseException {
private void read_xml_based(SharpStream source, FrameWriter callback, boolean detectYoutubeDuplicateLines,
String root, String formatAttr, String formatVersion, String[] cuePath, String[] framePath,
String timeAttr, String durationAttr, boolean hasTimestamp
) throws IOException, ParseException, SAXException, ParserConfigurationException, XPathExpressionException {
/*
* XML based subtitles parser with BASIC support
* multiple CUE is not supported
@ -72,21 +86,31 @@ public class SubtitleConverter {
* Language parsing is not supported
*/
XmlDocument xml = new XmlDocument(reader, BUFFER_SIZE);
byte[] buffer = new byte[source.available()];
source.read(buffer);
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document xml = builder.parse(new ByteArrayInputStream(buffer));
String attr;
// get the format version or namespace
XmlNode node = xml.selectSingleNode(root);
Element node = xml.getDocumentElement();
if (node == null) {
throw new ParseException("Can't get the format version. ¿wrong namespace?", -1);
} else if (!node.getNodeName().equals(root)) {
throw new ParseException("Invalid root", -1);
}
if (formatAttr.equals("xmlns")) {
if (!node.getNameSpace().equals(formatVersion)) {
if (!node.getNamespaceURI().equals(formatVersion)) {
throw new UnsupportedOperationException("Expected xml namespace: " + formatVersion);
}
} else {
attr = node.getAttribute(formatAttr);
attr = node.getAttributeNS(formatVersion, formatAttr);
if (attr == null) {
throw new ParseException("Can't get the format attribute", -1);
}
@ -95,29 +119,29 @@ public class SubtitleConverter {
}
}
XmlNodeList node_list;
NodeList node_list;
int line_break = 0;// Maximum characters per line if present (valid for TranScript v3)
if (!hasTimestamp) {
node_list = xml.selectNodes("timedtext", "head", "wp");
node_list = selectNodes(xml, cuePath, formatVersion);
if (node_list != null) {
// if the subtitle has multiple CUEs, use the highest value
while ((node = node_list.getNextNode()) != null) {
for (int i = 0; i < node_list.getLength(); i++) {
try {
int tmp = Integer.parseInt(node.getAttribute("ah"));
int tmp = Integer.parseInt(((Element) node_list.item(i)).getAttributeNS(formatVersion, "ah"));
if (tmp > line_break) {
line_break = tmp;
}
} catch (NumberFormatException err) {
} catch (Exception err) {
}
}
}
}
// parse every frame
node_list = xml.selectNodes(framePath);
node_list = selectNodes(xml, framePath, formatVersion);
if (node_list == null) {
return;// no frames detected
@ -126,14 +150,15 @@ public class SubtitleConverter {
int fs_ff = -1;// first timestamp of first frame
boolean limit_lines = false;
while ((node = node_list.getNextNode()) != null) {
for (int i = 0; i < node_list.getLength(); i++) {
Element elem = (Element) node_list.item(i);
SubtitleFrame obj = new SubtitleFrame();
obj.text = node.getInnerText();
obj.text = elem.getTextContent();
attr = node.getAttribute(timeAttr);// ¡this cant be null!
attr = elem.getAttribute(timeAttr);// ¡this cant be null!
obj.start = hasTimestamp ? parseTimestamp(attr) : Integer.parseInt(attr);
attr = node.getAttribute(durationAttr);
attr = elem.getAttribute(durationAttr);
if (obj.text == null || attr == null) {
continue;// normally is a blank line (on auto-generated subtitles) ignore
}
@ -197,6 +222,30 @@ public class SubtitleConverter {
}
}
private static NodeList selectNodes(Document xml, String[] path, String namespaceUri) throws XPathExpressionException {
Element ref = xml.getDocumentElement();
for (int i = 0; i < path.length - 1; i++) {
NodeList nodes = ref.getChildNodes();
if (nodes.getLength() < 1) {
return null;
}
Element elem;
for (int j = 0; j < nodes.getLength(); j++) {
if (nodes.item(j).getNodeType() == Node.ELEMENT_NODE) {
elem = (Element) nodes.item(j);
if (elem.getNodeName().equals(path[i]) && elem.getNamespaceURI().equals(namespaceUri)) {
ref = elem;
break;
}
}
}
}
return ref.getElementsByTagNameNS(namespaceUri, path[path.length - 1]);
}
private static int parseTimestamp(String multiImpl) throws NumberFormatException, ParseException {
if (multiImpl.length() < 1) {
return 0;
@ -296,43 +345,6 @@ public class SubtitleConverter {
return String.format(Locale.ENGLISH, "%0".concat(String.valueOf(pad)).concat("d"), nro);
}
/**
* XmlPullParser wrapper
* @param parser XmlPullParser instance
* @param name node name
* @param depth current tree deep
* @return true if the node was reached, otherwise, false
* @throws XmlPullParserException if cant read the next XML tag
* @throws IOException I/O error
*/
private static boolean getNextNode(XmlPullParser parser, String name, int depth) throws XmlPullParserException, IOException {
int cursor = 0;
int eventType = 0;
while (eventType != XmlPullParser.END_DOCUMENT) {
eventType = parser.next();
switch (eventType) {
case XmlPullParser.START_TAG:
int tmp = parser.getDepth();
if (tmp < depth) {
return false;
}
if (tmp == depth && cursor == 0 && parser.getName().equals(name)) {
return true;
}
cursor++;
break;
case XmlPullParser.END_TAG:
if (cursor > 0) {
cursor--;
}
}
}
return false;
}
/******************
* helper classes *
@ -371,169 +383,4 @@ public class SubtitleConverter {
}
}
private class XmlDocument {
private BufferedInputStream src;
private XmlPullParserFactory fac;
XmlDocument(InputStream stream, int bufferSize) throws XmlPullParserException {
// due how xml parsing works is necessary a wrapper
src = new BufferedInputStream(stream, bufferSize);
src.mark(0);
XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
factory.setNamespaceAware(true);
fac = factory;
}
XmlNode selectSingleNode(String... path) throws XmlPullParserException, IOException {
if (path.length < 1) {
return null;
}
src.reset();// ¡this is very much important!
XmlPullParser parser = fac.newPullParser();
parser.setInput(src, null);
for (int i = 0; i < path.length; i++) {
if (!getNextNode(parser, path[i], i + 1)) {
return null;
}
}
return new XmlNode(parser);
}
XmlNodeList selectNodes(String... path) throws XmlPullParserException, IOException {
XmlNode node = selectSingleNode(path);
if (node == null) {
return null;
}
return new XmlNodeList(node.parser);
}
}
private class XmlNode {
XmlPullParser parser;
XmlNode(XmlPullParser parser) {
this.parser = parser;
}
private void init_attrs() {
if (attrs != null) {
return;
}
// backup attributes first
attrs = new HashMap<String, String>(parser.getAttributeCount());
for (int i = 0; i < parser.getAttributeCount(); i++) {
attrs.put(parser.getAttributeName(i), parser.getAttributeValue(i));
}
}
String getText() throws IOException, XmlPullParserException {
init_attrs();
int eventType = 0;
boolean crash = false;
int deep = parser.getDepth();
while (!crash && eventType != XmlPullParser.END_DOCUMENT) {
eventType = parser.next();
switch (eventType) {
case XmlPullParser.TEXT:
if (parser.getDepth() != deep) {
continue;
}
return parser.getText();
case XmlPullParser.END_TAG:
if (parser.getDepth() > deep) {
continue;
}
return null;
case XmlPullParser.START_TAG:
if (parser.getDepth() < deep) {
crash = true;
}
break;
}
}
throw new XmlPullParserException("cant read the text node, XmlPullParser crashed");
}
String getInnerText() throws IOException, XmlPullParserException {
init_attrs();
int eventType = 0;
boolean crash = false;
int deep = parser.getDepth();
StringBuilder buffer = new StringBuilder(128);
while (!crash && eventType != XmlPullParser.END_DOCUMENT) {
eventType = parser.next();
switch (eventType) {
case XmlPullParser.TEXT:
String str = parser.getText();
if (str != null) {
buffer.append(str);
}
break;
case XmlPullParser.END_TAG:
if (parser.getDepth() > deep) {
continue;
}
return buffer.toString();
case XmlPullParser.START_TAG:
if (parser.getDepth() < deep) {
crash = true;
}
break;
}
}
throw new XmlPullParserException("cant read the text node, XmlPullParser crashed");
}
String getAttribute(String name) {
return attrs == null ? parser.getAttributeValue(null, name) : attrs.get(name);
}
String getNameSpace() {
return parser.getNamespace();
}
private Map<String, String> attrs;
}
private class XmlNodeList {
private XmlPullParser parser;
boolean first = true;
String node_name;
int node_depth;
XmlNodeList(XmlPullParser parser) {
this.parser = parser;
node_name = parser.getName();
node_depth = parser.getDepth();
}
XmlNode getNextNode() throws XmlPullParserException, IOException {
if (first) {
first = false;
return new XmlNode(parser);
}
if (!SubtitleConverter.getNextNode(parser, node_name, node_depth)) {
parser = null;
}
return parser == null ? null : new XmlNode(parser);
}
}
}