AbstractMarkupString.java
/*
* SPDX-FileCopyrightText: none
* SPDX-License-Identifier: CC0-1.0
*/
package gov.nist.secauto.metaschema.core.datatype.markup;
import com.ctc.wstx.api.WstxOutputProperties;
import com.ctc.wstx.stax.WstxOutputFactory;
import com.vladsch.flexmark.formatter.Formatter;
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
import com.vladsch.flexmark.parser.Parser;
import com.vladsch.flexmark.util.ast.Document;
import com.vladsch.flexmark.util.ast.Node;
import com.vladsch.flexmark.util.ast.TextCollectingVisitor;
import gov.nist.secauto.metaschema.core.datatype.markup.flexmark.AstCollectingVisitor;
import gov.nist.secauto.metaschema.core.datatype.markup.flexmark.FlexmarkFactory;
import gov.nist.secauto.metaschema.core.datatype.markup.flexmark.IMarkupVisitor;
import gov.nist.secauto.metaschema.core.datatype.markup.flexmark.IMarkupWriter;
import gov.nist.secauto.metaschema.core.datatype.markup.flexmark.InsertAnchorExtension;
import gov.nist.secauto.metaschema.core.datatype.markup.flexmark.InsertAnchorExtension.InsertAnchorNode;
import gov.nist.secauto.metaschema.core.datatype.markup.flexmark.MarkupVisitor;
import gov.nist.secauto.metaschema.core.datatype.markup.flexmark.MarkupXmlEventWriter;
import gov.nist.secauto.metaschema.core.datatype.markup.flexmark.MarkupXmlStreamWriter;
import gov.nist.secauto.metaschema.core.util.ObjectUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codehaus.stax2.XMLOutputFactory2;
import org.codehaus.stax2.XMLStreamWriter2;
import org.codehaus.stax2.evt.XMLEventFactory2;
import org.jsoup.Jsoup;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import edu.umd.cs.findbugs.annotations.NonNull;
import edu.umd.cs.findbugs.annotations.Nullable;
@SuppressWarnings("PMD.CouplingBetweenObjects")
public abstract class AbstractMarkupString<TYPE extends AbstractMarkupString<TYPE>>
implements IMarkupString<TYPE> {
private static final Logger LOGGER = LogManager.getLogger(FlexmarkFactory.class);
private static final Pattern QUOTE_TAG_REPLACEMENT_PATTERN
= Pattern.compile("</?q>");
//
// @NonNull
// private static final String DEFAULT_HTML_NS = "http://www.w3.org/1999/xhtml";
// @NonNull
// private static final String DEFAULT_HTML_PREFIX = "";
@NonNull
private final Document document;
/**
* Construct a new markup string based on the provided flexmark AST graph.
*
* @param document
* the AST graph representing Markdown text
*/
protected AbstractMarkupString(@NonNull Document document) {
this.document = document;
}
@Override
public Document getDocument() {
return document;
}
@Override
public boolean isEmpty() {
return getDocument().getFirstChild() == null;
}
/**
* Parse HTML-based text into markdown as a flexmark AST graph.
* <p>
* This method uses a two-step approach that first translates the HTML into
* markdown, and then parses the markdown into an AST graph.
*
* @param html
* the HTML text to parse
* @param htmlParser
* the HTML parser used to produce markdown
* @param markdownParser
* the markdown parser
* @return the markdown AST graph
*/
@NonNull
protected static Document parseHtml(@NonNull String html, @NonNull FlexmarkHtmlConverter htmlParser,
@NonNull Parser markdownParser) {
org.jsoup.nodes.Document document = Jsoup.parse(html);
// Fix for usnistgov/liboscal-java#5
// Caused by not stripping out extra newlines inside HTML tags
NodeTraversor.traverse(new NodeVisitor() {
@Override
public void head(org.jsoup.nodes.Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
org.jsoup.nodes.Node parent = textNode.parent();
if (!isTag(parent, "code") || !isTag(parent.parent(), "pre")) {
node.replaceWith(new TextNode(textNode.text()));
}
}
}
private boolean isTag(@Nullable org.jsoup.nodes.Node node, @NonNull String tagName) {
return node != null && tagName.equals(node.normalName());
}
}, document);
String markdown = htmlParser.convert(document);
assert markdown != null;
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("html->markdown: {}", markdown);
}
return parseMarkdown(markdown, markdownParser);
}
/**
* Parse markdown-based text into a flexmark AST graph.
*
* @param markdown
* the markdown text to parse
* @param parser
* the markdown parser
* @return the markdown AST graph
*/
@SuppressWarnings("null")
@NonNull
protected static Document parseMarkdown(@NonNull String markdown, @NonNull Parser parser) {
return parser.parse(markdown);
}
@Override
public String toXHtml(@NonNull String namespace) throws XMLStreamException, IOException {
String retval;
Document document = getDocument();
if (document.hasChildren()) {
XMLOutputFactory2 factory = (XMLOutputFactory2) XMLOutputFactory.newInstance();
assert factory instanceof WstxOutputFactory;
factory.setProperty(WstxOutputProperties.P_OUTPUT_VALIDATE_STRUCTURE, false);
try (ByteArrayOutputStream os = new ByteArrayOutputStream()) {
XMLStreamWriter2 xmlStreamWriter = (XMLStreamWriter2) factory.createXMLStreamWriter(os);
writeXHtml(namespace, ObjectUtils.notNull(xmlStreamWriter));
xmlStreamWriter.flush();
xmlStreamWriter.close();
os.flush();
retval = ObjectUtils.notNull(os.toString(StandardCharsets.UTF_8));
}
} else {
retval = "";
}
return retval;
}
@Override
public String toHtml() {
// String html;
// try {
// html = toXHtml("");
// } catch(RuntimeException ex) {
// throw ex;
// } catch (Throwable ex) {
// throw new RuntimeException(ex);
// }
// return QUOTE_TAG_REPLACEMENT_PATTERN.matcher(html)
// .replaceAll(""");
String html = getFlexmarkFactory().getHtmlRenderer().render(getDocument());
return ObjectUtils.notNull(QUOTE_TAG_REPLACEMENT_PATTERN.matcher(html)
.replaceAll("""));
}
@Override
public String toMarkdown() {
return toMarkdown(getFlexmarkFactory().getFormatter());
}
@Override
public String toMarkdown(Formatter formatter) {
return ObjectUtils.notNull(formatter.render(getDocument()));
}
@Override
public String toText() {
return ObjectUtils.notNull(new TextCollectingVisitor().collectAndGetText(document));
}
@Override
public void writeXHtml(String namespace, XMLStreamWriter2 streamWriter) throws XMLStreamException {
Document document = getDocument();
if (document.hasChildren()) {
IMarkupWriter<XMLStreamWriter, XMLStreamException> writer = new MarkupXmlStreamWriter(
namespace,
getFlexmarkFactory().getListOptions(),
streamWriter);
IMarkupVisitor<XMLStreamWriter, XMLStreamException> visitor = new MarkupVisitor<>(isBlock());
visitor.visitDocument(document, writer);
} else {
streamWriter.writeCharacters("");
}
}
@Override
public void writeXHtml(String namespace, XMLEventFactory2 eventFactory, XMLEventWriter eventWriter)
throws XMLStreamException {
Document document = getDocument();
if (document.hasChildren()) {
IMarkupWriter<XMLEventWriter, XMLStreamException> writer = new MarkupXmlEventWriter(
namespace,
getFlexmarkFactory().getListOptions(),
eventWriter,
eventFactory);
IMarkupVisitor<XMLEventWriter, XMLStreamException> visitor = new MarkupVisitor<>(isBlock());
visitor.visitDocument(getDocument(), writer);
} else {
eventWriter.add(eventFactory.createSpace(""));
}
}
@SuppressWarnings("null")
@Override
public Stream<Node> getNodesAsStream() {
return Stream.concat(Stream.of(getDocument()),
StreamSupport.stream(getDocument().getDescendants().spliterator(), false));
}
@Override
@NonNull
public List<InsertAnchorNode> getInserts() {
return getInserts(insert -> true);
}
/**
* Retrieve all insert statements that are contained within this markup text
* that match the provided filter.
*
* @param filter
* a filter used to identify matching insert statements
* @return the matching insert statements
*/
@Override
@NonNull
public List<InsertAnchorNode> getInserts(@NonNull Predicate<InsertAnchorNode> filter) {
InsertAnchorExtension.InsertVisitor visitor = new InsertAnchorExtension.InsertVisitor(filter);
visitor.visitChildren(getDocument());
return visitor.getInserts();
}
@Override
public String toString() {
return AstCollectingVisitor.asString(getDocument());
}
}