XmlMarkupParser.java

/*
 * SPDX-FileCopyrightText: none
 * SPDX-License-Identifier: CC0-1.0
 */

package gov.nist.secauto.metaschema.core.datatype.markup.flexmark;

import com.vladsch.flexmark.util.sequence.Escaping;

import gov.nist.secauto.metaschema.core.datatype.markup.MarkupLine;
import gov.nist.secauto.metaschema.core.datatype.markup.MarkupMultiline;
import gov.nist.secauto.metaschema.core.model.util.XmlEventUtil;
import gov.nist.secauto.metaschema.core.util.CollectionUtil;
import gov.nist.secauto.metaschema.core.util.ObjectUtils;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codehaus.stax2.XMLEventReader2;

import java.util.Set;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import edu.umd.cs.findbugs.annotations.NonNull;

public final class XmlMarkupParser {
  private static final Logger LOGGER = LogManager.getLogger(XmlMarkupParser.class);

  @NonNull
  public static final Set<String> BLOCK_ELEMENTS = ObjectUtils.notNull(
      Set.of(
          "h1",
          "h2",
          "h3",
          "h4",
          "h5",
          "h6",
          "ul",
          "ol",
          "pre",
          "hr",
          "blockquote",
          "p",
          "table",
          "img"));

  @NonNull
  private static final XmlMarkupParser SINGLETON = new XmlMarkupParser();

  /**
   * Get the singleton markup parser instance.
   *
   * @return the instance
   */
  @SuppressWarnings("PMD.AvoidSynchronizedAtMethodLevel")
  @NonNull
  public static synchronized XmlMarkupParser instance() {
    return SINGLETON;
  }

  private XmlMarkupParser() {
    // disable construction
  }

  /**
   * Parse a single line of markup from XHTML.
   *
   * @param reader
   *          the XML event stream reader
   * @return the markup string
   * @throws XMLStreamException
   *           if an error occurred while parsing
   */
  public MarkupLine parseMarkupline(XMLEventReader2 reader) throws XMLStreamException { // NOPMD - acceptable
    StringBuilder buffer = new StringBuilder();
    parseContents(reader, null, buffer);
    String html = buffer.toString().trim();
    return html.isEmpty() ? null : MarkupLine.fromHtml(html);
  }

  /**
   * Parse a markup multiline from XHTML.
   *
   * @param reader
   *          the XML event stream reader
   * @return the markup string
   * @throws XMLStreamException
   *           if an error occurred while parsing
   */
  public MarkupMultiline parseMarkupMultiline(XMLEventReader2 reader) throws XMLStreamException {
    StringBuilder buffer = new StringBuilder();
    parseToString(reader, buffer);
    String html = buffer.toString().trim();

    if (LOGGER.isDebugEnabled()) {
      LOGGER.debug("XML->HTML: {}", html);
    }
    return html.isEmpty() ? null : MarkupMultiline.fromHtml(html);
  }

  /**
   * Parse a markup multiline from XHTML.
   *
   * @param reader
   *          the XML event stream reader
   * @param buffer
   *          the markup string buffer
   * @throws XMLStreamException
   *           if an error occurred while parsing
   */
  private void parseToString(XMLEventReader2 reader, StringBuilder buffer) // NOPMD - acceptable
      throws XMLStreamException {
    // if (LOGGER.isDebugEnabled()) {
    // LOGGER.debug("parseToString(enter): {}",
    // XmlEventUtil.toString(reader.peek()));
    // }

    outer: while (reader.hasNextEvent() && !reader.peek().isEndElement()) {
      // skip whitespace before the next block element
      XMLEvent nextEvent = XmlEventUtil.skipWhitespace(reader);

      // if (LOGGER.isDebugEnabled()) {
      // LOGGER.debug("parseToString: {}", XmlEventUtil.toString(nextEvent));
      // }

      if (nextEvent.isStartElement()) {
        StartElement start = nextEvent.asStartElement();
        QName name = start.getName();

        // Note: the next element is not consumed. The called method is expected to
        // consume it
        if (BLOCK_ELEMENTS.contains(name.getLocalPart())) {
          parseStartElement(reader, start, buffer);

          // the next event should be the event after the start's END_ELEMENT
          // assert XmlEventUtil.isNextEventEndElement(reader, name) :
          // XmlEventUtil.toString(reader.peek());
        } else {
          // throw new IllegalStateException();
          // stop parsing on first unrecognized event
          break outer;
        }
      }
      // reader.nextEvent();

      // skip whitespace before the next block element
      XmlEventUtil.skipWhitespace(reader);
    }

    // if (LOGGER.isDebugEnabled()) {
    // LOGGER.debug("parseToString(exit): {}", reader.peek() != null ?
    // XmlEventUtil.toString(reader.peek()) : "");
    // }
  }

  private void parseStartElement(XMLEventReader2 reader, StartElement start, StringBuilder buffer)
      throws XMLStreamException {
    if (LOGGER.isDebugEnabled()) {
      LOGGER.debug("parseStartElement(enter): {}", XmlEventUtil.toString(start));
    }

    // consume the start event
    reader.nextEvent();

    QName name = start.getName();
    buffer.append('<')
        .append(name.getLocalPart());
    for (Attribute attribute : CollectionUtil.toIterable(
        ObjectUtils.notNull(start.getAttributes()))) {
      buffer
          .append(' ')
          .append(attribute.getName().getLocalPart())
          .append("=\"")
          .append(attribute.getValue())
          .append('"');
    }

    XMLEvent next = reader.peek();
    if (next != null && next.isEndElement()) {
      buffer.append("/>");
      // consume end element event
      reader.nextEvent();
    } else {
      buffer.append('>');

      // parse until the start's END_ELEMENT is reached
      parseContents(reader, start, buffer);

      buffer
          .append("</")
          .append(name.getLocalPart())
          .append('>');

      // the next event should be the start's END_ELEMENT
      XmlEventUtil.assertNext(reader, XMLStreamConstants.END_ELEMENT, name);

      // consume the start's END_ELEMENT
      reader.nextEvent();
    }

    if (LOGGER.isDebugEnabled()) {
      LOGGER.debug("parseStartElement(exit): {}", reader.peek() != null ? XmlEventUtil.toString(reader.peek()) : "");
    }
  }

  private void parseContents(XMLEventReader2 reader, StartElement start, StringBuilder buffer)
      throws XMLStreamException {
    // if (LOGGER.isDebugEnabled()) {
    // LOGGER.debug("parseContents(enter): {}",
    // XmlEventUtil.toString(reader.peek()));
    // }

    XMLEvent event;
    while (reader.hasNextEvent() && !(event = reader.peek()).isEndElement()) {
      // // skip whitespace before the next list item
      // event = XmlEventUtil.skipWhitespace(reader);

      // if (LOGGER.isDebugEnabled()) {
      // LOGGER.debug("parseContents(before): {}", XmlEventUtil.toString(event));
      // }

      if (event.isStartElement()) {
        StartElement nextStart = event.asStartElement();
        // QName nextName = nextStart.getName();
        parseStartElement(reader, nextStart, buffer);

        // if (LOGGER.isDebugEnabled()) {
        // LOGGER.debug("parseContents(after): {}",
        // XmlEventUtil.toString(reader.peek()));
        // }

        // assert XmlEventUtil.isNextEventEndElement(reader, nextName) :
        // XmlEventUtil.toString(reader.peek());

        // reader.nextEvent();
      } else if (event.isCharacters()) {
        Characters characters = event.asCharacters();
        buffer.append(Escaping.escapeHtml(characters.getData(), true));
        reader.nextEvent();
      }
    }

    assert start == null
        || XmlEventUtil.isEventEndElement(reader.peek(), ObjectUtils.notNull(start.getName())) : XmlEventUtil
            .generateExpectedMessage(reader.peek(), XMLStreamConstants.END_ELEMENT, start.getName());

    // if (LOGGER.isDebugEnabled()) {
    // LOGGER.debug("parseContents(exit): {}", reader.peek() != null ?
    // XmlEventUtil.toString(reader.peek()) : "");
    // }
  }

}