org.dice_research.topicmodeling.io.xml.BricsBasedXmlParser.java Source code

Introduction

Here is the source code for org.dice_research.topicmodeling.io.xml.BricsBasedXmlParser.java
Source

/**
 * This file is part of topicmodeling.io.
 *
 * topicmodeling.io is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * topicmodeling.io is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with topicmodeling.io.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.dice_research.topicmodeling.io.xml;

import org.apache.commons.lang3.StringEscapeUtils;
import org.dice_research.topicmodeling.automaton.AutomatonCallback;
import org.dice_research.topicmodeling.automaton.BricsAutomatonManager;
import org.dice_research.topicmodeling.automaton.MultiPatternAutomaton;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class BricsBasedXmlParser implements AutomatonCallback {

    private static final Logger LOGGER = LoggerFactory.getLogger(BricsBasedXmlParser.class);

    private XMLParserObserver observer;
    private MultiPatternAutomaton automaton;
    private String text;
    private StringBuilder buffer = new StringBuilder();

    private int lastPos;

    public BricsBasedXmlParser(XMLParserObserver observer) {
        this.observer = observer;
        this.automaton = new BricsAutomatonManager(this,
                new String[] { "\\<[^\\<\\>]*\\>", "\\&[#A-Za-z][A-Za-z0-9]{1,6};" });
    }

    @Override
    public void foundPattern(int patternId, int startPos, int length) {
        buffer.append(text.substring(lastPos, startPos));
        switch (patternId) {
        case 0: {
            parseTag(startPos, length);
            break;
        }
        case 1: {
            parseEscapedCharachter(startPos, length);
            break;
        }
        default: {
            LOGGER.error("Got an unknown patternId from the automaton (patternId=" + patternId + ").");
            break;
        }
        }
        lastPos = startPos + length;
    }

    private void parseTag(int startPos, int length) {
        if (buffer.length() > 0) {
            observer.handleData(buffer.toString());
            buffer.delete(0, buffer.length());
        }
        // if this is a closing tag
        if (text.charAt(startPos + 1) == '/') {
            observer.handleClosingTag(text.substring(startPos + 2, (startPos + length) - 1));
        } else if (text.charAt(startPos + length - 2) == '/') {
            // if this is an empty xml tag
            observer.handleEmptyTag(text.substring(startPos + 1, (startPos + length) - 2));
        } else {
            observer.handleOpeningTag(text.substring(startPos + 1, (startPos + length) - 1));
        }
    }

    private void parseEscapedCharachter(int startPos, int length) {
        //        buffer.append(text.substring(lastPos, startPos));
        lastPos = startPos + length;
        buffer.append(StringEscapeUtils.unescapeXml(text.substring(startPos, lastPos)));
    }

    public void parse(String text) {
        lastPos = 0;
        this.text = text;
        automaton.parseText(text);
        this.text = null;
    }
}