edu.jhu.hlt.concrete.ingesters.bolt.BoltForumPostIngester.java Source code

Introduction

Here is the source code for edu.jhu.hlt.concrete.ingesters.bolt.BoltForumPostIngester.java
Source

/*
 * Copyright 2012-2015 Johns Hopkins University HLTCOE. All rights reserved.
 * See LICENSE in the project root directory.
 */
package edu.jhu.hlt.concrete.ingesters.bolt;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.Section;
import edu.jhu.hlt.concrete.TextSpan;
import edu.jhu.hlt.concrete.communications.WritableCommunication;
import edu.jhu.hlt.concrete.ingesters.base.IngestException;
import edu.jhu.hlt.concrete.ingesters.base.UTF8FileIngester;
import edu.jhu.hlt.concrete.metadata.tools.SafeTooledAnnotationMetadata;
import edu.jhu.hlt.concrete.metadata.tools.TooledMetadataConverter;
import edu.jhu.hlt.concrete.section.SectionFactory;
import edu.jhu.hlt.concrete.util.ConcreteException;
import edu.jhu.hlt.concrete.util.ProjectConstants;
import edu.jhu.hlt.concrete.util.Timing;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory.AnalyticUUIDGenerator;
import edu.jhu.hlt.utilt.ex.LoggedUncaughtExceptionHandler;
import edu.jhu.hlt.utilt.io.ExistingNonDirectoryFile;
import edu.jhu.hlt.utilt.io.NotFileException;

/**
 * Class representing a Concrete ingester for BOLT forum post data.
 *
 * Currently only extracts the headline and posts from the document.
 */
public class BoltForumPostIngester implements SafeTooledAnnotationMetadata, UTF8FileIngester {

    private static final Logger LOGGER = LoggerFactory.getLogger(BoltForumPostIngester.class);

    public static boolean STRIP_WHITESPACE_OFF_HEADLINE = true;

    public static final String POST_LOCAL_NAME = "post";
    public static final String IMG_LOCAL_NAME = "img";
    public static final String QUOTE_LOCAL_NAME = "quote";
    public static final String LINK_LOCAL_NAME = "a";

    private final XMLInputFactory inF;

    /**
     *
     */
    public BoltForumPostIngester() {
        this.inF = XMLInputFactory.newInstance();
        // this.inF.setProperty(XMLInputFactory.IS_COALESCING, true);
        // this.inF.setProperty(XMLInputFactory.IS_VALIDATING, false);
    }

    /* (non-Javadoc)
     * @see edu.jhu.hlt.concrete.safe.metadata.SafeAnnotationMetadata#getTimestamp()
     */
    @Override
    public long getTimestamp() {
        return Timing.currentLocalTime();
    }

    /* (non-Javadoc)
     * @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolName()
     */
    @Override
    public String getToolName() {
        return BoltForumPostIngester.class.getSimpleName();
    }

    /* (non-Javadoc)
     * @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolVersion()
     */
    @Override
    public String getToolVersion() {
        return ProjectConstants.VERSION;
    }

    /* (non-Javadoc)
     * @see edu.jhu.hlt.concrete.ingesters.base.Ingester#getKind()
     */
    @Override
    public String getKind() {
        return "forum-post";
    }

    private static Section handleHeadline(final XMLEventReader rdr, final String content)
            throws XMLStreamException, ConcreteException {
        // The first type is always a document start event. Skip it.
        rdr.nextEvent();

        // The second type is a document ID block. Skip it.
        rdr.nextEvent();

        // The third type is a whitespace block. Skip it.
        rdr.nextEvent();

        // The next type is a headline start tag.
        XMLEvent hl = rdr.nextEvent();
        StartElement hlse = hl.asStartElement();
        QName hlqn = hlse.getName();
        final String hlPart = hlqn.getLocalPart();
        LOGGER.debug("QN: {}", hlPart);
        int hlPartOff = hlse.getLocation().getCharacterOffset();
        LOGGER.debug("HL part offset: {}", hlPartOff);

        // Text of the headline. This would be useful for purely getting
        // the content, but for offsets, it's not that useful.
        Characters cc = rdr.nextEvent().asCharacters();
        int charOff = cc.getLocation().getCharacterOffset();
        int clen = cc.getData().length();

        // The next part is the headline end element. Skip.
        rdr.nextEvent();

        // Whitespace. Skip.
        rdr.nextEvent();

        // Reader is now pointing at the first post.
        // Construct section, text span, etc.
        final int charOffPlusLen = charOff + clen;

        // Strip whitespace off
        TextSpan ts;
        if (STRIP_WHITESPACE_OFF_HEADLINE) {
            final String hlText = content.substring(charOff, charOffPlusLen);
            SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(hlText);
            ts = new TextSpan(charOff + pads.getKey(), charOffPlusLen - pads.getValue());
        } else {
            ts = new TextSpan(charOff, charOffPlusLen);
        }
        assert ts.getStart() <= ts.getEnding() : "ts=" + ts;

        Section s = new Section();
        s.setKind("headline");
        s.setTextSpan(ts);
        List<Integer> intList = new ArrayList<>();
        intList.add(0);
        s.setNumberList(intList);
        return s;
    }

    private static SimpleImmutableEntry<Integer, Integer> trimSpacing(final String str) {
        final int leftPadding = getLeftSpacesPaddingCount(str);
        LOGGER.trace("Left padding: {}", leftPadding);
        final int rightPadding = getRightSpacesPaddingCount(str);
        LOGGER.trace("Right padding: {}", rightPadding);
        return new SimpleImmutableEntry<Integer, Integer>(leftPadding, rightPadding);
    }

    private int handleLink(final XMLEventReader rdr) throws XMLStreamException {
        // Links have a start element, characters, and end element.
        // Alternatively, they have a start and end element.
        XMLEvent linkContent = rdr.nextEvent();
        if (linkContent.isEndElement())
            return linkContent.getLocation().getCharacterOffset();
        else if (linkContent.isCharacters())
            // Skip end of link.
            return rdr.nextEvent().getLocation().getCharacterOffset();
        else
            throw new RuntimeException("Characters did not follow link.");
    }

    /**
     * Moves the rdr "iterator" past any img tags or quote tags.
     *
     * @param rdr
     * @throws XMLStreamException
     */
    private int handleNonPostStartElement(final XMLEventReader rdr) throws XMLStreamException {
        // Next is a start element. Throw if not.
        StartElement se = rdr.nextEvent().asStartElement();
        QName seqn = se.getName();
        String part = seqn.getLocalPart();

        if (part.equals(QUOTE_LOCAL_NAME)) {
            return this.handleQuote(rdr);
        } else if (part.equals(IMG_LOCAL_NAME)) {
            return this.handleImg(rdr);
        } else if (part.equals(LINK_LOCAL_NAME)) {
            return this.handleLink(rdr);
        } else
            throw new IllegalArgumentException("Unhandled tag: " + part);
    }

    /**
     * Move the iterator so that a call to nextEvent will return the beginning of a post tag.
     *
     * @param rdr
     * @throws XMLStreamException
     */
    private void iterateToPosts(final XMLEventReader rdr) throws XMLStreamException {
        // Peek at the next element.
        XMLEvent fp = rdr.peek();

        // If start element and part == "post", return.
        if (fp.isStartElement()) {
            StartElement se = fp.asStartElement();
            if (se.getName().getLocalPart().equals(POST_LOCAL_NAME))
                return;
            else
                // Churn through non-post start tags.
                this.handleNonPostStartElement(rdr);
        } else
            // Drop.
            rdr.nextEvent();

        this.iterateToPosts(rdr);
    }

    private int handleQuote(final XMLEventReader rdr) throws XMLStreamException {
        // For quotes, there will be character contents - skip for now...
        XMLEvent quoteContent = rdr.nextEvent();
        if (!quoteContent.isCharacters())
            throw new RuntimeException("Characters did not follow quote.");
        // Skip end of quote.
        XMLEvent next = rdr.nextEvent();
        // Exit loop only when next end quote is hit.
        boolean hitEndQuoteElement = false;
        while (!next.isEndElement() && !hitEndQuoteElement) {
            // Move to next element.
            next = rdr.nextEvent();
            // If next element is an end element,
            // see if it's an end quote.
            // If so, exit the loop.
            if (next.isEndElement())
                hitEndQuoteElement = next.asEndElement().getName().getLocalPart().equals("quote");
        }

        return next.getLocation().getCharacterOffset();
    }

    private int handleImg(final XMLEventReader rdr) throws XMLStreamException {
        XMLEvent n = rdr.nextEvent();
        return n.asEndElement().getLocation().getCharacterOffset();
    }

    /* (non-Javadoc)
     * @see edu.jhu.hlt.concrete.ingesters.base.UTF8FileIngester#fromCharacterBasedFile(java.nio.file.Path)
     */
    @Override
    public Communication fromCharacterBasedFile(final Path path) throws IngestException {
        if (!Files.exists(path))
            throw new IngestException("No file at: " + path.toString());

        AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory();
        AnalyticUUIDGenerator gen = f.create();
        Communication c = new Communication();
        c.setUuid(gen.next());
        c.setType(this.getKind());
        c.setMetadata(TooledMetadataConverter.convert(this));

        try {
            ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path);
            c.setId(ef.getName().split("\\.")[0]);
        } catch (NoSuchFileException | NotFileException e) {
            // might throw if path is a directory.
            throw new IngestException(path.toString() + " is not a file, or is a directory.");
        }

        String content;
        try (InputStream is = Files.newInputStream(path);
                BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) {
            content = IOUtils.toString(bin, StandardCharsets.UTF_8);
            c.setText(content);
        } catch (IOException e) {
            throw new IngestException(e);
        }

        try (InputStream is = Files.newInputStream(path);
                BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);
                BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) {
            XMLEventReader rdr = null;
            try {
                rdr = inF.createXMLEventReader(reader);

                // Below method moves the reader
                // to the first post element.
                Section headline = handleHeadline(rdr, content);
                headline.setUuid(gen.next());
                c.addToSectionList(headline);
                int start = headline.getTextSpan().getStart();
                int ending = headline.getTextSpan().getEnding();
                if (ending < start)
                    ending = start; // @tongfei: handle empty headlines
                String htxt = c.getText().substring(start, ending);
                LOGGER.debug("headline text: {}", htxt);

                // Section indices.
                int sectNumber = 1;
                int subSect = 0;

                // Move iterator to post start element.
                this.iterateToPosts(rdr);

                // Offset pointer.
                int currOff = -1;

                SectionFactory sf = new SectionFactory(gen);

                // First post element.
                while (rdr.hasNext()) {
                    XMLEvent nextEvent = rdr.nextEvent();
                    currOff = nextEvent.getLocation().getCharacterOffset();
                    if (currOff > 0) {
                        int currOffPlus = currOff + 20;
                        int currOffLess = currOff - 20;
                        LOGGER.debug("Offset: {}", currOff);
                        if (currOffPlus < content.length())
                            LOGGER.debug("Surrounding text: {}", content.substring(currOffLess, currOffPlus));
                    }

                    // First: see if document is going to end.
                    // If yes: exit.
                    if (nextEvent.isEndDocument())
                        break;

                    // XMLEvent peeker = rdr.peek();

                    // Check if start element.
                    if (nextEvent.isStartElement()) {
                        StartElement se = nextEvent.asStartElement();
                        QName name = se.getName();
                        final String localName = name.getLocalPart();
                        LOGGER.debug("Hit start element: {}", localName);

                        //region
                        // Add sections for authors and datetimes for each bolt post
                        // by Tongfei Chen
                        Attribute attrAuthor = se.getAttributeByName(QName.valueOf("author"));
                        Attribute attrDateTime = se.getAttributeByName(QName.valueOf("datetime"));

                        if (attrAuthor != null && attrDateTime != null) {

                            int loc = attrAuthor.getLocation().getCharacterOffset();

                            int sectAuthorBeginningOffset = loc + "<post author=\"".length();

                            Section sectAuthor = sf.fromTextSpan(new TextSpan(sectAuthorBeginningOffset,
                                    sectAuthorBeginningOffset + attrAuthor.getValue().length()), "author");
                            c.addToSectionList(sectAuthor);

                            int sectDateTimeBeginningOffset = sectAuthorBeginningOffset
                                    + attrAuthor.getValue().length() + " datetime=".length();

                            Section sectDateTime = sf.fromTextSpan(
                                    new TextSpan(sectDateTimeBeginningOffset,
                                            sectDateTimeBeginningOffset + attrDateTime.getValue().length()),
                                    "datetime");
                            c.addToSectionList(sectDateTime);
                        }
                        //endregion

                        // Move past quotes, images, and links.
                        if (localName.equals(QUOTE_LOCAL_NAME)) {
                            this.handleQuote(rdr);
                        } else if (localName.equals(IMG_LOCAL_NAME)) {
                            this.handleImg(rdr);
                        } else if (localName.equals(LINK_LOCAL_NAME)) {
                            this.handleLink(rdr);
                        }

                        // not a start element
                    } else if (nextEvent.isCharacters()) {
                        Characters chars = nextEvent.asCharacters();
                        int coff = chars.getLocation().getCharacterOffset();
                        if (!chars.isWhiteSpace()) {
                            // content to be captured
                            String fpContent = chars.getData();
                            LOGGER.debug("Character offset: {}", coff);
                            LOGGER.debug("Character based data: {}", fpContent);
                            // LOGGER.debug("Character data via offset diff: {}", content.substring(coff - fpContent.length(), coff));

                            SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(fpContent);
                            final int tsb = currOff + pads.getKey();
                            final int tse = currOff + fpContent.length() - pads.getValue();
                            final String subs = content.substring(tsb, tse);
                            if (subs.replaceAll("\\p{Zs}", "").replaceAll("\\n", "").isEmpty()) {
                                LOGGER.info("Found empty section: skipping.");
                                continue;
                            }

                            LOGGER.debug("Section text: {}", subs);
                            TextSpan ts = new TextSpan(tsb, tse);

                            Section s = sf.fromTextSpan(ts, "post");
                            List<Integer> intList = new ArrayList<>();
                            intList.add(sectNumber);
                            intList.add(subSect);
                            s.setNumberList(intList);
                            c.addToSectionList(s);

                            subSect++;
                        }
                    } else if (nextEvent.isEndElement()) {
                        EndElement ee = nextEvent.asEndElement();
                        currOff = ee.getLocation().getCharacterOffset();
                        QName name = ee.getName();
                        String localName = name.getLocalPart();
                        LOGGER.debug("Hit end element: {}", localName);
                        if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) {
                            sectNumber++;
                            subSect = 0;
                        }
                    }
                }
                return c;
            } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException x) {
                throw new IngestException(x);
            } finally {
                if (rdr != null)
                    try {
                        rdr.close();
                    } catch (XMLStreamException e) {
                        // not likely.
                        LOGGER.info("Error closing XMLReader.", e);
                    }
            }
        } catch (IOException e) {
            throw new IngestException(e);
        }
    }

    /**
     * Length of longest whitespace prefix.
     */
    private static int getLeftSpacesPaddingCount(final String str) {
        final int len = str.length();
        for (int i = 0; i < len; i++) {
            char c = str.charAt(i);
            if (!Character.isWhitespace(c))
                return i;
        }
        return len;
    }

    /**
     * Number of whitespace characters that follow a non-whitespace charachter
     * (if the given string is all whitespace, this returns 0).
     */
    private static int getRightSpacesPaddingCount(final String str) {
        final int lenIdx = str.length() - 1;
        for (int i = 0; i < lenIdx; i++) {
            char c = str.charAt(lenIdx - i);
            if (!Character.isWhitespace(c))
                return i;
        }
        return 0;
    }

    public static void main(String... args) {
        Thread.setDefaultUncaughtExceptionHandler(new LoggedUncaughtExceptionHandler());
        if (args.length < 2) {
            LOGGER.info("Usage: {} {} {} {}", BoltForumPostIngester.class.getName(), "/path/to/output/folder",
                    "/path/to/bolt/.xml/file", "<additional/xml/file/paths>");
            System.exit(1);
        }

        Path outPath = Paths.get(args[0]);
        Optional.ofNullable(outPath.getParent()).ifPresent(p -> {
            if (!Files.exists(p))
                try {
                    Files.createDirectories(p);
                } catch (IOException e) {
                    throw new UncheckedIOException(e);
                }
        });

        if (!Files.isDirectory(outPath)) {
            LOGGER.error("Output path must be a directory.");
            System.exit(1);
        }

        BoltForumPostIngester ing = new BoltForumPostIngester();
        for (int i = 1; i < args.length; i++) {
            Path lp = Paths.get(args[i]);
            LOGGER.info("On path: {}", lp.toString());
            try {
                Communication c = ing.fromCharacterBasedFile(lp);
                new WritableCommunication(c).writeToFile(outPath.resolve(c.getId() + ".comm"), true);
            } catch (IngestException | ConcreteException e) {
                LOGGER.error("Caught exception during ingest on file: " + args[i], e);
            }
        }
    }
}