edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngester.java Source code

Java tutorial

Introduction

Here is the source code for edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngester.java

Source

/*
 * Copyright 2012-2015 Johns Hopkins University HLTCOE. All rights reserved.
 * See LICENSE in the project root directory.
 */
package edu.jhu.hlt.concrete.ingesters.webposts;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.io.IOUtils;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.Section;
import edu.jhu.hlt.concrete.TextSpan;
import edu.jhu.hlt.concrete.communications.WritableCommunication;
import edu.jhu.hlt.concrete.ingesters.base.IngestException;
import edu.jhu.hlt.concrete.ingesters.base.UTF8FileIngester;
import edu.jhu.hlt.concrete.metadata.tools.SafeTooledAnnotationMetadata;
import edu.jhu.hlt.concrete.metadata.tools.TooledMetadataConverter;
import edu.jhu.hlt.concrete.util.ConcreteException;
import edu.jhu.hlt.concrete.util.ProjectConstants;
import edu.jhu.hlt.concrete.util.Timing;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory.AnalyticUUIDGenerator;
import edu.jhu.hlt.utilt.ex.LoggedUncaughtExceptionHandler;
import edu.jhu.hlt.utilt.io.ExistingNonDirectoryFile;
import edu.jhu.hlt.utilt.io.NotFileException;

/**
 * Class representing a Concrete ingester for web post data.
 */
public class WebPostIngester implements SafeTooledAnnotationMetadata, UTF8FileIngester {

    private static final Logger LOGGER = LoggerFactory.getLogger(WebPostIngester.class);

    public static final String DOC_LOCAL_NAME = "DOC";
    public static final String DOCID_LOCAL_NAME = "DOCID";
    public static final String DOCTYPE_LOCAL_NAME = "DOCTYPE";
    public static final String DATETIME_LOCAL_NAME = "DATETIME";
    public static final String BODY_LOCAL_NAME = "BODY";
    public static final String HEADLINE_LOCAL_NAME = "HEADLINE";
    public static final String TEXT_LOCAL_NAME = "TEXT";
    public static final String POST_LOCAL_NAME = "POST";
    public static final String POSTER_LOCAL_NAME = "POSTER";
    public static final String POSTDATE_LOCAL_NAME = "POSTDATE";

    private final XMLInputFactory inF;
    private final DateTimeFormatter dtf;

    /**
     *
     */
    public WebPostIngester() {
        this.inF = XMLInputFactory.newInstance();
        this.dtf = ISODateTimeFormat.dateTimeParser().withZoneUTC();
        // UTC time!
        // Local time will cause the DST switch exception:
        // 2009-03-08T02:00:02: this instant does not exist under America/NewYork
        // time zone, hence an exception raised
        // To fix this, consider all time as UTC.
        // @tongfei
    }

    /*
     * (non-Javadoc)
     *
     * @see
     * edu.jhu.hlt.concrete.safe.metadata.SafeAnnotationMetadata#getTimestamp()
     */
    @Override
    public long getTimestamp() {
        return Timing.currentLocalTime();
    }

    /*
     * (non-Javadoc)
     *
     * @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolName()
     */
    @Override
    public String getToolName() {
        return WebPostIngester.class.getSimpleName();
    }

    /*
     * (non-Javadoc)
     *
     * @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolVersion()
     */
    @Override
    public String getToolVersion() {
        return ProjectConstants.VERSION;
    }

    /*
     * (non-Javadoc)
     *
     * @see edu.jhu.hlt.concrete.ingesters.base.Ingester#getKind()
     */
    @Override
    public String getKind() {
        return "web";
    }

    private Section handleBeginning(final XMLEventReader rdr, final String content, final Communication cptr)
            throws XMLStreamException, ConcreteException {
        // The first type is always a document start event. Skip it.
        rdr.nextEvent();

        // The second type is a document block. Skip it.
        rdr.nextEvent();

        // The third type is a whitespace block. Skip it.
        rdr.nextEvent();

        // The next type is a docid start tag.
        rdr.nextEvent();

        // Text of the docid.
        Characters cc = rdr.nextEvent().asCharacters();
        String idTxt = cc.getData().trim();
        cptr.setId(idTxt);

        // The next part is the docid end element. Skip.
        rdr.nextEvent();

        // Whitespace. Skip.
        rdr.nextEvent();

        // Reader is now pointing at the doctype.
        // XMLEvent doctypeStart = rdr.nextEvent();
        rdr.nextEvent();
        // StartElement dtse = doctypeStart.asStartElement();

        // Doc type content.
        Characters docTypeChars = rdr.nextEvent().asCharacters();
        String docTypeContent = docTypeChars.getData().trim();
        cptr.setType(docTypeContent);

        // Doctype end. Skip.
        rdr.nextEvent();
        // Whitespace. skip.
        rdr.nextEvent();
        // Datetime start.
        rdr.nextEvent();

        // Datetime value.
        Characters dtChars = rdr.nextEvent().asCharacters();
        // TODO: parse this

        String dtValue = dtChars.getData().trim();

        DateTime dt = this.dtf.parseDateTime(dtValue).toDateTime(DateTimeZone.UTC);
        LOGGER.debug("Got DateTime: {}", dt.toString());
        long millis = dt.getMillis();
        cptr.setStartTime(millis / 1000);

        // Datetime end.
        rdr.nextEvent();
        // WS
        rdr.nextEvent();
        // Body begin.
        rdr.nextEvent();
        // WS
        rdr.nextEvent();

        // Headline begin.
        XMLEvent hl = rdr.nextEvent();
        StartElement hlse = hl.asStartElement();
        QName hlqn = hlse.getName();
        final String hlPart = hlqn.getLocalPart();
        LOGGER.debug("QN: {}", hlPart);

        // Headline text.
        Characters hlChars = rdr.nextEvent().asCharacters();
        final int charOff = hlChars.getLocation().getCharacterOffset();
        final int clen = hlChars.getData().length();

        // Construct section, text span, etc.
        final int endTextOffset = charOff + clen;
        final String hlText = content.substring(charOff, endTextOffset);

        SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(hlText);
        TextSpan ts = new TextSpan(charOff + pads.getKey(), endTextOffset - pads.getValue());

        Section s = new Section();
        s.setKind("headline");
        s.setTextSpan(ts);
        List<Integer> intList = new ArrayList<>();
        intList.add(0);
        s.setNumberList(intList);
        return s;
    }

    private static SimpleImmutableEntry<Integer, Integer> trimSpacing(final String str) {
        final int leftPadding = getLeftSpacesPaddingCount(str);
        LOGGER.trace("Left padding: {}", leftPadding);
        final int rightPadding = getRightSpacesPaddingCount(str);
        LOGGER.trace("Right padding: {}", rightPadding);
        return new SimpleImmutableEntry<Integer, Integer>(leftPadding, rightPadding);
    }

    /*
     * (non-Javadoc)
     *
     * @see
     * edu.jhu.hlt.concrete.ingesters.base.UTF8FileIngester#fromCharacterBasedFile
     * (java.nio.file.Path)
     */
    @Override
    public Communication fromCharacterBasedFile(final Path path) throws IngestException {
        if (!Files.exists(path))
            throw new IngestException("No file at: " + path.toString());

        AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory();
        AnalyticUUIDGenerator g = f.create();
        Communication c = new Communication();
        c.setUuid(g.next());
        c.setType(this.getKind());
        c.setMetadata(TooledMetadataConverter.convert(this));

        try {
            ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path);
            c.setId(ef.getName().split("\\.")[0]);
        } catch (NoSuchFileException | NotFileException e) {
            // might throw if path is a directory.
            throw new IngestException(path.toString() + " is not a file, or is a directory.");
        }

        String content;
        try (InputStream is = Files.newInputStream(path);
                BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) {
            content = IOUtils.toString(bin, StandardCharsets.UTF_8);
            c.setText(content);
        } catch (IOException e) {
            throw new IngestException(e);
        }

        try (InputStream is = Files.newInputStream(path);
                BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);
                BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) {
            XMLEventReader rdr = null;
            try {
                rdr = inF.createXMLEventReader(reader);

                // Below method moves the reader
                // to the headline end element.
                Section headline = this.handleBeginning(rdr, content, c);
                headline.setUuid(g.next());
                c.addToSectionList(headline);
                TextSpan sts = headline.getTextSpan();
                LOGGER.debug("headline text: {}", c.getText().substring(sts.getStart(), sts.getEnding()));

                int sectNumber = 1;
                int subSect = 0;

                int currOff = -1;
                // Big amounts of characters.
                while (rdr.hasNext()) {
                    XMLEvent nextEvent = rdr.nextEvent();
                    currOff = nextEvent.getLocation().getCharacterOffset();

                    // First: see if document is going to end.
                    // If yes: exit.
                    if (nextEvent.isEndDocument())
                        break;

                    // region
                    // enables ingestion of quotes inside a usenet webpost.
                    // by Tongfei Chen
                    if (nextEvent.isStartElement()
                            && nextEvent.asStartElement().getName().equals(QName.valueOf("QUOTE"))) {
                        Attribute attrQuote = nextEvent.asStartElement()
                                .getAttributeByName(QName.valueOf("PREVIOUSPOST"));
                        String quote = StringEscapeUtils.escapeXml(attrQuote.getValue());
                        int location = attrQuote.getLocation().getCharacterOffset()
                                + "<QUOTE PREVIOUSPOST=\"".length();
                        Section quoteSection = new Section(g.next(), "quote")
                                .setTextSpan(new TextSpan(location, location + quote.length()));
                        c.addToSectionList(quoteSection);
                    }
                    // endregion

                    // Check if start element.
                    if (nextEvent.isCharacters()) {
                        Characters chars = nextEvent.asCharacters();
                        if (!chars.isWhiteSpace()) {
                            String fpContent = chars.getData();
                            LOGGER.debug("Character offset: {}", currOff);
                            LOGGER.debug("Character based data: {}", fpContent);

                            SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(fpContent);
                            final int tsb = currOff + pads.getKey();

                            final int tse = currOff + fpContent.replace("\"", "&quot;").replace("<", "&lt;")
                                    .replace(">", "&gt;").length() - (pads.getValue());
                            // MAINTAIN CORRECT TEXT SPAN
                            // CANNOT USE StringEscapeUtils.escapeXml because it will escape "'", which
                            // is not escaped in the data
                            // @tongfei

                            LOGGER.debug("Section text: {}", content.substring(tsb, tse));
                            TextSpan ts = new TextSpan(tsb, tse);
                            String sk;
                            if (subSect == 0)
                                sk = "poster";
                            else if (subSect == 1)
                                sk = "postdate";
                            else
                                sk = "post";

                            Section s = new Section();
                            s.setKind(sk);
                            s.setTextSpan(ts);
                            s.setUuid(g.next());
                            List<Integer> intList = new ArrayList<>();
                            intList.add(sectNumber);
                            intList.add(subSect);
                            s.setNumberList(intList);
                            c.addToSectionList(s);

                            subSect++;
                        }
                    } else if (nextEvent.isEndElement()) {
                        EndElement ee = nextEvent.asEndElement();
                        currOff = ee.getLocation().getCharacterOffset();
                        QName name = ee.getName();
                        String localName = name.getLocalPart();
                        LOGGER.debug("Hit end element: {}", localName);
                        if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) {
                            LOGGER.debug("Switching to new post.");
                            sectNumber++;
                            subSect = 0;
                        } else if (localName.equalsIgnoreCase(TEXT_LOCAL_NAME)) {
                            // done with document.
                            break;
                        }
                    }
                }

                return c;

            } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException
                    | ClassCastException x) {
                throw new IngestException(x);
            } finally {
                if (rdr != null)
                    try {
                        rdr.close();
                    } catch (XMLStreamException e) {
                        // not likely.
                        LOGGER.info("Error closing XMLReader.", e);
                    }
            }
        } catch (IOException e) {
            throw new IngestException(e);
        }
    }

    private static int getLeftSpacesPaddingCount(final String str) {
        final int len = str.length();
        for (int i = 0; i < len; i++) {
            Character c = str.charAt(i);
            if (!isSpaceOrUnixNewline(c))
                return i;
        }

        return len;
    }

    private static boolean isSpaceOrUnixNewline(final Character c) {
        return c.equals(' ') || c.equals('\n');
    }

    private static int getRightSpacesPaddingCount(final String str) {
        final int lenIdx = str.length() - 1;
        for (int i = 0; i < lenIdx; i++) {
            Character c = str.charAt(lenIdx - i);
            if (!isSpaceOrUnixNewline(c))
                return i;
        }

        return lenIdx + 1;
    }

    public static void main(String... args) {
        Thread.setDefaultUncaughtExceptionHandler(new LoggedUncaughtExceptionHandler());
        if (args.length < 2) {
            LOGGER.info("Usage: {} {} {} {}", WebPostIngester.class.getName(), "/path/to/output/folder",
                    "/path/to/web/.xml/file", "<additional/xml/file/paths>");
            System.exit(1);
        }

        Path outPath = Paths.get(args[0]);
        Optional.ofNullable(outPath.getParent()).ifPresent(p -> {
            if (!Files.exists(p))
                try {
                    Files.createDirectories(p);
                } catch (IOException e) {
                    throw new UncheckedIOException(e);
                }
        });

        if (!Files.isDirectory(outPath)) {
            LOGGER.error("Output path must be a directory.");
            System.exit(1);
        }

        WebPostIngester ing = new WebPostIngester();
        for (int i = 1; i < args.length; i++) {
            Path lp = Paths.get(args[i]);
            LOGGER.info("On path: {}", lp.toString());
            try {
                Communication c = ing.fromCharacterBasedFile(lp);
                new WritableCommunication(c).writeToFile(outPath.resolve(c.getId() + ".comm"), true);
            } catch (IngestException | ConcreteException e) {
                LOGGER.error("Caught exception during ingest on file: " + args[i], e);
            }
        }
    }
}