gov.llnl.ontology.text.corpora.UkWacDocumentReader.java Source code

Introduction

Here is the source code for gov.llnl.ontology.text.corpora.UkWacDocumentReader.java
Source

/*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. Produced at
 * the Lawrence Livermore National Laboratory. Written by Keith Stevens,
 * kstevens@cs.ucla.edu OCEC-10-073 All rights reserved. 
 *
 * This file is part of the C-Cat package and is covered under the terms and
 * conditions therein.
 *
 * The C-Cat package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

package gov.llnl.ontology.text.corpora;

import gov.llnl.ontology.text.DocumentReader;
import gov.llnl.ontology.text.SimpleDocument;

import org.apache.commons.lang3.StringEscapeUtils;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.util.HashSet;

/**
 * A {@link DocumentReader} for the parsed UkWac corpus.  Documents are expected
 * to deliminated with "<text>" tags in an xml format.  Each sentence is
 * expected to be in the UkWac CoNLL format.  The url in the {@code id}
 * attribute of {@code text} is the key and text, and it's hash value is the id
 * for each document.
 *
 * </p>
 *
 * This is <b>not</b> thread safe.
 *
 * @author Keith Stevens
 */
public class UkWacDocumentReader implements DocumentReader {

    public static final String CORPUS_NAME = "ukwac";

    /**
     * Returns {@link #CORPUS_NAME}
     */
    public String corpusName() {
        return CORPUS_NAME;
    }

    /**
     * {@inheritDoc}
     */
    public gov.llnl.ontology.text.Document readDocument(String doc) {
        return readDocument(doc, corpusName());
    }

    /**
     * {@inheritDoc}
     */
    public gov.llnl.ontology.text.Document readDocument(String doc, String corpusName) {
        String[] lines = doc.split("\\n");

        // Find the title.
        int titleStart = lines[0].indexOf("id=\"") + 4;
        int titleEnd = lines[0].lastIndexOf("\">");
        String key = lines[0].substring(titleStart, titleEnd);
        long id = key.hashCode();

        StringBuilder builder = new StringBuilder();
        for (int i = 1; i < lines.length - 1; ++i) {
            // Skip empty lines and xml tags.
            if (lines[i].length() == 0 || lines[i].endsWith("s>"))
                continue;

            lines[i] = StringEscapeUtils.unescapeHtml4(lines[i]);
            String[] toks = lines[i].split("\\s+");
            builder.append(toks[0]).append(" ");
        }

        return new SimpleDocument(corpusName, builder.toString(), doc, key, id, key, new HashSet<String>());
    }
}