gov.llnl.ontology.mapreduce.table.TrinidadTable.java Source code

Introduction

Here is the source code for gov.llnl.ontology.mapreduce.table.TrinidadTable.java
Source

/*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. Produced at
 * the Lawrence Livermore National Laboratory. Written by Keith Stevens,
 * kstevens@cs.ucla.edu OCEC-10-073 All rights reserved. 
 *
 * This file is part of the C-Cat package and is covered under the terms and
 * conditions therein.
 *
 * The C-Cat package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

package gov.llnl.ontology.mapreduce.table;

import gov.llnl.ontology.text.Document;
import gov.llnl.ontology.text.Sentence;
import gov.llnl.ontology.text.hbase.DynamicDocument;
import gov.llnl.ontology.util.StringPair;

import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.IntPair;

import com.google.common.collect.Lists;
import com.google.gson.reflect.TypeToken;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import java.io.IOError;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

/**
 * @author Keith Stevens
 */
public class TrinidadTable implements CorpusTable {

    /**
     * Stores the text type of any document.
     */
    public static final String XML_MIME_TYPE = "text/xml";

    /**
     * A marker to request all corpora types when scanning.
     */
    public static final String ALL_CORPORA = "";

    /**
     * The official table name.
     */
    public static final String TABLE_NAME = "trinidad_table";

    /**
     * The column family for source related columns.
     */
    public static final String SOURCE_CF = "src";

    /**
     * The column qualifier for the corpus source name.
     */
    public static final String SOURCE_NAME = "name";

    /**
     * The full column qualifier for the corpus source name.
     */
    public static final String SOURCE_NAMECOL = SOURCE_CF + ":" + SOURCE_NAME;

    /**
     * The column qualifier for the corpus id.
     */
    public static final String SOURCE_ID = "id";

    /**
     * The full column qualifier for the corpus id.
     */
    public static final String SOURCE_IDCOL = SOURCE_CF + ":" + SOURCE_ID;

    /**
     * The column family for the text colunns.
     */
    public static final String TEXT_CF = "text";

    /**
     * The column qualifier for the original document text.
     */
    public static final String TEXT_ORIGINAL = "orig";

    /**
     * The full column qualifier for the original document text.
     */
    public static final String TEXT_ORIGINAL_COL = TEXT_CF + ":" + TEXT_ORIGINAL;

    /**
     * The column qualifier for the text type.
     */
    public static final String TEXT_TYPE = "text";

    /**
     * The full column qualifier for the text type.
     */
    public static final String TEXT_TYPE_COL = TEXT_CF + ":" + TEXT_TYPE;

    /**
     * The column qualifier for the cleaned document text.
     */
    public static final String TEXT_RAW = "raw";

    /**
     * The full column qualifier for the cleaned document text.
     */
    public static final String TEXT_RAW_COL = TEXT_CF + ":" + TEXT_RAW;

    /**
     * The column qualifier for the document title.
     */
    public static final String TEXT_TITLE = "title";

    /**
     * The full column qualifier for the document title.
     */
    public static final String TEXT_TITLE_COL = TEXT_CF + ":" + TEXT_TITLE;

    /**
     * The column family for the document annotations.
     */
    public static final String ANNOTATION_CF = "annotations";

    /**
     * The column qualifier for the sentence level document annotations.
     */
    public static final String ANNOTATION_SENTENCE = "sentence";

    /**
     * The column qualifier for the token level document annotations.
     */
    public static final String ANNOTATION_TOKEN = "token";

    /**
     * The column qualifier prefix for sentence level word sense annotations.
     */
    public static final String SENSE_SENTENCE_PREFIX = "wss";

    /**
     * The column qualifier prefix for token level word sense annotations.
     */
    public static final String SENSE_TOKEN_PREFIX = "wst";

    /**
     * The column family for word list labels associated wtih each document.
     */
    public static final String LABEL_CF = "wordListLabels";

    /**
     * The column family for word list labels associated wtih each document.
     */
    public static final String META_CF = "meta";

    /**
     * The column name for categories that a document may fall under, if any.
     */
    public static final String CATEGORY_COLUMN = "categories";

    /**
     * The column name for the document key.
     */
    public static final String DOC_KEY = "key";

    /**
     * The column name for the document id.
     */
    public static final String DOC_ID = "id";

    /**
     * A connection to the {@link HTable}.
     */
    private HTable table;

    /**
     * Creates a new {@link TrinidadTable} that uses the default {@lin
     * HBaseConfiguration}.
     */
    public TrinidadTable() {
        table = null;
    }

    /**
     * {@inheritDoc}
     */
    public void createTable() {
        try {
            Configuration conf = HBaseConfiguration.create();
            HConnection connector = HConnectionManager.getConnection(conf);
            createTable(connector);
        } catch (org.apache.hadoop.hbase.ZooKeeperConnectionException zkce) {
            zkce.printStackTrace();
            System.exit(1);
        }
    }

    /**
     * {@inheritDoc}
     */
    public void createTable(HConnection connector) {
        try {
            Configuration config = HBaseConfiguration.create();
            HBaseAdmin admin = new HBaseAdmin(config);

            // Do nothing if the table already exists.
            if (admin.tableExists(tableName()))
                return;

            HTableDescriptor docDesc = new HTableDescriptor(tableName());
            SchemaUtil.addDefaultColumnFamily(docDesc, SOURCE_CF);
            SchemaUtil.addDefaultColumnFamily(docDesc, TEXT_CF);
            SchemaUtil.addDefaultColumnFamily(docDesc, ANNOTATION_CF);
            SchemaUtil.addDefaultColumnFamily(docDesc, LABEL_CF);
            SchemaUtil.addDefaultColumnFamily(docDesc, META_CF);
            admin.createTable(docDesc);
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }

    /**
     * {@inheritDoc}
     */
    public void setupScan(Scan scan) {
        setupScan(scan, ALL_CORPORA);
    }

    /**
     * {@inheritDoc}
     */
    public void setupScan(Scan scan, String corpusName) {
        scan.addColumn(SOURCE_CF.getBytes(), SOURCE_NAME.getBytes());
        scan.addColumn(ANNOTATION_CF.getBytes(), ANNOTATION_SENTENCE.getBytes());
        scan.addColumn(ANNOTATION_CF.getBytes(), ANNOTATION_TOKEN.getBytes());
        scan.addFamily(TEXT_CF.getBytes());
        scan.addFamily(LABEL_CF.getBytes());
        scan.addFamily(META_CF.getBytes());
        if (!corpusName.equals(ALL_CORPORA))
            scan.setFilter(new SingleColumnValueFilter(SOURCE_CF.getBytes(), SOURCE_NAME.getBytes(),
                    CompareOp.EQUAL, corpusName.getBytes()));
    }

    /**
     * {@inheritDoc}
     */
    public Iterator<Result> iterator(Scan scan) {
        try {
            return table.getScanner(scan).iterator();
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }

    /**
     * {@inheritDoc}
     */
    public String tableName() {
        return TABLE_NAME;
    }

    /**
     * {@inheritDoc}
     */
    public HTable table() {
        try {
            table = new HTable(HBaseConfiguration.create(), tableName());
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
        return table;
    }

    /**
     * {@inheritDoc}
     */
    public String text(Result row) {
        return SchemaUtil.getColumn(row, TEXT_CF, TEXT_RAW);
    }

    /**
     * {@inheritDoc}
     */
    public String textSource(Result row) {
        return SchemaUtil.getColumn(row, TEXT_CF, TEXT_ORIGINAL);
    }

    /**
     * {@inheritDoc}
     */
    public String sourceCorpus(Result row) {
        return SchemaUtil.getColumn(row, SOURCE_CF, SOURCE_NAME);
    }

    /**
     * {@inheritDoc}
     */
    public String title(Result row) {
        return SchemaUtil.getColumn(row, TEXT_CF, TEXT_TITLE);
    }

    /**
     * {@inheritDoc}
     */
    public List<Sentence> sentences(Result row) {
        // First read in and parse the meta data for the sentences.
        String sentenceText = SchemaUtil.getColumn(row, ANNOTATION_CF, ANNOTATION_SENTENCE);
        // Next read in and parse the meta data for each token.
        String tokenText = SchemaUtil.getColumn(row, ANNOTATION_CF, ANNOTATION_TOKEN);
        return Sentence.readSentences(sentenceText, tokenText);
    }

    /**
     * {@inheritDoc}
     */
    public List<Sentence> wordSenses(Result row, String senseLabel) {
        // First read in and parse the meta data for the sentences.
        String sentenceText = SchemaUtil.getColumn(row, ANNOTATION_CF, SENSE_SENTENCE_PREFIX + senseLabel);
        // Next read in and parse the meta data for each token.
        String tokenText = SchemaUtil.getColumn(row, ANNOTATION_CF, SENSE_TOKEN_PREFIX + senseLabel);
        return Sentence.readSentences(sentenceText, tokenText);
    }

    /**
     * {@inheritDoc}
     */
    public Document document(Result row) {
        return new DynamicDocument(row, SOURCE_CF + ":" + SOURCE_NAME, TEXT_CF + ":" + TEXT_RAW,
                TEXT_CF + ":" + TEXT_ORIGINAL, META_CF + ":" + DOC_KEY, META_CF + ":" + DOC_ID,
                TEXT_CF + ":" + TEXT_TITLE, META_CF + ":" + CATEGORY_COLUMN);
    }

    /**
     * {@inheritDoc}
     */
    public void put(Document document) {
        if (document == null)
            return;

        Put put = new Put(DigestUtils.shaHex(document.key()).getBytes());
        // Temporarily blank everything else out so that we can add in the keys
        // and ids to every document that's already been added.
        SchemaUtil.add(put, SOURCE_CF, SOURCE_NAME, document.sourceCorpus());
        SchemaUtil.add(put, TEXT_CF, TEXT_ORIGINAL, document.originalText());
        SchemaUtil.add(put, TEXT_CF, TEXT_RAW, document.rawText());
        SchemaUtil.add(put, TEXT_CF, TEXT_TITLE, document.title());
        SchemaUtil.add(put, TEXT_CF, TEXT_TYPE, XML_MIME_TYPE);
        SchemaUtil.add(put, META_CF, DOC_KEY, document.key());
        SchemaUtil.add(put, META_CF, DOC_ID, Long.toString(document.id()));
        SchemaUtil.add(put, META_CF, CATEGORY_COLUMN, document.categories());
        put(put);
    }

    /**
     * {@inheritDoc}
     */
    public void put(ImmutableBytesWritable key, List<Sentence> sentences) {
        if (sentences == null)
            return;

        StringPair annots = Sentence.writeSentences(sentences);
        Put put = new Put(key.get());
        SchemaUtil.add(put, ANNOTATION_CF, ANNOTATION_SENTENCE, annots.x);
        SchemaUtil.add(put, ANNOTATION_CF, ANNOTATION_TOKEN, annots.y);
        put(put);
    }

    /**
     * {@inheritDoc}
     */
    public void putSenses(ImmutableBytesWritable key, List<Sentence> sentences, String senseLabel) {
        if (sentences == null)
            return;

        StringPair annots = Sentence.writeSentences(sentences);
        Put put = new Put(key.get());
        SchemaUtil.add(put, ANNOTATION_CF, SENSE_SENTENCE_PREFIX + senseLabel, annots.x);
        SchemaUtil.add(put, ANNOTATION_CF, SENSE_TOKEN_PREFIX + senseLabel, annots.y);
        put(put);
    }

    /**
     * Stores the {@code put} into the {@link HTable}.  This helper method just
     * encapsulates the error handling code.
     */
    private void put(Put put) {
        try {
            table.put(put);
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }

    /**
     * {@inheritDoc}
     */
    public void putCategories(ImmutableBytesWritable key, Set<String> categories) {
        Put put = new Put(key.get());
        SchemaUtil.add(put, META_CF, CATEGORY_COLUMN, categories);
        put(put);
    }

    /**
     * {@inheritDoc}
     */
    public Set<String> getCategories(Result row) {
        return SchemaUtil.getObjectColumn(row, META_CF, CATEGORY_COLUMN, new TypeToken<Set<String>>() {
        }.getType());
    }

    /**
     * {@inheritDoc}
     */
    public void putLabel(ImmutableBytesWritable key, String labelName, String labelValue) {
        Put put = new Put(key.get());
        SchemaUtil.add(put, LABEL_CF, labelName, labelValue);
        put(put);
    }

    /**
     * {@inheritDoc}
     */
    public String getLabel(Result row, String labelName) {
        return SchemaUtil.getColumn(row, LABEL_CF, labelName);
    }

    /**
     * {@inheritDoc}
     */
    public boolean shouldProcessRow(Result row) {
        return true;
    }

    /**
     * {@inheritDoc}
     */
    public void markRowAsProcessed(ImmutableBytesWritable key, Result row) {
    }

    /**
     * {@inheritDoc}
     */
    public void close() {
        try {
            table.flushCommits();
            table.close();
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }
}