mvm.rya.accumulo.mr.RyaOutputFormat.java Source code

Java tutorial

Introduction

Here is the source code for mvm.rya.accumulo.mr.RyaOutputFormat.java

Source

package mvm.rya.accumulo.mr;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.io.Closeable;
import java.io.Flushable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;

import mvm.rya.accumulo.AccumuloRdfConfiguration;
import mvm.rya.accumulo.AccumuloRyaDAO;
import mvm.rya.accumulo.mr.utils.MRUtils;
import mvm.rya.api.domain.RyaStatement;
import mvm.rya.api.persist.RyaDAOException;
import mvm.rya.api.resolver.RdfToRyaConversions;
import mvm.rya.indexing.FreeTextIndexer;
import mvm.rya.indexing.GeoIndexer;
import mvm.rya.indexing.TemporalIndexer;
import mvm.rya.indexing.accumulo.ConfigUtils;
import mvm.rya.indexing.accumulo.StatementSerializer;
import mvm.rya.indexing.accumulo.freetext.AccumuloFreeTextIndexer;
import mvm.rya.indexing.accumulo.geo.GeoMesaGeoIndexer;
import mvm.rya.indexing.accumulo.temporal.AccumuloTemporalIndexer;

import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.TableExistsException;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.data.Mutation;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.log4j.Logger;
import org.geotools.feature.SchemaException;
import org.openrdf.model.Statement;

/**
 * Hadoop Map/Reduce class to use Rya, the {@link GeoIndexer}, the {@link FreeTextIndexer}, and the {@link TemporalIndexer} as the sink of {@link Statement} data.
 * wrapped in an {@link StatementWritable} objects. This {@link OutputFormat} ignores the Keys and only writes the Values to Rya.
 * 
 * The user must specify connection parameters for Rya, {@link GeoIndexer}, {@link FreeTextIndexer}, and {@link TemporalIndexer}.
 */
public class RyaOutputFormat extends OutputFormat<Writable, StatementWritable> {
    private static final Logger logger = Logger.getLogger(RyaOutputFormat.class);

    private static final String PREFIX = RyaOutputFormat.class.getSimpleName();
    private static final String MAX_MUTATION_BUFFER_SIZE = PREFIX + ".maxmemory";
    private static final String ENABLE_FREETEXT = PREFIX + ".freetext.enable";
    private static final String ENABLE_GEO = PREFIX + ".geo.enable";
    private static final String ENABLE_TEMPORAL = PREFIX + ".temporal.enable";;

    @Override
    public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException {
        Configuration conf = jobContext.getConfiguration();

        // make sure that all of the indexers can connect
        getGeoIndexer(conf);
        getFreeTextIndexer(conf);
        getTemporalIndexer(conf);
        getRyaIndexer(conf);
    }

    @Override
    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
        // copied from AccumuloOutputFormat
        return new NullOutputFormat<Text, Mutation>().getOutputCommitter(context);
    }

    @Override
    public RecordWriter<Writable, StatementWritable> getRecordWriter(TaskAttemptContext context)
            throws IOException, InterruptedException {
        return new RyaRecordWriter(context);
    }

    private static GeoIndexer getGeoIndexer(Configuration conf) throws IOException {
        if (!conf.getBoolean(ENABLE_GEO, true)) {
            return new NullGeoIndexer();
        }

        GeoMesaGeoIndexer geo = new GeoMesaGeoIndexer();
        geo.setConf(conf);
        return geo;

    }

    private static FreeTextIndexer getFreeTextIndexer(Configuration conf) throws IOException {
        if (!conf.getBoolean(ENABLE_FREETEXT, true)) {
            return new NullFreeTextIndexer();
        }

        AccumuloFreeTextIndexer freeText = new AccumuloFreeTextIndexer();
        freeText.setConf(conf);
        return freeText;

    }

    private static TemporalIndexer getTemporalIndexer(Configuration conf) throws IOException {
        if (!conf.getBoolean(ENABLE_TEMPORAL, true)) {
            return new NullTemporalIndexer();
        }
        AccumuloTemporalIndexer temporal = new AccumuloTemporalIndexer();
        temporal.setConf(conf);
        return temporal;
    }

    private static AccumuloRyaDAO getRyaIndexer(Configuration conf) throws IOException {
        try {
            AccumuloRyaDAO ryaIndexer = new AccumuloRyaDAO();
            Connector conn = ConfigUtils.getConnector(conf);
            ryaIndexer.setConnector(conn);

            AccumuloRdfConfiguration ryaConf = new AccumuloRdfConfiguration();

            String tablePrefix = conf.get(MRUtils.TABLE_PREFIX_PROPERTY, null);
            if (tablePrefix != null) {
                ryaConf.setTablePrefix(tablePrefix);
            }
            ryaConf.setDisplayQueryPlan(false);
            ryaIndexer.setConf(ryaConf);
            ryaIndexer.init();
            return ryaIndexer;
        } catch (AccumuloException e) {
            logger.error("Cannot create RyaIndexer", e);
            throw new IOException(e);
        } catch (AccumuloSecurityException e) {
            logger.error("Cannot create RyaIndexer", e);
            throw new IOException(e);
        } catch (RyaDAOException e) {
            logger.error("Cannot create RyaIndexer", e);
            throw new IOException(e);
        }
    }

    public static class RyaRecordWriter extends RecordWriter<Writable, StatementWritable>
            implements Closeable, Flushable {
        private static final Logger logger = Logger.getLogger(RyaRecordWriter.class);

        private FreeTextIndexer freeTextIndexer;
        private GeoIndexer geoIndexer;
        private TemporalIndexer temporalIndexer;
        private AccumuloRyaDAO ryaIndexer;

        private static final long ONE_MEGABYTE = 1024L * 1024L;
        private static final long AVE_STATEMENT_SIZE = 100L;

        private long bufferSizeLimit;
        private long bufferCurrentSize = 0;

        private ArrayList<RyaStatement> buffer;

        public RyaRecordWriter(TaskAttemptContext context) throws IOException {
            this(context.getConfiguration());
        }

        public RyaRecordWriter(Configuration conf) throws IOException {
            // set up the buffer
            bufferSizeLimit = conf.getLong(MAX_MUTATION_BUFFER_SIZE, ONE_MEGABYTE);
            int bufferCapacity = (int) (bufferSizeLimit / AVE_STATEMENT_SIZE);
            buffer = new ArrayList<RyaStatement>(bufferCapacity);

            // set up the indexers
            freeTextIndexer = getFreeTextIndexer(conf);
            geoIndexer = getGeoIndexer(conf);
            temporalIndexer = getTemporalIndexer(conf);
            ryaIndexer = getRyaIndexer(conf);

            // update fields used for metrics
            startTime = System.currentTimeMillis();
            lastCommitFinishTime = startTime;
        }

        @Override
        public void flush() throws IOException {
            flushBuffer();
        }

        @Override
        public void close() throws IOException {
            close(null);
        }

        @Override
        public void close(TaskAttemptContext paramTaskAttemptContext) throws IOException {
            // close everything. log errors
            try {
                flush();
            } catch (IOException e) {
                logger.error("Error flushing the buffer on RyaOutputFormat Close", e);
            }
            try {
                if (geoIndexer != null)
                    geoIndexer.close();
            } catch (IOException e) {
                logger.error("Error closing the geoIndexer on RyaOutputFormat Close", e);
            }
            try {
                if (freeTextIndexer != null)
                    freeTextIndexer.close();
            } catch (IOException e) {
                logger.error("Error closing the freetextIndexer on RyaOutputFormat Close", e);
            }
            try {
                if (temporalIndexer != null)
                    temporalIndexer.close();
            } catch (IOException e) {
                logger.error("Error closing the temporalIndexer on RyaOutputFormat Close", e);
            }
            try {
                ryaIndexer.destroy();
            } catch (RyaDAOException e) {
                logger.error("Error closing RyaDAO on RyaOutputFormat Close", e);
            }
        }

        public void write(Statement statement) throws IOException, InterruptedException {
            write(null, new StatementWritable(statement));
        }

        @Override
        public void write(Writable key, StatementWritable value) throws IOException, InterruptedException {
            buffer.add(RdfToRyaConversions.convertStatement(value));

            bufferCurrentSize += StatementSerializer.writeStatement(value).length();

            if (bufferCurrentSize >= bufferSizeLimit) {
                flushBuffer();
            }
        }

        // fields for storing metrics
        private long startTime = 0;
        private long lastCommitFinishTime = 0;
        private long totalCommitRecords = 0;

        private double totalReadDuration = 0;
        private double totalWriteDuration = 0;

        private long commitCount = 0;

        private void flushBuffer() throws IOException {
            totalCommitRecords += buffer.size();
            commitCount++;

            long startCommitTime = System.currentTimeMillis();

            logger.info(String.format("(C-%d) Flushing buffer with %,d objects and %,d bytes", commitCount,
                    buffer.size(), bufferCurrentSize));

            double readingDuration = (startCommitTime - lastCommitFinishTime) / 1000.;
            totalReadDuration += readingDuration;
            double currentReadRate = buffer.size() / readingDuration;
            double totalReadRate = totalCommitRecords / totalReadDuration;

            // Print "reading" metrics
            logger.info(String.format("(C-%d) (Reading) Duration, Current Rate, Total Rate: %.2f %.2f %.2f ",
                    commitCount, readingDuration, currentReadRate, totalReadRate));

            // write to geo
            geoIndexer.storeStatements(buffer);
            geoIndexer.flush();

            // write to free text
            freeTextIndexer.storeStatements(buffer);
            freeTextIndexer.flush();

            // write to temporal
            temporalIndexer.storeStatements(buffer);
            temporalIndexer.flush();

            // write to rya
            try {
                ryaIndexer.add(buffer.iterator());
            } catch (RyaDAOException e) {
                logger.error("Cannot writing statement to Rya", e);
                throw new IOException(e);
            }

            lastCommitFinishTime = System.currentTimeMillis();

            double writingDuration = (lastCommitFinishTime - startCommitTime) / 1000.;
            totalWriteDuration += writingDuration;
            double currentWriteRate = buffer.size() / writingDuration;
            double totalWriteRate = totalCommitRecords / totalWriteDuration;

            // Print "writing" stats
            logger.info(String.format("(C-%d) (Writing) Duration, Current Rate, Total Rate: %.2f %.2f %.2f ",
                    commitCount, writingDuration, currentWriteRate, totalWriteRate));

            double processDuration = writingDuration + readingDuration;
            double totalProcessDuration = totalWriteDuration + totalReadDuration;
            double currentProcessRate = buffer.size() / processDuration;
            double totalProcessRate = totalCommitRecords / (totalProcessDuration);

            // Print "total" stats
            logger.info(String.format("(C-%d) (Total) Duration, Current Rate, Total Rate: %.2f %.2f %.2f ",
                    commitCount, processDuration, currentProcessRate, totalProcessRate));

            // clear the buffer
            buffer.clear();
            bufferCurrentSize = 0L;
        }
    }
}