com.msd.gin.halyard.tools.HalyardHiveLoad.java Source code

Introduction

Here is the source code for com.msd.gin.halyard.tools.HalyardHiveLoad.java
Source

/*
 * Copyright 2016 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co.,
 * Inc., Kenilworth, NJ, USA.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.msd.gin.halyard.tools;

import com.msd.gin.halyard.common.HalyardTableUtils;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFHandler;
import org.eclipse.rdf4j.rio.ntriples.NTriplesUtil;

/**
 * MapReduce tool BulkLoading RDF data parsed (in any standard RDF form) from given Hive table and column
 * @author Adam Sotona (MSD)
 */
public class HalyardHiveLoad implements Tool {

    private static final String HIVE_DATA_COLUMN_INDEX_PROPERTY = "halyard.hive.data.column.index";

    /**
     * Base URI property used for parsed data
     */
    public static final String BASE_URI_PROPERTY = "halyard.base.uri";
    private static final String RDF_MIME_TYPE_PROPERTY = "halyard.rdf.mime.type";
    private static final Logger LOG = Logger.getLogger(HalyardHiveLoad.class.getName());

    private Configuration conf;

    /**
     * HiveMapper reads specified Hive table and column data and produces Halyard KeyValue pairs for HBase Reducers
     */
    public static class HiveMapper
            extends Mapper<WritableComparable<Object>, HCatRecord, ImmutableBytesWritable, KeyValue> {

        private IRI defaultRdfContext;
        private boolean overrideRdfContext;
        private int dataColumnIndex;
        private RDFFormat rdfFormat;
        private String baseUri;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            Configuration conf = context.getConfiguration();
            overrideRdfContext = conf.getBoolean(HalyardBulkLoad.OVERRIDE_CONTEXT_PROPERTY, false);
            String defCtx = conf.get(HalyardBulkLoad.DEFAULT_CONTEXT_PROPERTY);
            defaultRdfContext = defCtx == null ? null : SimpleValueFactory.getInstance().createIRI(defCtx);
            dataColumnIndex = conf.getInt(HIVE_DATA_COLUMN_INDEX_PROPERTY, 0);
            rdfFormat = Rio.getParserFormatForMIMEType(conf.get(RDF_MIME_TYPE_PROPERTY)).get();
            baseUri = conf.get(BASE_URI_PROPERTY);
        }

        @Override
        protected void map(WritableComparable<Object> key, HCatRecord value, final Context context)
                throws IOException, InterruptedException {
            String text = (String) value.get(dataColumnIndex);
            RDFParser parser = Rio.createParser(rdfFormat);
            parser.setRDFHandler(new AbstractRDFHandler() {
                @Override
                public void handleStatement(Statement st) throws RDFHandlerException {
                    Resource rdfContext;
                    if (overrideRdfContext || (rdfContext = st.getContext()) == null) {
                        rdfContext = defaultRdfContext;
                    }
                    for (KeyValue keyValue : HalyardTableUtils.toKeyValues(st.getSubject(), st.getPredicate(),
                            st.getObject(), rdfContext))
                        try {
                            context.write(new ImmutableBytesWritable(keyValue.getRowArray(),
                                    keyValue.getRowOffset(), keyValue.getRowLength()), keyValue);
                        } catch (IOException | InterruptedException e) {
                            throw new RDFHandlerException(e);
                        }
                }
            });
            try {
                parser.parse(new StringReader(text), baseUri);
            } catch (RDFParseException | RDFHandlerException e) {
                throw new IOException(e);
            }
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        if (args.length != 3) {
            System.err.println("Usage: hiveload -D" + RDF_MIME_TYPE_PROPERTY + "='application/ld+json' [-D"
                    + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + HIVE_DATA_COLUMN_INDEX_PROPERTY + "=3] [-D"
                    + BASE_URI_PROPERTY + "='http://my_base_uri/'] [-D" + HalyardBulkLoad.SPLIT_BITS_PROPERTY
                    + "=8] [-D" + HalyardBulkLoad.DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D"
                    + HalyardBulkLoad.OVERRIDE_CONTEXT_PROPERTY
                    + "=true] <hive_table_name> <output_path> <hbase_table_name>");
            return -1;
        }
        TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class,
                RDFFormat.class, RDFParser.class);
        HBaseConfiguration.addHbaseResources(getConf());
        if (SnappyCodec.isNativeCodeLoaded()) {
            getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
            getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
        }
        getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
        getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
        getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
        getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
        getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
        getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
        Job job = Job.getInstance(getConf(), "HalyardHiveLoad -> " + args[1] + " -> " + args[2]);
        int i = args[0].indexOf('.');
        HCatInputFormat.setInput(job, i > 0 ? args[0].substring(0, i) : null, args[0].substring(i + 1));
        job.setJarByClass(HalyardHiveLoad.class);
        job.setMapperClass(HiveMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);
        job.setInputFormatClass(HCatInputFormat.class);
        job.setSpeculativeExecution(false);
        job.setReduceSpeculativeExecution(false);
        Map<String, Integer> contextSplitsMap = new HashMap<>();
        for (Map.Entry<String, String> me : getConf().getValByRegex(HalyardBulkLoad.CONTEXT_SPLIT_REGEXP)
                .entrySet()) {
            int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1));
            StringTokenizer stk = new StringTokenizer(me.getValue(), ",");
            while (stk.hasMoreTokens()) {
                contextSplitsMap.put(stk.nextToken(), splits);
            }
        }
        try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true,
                getConf().getInt(HalyardBulkLoad.SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) {
            HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                    hTable.getRegionLocator());
            FileInputFormat.setInputDirRecursive(job, true);
            FileInputFormat.setInputPaths(job, args[0]);
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            TableMapReduceUtil.addDependencyJars(job);
            TableMapReduceUtil.initCredentials(job);
            if (job.waitForCompletion(true)) {
                new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
                LOG.info("Bulk Load Completed..");
                return 0;
            }
        }
        return -1;
    }

    @Override
    public Configuration getConf() {
        return this.conf;
    }

    @Override
    public void setConf(final Configuration c) {
        this.conf = c;
    }

    /**
     * Main of the HalyardHiveLoad
     * @param args String command line arguments
     * @throws Exception throws Exception in case of any problem
     */
    public static void main(String[] args) throws Exception {
        System.exit(ToolRunner.run(new Configuration(), new HalyardHiveLoad(), args));
    }
}