reconcile.hbase.mapreduce.ChainableAnnotationJob.java Source code

Java tutorial

Introduction

Here is the source code for reconcile.hbase.mapreduce.ChainableAnnotationJob.java

Source

/*
 * Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National
 * Laboratory. Written by Teresa Cottom, cottom1@llnl.gov CODE-400187 All rights reserved. This file is part of
 * RECONCILE
 *
 * This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
 * License (as published by the Free Software Foundation) version 2, dated June 1991. This program is distributed in the
 * hope that it will be useful, but WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with this program; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA For full text see license.txt
 */
package reconcile.hbase.mapreduce;

import static reconcile.hbase.mapreduce.annotation.AnnotationUtils.getAnnotationStr;

import java.io.IOException;
import java.util.NavigableMap;
import java.util.TreeMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;

import reconcile.data.AnnotationSet;
import reconcile.hbase.table.DocSchema;

public abstract class ChainableAnnotationJob extends Configured implements Tool {

    /**
     * Mapper which operates on a Result row in HBase and adds annotations to HBase, and Result
     * for further downstream processing.
     *
     * @author cottom1
     *
     */
    public static abstract class AnnotateMapper extends DocMapper<Put> {
        /*
         * Simply overridden to make map public
         */
        @Override
        public void map(ImmutableBytesWritable key, Result value, Context context)
                throws IOException, InterruptedException {
            super.map(key, value, context);
        }

        protected void addToResult(Result value, String colFamily, String colQual, byte[] data) {
            // Add entry to result for downstream processing
            NavigableMap<byte[], NavigableMap<byte[], byte[]>> values = value.getNoVersionMap();
            NavigableMap<byte[], byte[]> familyMap = values.get(colFamily.getBytes());
            if (familyMap == null) {
                familyMap = new TreeMap<byte[], byte[]>();
                values.put(colFamily.getBytes(), familyMap);
            }
            familyMap.put(colQual.getBytes(), data);
        }

        protected boolean addField(Result value, Put put, String col, String qual, String data, Counter counter) {
            if (DocSchema.add(put, col, qual, data, counter)) {
                addToResult(value, col, qual, data.getBytes());
                return true;
            }
            return false;
        }

        protected void addField(Result value, Put put, String colFamily, String colQual, byte[] data) {
            // Add entry to HBase
            DocSchema.add(put, colFamily, colQual, data);
            addToResult(value, colFamily, colQual, data);
        }

        protected void addAnnotation(Result value, Put put, AnnotationSet set, String name) {
            String data = getAnnotationStr(set);
            addField(value, put, DocSchema.annotationsCF, name, data.getBytes());
        }
    }

    /**
     * Initialize the M/R job and HBase scan based on command-line arguments. Common items to set in this method:
     * <ul>
     * <li>scan (start, stop, filter)
     * <li>parameters that the child map jobs might need
     * <li>scanner caching (default is pushed to 1 to compute-bound tasks; I/O bound tasks could go to 10000)
     * </ul>
     * 
     * @param args
     * @param job
     * @param scan
     */
    public abstract void init(JobConfig jobConfig, Job job, Scan scan);

    /**
     * Any post M/R job work
     */
    public void finish() {
        LOG.info("not overridden. No post M/R tasks to complete.");
    }

    /**
     * Get the class to run as the Mapper
     * @return
     */
    public abstract Class<? extends AnnotateMapper> getMapperClass();

    public static final Log LOG = LogFactory.getLog(ChainableAnnotationJob.class);

    private Configuration conf;

    @Override
    public int run(String[] args) throws Exception {
        conf = HBaseConfiguration.create();
        // important to switch spec exec off.
        // We don't want to have something duplicated for perfomance reasons.
        conf.set("mapred.map.tasks.speculative.execution", "false");

        // since our parse takes so long, we don't want to cache rows -- the scanner might time out
        conf.set("hbase.client.scanner.caching", "1");

        JobConfig jobConfig = new JobConfig(args);

        Scan scan = new Scan();

        int status = 0;
        try {

            LOG.info("Before map/reduce startup");

            Job job = new Job(conf, getClass().getSimpleName());
            job.setJarByClass(this.getClass());

            init(jobConfig, job, scan);

            jobConfig.initTableMapperNoReducer(LOG, job, scan, getMapperClass());

            LOG.info("Started ");
            job.waitForCompletion(true);
            if (!job.isSuccessful())
                status = 1;
            LOG.info("After map/reduce completion");

            finish();
        } catch (Exception e) {
            e.printStackTrace();
            status = 1;
        }

        LOG.info("Return run status(0=success,1=failure)(" + status + ")");
        return status;
    }

}