com.github.seqware.queryengine.plugins.hbasemr.MRHBasePluginRunner.java Source code

Java tutorial

Introduction

Here is the source code for com.github.seqware.queryengine.plugins.hbasemr.MRHBasePluginRunner.java

Source

/*
 * Copyright (C) 2012 SeqWare
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.github.seqware.queryengine.plugins.hbasemr;

import com.github.seqware.queryengine.Constants;
import com.github.seqware.queryengine.factory.CreateUpdateManager;
import com.github.seqware.queryengine.factory.SWQEFactory;
import com.github.seqware.queryengine.impl.HBaseStorage;
import com.github.seqware.queryengine.impl.SimplePersistentBackEnd;
import com.github.seqware.queryengine.model.Atom;
import com.github.seqware.queryengine.model.Feature;
import com.github.seqware.queryengine.model.FeatureSet;
import com.github.seqware.queryengine.model.impl.FeatureList;
import com.github.seqware.queryengine.model.impl.lazy.LazyFeatureSet;
import com.github.seqware.queryengine.plugins.MapReducePlugin;
import com.github.seqware.queryengine.plugins.MapperInterface;
import com.github.seqware.queryengine.plugins.PluginInterface;
import com.github.seqware.queryengine.plugins.PluginRunnerInterface;
import com.github.seqware.queryengine.plugins.ReducerInterface;
import com.github.seqware.queryengine.plugins.plugins.FeatureSetCountPlugin;
import com.github.seqware.queryengine.plugins.plugins.FeaturesByFilterPlugin;
import com.github.seqware.queryengine.plugins.plugins.VCFDumperPlugin;
import com.github.seqware.queryengine.util.SGID;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.SecureRandom;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.SerializationUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.QualifierFilter;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.log4j.Logger;

/**
 * Abstract implementation of an abstract map/reduce-based plug-in runner for a
 * HBase back-end.
 *
 * @author dyuen
 * @version $Id: $Id
 */
public final class MRHBasePluginRunner<ReturnType> implements PluginRunnerInterface<ReturnType> {

    /**
     * Constant
     * <code>INT_PARAMETERS="int_parameters"</code>
     */
    public static final String INT_PARAMETERS = "int_parameters";
    /**
     * Constant
     * <code>EXT_PARAMETERS="ext_parameters"</code>
     */
    public static final String EXT_PARAMETERS = "ext_parameters";
    protected Job job;
    private MapReducePlugin mapReducePlugin;
    private FeatureSet outputSet;

    public MRHBasePluginRunner(MapReducePlugin mapReducePlugin, FeatureSet inputSet, Object... parameters) {
        this.mapReducePlugin = mapReducePlugin;
        try {
            CreateUpdateManager manager = SWQEFactory.getModelManager();
            //outputSet should attach to the original reference
            this.outputSet = manager.buildFeatureSet().setReferenceID(inputSet.getReferenceID()).build();
            manager.close();

            // do setup for Map/Reduce from the HBase API
            String tableName = generateTableName(inputSet);
            String destTableName = generateTableName(outputSet);

            Configuration conf = new Configuration();
            HBaseStorage.configureHBaseConfig(conf);
            HBaseConfiguration.addHbaseResources(conf);

            // we need to pass the parameters for a featureset, maybe we can take advantage of our serializers
            byte[] sSet = SWQEFactory.getSerialization().serialize(inputSet);
            byte[] dSet = SWQEFactory.getSerialization().serialize(outputSet);

            String[] str_params = serializeParametersToString(parameters, mapReducePlugin, sSet, dSet);

            File file = new File(new URI(Constants.Term.DEVELOPMENT_DEPENDENCY.getTermValue(String.class)));
            if (file.exists()) {
                conf.setStrings("tmpjars", Constants.Term.DEVELOPMENT_DEPENDENCY.getTermValue(String.class));
            }
            conf.setStrings(EXT_PARAMETERS, str_params);
            conf.set("mapreduce.map.java.opts", "-Xmx4096m  -verbose:gc");
            conf.set("mapreduce.reduce.java.opts", "-Xmx4096m  -verbose:gc");
            conf.set("mapreduce.map.ulimit", "4194304");
            conf.set("mapreduce.reduce.ulimit", "4194304");
            conf.set("mapreduce.map.memory.mb", "4096");
            conf.set("mapreduce.reduce.memory.mb", "4096");
            conf.set("mapreduce.map.memory.physical.mb", "4096");
            conf.set("mapreduce.reduce.memory.physical.mb", "4096");
            // the above settings all seem to be ignored by hboot
            // TODO: only this one works, but as far I know, we're using mapreduce not mapred.
            // Strange
            conf.set("mapred.child.java.opts", "-Xmx2048m -verbose:gc");

            this.job = new Job(conf, mapReducePlugin.getClass().getSimpleName());

            Scan scan = new Scan();
            scan.setMaxVersions(); // we need all version data
            scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
            scan.setCacheBlocks(false); // don't set to true for MR jobs
            byte[] qualiferBytes = Bytes.toBytes(inputSet.getSGID().getUuid().toString());
            scan.addColumn(HBaseStorage.getTEST_FAMILY_INBYTES(), qualiferBytes);
            scan.setFilter(new QualifierFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(qualiferBytes)));

            // handle the part that changes from job to job
            // pluginInterface.performVariableInit(tableName, destTableName, scan);
            TableMapReduceUtil.initTableMapperJob(tableName, // input HBase table name
                    scan, // Scan instance to control CF and attribute selection
                    PluginRunnerMapper.class, // mapper
                    mapReducePlugin.getMapOutputKeyClass(), // mapper output key 
                    mapReducePlugin.getMapOutputValueClass(), // mapper output value
                    job);
            job.setOutputFormatClass(mapReducePlugin.getOutputClass()); // because we aren't emitting anything from mapper
            job.setReducerClass(MRHBasePluginRunner.PluginRunnerReducer.class); // reducer class
            job.setNumReduceTasks(mapReducePlugin.getNumReduceTasks());

            if (mapReducePlugin.getResultMechanism() == PluginInterface.ResultMechanism.FILE) {
                FileContext fileContext = FileContext.getFileContext(this.job.getConfiguration());
                Path path = new Path(
                        "/tmp/" + new BigInteger(20, new SecureRandom()).toString(32) + mapReducePlugin.toString());
                path = fileContext.makeQualified(path);
                TextOutputFormat.setOutputPath(job, path); // adjust directories as required
            }

            TableMapReduceUtil.addDependencyJars(job);
            job.setJarByClass(MRHBasePluginRunner.class);
            // submit the job, but do not block
            job.submit();
        } catch (URISyntaxException ex) {
            Logger.getLogger(MRHBasePluginRunner.class.getName()).fatal(null, ex);
        } catch (InterruptedException ex) {
            Logger.getLogger(MRHBasePluginRunner.class.getName()).fatal(null, ex);
        } catch (ClassNotFoundException ex) {
            Logger.getLogger(MRHBasePluginRunner.class.getName()).fatal(null, ex);
        } catch (IOException ex) {
            Logger.getLogger(MRHBasePluginRunner.class.getName()).fatal(null, ex);
        }
    }

    private static String generateTableName(FeatureSet sourceSet) {
        LazyFeatureSet lfSet = (LazyFeatureSet) sourceSet;
        String prefix = lfSet.getTablename();
        String tableName = HBaseStorage.TEST_TABLE_PREFIX + HBaseStorage.SEPARATOR + prefix;
        return tableName;
    }

    @Override
    public ReturnType get() {
        try {
            job.waitForCompletion(true);
            if (mapReducePlugin.getResultMechanism() == PluginInterface.ResultMechanism.COUNTER) {
                return (ReturnType) Long
                        .valueOf(job.getCounters().findCounter(MapperInterface.Counters.ROWS).getValue());
            } else if (mapReducePlugin.getResultMechanism() == PluginInterface.ResultMechanism.SGID) {
                SGID resultSGID = outputSet.getSGID();
                Class<? extends Atom> resultClass = (Class<? extends Atom>) mapReducePlugin.getResultClass();
                return (ReturnType) SWQEFactory.getQueryInterface().getLatestAtomBySGID(resultSGID, resultClass);
            } else if (mapReducePlugin.getResultMechanism() == PluginInterface.ResultMechanism.BATCHEDFEATURESET) {
                FeatureSet build = updateAndGet(outputSet);
                return (ReturnType) build;
            } else {
                throw new UnsupportedOperationException();

            }
        } catch (IOException ex) {
            Logger.getLogger(MRHBasePluginRunner.class.getName()).error(null, ex);
        } catch (InterruptedException ex) {
            Logger.getLogger(MRHBasePluginRunner.class.getName()).error(null, ex);
        } catch (ClassNotFoundException ex) {
            Logger.getLogger(MRHBasePluginRunner.class.getName()).error(null, ex);
        }
        return null;
    }

    public boolean isComplete() {
        try {
            return job.isComplete();

        } catch (IOException ex) {
            Logger.getLogger(MRHBasePluginRunner.class.getName()).error(null, ex);
        }
        return false;
    }

    @Override
    public PluginInterface getPlugin() {
        return mapReducePlugin;
    }

    public static FeatureSet updateAndGet(FeatureSet outputSet) {
        // after processing, outputSet will actually have been versioned several times, we need the latest one
        FeatureSet latestAtomBySGID = SWQEFactory.getQueryInterface().getLatestAtomBySGID(outputSet.getSGID(),
                FeatureSet.class);
        //        //TODO: remove the need for this hacky hack, there is some versioning issue here
        CreateUpdateManager modelManager = SWQEFactory.getModelManager();
        SGID sgid = latestAtomBySGID.getSGID();

        sgid.setBackendTimestamp(new Date());
        FeatureSet build = latestAtomBySGID.toBuilder().build();

        build.impersonate(sgid, latestAtomBySGID.getSGID());
        build.setPrecedingVersion(build);

        modelManager.persist(build);

        modelManager.close();
        return build;
    }

    public static String[] serializeParametersToString(Object[] parameters, PluginInterface mapReducePlugin,
            byte[] sSet, byte[] dSet) {
        String[] str_params = new String[6];
        byte[] ext_serials = mapReducePlugin.handleSerialization(parameters);
        byte[] int_serials = mapReducePlugin.handleSerialization(mapReducePlugin.getInternalParameters());
        str_params[0] = Base64.encodeBase64String(ext_serials);
        str_params[1] = Base64.encodeBase64String(int_serials);
        str_params[2] = Base64.encodeBase64String(sSet);
        str_params[3] = Base64.encodeBase64String(dSet);
        str_params[4] = Base64.encodeBase64String(mapReducePlugin.handleSerialization(Constants.getSETTINGS_MAP()));
        str_params[5] = Base64.encodeBase64String(mapReducePlugin.handleSerialization(mapReducePlugin));
        return str_params;
    }

    public static class PluginRunnerReducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
            extends TableReducer<KEYIN, VALUEIN, KEYOUT> implements ReducerInterface<KEYOUT, VALUEOUT> {

        private PluginRunnerReducer.Context context;
        private MapReducePlugin mapReducePlugin;

        @Override
        protected void reduce(KEYIN key, Iterable<VALUEIN> values, Context context)
                throws IOException, InterruptedException {
            this.context = context;
            mapReducePlugin.reduce(key, values, this);
        }

        @Override
        public void write(KEYOUT keyout, VALUEOUT valueout) {
            try {
                context.write(keyout, valueout);
            } catch (IOException ex) {
                Logger.getLogger(MRHBasePluginRunner.class.getName()).error(null, ex);
            } catch (InterruptedException ex) {
                Logger.getLogger(MRHBasePluginRunner.class.getName()).error(null, ex);
            }
        }
    }

    public static class PluginRunnerMapper<KEYOUT, VALUEOUT> extends TableMapper<KEYOUT, VALUEOUT>
            implements MapperInterface<KEYOUT, VALUEOUT> {

        private MapReducePlugin mapReducePlugin;
        private PluginRunnerMapper.Context context;

        @Override
        public void incrementCounter() {
            context.getCounter(Counters.ROWS).increment(1L);
        }

        @Override
        public void write(KEYOUT keyout, VALUEOUT valueout) {
            try {
                context.write(keyout, valueout);
            } catch (IOException ex) {
                Logger.getLogger(MRHBasePluginRunner.class.getName()).error(null, ex);
            } catch (InterruptedException ex) {
                Logger.getLogger(MRHBasePluginRunner.class.getName()).error(null, ex);
            }
        }

        @Override
        protected void setup(Mapper.Context context) {
            this.baseMapperSetup(context);
            mapReducePlugin.mapInit(this);
        }

        @Override
        protected void cleanup(org.apache.hadoop.mapreduce.Mapper.Context context)
                throws IOException, InterruptedException {
            mapReducePlugin.mapCleanup();
        }

        @Override
        protected void map(ImmutableBytesWritable row, Result values, Mapper.Context context)
                throws IOException, InterruptedException {
            this.context = context;

            List<FeatureList> list = HBaseStorage.grabFeatureListsGivenRow(values, sourceSet.getSGID(),
                    SWQEFactory.getSerialization());
            Logger.getLogger(FeatureSetCountPlugin.class.getName())
                    .trace("Counting " + sourceSet.getSGID() + " on row with " + list.size() + " lists");
            Collection<Feature> consolidateRow = SimplePersistentBackEnd.consolidateRow(list);
            Logger.getLogger(FeatureSetCountPlugin.class.getName())
                    .trace("Consolidated to  " + consolidateRow.size() + " features");
            mapReducePlugin.map(consolidateRow, this);
        }

        /**
         * parameters that will be usable by the user (the writer of the
         * queries)
         */
        protected Object[] ext_parameters;
        /**
         * parameters that will be handled by the plug-in developer but will not
         * be available to the user of the plug-in
         */
        protected Object[] int_parameters;
        /**
         * the feature set that we will be reading
         */
        protected FeatureSet sourceSet;
        /**
         * the feature set that we will be writing to, may be null
         */
        protected FeatureSet destSet;

        @Override
        public Object[] getExt_parameters() {
            return ext_parameters;
        }

        @Override
        public Object[] getInt_parameters() {
            return int_parameters;
        }

        @Override
        public FeatureSet getSourceSet() {
            return sourceSet;
        }

        @Override
        public FeatureSet getDestSet() {
            return destSet;
        }

        private void baseMapperSetup(Context context) {
            Logger.getLogger(FeatureSetCountPlugin.class.getName()).info("Setting up mapper");
            Configuration conf = context.getConfiguration();
            String[] strings = conf.getStrings(MRHBasePluginRunner.EXT_PARAMETERS);
            Logger.getLogger(PluginRunnerMapper.class.getName())
                    .info("QEMapper configured with: host: "
                            + Constants.Term.HBASE_PROPERTIES.getTermValue(Map.class).toString() + " namespace: "
                            + Constants.Term.NAMESPACE.getTermValue(String.class));
            final String mapParameter = strings[4];
            if (mapParameter != null && !mapParameter.isEmpty()) {
                Map<String, String> settingsMap = (Map<String, String>) ((Object[]) SerializationUtils
                        .deserialize(Base64.decodeBase64(mapParameter)))[0];
                if (settingsMap != null) {
                    Logger.getLogger(FeatureSetCountPlugin.class.getName())
                            .info("Settings map retrieved with " + settingsMap.size() + " entries");
                    Constants.setSETTINGS_MAP(settingsMap);
                }
            }
            Logger.getLogger(PluginRunnerMapper.class.getName())
                    .info("QEMapper configured with: host: "
                            + Constants.Term.HBASE_PROPERTIES.getTermValue(Map.class).toString() + " namespace: "
                            + Constants.Term.NAMESPACE.getTermValue(String.class));
            final String externalParameters = strings[0];
            if (externalParameters != null && !externalParameters.isEmpty()) {
                this.ext_parameters = (Object[]) SerializationUtils
                        .deserialize(Base64.decodeBase64(externalParameters));
            }
            final String internalParameters = strings[1];
            if (internalParameters != null && !internalParameters.isEmpty()) {
                this.int_parameters = (Object[]) SerializationUtils
                        .deserialize(Base64.decodeBase64(internalParameters));
            }
            final String sourceSetParameter = strings[2];
            if (sourceSetParameter != null && !sourceSetParameter.isEmpty()) {
                this.sourceSet = SWQEFactory.getSerialization().deserialize(Base64.decodeBase64(sourceSetParameter),
                        FeatureSet.class);
            }
            final String destSetParameter = strings[3];
            if (destSetParameter != null && !destSetParameter.isEmpty()) {
                this.destSet = SWQEFactory.getSerialization().deserialize(Base64.decodeBase64(destSetParameter),
                        FeatureSet.class);
            }
            final String pluginParameter = strings[5];
            if (pluginParameter != null && !pluginParameter.isEmpty()) {
                Object deserialize = SerializationUtils.deserialize(Base64.decodeBase64(pluginParameter));
                // yuck! I need a cleaner way to do this when done refactoring
                mapReducePlugin = (MapReducePlugin) ((Object[]) deserialize)[0];
            }
        }
    }

    public File handleFileResult(Path path) {
        FileSystem fs = null;
        try {
            Path outputPartPath = new Path(path, "part-r-00000");
            // copy file from HDFS to local temporary file
            Logger.getLogger(FeaturesByFilterPlugin.class.getName())
                    .info("Source file is " + outputPartPath.toString());
            Configuration conf = new Configuration();

            HBaseStorage.configureHBaseConfig(conf);

            HBaseConfiguration.addHbaseResources(conf);
            fs = FileSystem.get(conf);
            File createTempFile = File.createTempFile("vcf", "out");

            createTempFile.delete();
            Path outPath = new Path(createTempFile.toURI());
            FileSystem localSystem = FileSystem.get(new Configuration());

            Logger.getLogger(FeaturesByFilterPlugin.class.getName())
                    .info("Destination file is " + outPath.toString());
            if (!fs.exists(outputPartPath)) {
                Logger.getLogger(FeaturesByFilterPlugin.class.getName()).fatal("Input file not found");
            }

            if (!fs.isFile(outputPartPath)) {
                Logger.getLogger(FeaturesByFilterPlugin.class.getName()).fatal("Input should be a file");
            }

            if (localSystem.exists(outPath)) {
                Logger.getLogger(FeaturesByFilterPlugin.class.getName()).fatal("Output already exists");
            }
            // doesn't quite work yet, no time to finish before poster, check results manually on hdfs

            FileUtil.copy(fs, outputPartPath, localSystem, outPath, true, true, conf);
            return new File(outPath.toUri());
        } catch (IOException ex) {
            Logger.getLogger(VCFDumperPlugin.class.getName()).fatal(null, ex);
        } finally {
            if (fs != null) {
                try {
                    fs.delete(path, true);
                } catch (IOException ex) {
                    Logger.getLogger(VCFDumperPlugin.class.getName())
                            .warn("IOException when clearing after text output", ex);
                }
            }
        }

        return null;
    }
}