Example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the value class for the map output data.

Usage

From source file:com.bigdog.hadoop.mapreduce.group.GroupApp.java

public void group() throws Exception {
    final Configuration configuration = new Configuration();

    final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), configuration);
    if (fileSystem.exists(new Path(OUT_PATH))) {
        fileSystem.delete(new Path(OUT_PATH), true);
    }//from  w  w  w .j  a v  a2 s. co  m

    final Job job = new Job(configuration, GroupApp.class.getSimpleName());

    //1.1 
    FileInputFormat.setInputPaths(job, INPUT_PATH);
    //??
    job.setInputFormatClass(TextInputFormat.class);

    //1.2Mapper
    job.setMapperClass(MyMapper.class);
    //<k2,v2>
    job.setMapOutputKeyClass(NewK2.class);
    job.setMapOutputValueClass(LongWritable.class);

    //1.3 
    job.setPartitionerClass(HashPartitioner.class);
    job.setNumReduceTasks(1);

    //1.4 TODO ??
    job.setGroupingComparatorClass(MyGroupingComparator.class);
    //1.5  TODO ??

    //2.2 reduce
    job.setReducerClass(MyReducer.class);
    //<k3,v3>
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LongWritable.class);

    //2.3 
    FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
    //?
    job.setOutputFormatClass(TextOutputFormat.class);

    //???JobTracker
    job.waitForCompletion(true);
}

From source file:com.bigdog.hadoop.mapreduce.partition.KpiApp.java

public void kpi() throws Exception {
    final Job job = new Job(new Configuration(), KpiApp.class.getSimpleName());

    job.setJarByClass(KpiApp.class);

    //1.1 //  w  w  w.ja  v  a  2 s .  c  o m
    FileInputFormat.setInputPaths(job, INPUT_PATH);
    //??
    job.setInputFormatClass(TextInputFormat.class);

    //1.2Mapper
    job.setMapperClass(MyMapper.class);
    //<k2,v2>
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(KpiWritable.class);

    //1.3 
    job.setPartitionerClass(KpiPartitioner.class);
    job.setNumReduceTasks(2);

    //1.4 TODO ??
    //1.5  TODO ??
    //2.2 reduce
    job.setReducerClass(MyReducer.class);
    //<k3,v3>
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(KpiWritable.class);

    //2.3 
    FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
    //?
    job.setOutputFormatClass(TextOutputFormat.class);

    //???JobTracker
    job.waitForCompletion(true);
}

From source file:com.bigfishgames.biginsights.upsight.mapreduce.MapReduceAvroWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: AvroWordCount <input path> <output path>");
        return -1;
    }/*w w w.  j  a  va 2s. c  o  m*/

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceAvroWordCount.class);
    job.setJobName("wordcount");

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    // AvroJob.setOutputKeySchema(job,
    //                         Pair.getPairSchema(Schema.create(Type.STRING),
    //                                           Schema.create(Type.NULL)));
    AvroJob.setOutputKeySchema(job, Event.getClassSchema());

    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(MyAvroKeyOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setSortComparatorClass(Text.Comparator.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

    return 0;
}

From source file:com.bizosys.hsearch.kv.indexer.KVIndexer.java

License:Apache License

/**
 * Given a indexing parameters it starts a indexing.
 * Different indexing type are://from w  w w  . ja  v  a 2s.c  o  m
 * SF2HB = Simple File(csv,tsv) to hbase directly.
 * SF2HF = Simple File(csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * SF2MF = Simple File(csv,tsv) to MapFile (key as {@link Text} and value as {@link BytesWritable})
 * MF2HB = Map File(key and value as csv,tsv) to hbase.
 * MF2HF = Map File(key and value as csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * MF2MF = Map File(key and value as csv,tsv) to MapFile(key as {@link Text} and value as {@link BytesWritable})
 * HB2HB = Hbase to Hbase
 * HB2HF = Hbase to HFile which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * HB2MF = Hbase to MapFile(key as {@link Text} and value as {@link BytesWritable})
 * @param args
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void execute(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    if (args.length < 7) {
        String err = "Usage : " + KVIndexer.class
                + " <<Job Type(SF2HB|SF2HF|SF2MF...)>> <<Input Source>> <<Output Sink>> <<XML File Configuration>> <<Skip Header(true|false)>> <<Run KeyGeneration Job>> <<Number Of reducer>> <<Speculative Execution>> <<scanner-cache-size>> <<filter>>";
        IdSearchLog.l.fatal(err);
        System.exit(1);
    }

    String msg = this.getClass().getName() + " > Initializing indexer job.";
    IdSearchLog.l.info(msg);

    int seq = 0;
    int len = args.length;

    String jobType = (len > seq) ? args[seq++] : "";
    String inputSource = (len > seq) ? args[seq++] : "";
    String outputSink = (len > seq) ? args[seq++] : "/tmp/hsearch-index";
    String xmlFilePath = (len > seq) ? args[seq++] : "";
    String skipHeader = (len > seq) ? args[seq++] : "false";
    boolean runKeyGenJob = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : false;
    int numberOfReducer = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 1;
    boolean speculativeExecution = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : true;
    int scannerCacheSize = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 300;
    String filter = (len > seq) ? args[seq++] : "";

    if (isEmpty(jobType)) {
        String err = this.getClass().getName()
                + " > Please enter Job type as one of these :\n SF2HB|SF2HF|SF2MF|MF2HB|MF2HF|MF2MF|HB2HB|HB2HF|HB2MF|IMF2HF";
        System.err.println(err);
        throw new IOException(err);
    }

    if (isEmpty(inputSource)) {
        String err = this.getClass().getName() + " > Please enter input file path.";
        System.err.println(err);
        throw new IOException(err);
    }

    Configuration conf = HBaseConfiguration.create();

    FieldMapping fm = createFieldMapping(conf, xmlFilePath, new StringBuilder());
    outputSink = outputSink.charAt(outputSink.length() - 1) == '/' ? outputSink : outputSink + "/";
    outputSink = outputSink + fm.tableName;

    createHBaseTable(fm);

    KVIndexer.FAM_NAME = fm.familyName.getBytes();
    KVIndexer.FIELD_SEPARATOR = fm.fieldSeparator;

    conf.set(XML_FILE_PATH, xmlFilePath);
    conf.set(OUTPUT_FOLDER, outputSink);
    conf.set(SKIP_HEADER, skipHeader);
    conf.set(RAW_FILE_SEPATATOR, String.valueOf(fm.fieldSeparator));

    Job job = Job.getInstance(conf, "com.bizosys.hsearch.kv.indexing.KVIndexer type : " + jobType + "\n"
            + inputSource + "\n" + outputSink);
    job.setJarByClass(this.getClass());
    job.setNumReduceTasks(numberOfReducer);

    Integer jobTypeI = JobTypeMapping.get(jobType);
    if (jobTypeI == null)
        throw new IOException("Invalid Jobtype " + jobType);

    /**
     *  if internal keyIndex is given then generate the keys first and then do indexing 
     *  else just run indexer by creating keys from hbase 
     */
    boolean keyGenjobStatus = false;
    if (-1 != fm.internalKey && runKeyGenJob) {

        Configuration keyGenConf = HBaseConfiguration.create();
        keyGenConf.set(INPUT_SOURCE, inputSource);
        keyGenConf.set(XML_FILE_PATH, xmlFilePath);
        keyGenConf.set(OUTPUT_FOLDER, outputSink);
        keyGenConf.set(SKIP_HEADER, skipHeader);

        Job keyGenJob = Job.getInstance(keyGenConf, "Creating Keys KVKeyGenerator for " + inputSource);

        switch (jobTypeI) {
        case SF2HB:
        case SF2HF:
        case SF2MF: {

            FileInputFormat.addInputPath(keyGenJob, new Path(inputSource));

            keyGenJob.setMapperClass(KVKeyGeneratorMapperFile.class);
            keyGenJob.setInputFormatClass(TextInputFormat.class);
            keyGenJob.setMapOutputKeyClass(Text.class);
            keyGenJob.setMapOutputValueClass(Text.class);

            keyGenJob.setReducerClass(KVKeyGeneratorReducerFile.class);
            keyGenJob.setNumReduceTasks(numberOfReducer);
            keyGenJob.setOutputKeyClass(NullWritable.class);
            keyGenJob.setOutputValueClass(Text.class);

            inputSource = outputSink + "_" + INPUTWITH_KEY;
            Path intermediatePath = new Path(inputSource);
            System.out.println("Final input path " + inputSource);
            FileOutputFormat.setOutputPath(keyGenJob, intermediatePath);

            keyGenjobStatus = keyGenJob.waitForCompletion(true);
            if (!keyGenjobStatus) {
                throw new IOException("Error in running Job for Key Generation");
            }

            break;
        }
        case HB2HB:
        case HB2HF:
        case HB2MF: {

            Scan scan = new Scan();
            scan.setCaching(scannerCacheSize);
            scan.setCacheBlocks(false);

            byte[] family = fm.familyName.getBytes();
            for (String name : fm.nameWithField.keySet()) {

                Field fld = fm.nameWithField.get(name);
                if (!fld.isMergedKey)
                    continue;
                scan.addColumn(family, fld.sourceName.trim().getBytes());
            }

            TableMapReduceUtil.initTableMapperJob(inputSource, // input table
                    scan, // Scan instance to control CF and attribute selection
                    KVKeyGeneratorMapperHBase.class, // mapper class
                    Text.class, // mapper output key
                    ImmutableBytesWritable.class, // mapper output value
                    keyGenJob);

            TableMapReduceUtil.initTableReducerJob(inputSource, // output table
                    KVKeyGeneratorReducerHBase.class, // reducer class
                    keyGenJob);

            keyGenjobStatus = keyGenJob.waitForCompletion(true);
            if (!keyGenjobStatus) {
                throw new IOException("Error in running Job for Key Generation");
            }
            break;
        }
        default:
            break;
        }
    }
    /*
     * Run job based on job type eg. SF2HB,SF2MF,SF2HF etc.
     */
    System.out.println("Sending path " + inputSource);
    runJob(jobTypeI, job, fm, inputSource, outputSink, scannerCacheSize, filter);
}

From source file:com.bizosys.hsearch.kv.indexer.KVIndexer.java

License:Apache License

private static int runJob(int jobTypeI, Job job, FieldMapping fm, String input, String output,
        int scannerCacheSize, String filter) throws IOException, InterruptedException, ClassNotFoundException {

    int jobStatus = -1;

    switch (jobTypeI) {
    case SF2HB: {

        IdSearchLog.l.info("Starting Job for SF2HB input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);
        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }/*ww  w  .j a  v  a 2  s . c o  m*/
    case SF2MF: {

        IdSearchLog.l.info("Starting Job for SF2MF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case SF2HF: {

        /*
         * First creates map file and then convert to hfile.
         * create intermediate dir for map file output
         * 
         */

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for SF2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("SF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case HB2HB: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());
        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                Text.class, // mapper output key
                BytesWritable.class, // mapper output value
                job);

        TableMapReduceUtil.initTableReducerJob(fm.tableName, // output table
                KVReducerHBase.class, // reducer class
                job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case HB2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("HB2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case HB2MF: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());

        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                Text.class, // mapper output key
                BytesWritable.class, // mapper output value
                job);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case IMF2HF: {

        Path finalOutputDir = new Path(output);
        job.setJarByClass(KVIndexer.class);
        job.setMapperClass(KVMapperHFile.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(input));
        FileOutputFormat.setOutputPath(job, finalOutputDir);

        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        HTable hTable = new HTable(job.getConfiguration(), fm.tableName);
        HFileOutputFormat.configureIncrementalLoad(job, hTable);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }

    default:
        throw new IOException("Invalid Jobtype " + jobTypeI);
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVIndexer.java

License:Apache License

/**
 * Given a indexing parameters it starts a indexing.
 * Different indexing type are:/*  ww  w .j av  a2  s.  com*/
 * SF2HB = Simple File(csv,tsv) to hbase directly.
 * SF2HF = Simple File(csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * SF2MF = Simple File(csv,tsv) to MapFile (key as {@link Text} and value as {@link BytesWritable})
 * MF2HB = Map File(key and value as csv,tsv) to hbase.
 * MF2HF = Map File(key and value as csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * MF2MF = Map File(key and value as csv,tsv) to MapFile(key as {@link Text} and value as {@link BytesWritable})
 * HB2HB = Hbase to Hbase
 * HB2HF = Hbase to HFile which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * HB2MF = Hbase to MapFile(key as {@link Text} and value as {@link BytesWritable})
 * @param args
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void execute(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    if (args.length < 7) {
        String err = "Usage : " + KVIndexer.class
                + " <<Job Type(SF2HB|SF2HF|SF2MF...)>> <<Input Source>> <<Output Sink>> <<XML File Configuration>> <<Skip Header(true|false)>> <<Run KeyGeneration Job>> <<Number Of reducer>> <<Speculative Execution>> <<scanner-cache-size>> <<filter>>";
        IdSearchLog.l.fatal(err);
        System.exit(1);
    }

    String msg = this.getClass().getName() + " > Initializing indexer job.";
    IdSearchLog.l.info(msg);

    int seq = 0;
    int len = args.length;

    String jobType = (len > seq) ? args[seq++] : "";
    String inputSource = (len > seq) ? args[seq++] : "";
    String outputSink = (len > seq) ? args[seq++] : "/tmp/hsearch-index";
    String xmlFilePath = (len > seq) ? args[seq++] : "";
    String skipHeader = (len > seq) ? args[seq++] : "false";
    boolean runKeyGenJob = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : false;
    int numberOfReducer = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 1;
    boolean speculativeExecution = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : true;
    int scannerCacheSize = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 300;
    String filter = (len > seq) ? args[seq++] : "";

    if (isEmpty(jobType)) {
        String err = this.getClass().getName()
                + " > Please enter Job type as one of these :\n SF2HB|SF2HF|SF2MF|MF2HB|MF2HF|MF2MF|HB2HB|HB2HF|HB2MF|IMF2HF";
        System.err.println(err);
        throw new IOException(err);
    }

    if (isEmpty(inputSource)) {
        String err = this.getClass().getName() + " > Please enter input file path.";
        System.err.println(err);
        throw new IOException(err);
    }

    Configuration conf = HBaseConfiguration.create();

    FieldMapping fm = createFieldMapping(conf, xmlFilePath, new StringBuilder());
    outputSink = outputSink.charAt(outputSink.length() - 1) == '/' ? outputSink : outputSink + "/";
    outputSink = outputSink + fm.tableName;

    createHBaseTable(fm);

    KVIndexer.FAM_NAME = fm.familyName.getBytes();
    KVIndexer.FIELD_SEPARATOR = fm.fieldSeparator;

    conf.set(XML_FILE_PATH, xmlFilePath);
    conf.set(OUTPUT_FOLDER, outputSink);
    conf.set(SKIP_HEADER, skipHeader);
    conf.setBoolean("mapreduce.map.speculative", speculativeExecution);

    Job job = Job.getInstance(conf, "com.bizosys.hsearch.kv.indexing.KVIndexer type : " + jobType + "\n"
            + inputSource + "\n" + outputSink);
    job.setJarByClass(this.getClass());
    job.setNumReduceTasks(numberOfReducer);

    Integer jobTypeI = JobTypeMapping.get(jobType);
    if (jobTypeI == null)
        throw new IOException("Invalid Jobtype " + jobType);

    /**
     *  if internal keyIndex is given then generate the keys first and then do indexing 
     *  else just run indexer by creating keys from hbase 
     */
    boolean keyGenjobStatus = false;
    if (-1 != fm.internalKey && runKeyGenJob) {

        Configuration keyGenConf = HBaseConfiguration.create();
        keyGenConf.set(INPUT_SOURCE, inputSource);
        keyGenConf.set(XML_FILE_PATH, xmlFilePath);
        keyGenConf.set(OUTPUT_FOLDER, outputSink);
        keyGenConf.set(SKIP_HEADER, skipHeader);

        Job keyGenJob = Job.getInstance(keyGenConf, "Creating Keys KVKeyGenerator for " + inputSource);

        switch (jobTypeI) {
        case SF2HB:
        case SF2HF:
        case SF2MF: {

            FileInputFormat.addInputPath(keyGenJob, new Path(inputSource));

            keyGenJob.setMapperClass(KVKeyGeneratorMapperFile.class);
            keyGenJob.setInputFormatClass(TextInputFormat.class);
            keyGenJob.setMapOutputKeyClass(Text.class);
            keyGenJob.setMapOutputValueClass(Text.class);

            keyGenJob.setReducerClass(KVKeyGeneratorReducerFile.class);
            keyGenJob.setNumReduceTasks(numberOfReducer);
            keyGenJob.setOutputKeyClass(NullWritable.class);
            keyGenJob.setOutputValueClass(Text.class);

            inputSource = outputSink + "_" + INPUTWITH_KEY;
            Path intermediatePath = new Path(inputSource);
            System.out.println("Final input path " + inputSource);
            FileOutputFormat.setOutputPath(keyGenJob, intermediatePath);

            keyGenjobStatus = keyGenJob.waitForCompletion(true);
            if (!keyGenjobStatus) {
                throw new IOException("Error in running Job for Key Generation");
            }

            break;
        }
        case HB2HB:
        case HB2HF:
        case HB2MF: {

            Scan scan = new Scan();
            scan.setCaching(scannerCacheSize);
            scan.setCacheBlocks(false);

            // Added Filter
            if (null != filter) {
                if (filter.trim().length() > 0) {
                    int index = filter.indexOf('=');
                    scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                            filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                            filter.substring(index + 1).getBytes()));
                }
            }

            byte[] family = fm.familyName.getBytes();
            for (String name : fm.nameWithField.keySet()) {

                Field fld = fm.nameWithField.get(name);
                if (!fld.isMergedKey)
                    continue;
                scan.addColumn(family, fld.sourceName.trim().getBytes());
            }

            TableMapReduceUtil.initTableMapperJob(inputSource, // input table
                    scan, // Scan instance to control CF and attribute selection
                    KVKeyGeneratorMapperHBase.class, // mapper class
                    Text.class, // mapper output key
                    ImmutableBytesWritable.class, // mapper output value
                    keyGenJob);

            TableMapReduceUtil.initTableReducerJob(inputSource, // output table
                    KVKeyGeneratorReducerHBase.class, // reducer class
                    keyGenJob);

            keyGenjobStatus = keyGenJob.waitForCompletion(true);
            if (!keyGenjobStatus) {
                throw new IOException("Error in running Job for Key Generation");
            }
            break;
        }
        case MF2HB:
        case MF2HF:
        case MF2MF: {
            break;
        }
        default:
            break;
        }
    }
    /*
     * Run job based on job type eg. SF2HB,SF2MF,SF2HF etc.
     */
    System.out.println("Sending path " + inputSource);
    runJob(jobTypeI, job, fm, inputSource, outputSink, scannerCacheSize, filter);
}

From source file:com.bizosys.hsearch.kv.indexing.KVIndexer.java

License:Apache License

private static int runJob(int jobTypeI, Job job, FieldMapping fm, String input, String output,
        int scannerCacheSize, String filter) throws IOException, InterruptedException, ClassNotFoundException {

    int jobStatus = -1;

    switch (jobTypeI) {
    case SF2HB: {

        IdSearchLog.l.info("Starting Job for SF2HB input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);
        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }/*from  w  w  w . ja  v  a2s. c om*/
    case SF2HF: {

        //First creates map file and then convert to hfile.
        //create intermediate dir for map file output

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for SF2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("SF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case SF2MF: {

        IdSearchLog.l.info("Starting Job for SF2MF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);

        job.setSortComparatorClass(TextPair.FirstComparator.class);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case MF2HB: {

        job.setMapperClass(KVMapperMapFile.class);
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        SequenceFileAsTextInputFormat.addInputPath(job, new Path(input));

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case MF2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("MF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case MF2MF: {

        job.setMapperClass(KVMapperMapFile.class);
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        SequenceFileAsTextInputFormat.addInputPath(job, new Path(input));

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case HB2HB: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());
        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                TextPair.class, // mapper output key
                Text.class, // mapper output value
                job);

        TableMapReduceUtil.initTableReducerJob(fm.tableName, // output table
                KVReducerHBase.class, // reducer class
                job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case HB2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("HB2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case HB2MF: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());

        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                TextPair.class, // mapper output key
                Text.class, // mapper output value
                job);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case IMF2HF: {

        Path finalOutputDir = new Path(output);
        job.setJarByClass(KVIndexer.class);
        job.setMapperClass(KVMapperHFile.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(input));
        FileOutputFormat.setOutputPath(job, finalOutputDir);

        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        HTable hTable = new HTable(job.getConfiguration(), fm.tableName);
        HFileOutputFormat.configureIncrementalLoad(job, hTable);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }

    default:
        throw new IOException("Invalid Jobtype " + jobTypeI);
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVReplicatorHFile.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    int seq = 0;//w  w  w  . j  av  a 2 s .  c  o m
    String inputFile = (args.length > seq) ? args[seq] : "";
    seq++;

    String hfileOutputFile = (args.length > seq) ? args[seq] : "";
    seq++;

    String tableName = (args.length > seq) ? args[seq] : "";
    seq++;

    String familyName = (args.length > seq) ? args[seq] : "1";
    seq++;

    String replaceFrom = (args.length > seq) ? args[seq] : "";
    seq++;

    String replaceTo = (args.length > seq) ? args[seq] : "";
    seq++;

    String startIndex = (args.length > seq) ? args[seq] : "";
    seq++;

    String endIndex = (args.length > seq) ? args[seq] : "";
    seq++;

    if (null == inputFile || inputFile.trim().isEmpty()) {
        String err = KVReplicatorHFile.class + " > Please enter input file path.";
        System.err.println(err);
        throw new IOException(err);
    }

    Configuration conf = HBaseConfiguration.create();
    conf.set(TABLE_NAME, tableName);
    conf.set(FAMILY_NAME, familyName);
    conf.set(REPLACE_FROM, replaceFrom);
    conf.set(REPLACE_TO, replaceTo);
    conf.set(START_INDEX, startIndex);
    conf.set(END_INDEX, endIndex);

    try {
        List<HColumnDescriptor> colFamilies = new ArrayList<HColumnDescriptor>();
        HColumnDescriptor cols = new HColumnDescriptor(familyName.getBytes());
        colFamilies.add(cols);
        HDML.create(tableName, colFamilies);
    } catch (HBaseException e) {
        e.printStackTrace();
    }

    Job job = Job.getInstance(conf, "KVReplicatorHBase - creating HFile");

    job.setJarByClass(KVReplicatorHFile.class);
    job.setMapperClass(KVHFileWriterMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputFile.trim()));
    FileOutputFormat.setOutputPath(job, new Path(hfileOutputFile.trim()));

    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);

    HTable hTable = new HTable(conf, tableName);
    HFileOutputFormat.configureIncrementalLoad(job, hTable);
    boolean result = job.waitForCompletion(true);

    return (result ? 0 : 1);
}

From source file:com.bizosys.hsearch.kv.indexing.KVReplicatorMapFile.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    int seq = 0;/*  w  w  w  .ja v  a2 s .c o  m*/
    String inputFile = (args.length > seq) ? args[seq] : "";
    seq++;

    String outputFile = (args.length > seq) ? args[seq++] : "/tmp/hsearch-index";

    String outputFileName = (args.length > seq) ? args[seq++] : "file1";

    String xmlFilePath = (args.length > seq) ? args[seq++] : "";

    String replaceFrom = (args.length > seq) ? args[seq++] : "";

    String replaceTo = (args.length > seq) ? args[seq++] : "";

    String startIndex = (args.length > seq) ? args[seq++] : "";

    String endIndex = (args.length > seq) ? args[seq++] : "";

    String numberOfReducerStr = (args.length > seq) ? args[seq] : "1";
    int numberOfReducer = Integer.parseInt(numberOfReducerStr);

    if (null == inputFile || inputFile.trim().isEmpty()) {
        String err = KVReplicatorHFile.class + " > Please enter input file path.";
        System.err.println(err);
        throw new IOException(err);
    }

    Configuration conf = HBaseConfiguration.create();

    FieldMapping fm = KVIndexer.createFieldMapping(conf, xmlFilePath, new StringBuilder());
    outputFile = outputFile.charAt(outputFile.length() - 1) == '/' ? outputFile : outputFile + "/";
    outputFile = outputFile + fm.tableName;

    conf.set(OUTPUT_FILE_PATH, outputFile);
    conf.set(OUTPUT_FILE_NAME, outputFileName);

    conf.set(REPLACE_FROM, replaceFrom);
    conf.set(REPLACE_TO, replaceTo);
    conf.set(START_INDEX, startIndex);
    conf.set(END_INDEX, endIndex);

    Job job = Job.getInstance(conf, "KVReplicatorMapFile - Replicating Map File");

    job.setJarByClass(KVReplicatorMapFile.class);
    job.setMapperClass(KVReplicatorMapper.class);
    job.setReducerClass(KVReplicatorReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BytesWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setNumReduceTasks(numberOfReducer);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputFile.trim()));

    FileSystem fs = FileSystem.get(conf);
    Path dummyPath = new Path("/tmp", "dummy");
    if (fs.exists(dummyPath)) {
        fs.delete(dummyPath, true);
    }

    FileOutputFormat.setOutputPath(job, dummyPath);

    boolean result = job.waitForCompletion(true);
    return (result ? 0 : 1);
}

From source file:com.blackberry.logdriver.util.Cat.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf(); // Configuration processed by ToolRunner
    // If run by Oozie, then load the Oozie conf too
    if (System.getProperty("oozie.action.conf.xml") != null) {
        conf.addResource(new URL("file://" + System.getProperty("oozie.action.conf.xml")));
    }//from ww w.  j  ava  2s  .  c  o m

    FileSystem fs = FileSystem.get(conf);

    // The command line options
    List<Path> paths = new ArrayList<Path>();
    Path outputDir = null;

    // Load input files from the command line
    if (args.length < 2) {
        System.out.println("usage: [genericOptions] input [input ...] output");
        System.exit(1);
    }

    // Get the files we need from the command line.
    for (int i = 0; i < args.length - 1; i++) {
        for (FileStatus f : fs.globStatus(new Path(args[i]))) {
            paths.add(f.getPath());
        }
    }
    outputDir = new Path(args[args.length - 1]);

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    Configuration jobConf = job.getConfiguration();

    job.setJarByClass(Cat.class);
    jobConf.setIfUnset("mapred.job.name", "Cat Files");

    // To propagate credentials within Oozie
    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        jobConf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    // Good output separators include things that are unsupported by XML. So we
    // just send the byte value of the character through. The restriction here
    // is that it can't be more than 1 byte when UTF-8 encoded, since it will be
    // read by Pig which only deals with single byte separators.
    {
        String outputSeparator = jobConf.get("logdriver.output.field.separator", DEFAULT_OUTPUT_SEPARATOR);
        byte[] bytes = outputSeparator.getBytes(UTF_8);
        if (bytes.length != 1) {
            LOG.error("The output separator must be a single byte in UTF-8.");
            return 1;
        }
        jobConf.set("logdriver.output.field.separator", Byte.toString(bytes[0]));
    }

    job.setInputFormatClass(BoomInputFormat.class);
    job.setMapperClass(CatMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(0);

    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outputDir);

    for (Path path : paths) {
        BoomInputFormat.addInputPath(job, path);
    }

    // Run the job.
    if (conf.getBoolean("job.wait", DEFAULT_WAIT_JOB)) {
        return job.waitForCompletion(true) ? 0 : 1;
    } else {
        job.submit();
        return 0;
    }
}