org.trend.hgraph.mapreduce.pagerank.GetNoColumnsRows.java Source code

Introduction

Here is the source code for org.trend.hgraph.mapreduce.pagerank.GetNoColumnsRows.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.trend.hgraph.mapreduce.pagerank;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.Validate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author scott_miao
 *
 */
public class GetNoColumnsRows extends Configured implements Tool {

    private static final Logger LOGGER = LoggerFactory.getLogger(GetNoColumnsRows.class);

    /**
     * only for test
     */
    private long collectedRow;

    protected GetNoColumnsRows(Configuration conf) {
        super(conf);
    }

    private static class Mapper extends TableMapper<Text, NullWritable> {

        public static final String AND_OR = "hgraph.mapreduce.nocolumns.and";
        public static final String NO_COLUMNS = "hgraph.mapreduce.nocolumns";

        private boolean and = true;
        private Pair<byte[][], byte[][]> pair = null;

        enum Counters {
            ROWS, COLLECTED_ROWS
        }

        @Override
        protected void map(ImmutableBytesWritable key, Result value, Context context)
                throws IOException, InterruptedException {
            context.getCounter(Counters.ROWS).increment(1L);

            List<Boolean> founds = null;
            founds = new ArrayList<Boolean>(pair.getFirst().length);
            for (int a = 0; a < pair.getFirst().length; a++) {
                founds.add(value.containsColumn(pair.getFirst()[a], pair.getSecond()[a]));
            }

            boolean write = false;
            if (and) {
                write = true;
                for (Boolean found : founds) {
                    if (Boolean.TRUE.equals(found)) {
                        write = false;
                        break;
                    }
                }
            } else {
                for (Boolean found : founds) {
                    if (Boolean.FALSE.equals(found)) {
                        write = true;
                        break;
                    }
                }
            }
            if (write) {
                context.write(new Text(key.get()), NullWritable.get());
                context.getCounter(Counters.COLLECTED_ROWS).increment(1L);
            }
        }

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            Configuration conf = context.getConfiguration();
            and = conf.getBoolean(AND_OR, true);
            pair = parseColumns(conf.getStrings(NO_COLUMNS));
        }

        private static Pair<byte[][], byte[][]> parseColumns(String[] columns) {
            Validate.notEmpty(columns, "columns shall not be empty or null");
            List<byte[]> first = new ArrayList<byte[]>();
            List<byte[]> second = new ArrayList<byte[]>();
            String[] cfcq = null;
            for (String column : columns) {
                cfcq = StringUtils.split(column, ":");
                Validate.notEmpty(cfcq, "split failed for column:" + column);
                Validate.isTrue(cfcq.length == 2,
                        "parsed failed due to length is not 2, current length:" + cfcq.length);
                Validate.notEmpty(cfcq[0], "family is empty or null for column:" + column);
                Validate.notEmpty(cfcq[1], "qualifier is empty or null for column:" + column);

                first.add(Bytes.toBytes(cfcq[0]));
                second.add(Bytes.toBytes(cfcq[1]));
            }

            Validate.isTrue(first.size() == second.size(), "the parsed size is not equal, family.size:"
                    + first.size() + ", qualifier.size:" + second.size());

            return new Pair<byte[][], byte[][]>(first.toArray(new byte[][] {}), second.toArray(new byte[][] {}));
        }

    }

    /* (non-Javadoc)
     * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
     */
    @Override
    public int run(String[] args) throws Exception {
        if (null == args || args.length == 0) {
            System.err.println("no any option given !!");
            printUsage();
            return -1;
        }

        System.out.println("options:" + Arrays.toString(args));
        boolean and = true;
        String cmd = null;
        int mustStartIdx = -1;
        for (int a = 0; a < args.length; a++) {
            cmd = args[a];
            if (cmd.startsWith("-")) {
                if (mustStartIdx > -1) {
                    System.err.println("option order is incorrect !!");
                    printUsage();
                    return -1;
                }

                if ("-a".equals(cmd)) {
                    and = true;
                } else if ("-o".equals(cmd)) {
                    and = false;
                } else {
                    System.err.println("option is not defined !!");
                    printUsage();
                    return -1;
                }
            } else {
                if (mustStartIdx == -1) {
                    mustStartIdx = a;
                }
            }
        }

        String tableName = args[mustStartIdx];
        String outputPath = args[mustStartIdx + 1];
        List<String> columns = new ArrayList<String>();
        for (int a = mustStartIdx + 2; a < args.length; a++) {
            columns.add(args[a]);
        }

        LOGGER.info("tableName=" + tableName);
        LOGGER.info("outputPath=" + outputPath);
        LOGGER.info("columns=" + columns);

        Configuration conf = this.getConf();
        conf.setBoolean(Mapper.AND_OR, and);
        conf.setStrings(Mapper.NO_COLUMNS, columns.toArray(new String[] {}));

        Job job = createSubmittableJob(conf, tableName, outputPath);
        boolean success = job.waitForCompletion(true);
        if (!success) {
            System.err.println("run job:" + job.getJobName() + " failed");
            return -1;
        }

        // for test
        Counter counter = job.getCounters().findCounter(
                "org.trend.hgraph.mapreduce.pagerank.GetNoColumnsRows$Mapper$Counters", "COLLECTED_ROWS");
        if (null != counter) {
            collectedRow = counter.getValue();
        }

        return 0;
    }

    public static Job createSubmittableJob(Configuration conf, String tableName, String outputPath)
            throws IOException {
        Validate.notEmpty(tableName, "tableName shall always not be empty");
        Validate.notEmpty(outputPath, "outputPath shall always not be empty");

        long timestamp = System.currentTimeMillis();
        Job job = null;
        String jobName = null;
        try {
            jobName = "GetNoCoumnsRows_" + timestamp;
            LOGGER.info("start to run job:" + jobName);
            job = new Job(conf, jobName);
            job.setJarByClass(GetNoColumnsRows.class);

            LOGGER.info("tableName=" + tableName);
            LOGGER.info("outputPath=" + outputPath);

            Scan scan = new Scan();
            TableMapReduceUtil.initTableMapperJob(tableName, scan, Mapper.class, Text.class, NullWritable.class,
                    job, true, TableInputFormat.class);

            // only mapper
            job.setOutputFormatClass(TextOutputFormat.class);
            job.setNumReduceTasks(0);

            FileOutputFormat.setOutputPath(job, new Path(outputPath));
        } catch (IOException e) {
            LOGGER.error("run " + jobName + " failed", e);
            throw e;
        }
        return job;
    }

    private static void printUsage() {
        System.err.println(GetNoColumnsRows.class.getSimpleName()
                + " Usage: [-a | -o] <table> <output-path> <family:qualifier> [<family:qualifier> [...]]");
        System.err.println("Find the rowkeys if they have no the corresponding column(s) by given criteria.");
        System.err.println("  -a: AND (default), -o : OR");
    }

    /**
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        Tool tool = new GetNoColumnsRows(conf);
        int status = ToolRunner.run(tool, args);
        System.exit(status);
    }

    protected long getCollectedRow() {
        return collectedRow;
    }

}