org.trend.hgraph.util.test.GenerateTestData.java Source code

Java tutorial

Introduction

Here is the source code for org.trend.hgraph.util.test.GenerateTestData.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.trend.hgraph.util.test;

import java.io.IOException;
import java.math.BigDecimal;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Random;
import java.util.UUID;

import javax.activation.UnsupportedDataTypeException;

import org.apache.commons.lang.Validate;
import org.apache.commons.lang.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.trend.hgraph.HBaseGraphConstants;
import org.trend.hgraph.Properties;
import org.trend.hgraph.Properties.Pair;

/**
 * Generate test data for testing <code>trend-graph-hbase</code>.
 * @author scott_miao
 *
 */
public class GenerateTestData extends Configured implements Tool {

    private static Logger LOG = LoggerFactory.getLogger(GenerateTestData.class);

    private static final int DEFAULT_VERTEX_COUNT = 1000;

    private static final int DEFAULT_EDGE_COUNT_PER_VERTEX = 10;

    private static final int[] DEFAULT_EDGE_COUNT_FOR_VERTICES = { 5, 10, 20, 100, 1000 };
    private static final float[] DEFAULT_DIST_PERCS = { 0.2F, 0.3F, 0.35F, 0.1F, 0.05F };

    private static final Map<String, Object> DEFAULT_PROPERTY_KVS;
    static {
        DEFAULT_PROPERTY_KVS = new HashMap<String, Object>();

        DEFAULT_PROPERTY_KVS.put("a", "java");
        DEFAULT_PROPERTY_KVS.put("b", "hbase");
        DEFAULT_PROPERTY_KVS.put("c", "graph");
        DEFAULT_PROPERTY_KVS.put("d", 256);
        DEFAULT_PROPERTY_KVS.put("e", 1000000L);
        DEFAULT_PROPERTY_KVS.put("f", 20.235F);
        DEFAULT_PROPERTY_KVS.put("g", 3.1425678D);
        DEFAULT_PROPERTY_KVS.put("h", (short) 1);
        DEFAULT_PROPERTY_KVS.put("i", new BigDecimal("100000"));
        DEFAULT_PROPERTY_KVS.put("j", true);
    }

    private static final byte[] COLFAM_PROPERTY_NAME = Bytes
            .toBytes(HBaseGraphConstants.HBASE_GRAPH_TABLE_COLFAM_PROPERTY_NAME);

    private static final String[] LABELS = { "know", "create", "fight", "like", "not like", "link" };

    private static final Random random;
    static {
        random = new Random(System.currentTimeMillis());
    }

    private String vertexTable = null;
    private String edgeTable = null;

    private boolean isDistributionMode = false;

    private int vertexCount = DEFAULT_VERTEX_COUNT;
    private int edgeCountPerVertex = DEFAULT_EDGE_COUNT_PER_VERTEX;

    private int[] edgeCountForVertices = DEFAULT_EDGE_COUNT_FOR_VERTICES;

    /**
     * distribution percentages
     */
    private float[] distPercs = DEFAULT_DIST_PERCS;

    private long[] vertexCountForDistPercs = null;

    /**
     * Is first Vertex of each category ?
     */
    private boolean[] isFirstVertices = null;

    /**
     * for test purpose
     */
    private List<String> firstVertices = new ArrayList<String>();

    @Override
    public int run(String[] args) throws Exception {

        int countIndex = -1;
        // Process command line args
        for (int i = 0; i < args.length; i++) {
            String cmd = args[i];

            if (cmd.startsWith("-")) {
                if (countIndex >= 0) {
                    // command line args must be in the form: [opts] [count1
                    // [count2 ...]]
                    System.err.println("Invalid command line options");
                    printUsageAndExit();
                }

                if (cmd.equals("--help") || cmd.equals("-h")) {
                    // user asked for help, print the help and quit.
                    printUsageAndExit();
                } else if (cmd.equals("-ev")) {
                    i++;

                    if (i == args.length) {
                        System.err.println("-ev needs a ',' delimitered numeric value argument.");
                        printUsageAndExit();
                    }

                    String[] edgeCountStrs = args[i].split(",");
                    int[] edgeCounts = new int[edgeCountStrs.length];
                    for (int a = 0; a < edgeCountStrs.length; a++) {
                        try {
                            edgeCounts[a] = Integer.parseInt(edgeCountStrs[a]);
                        } catch (NumberFormatException e) {
                            System.err.println("-ev needs a ',' delimitered numeric value argument.");
                            printUsageAndExit();
                        }
                    }

                    for (int a = 1; a < edgeCounts.length; a++) {
                        if (edgeCounts[a - 1] > edgeCounts[a]) {
                            System.err.print("the before edgeCount shall always small than after");
                            System.err.println(", values:" + Arrays.toString(edgeCounts));
                            printUsageAndExit();
                        }
                    }

                    this.edgeCountForVertices = edgeCounts;
                } else if (cmd.equals("-p")) {
                    i++;

                    if (i == args.length) {
                        System.err.println("-p needs a ',' delimitered decimal value argument.");
                        printUsageAndExit();
                    }

                    String[] distPercsStrs = args[i].split(",");
                    float[] percs = new float[distPercsStrs.length];
                    for (int a = 0; a < distPercsStrs.length; a++) {
                        try {
                            percs[a] = Float.parseFloat(distPercsStrs[a]);
                        } catch (NumberFormatException e) {
                            System.err.println("-p needs a ',' delimitered decimal value argument.");
                            printUsageAndExit();
                        }
                    }

                    float sum = 0F;
                    for (float perc : percs) {
                        sum += perc;
                    }
                    if (sum != 1.0F) {
                        System.err.println("The total sum must be 1.0, values:" + Arrays.toString(percs));
                        printUsageAndExit();
                    }
                    this.distPercs = percs;
                } else if (cmd.equals("-d")) {
                    this.isDistributionMode = true;
                } else if (cmd.equals("-v")) {
                    i++;

                    if (i == args.length) {
                        System.err.println("-v needs a numeric value argument.");
                        printUsageAndExit();
                    }

                    try {
                        this.vertexCount = Integer.parseInt(args[i]);
                    } catch (NumberFormatException e) {
                        System.err.println("-v needs a numeric value argument.");
                        printUsageAndExit();
                    }
                } else if (cmd.equals("-e")) {
                    i++;

                    if (i == args.length) {
                        System.err.println("-e needs a numeric value argument.");
                        printUsageAndExit();
                    }

                    try {
                        this.edgeCountPerVertex = Integer.parseInt(args[i]);
                    } catch (NumberFormatException e) {
                        System.err.println("-e needs a numeric value argument.");
                        printUsageAndExit();
                    }
                }
            } else if (countIndex < 0) {
                // keep track of first count specified by the user
                countIndex = i;
            }
        }

        if (args.length != (countIndex + 2)) {
            System.err.println("the last two option must be <vertex-tablename> and <edge-tablename>");
            printUsageAndExit();
        }

        this.vertexTable = args[countIndex];
        this.edgeTable = args[countIndex + 1];

        if (this.isDistributionMode) {
            if (this.edgeCountForVertices.length != this.distPercs.length) {
                System.err.println("The element size between edge-count-for-vertices"
                        + " and distribution-percentages shall be the same");
                System.err.println("edge-count-for-vertices values:" + Arrays.toString(this.edgeCountForVertices)
                        + ", distribution-percentages values:" + Arrays.toString(this.distPercs));
                printUsageAndExit();
            }

            this.initialVertexCountForDistPercs();
            this.isFirstVertices = new boolean[this.edgeCountForVertices.length];

            LOG.info("-d, -ev:" + Arrays.toString(this.edgeCountForVertices) + ", -p:"
                    + Arrays.toString(this.distPercs) + ", vertex-table-name:" + this.vertexTable
                    + ", edge-table-name:" + this.edgeTable);
        } else {
            LOG.info("-v:" + this.vertexCount + ", -e:" + this.edgeCountPerVertex + ", vertex-table-name:"
                    + this.vertexTable + ", edge-table-name:" + this.edgeTable);
        }

        this.doGenerateTestData();

        return 0;
    }

    private void initialVertexCountForDistPercs() {
        this.vertexCountForDistPercs = new long[this.distPercs.length];
        for (int a = 0; a < this.distPercs.length; a++) {
            if (a + 1 == this.distPercs.length) {
                this.vertexCountForDistPercs[a] = this.vertexCount;
            } else if (a == 0) {
                this.vertexCountForDistPercs[a] = new BigDecimal(this.vertexCount)
                        .multiply(new BigDecimal((double) this.distPercs[a])).intValue();
            } else {
                this.vertexCountForDistPercs[a] = new BigDecimal(this.vertexCount)
                        .multiply(new BigDecimal((double) this.distPercs[a])).intValue()
                        + this.vertexCountForDistPercs[a - 1];
            }
        }
    }

    private void doGenerateTestData() throws IOException {
        HTable vertexTable = null;
        HTable edgeTable = null;
        Put put = null;
        long vIdx = 0;
        byte[] parentVertexKey = null;
        StopWatch timer = new StopWatch();
        timer.start();
        try {
            vertexTable = new HTable(this.getConf(), this.vertexTable);
            vertexTable.setAutoFlush(false);
            edgeTable = new HTable(this.getConf(), this.edgeTable);
            edgeTable.setAutoFlush(false);

            Queue<byte[]> parentVertexKeysQueue = new ArrayDeque<byte[]>();
            int tmpEdgeCountPerVertex = 0;
            int edgeAcctCount = 0;
            Properties.Pair<Integer, Integer> pair = null;
            for (int rowCount = 0; rowCount < this.vertexCount; rowCount++) {
                put = generateVertexPut();
                vertexTable.put(put);
                parentVertexKeysQueue.offer(put.getRow());

                if (rowCount > 0) {
                    vIdx = rowCount % tmpEdgeCountPerVertex;
                    if (vIdx == 0) {
                        parentVertexKey = parentVertexKeysQueue.poll();

                        edgeAcctCount++;
                        if (this.isDistributionMode && !this.isFirstVertices[pair.key]
                                && edgeAcctCount == tmpEdgeCountPerVertex) {
                            this.addFirstVertex(Bytes.toString(parentVertexKey));
                            this.isFirstVertices[pair.key] = true;
                        }
                        pair = this.determineEdgeCountPerVertex(rowCount);
                        tmpEdgeCountPerVertex = pair.value;
                        edgeAcctCount = 0;
                    } else if (vIdx > 0) {
                        edgeAcctCount++;
                        parentVertexKey = parentVertexKeysQueue.peek();
                    } else {
                        throw new RuntimeException("vIdex:" + vIdx + " shall always not small than 0");
                    }

                    put = generateEdgePut(rowCount, parentVertexKey, put.getRow());
                    edgeTable.put(put);
                } else {
                    pair = this.determineEdgeCountPerVertex(rowCount);
                    tmpEdgeCountPerVertex = pair.value;
                    if (!this.isDistributionMode)
                        this.addFirstVertex(Bytes.toString(put.getRow()));
                }
            }
            vertexTable.flushCommits();
            edgeTable.flushCommits();
        } catch (IOException e) {
            LOG.error("doGenerateTestData failed", e);
            throw e;
        } finally {
            if (null != vertexTable)
                vertexTable.close();
            if (null != edgeTable)
                edgeTable.close();
            timer.stop();
            LOG.info("Time elapsed:" + timer.toString() + ", " + timer.getTime() + " for pushing "
                    + this.vertexCount + " vertices test data to HBase");
            LOG.info("first vertices id:" + this.firstVertices);
        }
    }

    /**
     * @param rowCount
     * @return <code>pair.key</code> is idx used in {@link #edgeCountForVertices},<br>
     * <code>pair.value</code> is edgeCountPerVertex
     */
    private Properties.Pair<Integer, Integer> determineEdgeCountPerVertex(int rowCount) {
        int tmpEdgeCountPerVertex = 0;
        int idx = -1;
        int start = 0;
        int end = 0;

        if (this.isDistributionMode) {
            for (long count : this.vertexCountForDistPercs) {
                idx++;
                if (count > rowCount) {
                    if (idx == 0) {
                        start = 1;
                    } else {
                        start = this.edgeCountForVertices[idx - 1] + 1;
                    }
                    end = this.edgeCountForVertices[idx] + 1;
                    tmpEdgeCountPerVertex = random.nextInt(end - start) + start;
                    break;
                }
            }
        } else {
            tmpEdgeCountPerVertex = this.edgeCountPerVertex;
        }
        return new Pair<Integer, Integer>(idx, tmpEdgeCountPerVertex);
    }

    private static Put generateVertexPut() {
        Put put = null;
        String rowKey = UUID.randomUUID().toString();
        put = new Put(Bytes.toBytes(rowKey));
        generateProperties(put);
        return put;
    }

    private static Put generateEdgePut(int rowCount, byte[] parentVertexKey, byte[] vertexKey) {
        String label = null;
        Put put = null;
        String rowKey = null;

        //get label randomly
        label = LABELS[rowCount % LABELS.length];

        rowKey = Bytes.toString(parentVertexKey) + HBaseGraphConstants.HBASE_GRAPH_TABLE_EDGE_DELIMITER_1 + label
                + HBaseGraphConstants.HBASE_GRAPH_TABLE_EDGE_DELIMITER_2 + Bytes.toString(vertexKey);
        put = new Put(Bytes.toBytes(rowKey));
        generateProperties(put);
        return put;
    }

    private static void generateProperties(Put put) {
        Pair<byte[], byte[]> pair = null;
        for (Map.Entry<String, Object> entry : DEFAULT_PROPERTY_KVS.entrySet()) {
            try {
                pair = Properties.keyValueToBytes(entry.getKey(), entry.getValue());
                put.add(COLFAM_PROPERTY_NAME, pair.key, pair.value);
            } catch (UnsupportedDataTypeException e) {
                LOG.error("generate property pair failed", e);
                throw new RuntimeException(e);
            }
        }
    }

    private static void printUsageAndExit() {
        System.err.print(GenerateTestData.class.getSimpleName() + " Usage:");
        System.err.print(" [-D genericOption [...]] ");
        System.err.print(" [-v <vertex-count>] [-e <edge-count-per-vertex>]");
        System.err.print(" [-d [-ev <edge-count-for-vertices>] [-p <distribution-percentages>]]");
        System.err.println(" <vertex-tablename> <edge-tablename>");
        System.err.println("-h --help print usage");
        System.err.println("-v vertex-count, default value: " + DEFAULT_VERTEX_COUNT);
        System.err.println("-e edge-count-per-vertex, default value: " + DEFAULT_EDGE_COUNT_PER_VERTEX);
        System.err.println("-d enable normal-distribution mode");
        System.err.println("-ev edge-count-for-vertices, specify edge"
                + " count for each category of given vertex, delimitered by ','");
        System.err.println("    it is opposite with edge-count-per-vertex");
        System.err.println("    default vaule:" + Arrays.toString(DEFAULT_EDGE_COUNT_FOR_VERTICES));
        System.err.println("-p distribution-percentages, specify percentages"
                + " of vertices will have edge count, delimitered by ','");
        System.err.println("    default value:" + Arrays.toString(DEFAULT_DIST_PERCS));
        System.exit(1);
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        int returnCode = ToolRunner.run(conf, new GenerateTestData(), args);
        System.exit(returnCode);
    }

    /**
     * @return the firstVertices
     */
    List<String> getFirstVertices() {
        return firstVertices;
    }

    void addFirstVertex(String id) {
        Validate.notEmpty(id, "id for firstVertices shall always not be null or empty");
        this.firstVertices.add(id);
    }

}