datafu.hourglass.demo.GenerateIds.java Source code

Java tutorial

Introduction

Here is the source code for datafu.hourglass.demo.GenerateIds.java

Source

/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.demo;

import java.io.IOException;
import java.io.OutputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Random;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import datafu.hourglass.test.Schemas;

/**
 * Generate random test data in yyyy/MM/dd paths for
 * a given date range.
 * 
 * @author "Matthew Hayes"
 *
 */
public class GenerateIds extends Configured implements NamedTool {
    private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd");

    private static final Schema EVENT_SCHEMA;

    private Random random = new Random();
    private int startId;
    private int endId;

    static {
        EVENT_SCHEMA = Schemas.createRecordSchema(GenerateIds.class, "Event",
                new Field("id", Schema.create(Type.LONG), null, null));
    }

    @Override
    public int run(String[] args) throws Exception {
        if (args.length < 2 || args.length > 3) {
            System.err.printf("%s   %s\n", getName(), getDescription());
            System.err.println("Usage: <output-path> <date-range> [<id-range>]");
            return 1;
        }

        String outputPath = args[0];
        String dateRange = args[1];

        String idRange = "1-100";
        if (args.length == 3) {
            idRange = args[2];
        }

        try {
            return run(super.getConf(), outputPath, dateRange, idRange);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
        return 1;
    }

    public int run(Configuration conf, String outputPathString, String dateRange, String idRange)
            throws IOException, InterruptedException, ClassNotFoundException {
        FileSystem fs = FileSystem.get(conf);
        Path outputPath = new Path(outputPathString);

        String[] dateRangeParts = dateRange.split("-");
        String[] idRangeParts = idRange.split("-");

        Date startDate;
        Date endDate = null;

        if (dateRangeParts.length == 1) {
            try {
                startDate = dateFormat.parse(dateRangeParts[0]);
            } catch (ParseException e) {
                System.err.println("Invalid date range: " + dateRangeParts[0]);
                return 1;
            }
        } else if (dateRangeParts.length == 2) {
            try {
                startDate = dateFormat.parse(dateRangeParts[0]);
            } catch (ParseException e) {
                System.err.println("Invalid date range: " + dateRangeParts[0]);
                return 1;
            }

            try {
                endDate = dateFormat.parse(dateRangeParts[1]);
            } catch (ParseException e) {
                System.err.println("Invalid date range: " + dateRangeParts[1]);
                return 1;
            }

            if (startDate.compareTo(endDate) >= 0) {
                System.err.println("Start date must be before end date");
                return 1;
            }
        } else {
            System.err.println("Invalid date range: " + dateRange);
            return 1;
        }

        if (idRangeParts.length == 1) {
            try {
                startId = Integer.parseInt(idRangeParts[0]);
                endId = startId;
            } catch (NumberFormatException e) {
                System.err.println("Invalid id range: " + idRangeParts[0]);
                return 1;
            }
        } else if (idRangeParts.length == 2) {
            try {
                startId = Integer.parseInt(idRangeParts[0]);
            } catch (NumberFormatException e) {
                System.err.println("Invalid id range: " + idRangeParts[0]);
                return 1;
            }

            try {
                endId = Integer.parseInt(idRangeParts[1]);
            } catch (NumberFormatException e) {
                System.err.println("Invalid id range: " + idRangeParts[1]);
                return 1;
            }

            if (endId < startId) {
                System.err.println("Start id must be before end id");
                return 1;
            }
        } else {
            System.err.println("Invalid id range: " + idRange);
            return 1;
        }

        Calendar cal = Calendar.getInstance();

        if (endDate == null) {
            createDataForDate(fs, outputPath, startDate);
        } else {
            for (Date date = startDate; date.compareTo(endDate) <= 0;) {
                createDataForDate(fs, outputPath, date);
                cal.setTime(date);
                cal.add(Calendar.DAY_OF_MONTH, 1);
                date = cal.getTime();
            }
        }

        return 0;
    }

    private void createDataForDate(FileSystem fs, Path outputPath, Date date) throws IOException {
        // make sure output path exists
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);
        }

        Path datePath = new Path(outputPath, dateFormat.format(date));

        System.out.println("Writing to " + datePath.toString() + " with range " + startId + " to " + endId);

        DataFileWriter<GenericRecord> dataWriter;
        OutputStream outputStream;

        Path dailyPath = outputPath;
        Path path = new Path(dailyPath, dateFormat.format(date));

        // delete directory if it already exists
        if (fs.exists(path)) {
            fs.delete(path, true);
        }

        outputStream = fs.create(new Path(path, "part-00000.avro"));

        GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>();
        dataWriter = new DataFileWriter<GenericRecord>(writer);
        dataWriter.create(EVENT_SCHEMA, outputStream);

        GenericRecord record = new GenericData.Record(EVENT_SCHEMA);
        // create 1000 random IDs
        for (int i = 0; i < 1000; i++) {
            long val;
            if (startId == endId) {
                val = startId;
            } else {
                val = (long) (startId + random.nextInt(endId - startId + 1));
            }
            record.put("id", val);
            dataWriter.append(record);
        }

        dataWriter.close();
        outputStream.close();
    }

    @Override
    public String getName() {
        return "generate";
    }

    @Override
    public String getDescription() {
        return "create random event data";
    }
}