hibench.DataGenerator.java Source code

Introduction

Here is the source code for hibench.DataGenerator.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package hibench;

import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS;

import hibench.MapReduceSet.ConcatTextCombiner;
import hibench.MapReduceSet.CreateHtmlPagesMapper;
import hibench.MapReduceSet.CreateZipfDistrMapper;
import hibench.MapReduceSet.HtmlMultipleTextOutputFormat;
import hibench.MapReduceSet.JoinContentWithZipfReducer;
import hibench.MapReduceSet.ReverseContentMapper;
import hibench.MapReduceSet.SumUpZipfMapper;
import hibench.MapReduceSet.SumUpZipfReducer;
import hibench.MapReduceSet.TagRecordsMapper;
import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleInputs;
import org.apache.hadoop.mapred.lib.NLineInputFormat;

public abstract class DataGenerator {
    public abstract void initGenerator() throws IOException;

    public abstract void genHtmlPages() throws IOException;

    public abstract void genResultData() throws Exception;

    public abstract void closeGenerator() throws IOException;

    protected static Log LOG;

    DataGenerator() {
        LOG = LogFactory.getLog(WebDataGen.class.getName());
    }

    public void generateData() throws Exception {

        initGenerator();
        genHtmlPages();
        genResultData();
        closeGenerator();
    }

    public void createDummy(Path dummy, int agents) throws IOException {

        LOG.info("Creating dummy file " + dummy + "with " + agents + " agents...");

        DataPaths.checkHdfsFile(dummy, false);

        FileSystem fs = dummy.getFileSystem(new Configuration());
        FSDataOutputStream out = fs.create(dummy);

        String dummyContent = "";
        for (int i = 1; i <= agents; i++) {
            dummyContent = dummyContent.concat(Integer.toString(i) + "\n");
        }

        out.write(dummyContent.getBytes("UTF-8"));
        out.close();
    }

    public void sumUpZipf(Path fin, Path fout, ZipfRandom zipf) throws IOException {
        LOG.info("Summing up Zipfian Id Distirubtion...");

        JobConf job = new JobConf(WebDataGen.class);
        String jobname = fin.getName() + " -> " + fout.getName();
        job.setJobName(jobname);

        zipf.setJobConf(job);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setMapperClass(SumUpZipfMapper.class);
        job.setReducerClass(SumUpZipfReducer.class);

        job.setNumReduceTasks(1); // Important to sequentially accumulate the required space

        job.setInputFormat(NLineInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);

        DataPaths.checkHdfsFile(fout, false);

        FileInputFormat.setInputPaths(job, fin);
        FileOutputFormat.setOutputPath(job, fout);

        LOG.info("Running Job: " + jobname);
        LOG.info("Dummy file: " + fin);
        LOG.info("Zipfian sum up file as Ouput: " + fout);
        JobClient.runJob(job);
        LOG.info("Finished Running Job: " + jobname);
    }

    public void createZipf(Path fin, Path fout, ZipfRandom zipf) throws IOException {
        LOG.info("Creating Zipfian Id Distirubtion...");

        JobConf job = new JobConf(WebDataGen.class);
        String jobname = fin.getName() + " -> " + fout.getName();

        job.setJobName(jobname);

        zipf.setJobConf(job);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setMapperClass(CreateZipfDistrMapper.class);

        job.setNumReduceTasks(0);

        job.setInputFormat(NLineInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(job, fin);
        FileOutputFormat.setOutputPath(job, fout);

        DataPaths.checkHdfsFile(fout, false);

        LOG.info("Running Job: " + jobname);
        LOG.info("Zipfian Sum File: " + fin);
        LOG.info("Zipfian Id distribution as Ouput: " + fout);
        RunningJob jobCreateZipf = JobClient.runJob(job);
        LOG.info("Finished Running Job: " + jobname);

        long vElems = jobCreateZipf.getCounters().getCounter(MAP_OUTPUT_RECORDS);
        LOG.info("Created " + vElems + " virtual zipfian elements");
        zipf.setVirtElems(vElems);
    }

    public void genZipfDist(Path dummy, Path zipfSum, Path zipfDist, ZipfRandom zipf) throws IOException {
        sumUpZipf(dummy, zipfSum, zipf);
        createZipf(zipfSum, zipfDist, zipf);
    }

    public void replaceIds(Path fcontent, Path fids, Path fjoin, ZipfRandom zipf) throws IOException {

        LOG.info("Replace Virtual Zipfian Ids with real Ids...");

        JobConf job = new JobConf(WebDataGen.class);
        String jobname = fcontent.getName() + " JOIN " + fids.getName() + " -> " + fjoin.getName();

        job.setJobName(jobname);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        MultipleInputs.addInputPath(job, fids, TextInputFormat.class, TagRecordsMapper.class);
        MultipleInputs.addInputPath(job, fcontent, TextInputFormat.class, ReverseContentMapper.class);
        job.setOutputFormat(TextOutputFormat.class);

        // use combiner to avoid too many inputs for reducer
        job.setCombinerClass(ConcatTextCombiner.class);
        job.setReducerClass(JoinContentWithZipfReducer.class);

        if (zipf.reds > 0) {
            job.setNumReduceTasks(zipf.reds);
        } else {
            job.setNumReduceTasks(DataOptions.getMaxNumReduce());
        }

        FileOutputFormat.setOutputPath(job, fjoin);

        LOG.info("Running Job: " + jobname);
        LOG.info("Zipfian Id distribution: " + fids);
        LOG.info("Content file with virtual Ids: " + fcontent);
        LOG.info("Joint result file: " + fjoin);
        JobClient.runJob(job);
        LOG.info("Finished Running Job: " + jobname);
    }

    public void createHtmlPages(Path dummy, HtmlConf html) throws IOException {

        LOG.info("Creating Html Pages...");

        Path fout = new Path(dummy.getParent(), "tmp");

        JobConf job = new JobConf(WebDataGen.class);
        String jobname = "Create html pages to " + fout.getName();

        job.setJobName(jobname);

        html.setJobConf(job);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setMapperClass(CreateHtmlPagesMapper.class);

        job.setNumReduceTasks(0);

        FileInputFormat.setInputPaths(job, dummy);

        // first create result files under tmp folder
        FileOutputFormat.setOutputPath(job, fout);

        // begin from dummy file
        job.setInputFormat(NLineInputFormat.class);

        // use MultipleTextOutputFormat to produce three out files defined
        // in PathConf, i.e., LINK, PAGE_ZLINK_TABLE, PAGE_ZWORD_TABLE
        job.setOutputFormat(HtmlMultipleTextOutputFormat.class);

        LOG.info("Running Job: " + jobname);
        LOG.info("Dummy file: " + dummy);
        LOG.info("Multiple result Html files as <links, words, urls>");
        JobClient.runJob(job);
        LOG.info("Finished Running Job: " + jobname);

        // Move result files under tmp into parent path
        // and remove the empty tmp path finally 
        DataPaths.moveFilesToParent(fout);
    }
}