Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths)

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:com.inmobi.messaging.consumer.databus.mapred.TestDatabusInputFormat.java

License:Apache License

/**
 * It reads the collector file (i.e. non compressed file) and assert on the
 * read messages/*from  w  ww  .ja v a2  s .com*/
 */
@Test
public void testDatabusInputFormat() throws Exception {
    FileInputFormat.setInputPaths(defaultConf, collectorDir);
    splitFile(5);
    assertMessages(100);
}

From source file:com.inmobi.messaging.consumer.databus.mapreduce.TestDatabusInputFormatMapReduce.java

License:Apache License

/**
 * It reads the collector file (i.e. non compressed file) and assert on the
 * read messages//from   w w  w  .ja  v  a  2s  .com
 */
@Test
public void testDatabusInputFormatMapReduce() throws Exception {
    FileInputFormat.setInputPaths(defaultConf, collectorDir);
    context = getTaskAttemptContext(defaultConf, taskId);
    List<Path> collectorFilePaths = new ArrayList<Path>();
    listAllPaths(collectorDir, collectorFilePaths);

    if (collectorFilePaths.size() > 0) {
        splitFile(5, collectorFilePaths.get(0));
    }
    assertMessages(100);
}

From source file:com.inmobi.messaging.consumer.databus.mapreduce.TestDatabusInputFormatMapReduce.java

License:Apache License

/**
 * It reads the local stream file(i.e. compressed file) and assert on the
 * read messages//from  www.ja  va2s  .c  o m
 */
@Test
protected void testGZFile() throws Exception {
    Path localstreamDir = new Path(cluster.getLocalFinalDestDirRoot(), testStream);
    List<Path> minuteDirs = new ArrayList<Path>();
    listAllPaths(localstreamDir, minuteDirs);
    if (minuteDirs.size() > 0) {
        FileInputFormat.setInputPaths(defaultConf, minuteDirs.get(0).getParent());
        context = getTaskAttemptContext(defaultConf, taskId);
        readMessages = new ArrayList<Message>();
        splitFile(1, minuteDirs.get(0));
        LOG.info("number msgs read from gz files  " + readMessages.size());
        assertMessages(0);
    }
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.HashIdMR.java

License:Open Source License

/**
 * @param inputpath/*from   w ww  .j ava  2 s.c om*/
 *          the path to a unique vertex list. Each line is parsed into (vid,
 *          data) using {@code vidparser} and {@code vdataparser}.
 * @param outputpath
 *          the path of output directory.
 * @throws IOException
 */
public void run(String inputpath, String outputpath) throws IOException {
    JobConf conf = new JobConf(HashIdMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(HashIdMapper.class);
    conf.setReducerClass(HashIdReducer.class);

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputFormat(MultiDirOutputFormat.class);

    conf.setInt("mapred.line.input.format.linespermap", linespermap);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("VdataParser", vdataparser.getClass().getName());

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("====== Job: Create integer Id maps for vertices ==========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.debug("Lines per map = 6000000");
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("VdataParser = " + vdataparser.getClass().getName());
    LOG.info("==========================================================");
    JobClient.runJob(conf);
    LOG.info("=======================Done =====================\n");
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortDictMR.java

License:Open Source License

/**
 * @param inputpath/*from   www. j av  a2 s .com*/
 *          the path to a rawId to newId dictionary.
 * @param outputpath
 *          the path of output directory.
 * @throws IOException
 */
public void run(String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(SortDictMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(SortDictMapper.class);
    conf.setReducerClass(SortDictReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setBoolean("hashRawVid", hashRawVid);
    conf.setInt("numChunks", numChunks);
    conf.set("VidParser", vidparser.getClass().getName());

    String outprefix = "vidhashmap";
    for (int i = 0; i < numChunks; i++) {
        MultipleOutputs.addNamedOutput(conf, outprefix + i, TextOutputFormat.class, Text.class, Text.class);
    }

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("========== Job: Partition the map of rawid -> id ===========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.info("======================================================");
    if (hashRawVid)
        LOG.info("Partition on rawId.");
    else
        LOG.info("Partition on newId");
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    JobClient.runJob(conf);
    LOG.info("======================= Done ==========================\n");
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortEdgeMR.java

License:Open Source License

public void run(String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(SortEdgeMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(SortEdgeMapper.class);
    conf.setReducerClass(SortEdgeReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setInt("numChunks", numChunks);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("EdataParser", edataparser.getClass().getName());

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("==== Job: Partition the input edges by hash(sourceid) =========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("EdataParser = " + edataparser.getClass().getName());
    LOG.info("===============================================================");

    JobClient.runJob(conf);//from   www.ja v  a 2s .c  om
    LOG.info("=================== Done ====================================\n");
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.TransEdgeMR.java

License:Open Source License

/**
 * @param inputpath/*from  ww w.  j  a  v a  2s .com*/
 *          path of the partitioned edge list
 * @param outputpath
 *          path of the output directory
 * @throws IOException
 */
public void run(String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(TransEdgeMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(TransEdgeMapper.class);
    conf.setReducerClass(TransEdgeReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setInt("numChunks", numChunks);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("EdataParser", edataparser.getClass().getName());

    conf.set("dictionaryPath", dictionaryPath);

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("============= Job: Normalize Ids in Edges ====================");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.info("Dictionary = " + dictionaryPath);
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("EdataParser = " + edataparser.getClass().getName());
    LOG.info("===============================================================");

    JobClient.runJob(conf);

    LOG.info("========================= Done ===============================");
}

From source file:com.intel.hadoop.graphbuilder.partition.mapreduce.vrecord.VrecordIngressMR.java

License:Open Source License

public void run(int numProcs, String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(VrecordIngressMR.class);
    conf.setJobName("Vrecord Mapreduce");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(VrecordIngressMapper.class);
    conf.setReducerClass(VrecordIngressReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(MultiDirOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    if (gzip) {//  w ww. j  a v  a2s.  c o m
        TextOutputFormat.setCompressOutput(conf, true);
        TextOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
    }

    LOG.info("====== Job: Distributed Vertex Records to partitions =========");
    LOG.info("input: " + inputpath);
    LOG.info("output: " + outputpath);
    LOG.info("numProc = " + numProcs);
    LOG.info("gzip = " + Boolean.toString(gzip));
    LOG.info("==============================================================");

    JobClient.runJob(conf);
    LOG.info("==========================Done===============================");
}

From source file:com.intel.hadoop.graphbuilder.preprocess.mapreduce.EdgeTransformMR.java

License:Open Source License

public void run(String inputpath, String outputpath) throws IOException {

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(mapkeytype.getClass());
    conf.setMapOutputValueClass(mapvaltype.getClass());

    conf.setMapperClass(EdgeTransformMapper.class);
    conf.setCombinerClass(EdgeTransformCombiner.class);
    conf.setReducerClass(EdgeTransformReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.set("ReduceFunc", reducefunc.getClass().getName());
    conf.set("ApplyFunc", applyfunc.getClass().getName());
    conf.setBoolean("reduceEndPoint", reduceEndPoint);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("EdataParser", edataparser.getClass().getName());

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("============== Job: Data Transformation on Edges ==========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.info("reducefunc = " + reducefunc.getClass().getName());
    LOG.info("applyfunc = " + applyfunc.getClass().getName());
    if (reduceEndPoint == SOURCE)
        LOG.info("Reduce on source");
    else//from  ww w  . j  a  va 2s .c o m
        LOG.info("Reduce on target");

    if (!checkTypes()) {
        LOG.fatal("Type check failed."
                + " Please check the parser and reduce/apply functions are consistent with key/val types.");
        return;
    }
    LOG.info("===========================================================");
    JobClient.runJob(conf);
    LOG.info("======================== Done ============================\n");
}

From source file:com.jackbe.mapreduce.LocalJobManager.java

License:Open Source License

public RunningJob startJob(String inputDir, String outputDir, String mapperScript, String reducerScript,
        String combinerScript) throws Exception {

    init();/*w  w  w .ja  v  a2 s.c o  m*/
    conf.setJobName("EMMLMapReduce");
    //conf.setSessionId(Long.toString(System.currentTimeMillis()));

    conf.set("MAPPER_SCRIPT", mapperScript);
    conf.set("REDUCER_SCRIPT", reducerScript);
    if (combinerScript != null) {
        conf.set("COMBINER_SCRIPT", combinerScript);
        conf.setCombinerClass(EMMLCombiner.class);
    }

    //      FileInputFormat.setInputPaths(conf, new Path(inputDir));
    FileInputFormat.setInputPaths(conf, new Path("hdfs://" + NAMENODE + "/" + inputDir));
    //      FileOutputFormat.setOutputPath(conf, new Path(outputDir));
    Path outputPath = new Path("hdfs://" + NAMENODE + "/" + outputDir);
    outputPath.getFileSystem(conf).delete(outputPath, true);
    FileOutputFormat.setOutputPath(conf, outputPath);
    RESTRegistrationJobCallback callback = new RESTRegistrationJobCallback(outputPath, outputDir, conf);

    RunningJob job = null;
    try {
        job = jobClient.submitJob(conf);
        this.registerJobCompleteCallback(job, callback);

        statusMap.put(job.getJobID(), job);
    } catch (IOException e) {
        e.printStackTrace();
        throw e;
    }
    jobClient.getSystemDir();
    return job;
}