List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:com.inmobi.messaging.consumer.databus.mapred.TestDatabusInputFormat.java
License:Apache License
/** * It reads the collector file (i.e. non compressed file) and assert on the * read messages/*from w ww .ja v a2 s .com*/ */ @Test public void testDatabusInputFormat() throws Exception { FileInputFormat.setInputPaths(defaultConf, collectorDir); splitFile(5); assertMessages(100); }
From source file:com.inmobi.messaging.consumer.databus.mapreduce.TestDatabusInputFormatMapReduce.java
License:Apache License
/** * It reads the collector file (i.e. non compressed file) and assert on the * read messages//from w w w .ja v a 2s .com */ @Test public void testDatabusInputFormatMapReduce() throws Exception { FileInputFormat.setInputPaths(defaultConf, collectorDir); context = getTaskAttemptContext(defaultConf, taskId); List<Path> collectorFilePaths = new ArrayList<Path>(); listAllPaths(collectorDir, collectorFilePaths); if (collectorFilePaths.size() > 0) { splitFile(5, collectorFilePaths.get(0)); } assertMessages(100); }
From source file:com.inmobi.messaging.consumer.databus.mapreduce.TestDatabusInputFormatMapReduce.java
License:Apache License
/** * It reads the local stream file(i.e. compressed file) and assert on the * read messages//from www.ja va2s .c o m */ @Test protected void testGZFile() throws Exception { Path localstreamDir = new Path(cluster.getLocalFinalDestDirRoot(), testStream); List<Path> minuteDirs = new ArrayList<Path>(); listAllPaths(localstreamDir, minuteDirs); if (minuteDirs.size() > 0) { FileInputFormat.setInputPaths(defaultConf, minuteDirs.get(0).getParent()); context = getTaskAttemptContext(defaultConf, taskId); readMessages = new ArrayList<Message>(); splitFile(1, minuteDirs.get(0)); LOG.info("number msgs read from gz files " + readMessages.size()); assertMessages(0); } }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.HashIdMR.java
License:Open Source License
/** * @param inputpath/*from w ww .j ava 2 s.c om*/ * the path to a unique vertex list. Each line is parsed into (vid, * data) using {@code vidparser} and {@code vdataparser}. * @param outputpath * the path of output directory. * @throws IOException */ public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(HashIdMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(HashIdMapper.class); conf.setReducerClass(HashIdReducer.class); conf.setInputFormat(NLineInputFormat.class); conf.setOutputFormat(MultiDirOutputFormat.class); conf.setInt("mapred.line.input.format.linespermap", linespermap); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("VdataParser", vdataparser.getClass().getName()); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("====== Job: Create integer Id maps for vertices =========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.debug("Lines per map = 6000000"); LOG.debug("GraphParser = " + graphparser.getClass().getName()); LOG.debug("VidParser = " + vidparser.getClass().getName()); LOG.debug("VdataParser = " + vdataparser.getClass().getName()); LOG.info("=========================================================="); JobClient.runJob(conf); LOG.info("=======================Done =====================\n"); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortDictMR.java
License:Open Source License
/** * @param inputpath/*from www. j av a2 s .com*/ * the path to a rawId to newId dictionary. * @param outputpath * the path of output directory. * @throws IOException */ public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(SortDictMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(SortDictMapper.class); conf.setReducerClass(SortDictReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setBoolean("hashRawVid", hashRawVid); conf.setInt("numChunks", numChunks); conf.set("VidParser", vidparser.getClass().getName()); String outprefix = "vidhashmap"; for (int i = 0; i < numChunks; i++) { MultipleOutputs.addNamedOutput(conf, outprefix + i, TextOutputFormat.class, Text.class, Text.class); } FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("========== Job: Partition the map of rawid -> id ==========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.info("======================================================"); if (hashRawVid) LOG.info("Partition on rawId."); else LOG.info("Partition on newId"); LOG.debug("numChunks = " + numChunks); LOG.debug("VidParser = " + vidparser.getClass().getName()); JobClient.runJob(conf); LOG.info("======================= Done ==========================\n"); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortEdgeMR.java
License:Open Source License
public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(SortEdgeMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(SortEdgeMapper.class); conf.setReducerClass(SortEdgeReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setInt("numChunks", numChunks); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("EdataParser", edataparser.getClass().getName()); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("==== Job: Partition the input edges by hash(sourceid) ========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.debug("numChunks = " + numChunks); LOG.debug("GraphParser = " + graphparser.getClass().getName()); LOG.debug("VidParser = " + vidparser.getClass().getName()); LOG.debug("EdataParser = " + edataparser.getClass().getName()); LOG.info("==============================================================="); JobClient.runJob(conf);//from www.ja v a 2s .c om LOG.info("=================== Done ====================================\n"); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.TransEdgeMR.java
License:Open Source License
/** * @param inputpath/*from ww w. j a v a 2s .com*/ * path of the partitioned edge list * @param outputpath * path of the output directory * @throws IOException */ public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(TransEdgeMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(TransEdgeMapper.class); conf.setReducerClass(TransEdgeReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setInt("numChunks", numChunks); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("EdataParser", edataparser.getClass().getName()); conf.set("dictionaryPath", dictionaryPath); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("============= Job: Normalize Ids in Edges ===================="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.info("Dictionary = " + dictionaryPath); LOG.debug("numChunks = " + numChunks); LOG.debug("GraphParser = " + graphparser.getClass().getName()); LOG.debug("VidParser = " + vidparser.getClass().getName()); LOG.debug("EdataParser = " + edataparser.getClass().getName()); LOG.info("==============================================================="); JobClient.runJob(conf); LOG.info("========================= Done ==============================="); }
From source file:com.intel.hadoop.graphbuilder.partition.mapreduce.vrecord.VrecordIngressMR.java
License:Open Source License
public void run(int numProcs, String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(VrecordIngressMR.class); conf.setJobName("Vrecord Mapreduce"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(VrecordIngressMapper.class); conf.setReducerClass(VrecordIngressReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(MultiDirOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); if (gzip) {// w ww. j a v a2s. c o m TextOutputFormat.setCompressOutput(conf, true); TextOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); } LOG.info("====== Job: Distributed Vertex Records to partitions ========="); LOG.info("input: " + inputpath); LOG.info("output: " + outputpath); LOG.info("numProc = " + numProcs); LOG.info("gzip = " + Boolean.toString(gzip)); LOG.info("=============================================================="); JobClient.runJob(conf); LOG.info("==========================Done==============================="); }
From source file:com.intel.hadoop.graphbuilder.preprocess.mapreduce.EdgeTransformMR.java
License:Open Source License
public void run(String inputpath, String outputpath) throws IOException { conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(mapkeytype.getClass()); conf.setMapOutputValueClass(mapvaltype.getClass()); conf.setMapperClass(EdgeTransformMapper.class); conf.setCombinerClass(EdgeTransformCombiner.class); conf.setReducerClass(EdgeTransformReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.set("ReduceFunc", reducefunc.getClass().getName()); conf.set("ApplyFunc", applyfunc.getClass().getName()); conf.setBoolean("reduceEndPoint", reduceEndPoint); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("EdataParser", edataparser.getClass().getName()); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("============== Job: Data Transformation on Edges =========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.info("reducefunc = " + reducefunc.getClass().getName()); LOG.info("applyfunc = " + applyfunc.getClass().getName()); if (reduceEndPoint == SOURCE) LOG.info("Reduce on source"); else//from ww w . j a va 2s .c o m LOG.info("Reduce on target"); if (!checkTypes()) { LOG.fatal("Type check failed." + " Please check the parser and reduce/apply functions are consistent with key/val types."); return; } LOG.info("==========================================================="); JobClient.runJob(conf); LOG.info("======================== Done ============================\n"); }
From source file:com.jackbe.mapreduce.LocalJobManager.java
License:Open Source License
public RunningJob startJob(String inputDir, String outputDir, String mapperScript, String reducerScript, String combinerScript) throws Exception { init();/*w w w .ja v a2 s.c o m*/ conf.setJobName("EMMLMapReduce"); //conf.setSessionId(Long.toString(System.currentTimeMillis())); conf.set("MAPPER_SCRIPT", mapperScript); conf.set("REDUCER_SCRIPT", reducerScript); if (combinerScript != null) { conf.set("COMBINER_SCRIPT", combinerScript); conf.setCombinerClass(EMMLCombiner.class); } // FileInputFormat.setInputPaths(conf, new Path(inputDir)); FileInputFormat.setInputPaths(conf, new Path("hdfs://" + NAMENODE + "/" + inputDir)); // FileOutputFormat.setOutputPath(conf, new Path(outputDir)); Path outputPath = new Path("hdfs://" + NAMENODE + "/" + outputDir); outputPath.getFileSystem(conf).delete(outputPath, true); FileOutputFormat.setOutputPath(conf, outputPath); RESTRegistrationJobCallback callback = new RESTRegistrationJobCallback(outputPath, outputDir, conf); RunningJob job = null; try { job = jobClient.submitJob(conf); this.registerJobCompleteCallback(job, callback); statusMap.put(job.getJobID(), job); } catch (IOException e) { e.printStackTrace(); throw e; } jobClient.getSystemDir(); return job; }