List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:com.hadoop.secondarysort.SecondarySort_MapRed.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysrot <in> <out>"); System.exit(2);/*from w ww .j a v a2s .c o m*/ } JobConf jobConf = new JobConf(conf); jobConf.setMapperClass(MapClass.class); jobConf.setReducerClass(Reduce.class); jobConf.setPartitionerClass(FirstPartitioner.class); jobConf.setOutputValueGroupingComparator(FirstGroupingComparator.class); jobConf.setMapOutputKeyClass(IntPair.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(IntWritable.class); // // Job job = new Job(conf, "secondary sort"); // job.setJarByClass(SecondarySort_MapRed.class); // job.setMapperClass(MapClass.class); // job.setReducerClass(Reduce.class); // // // group and partition by the first int in the pair // job.setPartitionerClass(FirstPartitioner.class); // job.setGroupingComparatorClass(FirstGroupingComparator.class); // conf.setClass("mapred.output.key.comparator.class", // KeyComparator.class, RawComparator.class); // // job.setSortComparatorClass(SecondGroupingComparator.class); // // the map output is IntPair, IntWritable // job.setMapOutputKeyClass(IntPair.class); // job.setMapOutputValueClass(IntWritable.class); // // // the reduce output is Text, IntWritable // job.setOutputKeyClass(Text.class); // job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(jobConf, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs[1])); }
From source file:com.hazelcast.jet.hadoop.impl.ReadHdfsPTest.java
License:Open Source License
@Before public void setup() throws IOException { instance = createJetMember();//from ww w. java 2 s . com jobConf = new JobConf(); jobConf.setInputFormat(inputFormatClass); writeToFile(); for (Path path : paths) { FileInputFormat.addInputPath(jobConf, path); } }
From source file:com.hazelcast.jet.hadoop.impl.WriteHdfsPTest.java
License:Open Source License
@Test public void testWriteFile() throws Exception { int messageCount = 320; String mapName = randomMapName(); JetInstance instance = createJetMember(); createJetMember();/*w w w . j a va2s .c o m*/ Map<IntWritable, IntWritable> map = IntStream.range(0, messageCount).boxed() .collect(toMap(IntWritable::new, IntWritable::new)); instance.getMap(mapName).putAll(map); Path path = getPath(); JobConf conf = new JobConf(); conf.setOutputFormat(outputFormatClass); conf.setOutputCommitter(FileOutputCommitter.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); if (outputFormatClass.equals(LazyOutputFormat.class)) { LazyOutputFormat.setOutputFormatClass(conf, TextOutputFormat.class); } FileOutputFormat.setOutputPath(conf, path); Pipeline p = Pipeline.create(); p.drawFrom(Sources.map(mapName)).drainTo(HdfsSinks.hdfs(conf)) // we use higher value to increase the race chance for LazyOutputFormat .setLocalParallelism(8); Future<Void> future = instance.newJob(p).getFuture(); assertCompletesEventually(future); JobConf readJobConf = new JobConf(); readJobConf.setInputFormat(inputFormatClass); FileInputFormat.addInputPath(readJobConf, path); p = Pipeline.create(); p.drawFrom(HdfsSources.hdfs(readJobConf)).drainTo(Sinks.list("results")); future = instance.newJob(p).getFuture(); assertCompletesEventually(future); IList<Object> results = instance.getList("results"); assertEquals(messageCount, results.size()); }
From source file:com.hazelcast.jet.impl.connector.hadoop.WriteHdfsPTest.java
License:Open Source License
@Test public void testWriteFile() throws Exception { int messageCount = 20; String mapName = randomMapName(); JetInstance instance = createJetMember(); createJetMember();/*from www . jav a2s. c om*/ Map<IntWritable, IntWritable> map = IntStream.range(0, messageCount).boxed() .collect(toMap(IntWritable::new, IntWritable::new)); instance.getMap(mapName).putAll(map); DAG dag = new DAG(); Vertex producer = dag.newVertex("producer", readMap(mapName)).localParallelism(1); Path path = getPath(); JobConf conf = new JobConf(); conf.setOutputFormat(outputFormatClass); conf.setOutputCommitter(FileOutputCommitter.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(conf, path); Vertex consumer = dag.newVertex("consumer", writeHdfs(conf)).localParallelism(4); dag.edge(between(producer, consumer)); Future<Void> future = instance.newJob(dag).execute(); assertCompletesEventually(future); dag = new DAG(); JobConf readJobConf = new JobConf(); readJobConf.setInputFormat(inputFormatClass); FileInputFormat.addInputPath(readJobConf, path); producer = dag.newVertex("producer", readHdfs(readJobConf)).localParallelism(8); consumer = dag.newVertex("consumer", writeList("results")).localParallelism(1); dag.edge(between(producer, consumer)); future = instance.newJob(dag).execute(); assertCompletesEventually(future); IList<Object> results = instance.getList("results"); assertEquals(messageCount, results.size()); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java
License:Open Source License
/** * /*from ww w.j a v a 2 s . c o m*/ * @param fname * @param fnameStaging * @param fnameNew * @param brlen * @param bclen * @throws DMLRuntimeException */ private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { long row = -1; long col = -1; try { //STEP 1: read matrix from HDFS and write blocks to local staging area //check and add input path JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<Cell>(); LongWritable key = new LongWritable(); Text value = new Text(); FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer row = st.nextLong(); col = st.nextLong(); double lvalue = st.nextDouble(); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } //STEP 2: read matrix blocks from staging area and write matrix to HDFS String[] fnamesPartitions = new File(fnameStaging).list(); if (PARALLEL) { int len = Math.min(fnamesPartitions.length, _par); Thread[] threads = new Thread[len]; for (int i = 0; i < len; i++) { int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len); int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1; end = Math.min(end, fnamesPartitions.length - 1); threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end)); threads[i].start(); } for (Thread t : threads) t.join(); } else { for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir); } } catch (Exception e) { //post-mortem error handling and bounds checking if (row < 1 || row > rlen || col < 1 || col > clen) { throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } else throw new DMLRuntimeException("Unable to partition text cell matrix.", e); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * /*from ww w. j a v a 2 s. c om*/ * @param fnameNew * @param outMo * @param inMO * @throws DMLRuntimeException */ private void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) throws DMLRuntimeException { try { //delete target file if already exists MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); if (ALLOW_COPY_CELLFILES) { copyAllFiles(fnameNew, inMO); return; //we're done } //actual merge JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fnameNew); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); String valueStr = null; try { for (MatrixObject in : inMO) //read/write all inputs { LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname=" + in.getFileName() + ") via stream merge"); JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf()); Path tmpPath = new Path(in.getFileName()); FileInputFormat.addInputPath(tmpJob, tmpPath); TextInputFormat informat = new TextInputFormat(); informat.configure(tmpJob); InputSplit[] splits = informat.getSplits(tmpJob, 1); LongWritable key = new LongWritable(); Text value = new Text(); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL); try { while (reader.next(key, value)) { valueStr = value.toString().trim(); out.write(valueStr + "\n"); } } finally { if (reader != null) reader.close(); } } } } finally { if (out != null) out.close(); } } catch (Exception ex) { throw new DMLRuntimeException("Unable to merge text cell results.", ex); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * //from w w w .jav a 2 s. c om * @param fnameStaging * @param mo * @param ID * @throws IOException * @throws DMLRuntimeException */ private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException { JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(mo.getFileName()); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<Cell>(); LongWritable key = new LongWritable(); Text value = new Text(); MatrixCharacteristics mc = mo.getMatrixCharacteristics(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit. // It works fine with int row, col but we require long for larger matrices. // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell) // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0) FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer long row = st.nextLong(); long col = st.nextLong(); double lvalue = Double.parseDouble(st.nextToken()); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } }
From source file:com.ibm.bi.dml.runtime.io.ReaderBinaryBlockParFiles.java
License:Open Source License
/** * //from w w w. j a va 2 s. c om * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException * @throws DMLRuntimeException */ private static void readBinaryBlockMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); FileInputFormat.addInputPath(job, path); ExecutorService pool = Executors.newFixedThreadPool(_numThreads); try { //create read tasks for all splits ArrayList<ReadMatrixPerPartfileTask> tasks = new ArrayList<ReadMatrixPerPartfileTask>(); for (Path lpath : getSequenceFilePaths(fs, path)) { ReadMatrixPerPartfileTask t = new ReadMatrixPerPartfileTask(lpath, job, fs, dest, rlen, clen, brlen, bclen); tasks.add(t); } //wait until all tasks have been executed pool.invokeAll(tasks); pool.shutdown(); //early error notify in case not all tasks successful for (ReadMatrixPerPartfileTask rt : tasks) { if (!rt.getReturnCode()) { throw new IOException("Read task for text input failed: " + rt.getErrMsg()); } } } catch (Exception e) { throw new IOException("Threadpool issue, while parallel read.", e); } }
From source file:com.ibm.bi.dml.runtime.io.ReaderBinaryBlockParSplits.java
License:Open Source License
/** * //from w w w.ja v a 2 s . c o m * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException * @throws DMLRuntimeException */ private static void readBinaryBlockMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); FileInputFormat.addInputPath(job, path); /* BinaryBlockInputFormat informat = new BinaryBlockInputFormat(); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, _numThreads); SequenceFileInputFormat<MatrixIndexes, MatrixBlock> informat = new SequenceFileInputFormat<MatrixIndexes, MatrixBlock>(); InputSplit[] seqsplits = informat.getSplits(job, _numThreads); */ BinaryBlockInputFormat informat = new BinaryBlockInputFormat(); InputSplit[] seqsplits = informat.getSplits(job, _numThreads); ExecutorService pool = Executors.newFixedThreadPool(_numThreads); try { //create read tasks for all splits ArrayList<ReadMatrixPerSplitTask> tasks = new ArrayList<ReadMatrixPerSplitTask>(); for (InputSplit split : seqsplits) { ReadMatrixPerSplitTask t = new ReadMatrixPerSplitTask(split, informat, job, dest, rlen, clen, brlen, bclen); tasks.add(t); } //wait until all tasks have been executed pool.invokeAll(tasks); pool.shutdown(); //early error notify in case not all tasks successful for (ReadMatrixPerSplitTask rt : tasks) { if (!rt.getReturnCode()) { throw new IOException("Read task for text input failed: " + rt.getErrMsg()); } } } catch (Exception e) { throw new IOException("Threadpool issue, while parallel read.", e); } }
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCell.java
License:Open Source License
/** * /*from w ww .ja v a 2 s . c o m*/ * @param path * @param job * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException { boolean sparse = dest.isInSparseFormat(); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LongWritable key = new LongWritable(); Text value = new Text(); int row = -1; int col = -1; try { FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { if (sparse) //SPARSE<-value { while (reader.next(key, value)) { st.reset(value.toString()); //reinit tokenizer row = st.nextInt() - 1; col = st.nextInt() - 1; double lvalue = st.nextDouble(); dest.appendValue(row, col, lvalue); } dest.sortSparseRows(); } else //DENSE<-value { while (reader.next(key, value)) { st.reset(value.toString()); //reinit tokenizer row = st.nextInt() - 1; col = st.nextInt() - 1; double lvalue = st.nextDouble(); dest.setValueDenseUnsafe(row, col, lvalue); } } } finally { if (reader != null) reader.close(); } } } catch (Exception ex) { //post-mortem error handling and bounds checking if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) { throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } else { throw new IOException("Unable to read matrix in text cell format.", ex); } } }