List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:org.archive.nutchwax.IndexerMapReduce.java
License:Apache License
public static void initMRJob(Collection<Path> segments, JobConf job) { for (final Path segment : segments) { LOG.info("IndexerMapReduces: adding segment: " + segment); FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); }/* www.j a v a2s . c o m*/ job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IndexerMapReduce.class); job.setReducerClass(IndexerMapReduce.class); job.setOutputFormat(IndexerOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapOutputValueClass(NutchWritable.class); job.setOutputValueClass(NutchWritable.class); }
From source file:org.archive.nutchwax.PageRankDb.java
License:Apache License
public void invert(Path pageRankDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException { Path lock = new Path(pageRankDb, LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); Path currentPageRankDb = new Path(pageRankDb, CURRENT_NAME); if (LOG.isInfoEnabled()) { LOG.info("PageRankDb: starting"); LOG.info("PageRankDb: pageRankDb: " + pageRankDb); LOG.info("PageRankDb: URL normalize: " + normalize); LOG.info("PageRankDb: URL filter: " + filter); }//from w ww . ja v a 2 s . c o m JobConf job = PageRankDb.createJob(getConf(), pageRankDb, normalize, filter); for (int i = 0; i < segments.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("PageRankDb: adding segment: " + segments[i]); } FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME)); } try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); throw e; } if (fs.exists(currentPageRankDb)) { if (LOG.isInfoEnabled()) { LOG.info("PageRankDb: merging with existing pageRankDb: " + pageRankDb); } // try to merge Path newPageRankDb = FileOutputFormat.getOutputPath(job); job = PageRankDbMerger.createMergeJob(getConf(), pageRankDb, normalize, filter); FileInputFormat.addInputPath(job, currentPageRankDb); FileInputFormat.addInputPath(job, newPageRankDb); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(newPageRankDb, true); throw e; } fs.delete(newPageRankDb, true); } PageRankDb.install(job, pageRankDb); if (LOG.isInfoEnabled()) { LOG.info("PageRankDb: done"); } }
From source file:org.archive.nutchwax.PageRankDbMerger.java
License:Apache License
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { JobConf job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { FileInputFormat.addInputPath(job, new Path(dbs[i], PageRankDb.CURRENT_NAME)); }/*from w ww .j a v a 2 s .c o m*/ JobClient.runJob(job); FileSystem fs = FileSystem.get(getConf()); fs.mkdirs(output); fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, PageRankDb.CURRENT_NAME)); }
From source file:org.cloudata.core.PerformanceTest.java
License:Apache License
private void runNIsMoreThanOne(final String cmd) throws IOException { checkTable();/*from w w w .j a v a2 s .c o m*/ // Run a mapreduce job. Run as many maps as asked-for clients. // Before we start up the job, write out an input file with instruction // per client regards which row they are to start on. Path inputDir = writeInputFile(this.conf); this.conf.set(EvaluationMapTask.CMD_KEY, cmd); JobConf job = new JobConf(this.conf, this.getClass()); FileInputFormat.addInputPath(job, inputDir); job.setInputFormat(TextInputFormat.class); job.setJobName("Cloudata Performance Evaluation"); job.setMapperClass(EvaluationMapTask.class); job.setMaxMapAttempts(1); job.setMaxReduceAttempts(1); job.setNumMapTasks(this.N * 10); // Ten maps per client. job.setNumReduceTasks(1); job.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(inputDir, "outputs")); JobClient.runJob(job); }
From source file:org.cloudata.core.tablet.backup.RestoreBinaryJob.java
License:Apache License
/** * @param string/*from w ww . ja v a 2 s . c o m*/ * @param string2 * @param binary */ public void runRestore(String tableName, String[] columnNames, int numOfVersion, String inputPath) throws IOException { CloudataConf nconf = new CloudataConf(); JobConf partitionJob = new JobConf(BackupJob.class); FileSystem fs = FileSystem.get(partitionJob); if (!fs.exists(new Path(inputPath))) { throw new IOException("input path not exists:" + inputPath); } if (CTable.existsTable(nconf, tableName)) { throw new IOException("table already exists" + tableName); } TableSchema tableSchema = new TableSchema(tableName, "", columnNames); tableSchema.setNumOfVersion(numOfVersion); CTable.createTable(nconf, tableSchema); String columns = ""; for (String eachColumn : columnNames) { columns += eachColumn.trim() + ","; } columns = columns.substring(0, columns.length() - 1); String jobName = tableName + " restore"; String tempDir = jobName + "_" + System.currentTimeMillis(); partitionJob.setJobName(tableName + " restore"); partitionJob.setMapperClass(RestoreBinaryPartitionMap.class); FileInputFormat.addInputPath(partitionJob, new Path(inputPath)); partitionJob.setInputFormat(RestoreSequenceFileAsBinaryInputFormat.class); partitionJob.set(DefaultTabletInputFormat.OUTPUT_TABLE, tableName); FileOutputFormat.setOutputPath(partitionJob, new Path(tempDir)); //map only partitionJob.setNumReduceTasks(0); JobClient.runJob(partitionJob); //delete temp output dir fs.delete(new Path(tempDir), true); //////////////////////////////////////////////////////////////// JobConf jobConf = new JobConf(BackupJob.class); jobConf.setJobName(tableName + " restore"); jobConf.setMapperClass(RestoreBinaryMap.class); FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(RestoreSequenceFileAsBinaryInputFormat.class); jobConf.set(DefaultTabletInputFormat.OUTPUT_TABLE, tableName); jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, columns); FileOutputFormat.setOutputPath(jobConf, new Path(tempDir)); jobConf.setMaxMapAttempts(0); //map only jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); //delete temp output dir fs.delete(new Path(tempDir), true); }
From source file:org.cloudata.core.tablet.backup.RestoreJob.java
License:Apache License
/** * @param string//w ww. jav a 2s .c o m * @param string2 * @param binary */ public void runRestore(String tableName, String[] columnNames, int numOfVersion, String inputPath) throws IOException { CloudataConf nconf = new CloudataConf(); JobConf partitionJob = new JobConf(RestoreJob.class); FileSystem fs = FileSystem.get(partitionJob); if (!fs.exists(new Path(inputPath))) { throw new IOException("input path not exists:" + inputPath); } if (CTable.existsTable(nconf, tableName)) { throw new IOException("table already exists" + tableName); } TableSchema tableSchema = new TableSchema(tableName, "", columnNames); tableSchema.setNumOfVersion(numOfVersion); CTable.createTable(nconf, tableSchema); String columns = ""; for (String eachColumn : columnNames) { columns += eachColumn.trim() + ","; } columns = columns.substring(0, columns.length() - 1); String jobName = tableName + " restore"; String tempDir = jobName + "_" + System.currentTimeMillis(); partitionJob.setJobName(jobName + "_partition"); partitionJob.setMapperClass(RestorePartitionMap.class); FileInputFormat.addInputPath(partitionJob, new Path(inputPath)); partitionJob.setInputFormat(RestoreTextInputFormat.class); partitionJob.set(DefaultTabletInputFormat.OUTPUT_TABLE, tableName); FileOutputFormat.setOutputPath(partitionJob, new Path(tempDir)); //map only partitionJob.setNumReduceTasks(0); JobClient.runJob(partitionJob); fs.delete(new Path(tempDir), true); //////////////////////////////////////////////////////////// JobConf jobConf = new JobConf(BackupJob.class); jobConf.setJobName(jobName); jobConf.setMapperClass(RestoreMap.class); FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(RestoreTextInputFormat.class); jobConf.set(DefaultTabletInputFormat.OUTPUT_TABLE, tableName); jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, columns); FileOutputFormat.setOutputPath(jobConf, new Path(tempDir)); jobConf.setMaxMapAttempts(0); //map only jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); //delete temp output dir fs.delete(new Path(tempDir), true); }
From source file:org.cloudata.examples.first.HdfsToCloudataMapReduce.java
License:Apache License
public void run(String[] args) throws IOException { if (args.length < 2) { System.out.println("Usage: java HdfsToCloudataMapReduce <input path> <table name>"); System.exit(0);//from w w w .ja va2 s. c om } Path inputPath = new Path(args[0]); String tableName = args[1]; CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, tableName)) { TableSchema tableSchema = new TableSchema(tableName); tableSchema.addColumn("col1"); CTable.createTable(nconf, tableSchema); } JobConf jobConf = new JobConf(HdfsToCloudataMapReduce.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); // <MAP> FileInputFormat.addInputPath(jobConf, inputPath); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(HdfsToCloudataMappper.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); // </MAP> // <REDUCE> // Map Only FileOutputFormat.setOutputPath(jobConf, new Path("HdfsToCloudataMapReduce_" + System.currentTimeMillis())); jobConf.setNumReduceTasks(0); // </REDUCE> try { JobClient.runJob(jobConf); } catch (Exception e) { e.printStackTrace(); } finally { FileSystem fs = FileSystem.get(jobConf); fs.delete(FileOutputFormat.getOutputPath(jobConf), true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.examples.upload.partitionjob.PartitionJob.java
License:Apache License
public boolean runJob(String inputPath, String tableName, int numOfTablets) throws IOException { JobConf jobConf = new JobConf(PartitionJob.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); FileSystem fs = FileSystem.get(jobConf); // ? //from w w w . j a va2 s . c o m FileUtil.delete(fs, new Path(getLogCountFilepath(tableName)), true); jobConf.setJobName("PartitionJob_" + tableName + "(" + new Date() + ")"); jobConf.set("cloudata.numOfTablets", String.valueOf(numOfTablets)); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); String clientOpt = jobConf.get("mapred.child.java.opts"); if (clientOpt == null) { clientOpt = ""; } jobConf.set("mapred.child.java.opts", clientOpt + " -Duser.name=" + System.getProperty("user.name")); //<Map> FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(PartitionMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); //</Map> //<Reduce> Path tempOutputPath = new Path("temp/partitionJob/" + tableName + "/reducer"); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setReducerClass(PartitionReducer.class); //Reduce 1 jobConf.setNumReduceTasks(1); //</Reduce> try { RunningJob job = JobClient.runJob(jobConf); return job.isSuccessful(); } finally { FileUtil.delete(fs, new Path(getLogCountFilepath(tableName)), true); FileUtil.delete(fs, tempOutputPath, true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.examples.upload.partitionjob.UploadJob.java
License:Apache License
public void runJob(String inputPath, String tableName) throws IOException { JobConf jobConf = new JobConf(UploadJob.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")"); //KeyRangePartitioner //AbstractTabletInputFormat.OUTPUT_TABLE? ? jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); CloudataConf conf = new CloudataConf(); CTable ctable = CTable.openTable(conf, tableName); TabletInfo[] tabletInfos = ctable.listTabletInfos(); //<Map> FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(UploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setMapSpeculativeExecution(false); jobConf.setMaxMapAttempts(0);/*from ww w .j a v a2s . c o m*/ jobConf.setPartitionerClass(KeyRangePartitioner.class); //</Map> //<Reduce> Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer"); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setReducerClass(UploadReducer.class); jobConf.setReduceSpeculativeExecution(false); jobConf.setMaxReduceAttempts(0); //Reduce Tablet jobConf.setNumReduceTasks(tabletInfos.length); //</Reduce> try { JobClient.runJob(jobConf); } finally { FileSystem fs = FileSystem.get(jobConf); FileUtil.delete(fs, tempOutputPath, true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.examples.upload.SimpleUploaderMapReduce.java
License:Apache License
public void run(String[] args) throws IOException { if (args.length < 3) { System.out.println("Usage: java SimpleUploaderMapReduce <input path> <table name> <# reduce>"); System.exit(0);//from www .ja v a2 s .c o m } Path inputPath = new Path(args[0]); String tableName = args[1]; CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, tableName)) { TableSchema tableSchema = new TableSchema(tableName); tableSchema.addColumn("Col1"); Row.Key[] rowKeys = new Row.Key[20]; for (int i = 0; i < 10; i++) { rowKeys[i] = new Row.Key("-0" + i); } for (int i = 1; i < 10; i++) { rowKeys[9 + i] = new Row.Key("0" + i); } rowKeys[19] = Row.Key.MAX_KEY; CTable.createTable(nconf, tableSchema, rowKeys); } JobConf jobConf = new JobConf(HdfsToCloudataMapReduce.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); // <MAP> FileInputFormat.addInputPath(jobConf, inputPath); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(SimpleUploaderMapper.class); jobConf.setPartitionerClass(KeyRangePartitioner.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); // </MAP> // <REDUCE> FileOutputFormat.setOutputPath(jobConf, new Path("SimpleUploaderMapReduce_" + System.currentTimeMillis())); jobConf.setReducerClass(SimpleUploaderReducer.class); jobConf.setNumReduceTasks(Integer.parseInt(args[2])); jobConf.setMaxReduceAttempts(0); // </REDUCE> try { JobClient.runJob(jobConf); } catch (Exception e) { e.printStackTrace(); } finally { FileSystem fs = FileSystem.get(jobConf); fs.delete(FileOutputFormat.getOutputPath(jobConf), true); CloudataMapReduceUtil.clearMapReduce(libDir); } }