List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:com.mycompany.MyHadoopSamples1.TransposeJob.java
License:Apache License
public static Configuration buildTransposeJobConf(Configuration initialConf, Path matrixInputPath, Path matrixOutputPath, int numInputRows) throws IOException { JobConf conf = new JobConf(initialConf, TransposeJob.class); conf.setJobName("TransposeJob: " + matrixInputPath + " transpose -> " + matrixOutputPath); FileSystem fs = FileSystem.get(conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); conf.setInt(NUM_ROWS_KEY, numInputRows); FileInputFormat.addInputPath(conf, matrixInputPath); conf.setInputFormat(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(conf, matrixOutputPath); System.out.println("OUTPUT --> " + matrixOutputPath.toString()); conf.setMapperClass(TransposeMapper.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(VectorWritable.class); conf.setCombinerClass(MergeVectorsCombiner.class); conf.setReducerClass(MergeVectorsReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(VectorWritable.class); return conf;/* w w w . j av a2 s . c om*/ }
From source file:com.mycompany.wordcount.WCMain.java
@Override public int run(String[] args) throws Exception { //throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. JobConf conf = new JobConf(WCMain.class); conf.setJobName("WordCount"); // key value// w w w. j a v a 2s .c o m conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); // mapper and reducer conf.setMapperClass(WCMapper.class); conf.setReducerClass(WCReducer.class); // input output format conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:com.spotify.hdfs2cass.BulkLoader.java
License:Apache License
public int run(String[] args) throws Exception { CommandLine cmdLine = parseOptions(args); String[] inputPaths = cmdLine.getOptionValues('i'); String seedNodeHost = cmdLine.getOptionValue('h'); String seedNodePort = cmdLine.getOptionValue('p', "9160"); String keyspace = cmdLine.getOptionValue('k'); String colfamily = cmdLine.getOptionValue('c'); int mappers = Integer.parseInt(cmdLine.getOptionValue('m', "0")); Integer copiers = Integer.parseInt(cmdLine.getOptionValue('P', "0")); String poolName = cmdLine.getOptionValue("pool"); ClusterInfo clusterInfo = new ClusterInfo(seedNodeHost, seedNodePort); clusterInfo.init(keyspace);//from ww w . j av a 2 s.c om final String partitionerClass = clusterInfo.getPartitionerClass(); final int reducers = adjustReducers(Integer.parseInt(cmdLine.getOptionValue('r', "0")), clusterInfo.getNumClusterNodes()); Configuration conf = new Configuration(); ConfigHelper.setOutputColumnFamily(conf, keyspace, colfamily); ConfigHelper.setOutputInitialAddress(conf, seedNodeHost); ConfigHelper.setOutputRpcPort(conf, seedNodePort); ConfigHelper.setOutputPartitioner(conf, partitionerClass); if (cmdLine.hasOption('s')) { conf.set("mapreduce.output.bulkoutputformat.buffersize", cmdLine.getOptionValue('s', "32")); } if (cmdLine.hasOption('M')) { conf.set("mapreduce.output.bulkoutputformat.streamthrottlembits", cmdLine.getOptionValue('M')); } if (cmdLine.hasOption('C')) { ConfigHelper.setOutputCompressionClass(conf, cmdLine.getOptionValue('C')); } if (cmdLine.hasOption('b')) { conf.setBoolean("com.spotify.hdfs2cass.base64", true); } JobConf job = new JobConf(conf); if (mappers > 0) job.setNumMapTasks(mappers); if (reducers > 0) job.setNumReduceTasks(reducers); if (copiers > 0) job.set("mapred.reduce.parallel.copies", copiers.toString()); if (poolName != null) job.set("mapred.fairscheduler.pool", poolName); // set the nodes as a param for the other hadoop nodes clusterInfo.setConf(job); String jobName = "bulkloader-hdfs-to-cassandra"; if (cmdLine.hasOption('n')) jobName += "-" + cmdLine.getOptionValue('n'); job.setJobName(jobName); job.setJarByClass(BulkLoader.class); job.setInputFormat(AvroAsTextInputFormat.class); for (String inputPath : inputPaths) { FileInputFormat.addInputPath(job, new Path(inputPath)); } //map just outputs text, reduce sends to cassandra job.setMapperClass(MapToText.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(CassandraPartitioner.class); job.setReducerClass(ReduceTextToCassandra.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); if (cmdLine.hasOption('s')) job.setOutputFormat(BulkOutputFormat.class); else job.setOutputFormat(ColumnFamilyOutputFormat.class); JobClient.runJob(job); return 0; }
From source file:com.talis.mapreduce.wordcount.oldapi.WordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*from ww w . j a va2 s . co m*/ JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Word Count"); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setMapperClass(WordCountMapper.class); conf.setCombinerClass(WordCountReducer.class); conf.setReducerClass(WordCountReducer.class); // conf.setPartitionerClass(HashPartitioner.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); JobClient.runJob(conf); return 0; }
From source file:com.TCG.Nutch_DNS.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation.//from w w w . j av a 2 s . c om * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.TCG.Nutch_DNS.Generator.java
License:Apache License
private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists) throws IOException { // invert again, partition by host/domain/IP, sort by url hash if (LOG.isInfoEnabled()) { LOG.info("Generator: Partitioning selected urls for politeness."); }/*from w ww. j a va 2 s . c o m*/ Path segment = new Path(segmentsDir, generateSegmentName()); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); LOG.info("Generator: segment: " + segment); NutchJob job = new NutchJob(getConf()); job.setJobName("generate: partition " + segment); job.setInt("partition.url.seed", new Random().nextInt()); FileInputFormat.addInputPath(job, inputDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(SelectorInverseMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(SelectorEntry.class); job.setPartitionerClass(URLPartitioner.class); job.setReducerClass(PartitionReducer.class); job.setNumReduceTasks(numLists); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); JobClient.runJob(job); return segment; }
From source file:com.TCG.Nutch_DNS.HostDb.java
License:Apache License
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException { FileSystem fs = FileSystem.get(getConf()); Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); JobConf job = HostDb.createJob(getConf(), crawlDb); job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); job.setBoolean(HostDbFilter.URL_FILTERING, filter); job.setBoolean(HostDbFilter.URL_NORMALIZING, normalize); boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: starting at " + sdf.format(start)); LOG.info("CrawlDb update: db: " + crawlDb); LOG.info("CrawlDb update: segments: " + Arrays.asList(segments)); LOG.info("CrawlDb update: additions allowed: " + additionsAllowed); LOG.info("CrawlDb update: URL normalizing: " + normalize); LOG.info("CrawlDb update: URL filtering: " + filter); LOG.info("CrawlDb update: 404 purging: " + url404Purging); }/*from w w w .j a va 2 s. com*/ for (int i = 0; i < segments.length; i++) { Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME); if (fs.exists(fetch) && fs.exists(parse)) { FileInputFormat.addInputPath(job, fetch); FileInputFormat.addInputPath(job, parse); } else { LOG.info(" - skipping invalid segment " + segments[i]); } } if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: Merging segment data into db."); } try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); Path outPath = FileOutputFormat.getOutputPath(job); if (fs.exists(outPath)) fs.delete(outPath, true); throw e; } HostDb.install(job, crawlDb); long end = System.currentTimeMillis(); LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:com.TCG.Nutch_DNS.HostDb.java
License:Apache License
public static JobConf createJob(Configuration config, Path crawlDb) throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("crawldb " + crawlDb); Path current = new Path(crawlDb, CURRENT_NAME); if (FileSystem.get(job).exists(current)) { FileInputFormat.addInputPath(job, current); }//w w w. j a v a 2s . co m job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(HostDbFilter.class); job.setReducerClass(HostDbReducer.class); FileOutputFormat.setOutputPath(job, newCrawlDb); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); // https://issues.apache.org/jira/browse/NUTCH-1110 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); return job; }
From source file:com.TCG.Nutch_DNS.Injector.java
License:Apache License
public void inject(Path hostDb, Path crawlDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: hostDb: " + hostDb); LOG.info("Injector: carwlDb: " + crawlDb); }//from w w w .j a v a2 s . c o m Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected host to host db entries."); } FileSystem fs = FileSystem.get(getConf()); // determine if the crawldb already exists boolean dbExists = fs.exists(hostDb); JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + hostDb); FileInputFormat.addInputPath(sortJob, crawlDb); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); if (dbExists) { sortJob.setOutputFormat(SequenceFileOutputFormat.class); //HostReducer,host sortJob.setReducerClass(ExitHostReducer.class); } else { sortJob.setOutputFormat(MapFileOutputFormat.class); //HostReducer,host sortJob.setReducerClass(NotExitHostReducer.class); sortJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); } sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = null; try { mapJob = JobClient.runJob(sortJob); } catch (IOException e) { fs.delete(tempDir, true); throw e; } if (dbExists) { // merge with existing host db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected hostDb into old hostDb."); } JobConf mergeJob = HostDb.createJob(getConf(), hostDb); FileInputFormat.addInputPath(mergeJob, tempDir); //HostDb.createJobReducer:HostDbReducer mergeJob.setReducerClass(InjectReducer.class); try { RunningJob merge = JobClient.runJob(mergeJob); } catch (IOException e) { fs.delete(tempDir, true); throw e; } HostDb.install(mergeJob, hostDb); } else { HostDb.install(sortJob, hostDb); } // clean up fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:com.vsii.ttxvn.crawling.DeleteFailedDataJob.java
License:Apache License
public int run(String[] args) throws IOException { if (args.length < 1) { System.err.println("Usage: DeleteFailedDataJob <crawldb>"); return 1; }// w ww.ja v a2s . co m String crawldb = args[0]; SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("DeleteFailedDataJob: starting at " + sdf.format(start)); Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(getConf()); job.setJobName("DeleteFailedData on " + crawldb); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(DBFilter.class); job.setReducerClass(DedupReducer.class); try { RunningJob rj = JobClient.runJob(job); Group g = rj.getCounters().getGroup("DeleteFailedDataJobStatus"); if (g != null) { long dups = g.getCounter("Documents marked as duplicate"); LOG.info("DeleteFailedData: " + (int) dups + " documents marked as duplicates"); } } catch (final Exception e) { LOG.error("DeleteFailedDataJob: " + StringUtils.stringifyException(e)); return -1; } // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("DeleteFailedData: Updating status of duplicate urls into crawl db."); } Path dbPath = new Path(crawldb); JobConf mergeJob = CrawlDb.createJob(getConf(), dbPath); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(StatusUpdateReducer.class); try { JobClient.runJob(mergeJob); } catch (final Exception e) { LOG.error("DeleteFailedDataMergeJob: " + StringUtils.stringifyException(e)); return -1; } CrawlDb.install(mergeJob, dbPath); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("DeleteFailedData finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }