List of usage examples for org.apache.hadoop.conf Configuration setClass
public void setClass(String name, Class<?> theClass, Class<?> xface)
name
property to the name of a theClass
implementing the given interface xface
. From source file:org.apache.tez.test.MiniTezCluster.java
License:Apache License
@Override public void serviceInit(Configuration conf) throws Exception { conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME); // Use libs from cluster since no build is available conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true); // blacklisting disabled to prevent scheduling issues conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath()); }// www . ja v a2 s. com if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) { // nothing defined. set quick delete value conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l); } File appJarLocalFile = new File(MiniTezCluster.APPJAR); if (!appJarLocalFile.exists()) { String message = "TezAppJar " + MiniTezCluster.APPJAR + " not found. Exiting."; LOG.info(message); throw new TezUncheckedException(message); } else { LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath()); } FileSystem fs = FileSystem.get(conf); Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir")); Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar"); // Copy AppJar and make it public. Path appMasterJar = new Path(MiniTezCluster.APPJAR); fs.copyFromLocalFile(appMasterJar, appRemoteJar); fs.setPermission(appRemoteJar, new FsPermission("777")); conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString()); LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS)); // VMEM monitoring disabled, PMEM monitoring enabled. conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000"); try { Path stagingPath = FileContext.getFileContext(conf) .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR))); /* * Re-configure the staging path on Windows if the file system is localFs. * We need to use a absolute path that contains the drive letter. The unit * test could run on a different drive than the AM. We can run into the * issue that job files are localized to the drive where the test runs on, * while the AM starts on a different drive and fails to find the job * metafiles. Using absolute path can avoid this ambiguity. */ if (Path.WINDOWS) { if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath()); } } FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf); if (fc.util().exists(stagingPath)) { LOG.info(stagingPath + " exists! deleting..."); fc.delete(stagingPath, true); } LOG.info("mkdir: " + stagingPath); fc.mkdir(stagingPath, null, true); //mkdir done directory as well String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf); Path doneDirPath = fc.makeQualified(new Path(doneDir)); fc.mkdir(doneDirPath, null, true); } catch (IOException e) { throw new TezUncheckedException("Could not create staging directory. ", e); } conf.set(MRConfig.MASTER_ADDRESS, "test"); //configure the shuffle service in NM conf.setStrings(YarnConfiguration.NM_AUX_SERVICES, new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID }); conf.setClass( String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID), ShuffleHandler.class, Service.class); // Non-standard shuffle port conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0); conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class, ContainerExecutor.class); // TestMRJobs is for testing non-uberized operation only; see TestUberAM // for corresponding uberized tests. conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false); super.serviceInit(conf); }
From source file:org.apache.tez.tests.MiniTezClusterWithTimeline.java
License:Apache License
@Override public void serviceInit(Configuration conf) throws Exception { conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME); // Use libs from cluster since no build is available conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true); // blacklisting disabled to prevent scheduling issues conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath()); }/*w ww .j a v a2 s .com*/ if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) { // nothing defined. set quick delete value conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l); } File appJarLocalFile = new File(MiniTezClusterWithTimeline.APPJAR); if (!appJarLocalFile.exists()) { String message = "TezAppJar " + MiniTezClusterWithTimeline.APPJAR + " not found. Exiting."; LOG.info(message); throw new TezUncheckedException(message); } else { LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath()); } FileSystem fs = FileSystem.get(conf); Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir")); Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar"); // Copy AppJar and make it public. Path appMasterJar = new Path(MiniTezClusterWithTimeline.APPJAR); fs.copyFromLocalFile(appMasterJar, appRemoteJar); fs.setPermission(appRemoteJar, new FsPermission("777")); conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString()); LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS)); // VMEM monitoring disabled, PMEM monitoring enabled. conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000"); try { Path stagingPath = FileContext.getFileContext(conf) .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR))); /* * Re-configure the staging path on Windows if the file system is localFs. * We need to use a absolute path that contains the drive letter. The unit * test could run on a different drive than the AM. We can run into the * issue that job files are localized to the drive where the test runs on, * while the AM starts on a different drive and fails to find the job * metafiles. Using absolute path can avoid this ambiguity. */ if (Path.WINDOWS) { if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath()); } } FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf); if (fc.util().exists(stagingPath)) { LOG.info(stagingPath + " exists! deleting..."); fc.delete(stagingPath, true); } LOG.info("mkdir: " + stagingPath); fc.mkdir(stagingPath, null, true); //mkdir done directory as well String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf); Path doneDirPath = fc.makeQualified(new Path(doneDir)); fc.mkdir(doneDirPath, null, true); } catch (IOException e) { throw new TezUncheckedException("Could not create staging directory. ", e); } conf.set(MRConfig.MASTER_ADDRESS, "test"); //configure the shuffle service in NM conf.setStrings(YarnConfiguration.NM_AUX_SERVICES, new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID }); conf.setClass( String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID), ShuffleHandler.class, Service.class); // Non-standard shuffle port conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0); conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class, ContainerExecutor.class); // TestMRJobs is for testing non-uberized operation only; see TestUberAM // for corresponding uberized tests. conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false); super.serviceInit(conf); }
From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.util.MapReduceHelper.java
License:Apache License
public static void executeMapReduceJob(final MapReduce mapReduce, final Memory.Admin memory, final Configuration configuration) throws IOException, ClassNotFoundException, InterruptedException { final Configuration newConfiguration = new Configuration(configuration); final boolean vertexProgramExists = newConfiguration.get(VertexProgram.VERTEX_PROGRAM, null) != null; if (vertexProgramExists) { newConfiguration.set(Constants.GREMLIN_HADOOP_GRAPH_READER, InputOutputHelper.getInputFormat( (Class) newConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class)) .getCanonicalName());/* w w w .ja va 2 s .co m*/ newConfiguration.unset(Constants.GREMLIN_HADOOP_GRAPH_FILTER); } final BaseConfiguration apacheConfiguration = new BaseConfiguration(); apacheConfiguration.setDelimiterParsingDisabled(true); mapReduce.storeState(apacheConfiguration); ConfUtil.mergeApacheIntoHadoopConfiguration(apacheConfiguration, newConfiguration); final Optional<Comparator<?>> mapSort = mapReduce.getMapKeySort(); final Optional<Comparator<?>> reduceSort = mapReduce.getReduceKeySort(); newConfiguration.setClass(Constants.GREMLIN_HADOOP_MAP_REDUCE_CLASS, mapReduce.getClass(), MapReduce.class); final Job job = Job.getInstance(newConfiguration, mapReduce.toString()); HadoopGraph.LOGGER.info(Constants.GREMLIN_HADOOP_JOB_PREFIX + mapReduce.toString()); job.setJarByClass(HadoopGraph.class); if (mapSort.isPresent()) job.setSortComparatorClass(ObjectWritableComparator.ObjectWritableMapComparator.class); job.setMapperClass(HadoopMap.class); if (mapReduce.doStage(MapReduce.Stage.REDUCE)) { if (mapReduce.doStage(MapReduce.Stage.COMBINE)) job.setCombinerClass(HadoopCombine.class); job.setReducerClass(HadoopReduce.class); } else { if (mapSort.isPresent()) { job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order? } else { job.setNumReduceTasks(0); } } job.setMapOutputKeyClass(ObjectWritable.class); job.setMapOutputValueClass(ObjectWritable.class); job.setOutputKeyClass(ObjectWritable.class); job.setOutputValueClass(ObjectWritable.class); job.setInputFormatClass(GraphFilterInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // if there is no vertex program, then grab the graph from the input location final Path graphPath; if (vertexProgramExists) { graphPath = new Path( Constants.getGraphLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))); } else { graphPath = new Path(newConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION)); } Path memoryPath = new Path( Constants.getMemoryLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), (reduceSort.isPresent() ? mapReduce.getMemoryKey() + "-temp" : mapReduce.getMemoryKey()))); if (FileSystem.get(newConfiguration).exists(memoryPath)) { FileSystem.get(newConfiguration).delete(memoryPath, true); } FileInputFormat.setInputPaths(job, graphPath); FileOutputFormat.setOutputPath(job, memoryPath); job.waitForCompletion(true); // if there is a reduce sort, we need to run another identity MapReduce job if (reduceSort.isPresent()) { final Job reduceSortJob = Job.getInstance(newConfiguration, "ReduceKeySort"); reduceSortJob.setSortComparatorClass(ObjectWritableComparator.ObjectWritableReduceComparator.class); reduceSortJob.setMapperClass(Mapper.class); reduceSortJob.setReducerClass(Reducer.class); reduceSortJob.setMapOutputKeyClass(ObjectWritable.class); reduceSortJob.setMapOutputValueClass(ObjectWritable.class); reduceSortJob.setOutputKeyClass(ObjectWritable.class); reduceSortJob.setOutputValueClass(ObjectWritable.class); reduceSortJob.setInputFormatClass(SequenceFileInputFormat.class); reduceSortJob.setOutputFormatClass(SequenceFileOutputFormat.class); reduceSortJob.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order? FileInputFormat.setInputPaths(reduceSortJob, memoryPath); final Path sortedMemoryPath = new Path(Constants.getMemoryLocation( newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), mapReduce.getMemoryKey())); FileOutputFormat.setOutputPath(reduceSortJob, sortedMemoryPath); reduceSortJob.waitForCompletion(true); FileSystem.get(newConfiguration).delete(memoryPath, true); // delete the temporary memory path memoryPath = sortedMemoryPath; } mapReduce.addResultToMemory(memory, new ObjectWritableIterator(newConfiguration, memoryPath)); }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java
License:Open Source License
@Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { if (_skipPartition) return;//from w w w .j a v a 2 s . c om // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); } FlexBuffer scanArray[] = LinkKey.allocateScanArray(); // set up merge attributes Configuration localMergeConfig = new Configuration(_conf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(_fs, incomingPaths, localMergeConfig); TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); int processedKeysCount = 0; Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { summaryRecord = null; linkSummaryRecord = null; types.clear(); linkSources = null; outputKeyString = null; outputKeyFromInternalLink = false; outputKeyURLObj = null; int statusCount = 0; int linkCount = 0; // scan key components LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray); // pick up source fp from key ... URLFPV2 fpSource = new URLFPV2(); fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.URL_HASH_COMPONENT_ID)); for (RawRecordValue rawValue : nextItem.e1) { inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength()); int length = WritableUtils.readVInt(inputBuffer); keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length); inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength()); length = WritableUtils.readVInt(inputBuffer); valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length); long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID); if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { statusCount++; try { JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject(); if (object != null) { updateCrawlStatsFromJSONObject(object, fpSource, reporter); } } catch (Exception e) { LOG.error("Error Parsing JSON:" + valueBytes.toString()); throw new IOException(e); } } else { linkCount++; JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject(); // ok this is a link ... updateLinkStatsFromLinkJSONObject(object, fpSource, reporter); } reporter.progress(); } if (statusCount > 1) { reporter.incrCounter(Counters.TWO_REDUNDANT_STATUS_IN_REDUCER, 1); } if (statusCount == 0 && linkCount != 0) { reporter.incrCounter(Counters.DISCOVERED_NEW_LINK, 1); } else { if (statusCount >= 1 && linkCount >= 1) { reporter.incrCounter(Counters.GOT_CRAWL_STATUS_WITH_LINK, 1); } else if (statusCount >= 1 && linkCount == 0) { reporter.incrCounter(Counters.GOT_CRAWL_STATUS_NO_LINK, 1); } } if (summaryRecord != null || linkSummaryRecord != null) { JsonObject compositeObject = new JsonObject(); if (summaryRecord != null) { compositeObject.add("crawl_status", summaryRecord); } if (linkSummaryRecord != null) { if (types != null && types.size() != 0) { stringCollectionToJsonArray(linkSummaryRecord, "typeAndRels", types); if (linkSources != null) { stringCollectionToJsonArray(linkSummaryRecord, "sources", linkSources.values()); } } compositeObject.add("link_status", linkSummaryRecord); } if (outputKeyString != null && outputKeyURLObj != null && outputKeyURLObj.isValid()) { if (outputKeyFromInternalLink) { reporter.incrCounter(Counters.OUTPUT_KEY_FROM_INTERNAL_LINK, 1); } else { reporter.incrCounter(Counters.OUTPUT_KEY_FROM_EXTERNAL_LINK, 1); } output.collect(new TextBytes(outputKeyString), new TextBytes(compositeObject.toString())); } else { reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1); } } } }
From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java
License:Open Source License
@Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> output, Reporter reporter) throws IOException { // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); }/* ww w . j av a2s . c om*/ // set up merge attributes Configuration localMergeConfig = new Configuration(_jobConf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, TextBytes.Comparator.class, TextBytes.Comparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>( FileSystem.get(_jobConf), incomingPaths, localMergeConfig); // save a reference to the collector _collector = output; iterateItems(multiFileInputReader, reporter); }
From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.rank.LinkScannerStep.java
License:Open Source License
@Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); FlexBuffer scanArray[] = LinkKey.allocateScanArray(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); }//from w w w .j a v a2 s .c o m // set up merge attributes Configuration localMergeConfig = new Configuration(_jobConf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>( FileSystem.get(_jobConf), incomingPaths, localMergeConfig); TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); TextBytes valueOut = new TextBytes(); TextBytes keyOut = new TextBytes(); Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; // pick up source fp from key ... URLFPV2 fpSource = new URLFPV2(); while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { outputKeyString = null; outputKeyFromInternalLink = false; outputKeyURLObj = null; latestLinkDataTime = -1L; outlinks.clear(); discoveredLinks.clear(); // scan key components LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray); // setup fingerprint ... fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.URL_HASH_COMPONENT_ID)); for (RawRecordValue rawValue : nextItem.e1) { inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength()); int length = WritableUtils.readVInt(inputBuffer); keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length); inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength()); length = WritableUtils.readVInt(inputBuffer); valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length); long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID); if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { try { JsonObject object = parser.parse(valueBytes.toString()).getAsJsonObject(); if (object != null) { updateCrawlStatsFromJSONObject(object, fpSource, reporter); } } catch (Exception e) { LOG.error("Error Parsing JSON:" + valueBytes.toString()); throw new IOException(e); } } reporter.progress(); } // ok now see if we have anything to emit ... if (discoveredLinks.size() != 0) { reporter.incrCounter(Counters.HAD_OUTLINK_DATA, 1); for (String outlink : outlinks) { // emit a to tuple toJsonObject.addProperty("to", outlink); valueBytes.set(toJsonObject.toString()); output.collect(sourceDomain, valueBytes); // now emit a from tuple ... fromJsonObject.addProperty("from", sourceDomain.toString()); keyBytes.set(outlink); valueBytes.set(fromJsonObject.toString()); output.collect(keyBytes, valueBytes); } bloomKey.setDomainHash(fpSource.getDomainHash()); for (long destDomainFP : discoveredLinks) { // set the bloom filter key ... bloomKey.setUrlHash(destDomainFP); // add it to the bloom filter emittedTuplesFilter.add(bloomKey); } } else { reporter.incrCounter(Counters.HAD_NO_OUTLINK_DATA, 1); } } }
From source file:org.commoncrawl.util.MultiFileMergeUtils.java
License:Open Source License
public static void main(String[] args) { Path testPath = new Path(args[0]); LOG.info("Initializing Hadoop Config"); Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("mapred-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); conf.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, URLFPV2RawComparator.class, RawComparator.class); conf.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, URLFPV2.class, WritableComparable.class); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/"); try {//from w ww.j av a 2 s. c om FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Vector<Path> paths = new Vector<Path>(); paths.add(new Path(testPath, "part-00000")); // paths.add(new Path(testPath,"part-00000")); paths.add(new Path(testPath, "part-00001")); TreeSet<URLFPV2> directReadSet = new TreeSet<URLFPV2>(); TreeSet<URLFPV2> multiFileReadSet = new TreeSet<URLFPV2>(); MultiFileInputReader<URLFPV2> inputReader = new MultiFileInputReader<URLFPV2>(fs, paths, conf); KeyAndValueData<URLFPV2> keyValueData = null; int multiFileKeyCount = 0; while ((keyValueData = inputReader.readNextItem()) != null) { LOG.info("Got Key Domain:" + keyValueData._keyObject.getDomainHash() + " URLHash:" + keyValueData._keyObject.getUrlHash() + " Item Count:" + keyValueData._values.size() + " Path[0]:" + keyValueData._values.get(0).source); if (keyValueData._values.size() > 1) { LOG.error("Got more than one item"); for (int i = 0; i < keyValueData._values.size(); ++i) { CRC32 crc = new CRC32(); crc.update(keyValueData._keyData.getData(), 0, keyValueData._keyData.getLength()); LOG.error("Item at[" + i + "] Path:" + keyValueData._values.get(i).source + " CRC:" + crc.getValue()); } } if (multiFileKeyCount++ < 1000) multiFileReadSet.add((URLFPV2) keyValueData._keyObject.clone()); } inputReader.close(); addFirstNFPItemsToSet(fs, new Path(testPath, "part-00000"), conf, directReadSet, 1000); addFirstNFPItemsToSet(fs, new Path(testPath, "part-00001"), conf, directReadSet, 1000); Iterator<URLFPV2> directReadIterator = directReadSet.iterator(); Iterator<URLFPV2> multiFileReadIterator = multiFileReadSet.iterator(); for (int i = 0; i < 1000; ++i) { URLFPV2 directReadFP = directReadIterator.next(); URLFPV2 multiFileReadFP = multiFileReadIterator.next(); if (directReadFP.compareTo(multiFileReadFP) != 0) { LOG.info("Mismatch at Index:" + i); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } catch (CloneNotSupportedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:org.elasticsearch.hadoop.integration.mr.AbstractMRNewApiSaveTest.java
License:Apache License
@Test public void testBasicMultiSave() throws Exception { Configuration conf = createConf(); conf.set(ConfigurationOptions.ES_RESOURCE, "mrnewapi/multi-save"); MultiOutputFormat.addOutputFormat(conf, EsOutputFormat.class); MultiOutputFormat.addOutputFormat(conf, PrintStreamOutputFormat.class); //MultiOutputFormat.addOutputFormat(conf, TextOutputFormat.class); PrintStreamOutputFormat.stream(conf, Stream.OUT); //conf.set("mapred.output.dir", "foo/bar"); conf.setClass("mapreduce.outputformat.class", MultiOutputFormat.class, OutputFormat.class); runJob(conf);//from w ww. ja v a2s . c om }
From source file:org.hdp.WordCountDataCompression.WordCountDataCompressionJob.java
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub // data compression code --> Configuration conf = getConf(); conf.setClass("mapreduce.output.TextOutputFormat.compress", SnappyCodec.class, CompressionCodec.class); // <--/* w w w .jav a2 s .c o m*/ Job job = Job.getInstance(conf, "Word Count Job with compression"); Path in = new Path(args[0]); Path out = new Path(args[1]); FileSystem fs = FileSystem.get(getConf()); // does not the HDFS setting that is set for the eclipse env Path pathOut = new Path("/test/wordcount/op"); if (fs.exists(pathOut)) { fs.delete(out, true); } FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setMapperClass(WordCountMapperDataCompression.class); job.setReducerClass(WordCountReducerDataCompression.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:org.kiji.mapreduce.bulkimport.KijiBulkImportJobBuilder.java
License:Apache License
/** {@inheritDoc} */ @Override/*ww w . j ava 2s .c om*/ protected void configureJob(Job job) throws IOException { final Configuration conf = job.getConfiguration(); // Store the name of the the importer to use in the job configuration so the mapper can // create instances of it. // Construct the bulk importer instance. if (null == mBulkImporterClass) { throw new JobConfigurationException("Must specify a bulk importer."); } conf.setClass(KijiConfKeys.KIJI_BULK_IMPORTER_CLASS, mBulkImporterClass, KijiBulkImporter.class); mJobOutput.configure(job); // Configure the mapper and reducer. This part depends on whether we're going to write // to HFiles or directly to the table. configureJobForHFileOutput(job); job.setJobName("Kiji bulk import: " + mBulkImporterClass.getSimpleName()); mBulkImporter = ReflectionUtils.newInstance(mBulkImporterClass, conf); // Configure the MapReduce job (requires mBulkImporter to be set properly): super.configureJob(job); }