Example usage for org.apache.hadoop.conf Configuration setClass

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setClass.

Prototype

public void setClass(String name, Class<?> theClass, Class<?> xface)

Source Link

Document

Set the value of the name property to the name of a theClass implementing the given interface xface.

Usage

From source file:org.apache.tez.test.MiniTezCluster.java

License:Apache License

@Override
public void serviceInit(Configuration conf) throws Exception {
    conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME);
    // Use libs from cluster since no build is available
    conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true);
    // blacklisting disabled to prevent scheduling issues
    conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
    if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) {
        conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath());
    }// www  .  ja v  a2  s.  com

    if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) {
        // nothing defined. set quick delete value
        conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l);
    }

    File appJarLocalFile = new File(MiniTezCluster.APPJAR);

    if (!appJarLocalFile.exists()) {
        String message = "TezAppJar " + MiniTezCluster.APPJAR + " not found. Exiting.";
        LOG.info(message);
        throw new TezUncheckedException(message);
    } else {
        LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath());
    }

    FileSystem fs = FileSystem.get(conf);
    Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir"));
    Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar");
    // Copy AppJar and make it public.
    Path appMasterJar = new Path(MiniTezCluster.APPJAR);
    fs.copyFromLocalFile(appMasterJar, appRemoteJar);
    fs.setPermission(appRemoteJar, new FsPermission("777"));

    conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString());
    LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS));

    // VMEM monitoring disabled, PMEM monitoring enabled.
    conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
    conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);

    conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000");

    try {
        Path stagingPath = FileContext.getFileContext(conf)
                .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR)));
        /*
         * Re-configure the staging path on Windows if the file system is localFs.
         * We need to use a absolute path that contains the drive letter. The unit
         * test could run on a different drive than the AM. We can run into the
         * issue that job files are localized to the drive where the test runs on,
         * while the AM starts on a different drive and fails to find the job
         * metafiles. Using absolute path can avoid this ambiguity.
         */
        if (Path.WINDOWS) {
            if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) {
                conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                        new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath());
            }
        }
        FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf);
        if (fc.util().exists(stagingPath)) {
            LOG.info(stagingPath + " exists! deleting...");
            fc.delete(stagingPath, true);
        }
        LOG.info("mkdir: " + stagingPath);
        fc.mkdir(stagingPath, null, true);

        //mkdir done directory as well
        String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
        Path doneDirPath = fc.makeQualified(new Path(doneDir));
        fc.mkdir(doneDirPath, null, true);
    } catch (IOException e) {
        throw new TezUncheckedException("Could not create staging directory. ", e);
    }
    conf.set(MRConfig.MASTER_ADDRESS, "test");

    //configure the shuffle service in NM
    conf.setStrings(YarnConfiguration.NM_AUX_SERVICES,
            new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID });
    conf.setClass(
            String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID),
            ShuffleHandler.class, Service.class);

    // Non-standard shuffle port
    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0);

    conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class,
            ContainerExecutor.class);

    // TestMRJobs is for testing non-uberized operation only; see TestUberAM
    // for corresponding uberized tests.
    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
    super.serviceInit(conf);
}

From source file:org.apache.tez.tests.MiniTezClusterWithTimeline.java

License:Apache License

@Override
public void serviceInit(Configuration conf) throws Exception {
    conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME);
    // Use libs from cluster since no build is available
    conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true);
    // blacklisting disabled to prevent scheduling issues
    conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
    if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) {
        conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath());
    }/*w ww  .j a v a2  s .com*/

    if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) {
        // nothing defined. set quick delete value
        conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l);
    }

    File appJarLocalFile = new File(MiniTezClusterWithTimeline.APPJAR);

    if (!appJarLocalFile.exists()) {
        String message = "TezAppJar " + MiniTezClusterWithTimeline.APPJAR + " not found. Exiting.";
        LOG.info(message);
        throw new TezUncheckedException(message);
    } else {
        LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath());
    }

    FileSystem fs = FileSystem.get(conf);
    Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir"));
    Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar");
    // Copy AppJar and make it public.
    Path appMasterJar = new Path(MiniTezClusterWithTimeline.APPJAR);
    fs.copyFromLocalFile(appMasterJar, appRemoteJar);
    fs.setPermission(appRemoteJar, new FsPermission("777"));

    conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString());
    LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS));

    // VMEM monitoring disabled, PMEM monitoring enabled.
    conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
    conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);

    conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000");

    try {
        Path stagingPath = FileContext.getFileContext(conf)
                .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR)));
        /*
         * Re-configure the staging path on Windows if the file system is localFs.
         * We need to use a absolute path that contains the drive letter. The unit
         * test could run on a different drive than the AM. We can run into the
         * issue that job files are localized to the drive where the test runs on,
         * while the AM starts on a different drive and fails to find the job
         * metafiles. Using absolute path can avoid this ambiguity.
         */
        if (Path.WINDOWS) {
            if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) {
                conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                        new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath());
            }
        }
        FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf);
        if (fc.util().exists(stagingPath)) {
            LOG.info(stagingPath + " exists! deleting...");
            fc.delete(stagingPath, true);
        }
        LOG.info("mkdir: " + stagingPath);
        fc.mkdir(stagingPath, null, true);

        //mkdir done directory as well
        String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
        Path doneDirPath = fc.makeQualified(new Path(doneDir));
        fc.mkdir(doneDirPath, null, true);
    } catch (IOException e) {
        throw new TezUncheckedException("Could not create staging directory. ", e);
    }
    conf.set(MRConfig.MASTER_ADDRESS, "test");

    //configure the shuffle service in NM
    conf.setStrings(YarnConfiguration.NM_AUX_SERVICES,
            new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID });
    conf.setClass(
            String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID),
            ShuffleHandler.class, Service.class);

    // Non-standard shuffle port
    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0);

    conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class,
            ContainerExecutor.class);

    // TestMRJobs is for testing non-uberized operation only; see TestUberAM
    // for corresponding uberized tests.
    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
    super.serviceInit(conf);
}

From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.util.MapReduceHelper.java

License:Apache License

public static void executeMapReduceJob(final MapReduce mapReduce, final Memory.Admin memory,
        final Configuration configuration) throws IOException, ClassNotFoundException, InterruptedException {
    final Configuration newConfiguration = new Configuration(configuration);
    final boolean vertexProgramExists = newConfiguration.get(VertexProgram.VERTEX_PROGRAM, null) != null;
    if (vertexProgramExists) {
        newConfiguration.set(Constants.GREMLIN_HADOOP_GRAPH_READER, InputOutputHelper.getInputFormat(
                (Class) newConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class))
                .getCanonicalName());/* w w  w .ja va  2  s  .co m*/
        newConfiguration.unset(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    }
    final BaseConfiguration apacheConfiguration = new BaseConfiguration();
    apacheConfiguration.setDelimiterParsingDisabled(true);
    mapReduce.storeState(apacheConfiguration);
    ConfUtil.mergeApacheIntoHadoopConfiguration(apacheConfiguration, newConfiguration);

    final Optional<Comparator<?>> mapSort = mapReduce.getMapKeySort();
    final Optional<Comparator<?>> reduceSort = mapReduce.getReduceKeySort();
    newConfiguration.setClass(Constants.GREMLIN_HADOOP_MAP_REDUCE_CLASS, mapReduce.getClass(), MapReduce.class);
    final Job job = Job.getInstance(newConfiguration, mapReduce.toString());
    HadoopGraph.LOGGER.info(Constants.GREMLIN_HADOOP_JOB_PREFIX + mapReduce.toString());
    job.setJarByClass(HadoopGraph.class);
    if (mapSort.isPresent())
        job.setSortComparatorClass(ObjectWritableComparator.ObjectWritableMapComparator.class);
    job.setMapperClass(HadoopMap.class);
    if (mapReduce.doStage(MapReduce.Stage.REDUCE)) {
        if (mapReduce.doStage(MapReduce.Stage.COMBINE))
            job.setCombinerClass(HadoopCombine.class);
        job.setReducerClass(HadoopReduce.class);
    } else {
        if (mapSort.isPresent()) {
            job.setReducerClass(Reducer.class);
            job.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        } else {
            job.setNumReduceTasks(0);
        }
    }
    job.setMapOutputKeyClass(ObjectWritable.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(ObjectWritable.class);
    job.setOutputValueClass(ObjectWritable.class);
    job.setInputFormatClass(GraphFilterInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    // if there is no vertex program, then grab the graph from the input location
    final Path graphPath;
    if (vertexProgramExists) {
        graphPath = new Path(
                Constants.getGraphLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)));
    } else {
        graphPath = new Path(newConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION));
    }

    Path memoryPath = new Path(
            Constants.getMemoryLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION),
                    (reduceSort.isPresent() ? mapReduce.getMemoryKey() + "-temp" : mapReduce.getMemoryKey())));
    if (FileSystem.get(newConfiguration).exists(memoryPath)) {
        FileSystem.get(newConfiguration).delete(memoryPath, true);
    }
    FileInputFormat.setInputPaths(job, graphPath);
    FileOutputFormat.setOutputPath(job, memoryPath);
    job.waitForCompletion(true);

    // if there is a reduce sort, we need to run another identity MapReduce job
    if (reduceSort.isPresent()) {
        final Job reduceSortJob = Job.getInstance(newConfiguration, "ReduceKeySort");
        reduceSortJob.setSortComparatorClass(ObjectWritableComparator.ObjectWritableReduceComparator.class);
        reduceSortJob.setMapperClass(Mapper.class);
        reduceSortJob.setReducerClass(Reducer.class);
        reduceSortJob.setMapOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setMapOutputValueClass(ObjectWritable.class);
        reduceSortJob.setOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setOutputValueClass(ObjectWritable.class);
        reduceSortJob.setInputFormatClass(SequenceFileInputFormat.class);
        reduceSortJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        reduceSortJob.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        FileInputFormat.setInputPaths(reduceSortJob, memoryPath);
        final Path sortedMemoryPath = new Path(Constants.getMemoryLocation(
                newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), mapReduce.getMemoryKey()));
        FileOutputFormat.setOutputPath(reduceSortJob, sortedMemoryPath);
        reduceSortJob.waitForCompletion(true);
        FileSystem.get(newConfiguration).delete(memoryPath, true); // delete the temporary memory path
        memoryPath = sortedMemoryPath;
    }
    mapReduce.addResultToMemory(memory, new ObjectWritableIterator(newConfiguration, memoryPath));
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
        Reporter reporter) throws IOException {

    if (_skipPartition)
        return;//from   w w  w  .j a  v a 2 s . c om
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }

    FlexBuffer scanArray[] = LinkKey.allocateScanArray();

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_conf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
            RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(_fs,
            incomingPaths, localMergeConfig);

    TextBytes keyBytes = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer inputBuffer = new DataInputBuffer();

    int processedKeysCount = 0;

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;
    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

        summaryRecord = null;
        linkSummaryRecord = null;
        types.clear();
        linkSources = null;
        outputKeyString = null;
        outputKeyFromInternalLink = false;
        outputKeyURLObj = null;

        int statusCount = 0;
        int linkCount = 0;

        // scan key components 
        LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

        // pick up source fp from key ... 
        URLFPV2 fpSource = new URLFPV2();

        fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

        for (RawRecordValue rawValue : nextItem.e1) {

            inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
            int length = WritableUtils.readVInt(inputBuffer);
            keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            length = WritableUtils.readVInt(inputBuffer);
            valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

            long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

            if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                statusCount++;

                try {
                    JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                    if (object != null) {
                        updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                    }
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON:" + valueBytes.toString());
                    throw new IOException(e);
                }
            } else {
                linkCount++;
                JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                // ok this is a link ... 
                updateLinkStatsFromLinkJSONObject(object, fpSource, reporter);
            }

            reporter.progress();
        }

        if (statusCount > 1) {
            reporter.incrCounter(Counters.TWO_REDUNDANT_STATUS_IN_REDUCER, 1);
        }

        if (statusCount == 0 && linkCount != 0) {
            reporter.incrCounter(Counters.DISCOVERED_NEW_LINK, 1);
        } else {
            if (statusCount >= 1 && linkCount >= 1) {
                reporter.incrCounter(Counters.GOT_CRAWL_STATUS_WITH_LINK, 1);
            } else if (statusCount >= 1 && linkCount == 0) {
                reporter.incrCounter(Counters.GOT_CRAWL_STATUS_NO_LINK, 1);
            }
        }

        if (summaryRecord != null || linkSummaryRecord != null) {
            JsonObject compositeObject = new JsonObject();
            if (summaryRecord != null) {
                compositeObject.add("crawl_status", summaryRecord);
            }
            if (linkSummaryRecord != null) {
                if (types != null && types.size() != 0) {
                    stringCollectionToJsonArray(linkSummaryRecord, "typeAndRels", types);
                    if (linkSources != null) {
                        stringCollectionToJsonArray(linkSummaryRecord, "sources", linkSources.values());
                    }
                }
                compositeObject.add("link_status", linkSummaryRecord);
            }

            if (outputKeyString != null && outputKeyURLObj != null && outputKeyURLObj.isValid()) {
                if (outputKeyFromInternalLink) {
                    reporter.incrCounter(Counters.OUTPUT_KEY_FROM_INTERNAL_LINK, 1);
                } else {
                    reporter.incrCounter(Counters.OUTPUT_KEY_FROM_EXTERNAL_LINK, 1);
                }
                output.collect(new TextBytes(outputKeyString), new TextBytes(compositeObject.toString()));
            } else {
                reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1);
            }
        }
    }
}

From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values,
        OutputCollector<SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> output, Reporter reporter)
        throws IOException {
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }/* ww  w .  j av a2s . c  om*/

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_jobConf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, TextBytes.Comparator.class,
            TextBytes.Comparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
            FileSystem.get(_jobConf), incomingPaths, localMergeConfig);

    // save a reference to the collector
    _collector = output;

    iterateItems(multiFileInputReader, reporter);
}

From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.rank.LinkScannerStep.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
        Reporter reporter) throws IOException {
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    FlexBuffer scanArray[] = LinkKey.allocateScanArray();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }//from w w  w .j  a v a2  s .c  o  m

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_jobConf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
            RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
            FileSystem.get(_jobConf), incomingPaths, localMergeConfig);

    TextBytes keyBytes = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer inputBuffer = new DataInputBuffer();
    TextBytes valueOut = new TextBytes();
    TextBytes keyOut = new TextBytes();

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;

    // pick up source fp from key ...
    URLFPV2 fpSource = new URLFPV2();

    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

        outputKeyString = null;
        outputKeyFromInternalLink = false;
        outputKeyURLObj = null;
        latestLinkDataTime = -1L;
        outlinks.clear();
        discoveredLinks.clear();

        // scan key components
        LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

        // setup fingerprint ...
        fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

        for (RawRecordValue rawValue : nextItem.e1) {

            inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
            int length = WritableUtils.readVInt(inputBuffer);
            keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            length = WritableUtils.readVInt(inputBuffer);
            valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

            long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

            if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                try {
                    JsonObject object = parser.parse(valueBytes.toString()).getAsJsonObject();
                    if (object != null) {
                        updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                    }
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON:" + valueBytes.toString());
                    throw new IOException(e);
                }
            }
            reporter.progress();
        }
        // ok now see if we have anything to emit ...
        if (discoveredLinks.size() != 0) {
            reporter.incrCounter(Counters.HAD_OUTLINK_DATA, 1);
            for (String outlink : outlinks) {
                // emit a to tuple
                toJsonObject.addProperty("to", outlink);
                valueBytes.set(toJsonObject.toString());
                output.collect(sourceDomain, valueBytes);
                // now emit a from tuple ...
                fromJsonObject.addProperty("from", sourceDomain.toString());
                keyBytes.set(outlink);
                valueBytes.set(fromJsonObject.toString());
                output.collect(keyBytes, valueBytes);
            }

            bloomKey.setDomainHash(fpSource.getDomainHash());

            for (long destDomainFP : discoveredLinks) {
                // set the bloom filter key ...
                bloomKey.setUrlHash(destDomainFP);
                // add it to the bloom filter
                emittedTuplesFilter.add(bloomKey);
            }
        } else {
            reporter.incrCounter(Counters.HAD_NO_OUTLINK_DATA, 1);
        }
    }
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

public static void main(String[] args) {

    Path testPath = new Path(args[0]);

    LOG.info("Initializing Hadoop Config");

    Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("mapred-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("commoncrawl-default.xml");
    conf.addResource("commoncrawl-site.xml");

    conf.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, URLFPV2RawComparator.class,
            RawComparator.class);
    conf.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, URLFPV2.class, WritableComparable.class);

    CrawlEnvironment.setHadoopConfig(conf);
    CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/");

    try {//from   w  ww.j  av  a 2  s. c om
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        Vector<Path> paths = new Vector<Path>();

        paths.add(new Path(testPath, "part-00000"));
        // paths.add(new Path(testPath,"part-00000"));
        paths.add(new Path(testPath, "part-00001"));

        TreeSet<URLFPV2> directReadSet = new TreeSet<URLFPV2>();
        TreeSet<URLFPV2> multiFileReadSet = new TreeSet<URLFPV2>();

        MultiFileInputReader<URLFPV2> inputReader = new MultiFileInputReader<URLFPV2>(fs, paths, conf);

        KeyAndValueData<URLFPV2> keyValueData = null;
        int multiFileKeyCount = 0;
        while ((keyValueData = inputReader.readNextItem()) != null) {
            LOG.info("Got Key Domain:" + keyValueData._keyObject.getDomainHash() + " URLHash:"
                    + keyValueData._keyObject.getUrlHash() + " Item Count:" + keyValueData._values.size()
                    + " Path[0]:" + keyValueData._values.get(0).source);

            if (keyValueData._values.size() > 1) {
                LOG.error("Got more than one item");
                for (int i = 0; i < keyValueData._values.size(); ++i) {
                    CRC32 crc = new CRC32();
                    crc.update(keyValueData._keyData.getData(), 0, keyValueData._keyData.getLength());
                    LOG.error("Item at[" + i + "] Path:" + keyValueData._values.get(i).source + " CRC:"
                            + crc.getValue());
                }
            }
            if (multiFileKeyCount++ < 1000)
                multiFileReadSet.add((URLFPV2) keyValueData._keyObject.clone());
        }
        inputReader.close();

        addFirstNFPItemsToSet(fs, new Path(testPath, "part-00000"), conf, directReadSet, 1000);
        addFirstNFPItemsToSet(fs, new Path(testPath, "part-00001"), conf, directReadSet, 1000);

        Iterator<URLFPV2> directReadIterator = directReadSet.iterator();
        Iterator<URLFPV2> multiFileReadIterator = multiFileReadSet.iterator();

        for (int i = 0; i < 1000; ++i) {
            URLFPV2 directReadFP = directReadIterator.next();
            URLFPV2 multiFileReadFP = multiFileReadIterator.next();

            if (directReadFP.compareTo(multiFileReadFP) != 0) {
                LOG.info("Mismatch at Index:" + i);
            }
        }

    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    } catch (CloneNotSupportedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractMRNewApiSaveTest.java

License:Apache License

@Test
public void testBasicMultiSave() throws Exception {
    Configuration conf = createConf();
    conf.set(ConfigurationOptions.ES_RESOURCE, "mrnewapi/multi-save");

    MultiOutputFormat.addOutputFormat(conf, EsOutputFormat.class);
    MultiOutputFormat.addOutputFormat(conf, PrintStreamOutputFormat.class);
    //MultiOutputFormat.addOutputFormat(conf, TextOutputFormat.class);

    PrintStreamOutputFormat.stream(conf, Stream.OUT);
    //conf.set("mapred.output.dir", "foo/bar");

    conf.setClass("mapreduce.outputformat.class", MultiOutputFormat.class, OutputFormat.class);
    runJob(conf);//from   w ww.  ja  v  a2s  .  c om
}

From source file:org.hdp.WordCountDataCompression.WordCountDataCompressionJob.java

@Override
public int run(String[] args) throws Exception {
    // TODO Auto-generated method stub

    // data compression code  -->
    Configuration conf = getConf();
    conf.setClass("mapreduce.output.TextOutputFormat.compress", SnappyCodec.class, CompressionCodec.class);

    // <--/*  w  w  w .jav  a2  s  .c  o  m*/
    Job job = Job.getInstance(conf, "Word Count Job with compression");
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileSystem fs = FileSystem.get(getConf());
    // does not the HDFS setting that is set for the eclipse env
    Path pathOut = new Path("/test/wordcount/op");
    if (fs.exists(pathOut)) {
        fs.delete(out, true);
    }
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setMapperClass(WordCountMapperDataCompression.class);
    job.setReducerClass(WordCountReducerDataCompression.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:org.kiji.mapreduce.bulkimport.KijiBulkImportJobBuilder.java

License:Apache License

/** {@inheritDoc} */
@Override/*ww w . j ava  2s  .c om*/
protected void configureJob(Job job) throws IOException {
    final Configuration conf = job.getConfiguration();

    // Store the name of the the importer to use in the job configuration so the mapper can
    // create instances of it.
    // Construct the bulk importer instance.
    if (null == mBulkImporterClass) {
        throw new JobConfigurationException("Must specify a bulk importer.");
    }
    conf.setClass(KijiConfKeys.KIJI_BULK_IMPORTER_CLASS, mBulkImporterClass, KijiBulkImporter.class);

    mJobOutput.configure(job);

    // Configure the mapper and reducer. This part depends on whether we're going to write
    // to HFiles or directly to the table.
    configureJobForHFileOutput(job);

    job.setJobName("Kiji bulk import: " + mBulkImporterClass.getSimpleName());

    mBulkImporter = ReflectionUtils.newInstance(mBulkImporterClass, conf);

    // Configure the MapReduce job (requires mBulkImporter to be set properly):
    super.configureJob(job);
}