Example usage for org.apache.hadoop.conf Configuration setClass

List of usage examples for org.apache.hadoop.conf Configuration setClass

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setClass.

Prototype

public void setClass(String name, Class<?> theClass, Class<?> xface) 

Source Link

Document

Set the value of the name property to the name of a theClass implementing the given interface xface.

Usage

From source file:org.apache.tez.test.MiniTezCluster.java

License:Apache License

@Override
public void serviceInit(Configuration conf) throws Exception {
    conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME);
    // Use libs from cluster since no build is available
    conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true);
    // blacklisting disabled to prevent scheduling issues
    conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
    if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) {
        conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath());
    }// www  .  ja v  a2  s.  com

    if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) {
        // nothing defined. set quick delete value
        conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l);
    }

    File appJarLocalFile = new File(MiniTezCluster.APPJAR);

    if (!appJarLocalFile.exists()) {
        String message = "TezAppJar " + MiniTezCluster.APPJAR + " not found. Exiting.";
        LOG.info(message);
        throw new TezUncheckedException(message);
    } else {
        LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath());
    }

    FileSystem fs = FileSystem.get(conf);
    Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir"));
    Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar");
    // Copy AppJar and make it public.
    Path appMasterJar = new Path(MiniTezCluster.APPJAR);
    fs.copyFromLocalFile(appMasterJar, appRemoteJar);
    fs.setPermission(appRemoteJar, new FsPermission("777"));

    conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString());
    LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS));

    // VMEM monitoring disabled, PMEM monitoring enabled.
    conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
    conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);

    conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000");

    try {
        Path stagingPath = FileContext.getFileContext(conf)
                .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR)));
        /*
         * Re-configure the staging path on Windows if the file system is localFs.
         * We need to use a absolute path that contains the drive letter. The unit
         * test could run on a different drive than the AM. We can run into the
         * issue that job files are localized to the drive where the test runs on,
         * while the AM starts on a different drive and fails to find the job
         * metafiles. Using absolute path can avoid this ambiguity.
         */
        if (Path.WINDOWS) {
            if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) {
                conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                        new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath());
            }
        }
        FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf);
        if (fc.util().exists(stagingPath)) {
            LOG.info(stagingPath + " exists! deleting...");
            fc.delete(stagingPath, true);
        }
        LOG.info("mkdir: " + stagingPath);
        fc.mkdir(stagingPath, null, true);

        //mkdir done directory as well
        String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
        Path doneDirPath = fc.makeQualified(new Path(doneDir));
        fc.mkdir(doneDirPath, null, true);
    } catch (IOException e) {
        throw new TezUncheckedException("Could not create staging directory. ", e);
    }
    conf.set(MRConfig.MASTER_ADDRESS, "test");

    //configure the shuffle service in NM
    conf.setStrings(YarnConfiguration.NM_AUX_SERVICES,
            new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID });
    conf.setClass(
            String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID),
            ShuffleHandler.class, Service.class);

    // Non-standard shuffle port
    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0);

    conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class,
            ContainerExecutor.class);

    // TestMRJobs is for testing non-uberized operation only; see TestUberAM
    // for corresponding uberized tests.
    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
    super.serviceInit(conf);
}

From source file:org.apache.tez.tests.MiniTezClusterWithTimeline.java

License:Apache License

@Override
public void serviceInit(Configuration conf) throws Exception {
    conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME);
    // Use libs from cluster since no build is available
    conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true);
    // blacklisting disabled to prevent scheduling issues
    conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
    if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) {
        conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath());
    }/*w ww  .j a v a2  s .com*/

    if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) {
        // nothing defined. set quick delete value
        conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l);
    }

    File appJarLocalFile = new File(MiniTezClusterWithTimeline.APPJAR);

    if (!appJarLocalFile.exists()) {
        String message = "TezAppJar " + MiniTezClusterWithTimeline.APPJAR + " not found. Exiting.";
        LOG.info(message);
        throw new TezUncheckedException(message);
    } else {
        LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath());
    }

    FileSystem fs = FileSystem.get(conf);
    Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir"));
    Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar");
    // Copy AppJar and make it public.
    Path appMasterJar = new Path(MiniTezClusterWithTimeline.APPJAR);
    fs.copyFromLocalFile(appMasterJar, appRemoteJar);
    fs.setPermission(appRemoteJar, new FsPermission("777"));

    conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString());
    LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS));

    // VMEM monitoring disabled, PMEM monitoring enabled.
    conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
    conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);

    conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000");

    try {
        Path stagingPath = FileContext.getFileContext(conf)
                .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR)));
        /*
         * Re-configure the staging path on Windows if the file system is localFs.
         * We need to use a absolute path that contains the drive letter. The unit
         * test could run on a different drive than the AM. We can run into the
         * issue that job files are localized to the drive where the test runs on,
         * while the AM starts on a different drive and fails to find the job
         * metafiles. Using absolute path can avoid this ambiguity.
         */
        if (Path.WINDOWS) {
            if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) {
                conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                        new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath());
            }
        }
        FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf);
        if (fc.util().exists(stagingPath)) {
            LOG.info(stagingPath + " exists! deleting...");
            fc.delete(stagingPath, true);
        }
        LOG.info("mkdir: " + stagingPath);
        fc.mkdir(stagingPath, null, true);

        //mkdir done directory as well
        String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
        Path doneDirPath = fc.makeQualified(new Path(doneDir));
        fc.mkdir(doneDirPath, null, true);
    } catch (IOException e) {
        throw new TezUncheckedException("Could not create staging directory. ", e);
    }
    conf.set(MRConfig.MASTER_ADDRESS, "test");

    //configure the shuffle service in NM
    conf.setStrings(YarnConfiguration.NM_AUX_SERVICES,
            new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID });
    conf.setClass(
            String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID),
            ShuffleHandler.class, Service.class);

    // Non-standard shuffle port
    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0);

    conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class,
            ContainerExecutor.class);

    // TestMRJobs is for testing non-uberized operation only; see TestUberAM
    // for corresponding uberized tests.
    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
    super.serviceInit(conf);
}

From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.util.MapReduceHelper.java

License:Apache License

public static void executeMapReduceJob(final MapReduce mapReduce, final Memory.Admin memory,
        final Configuration configuration) throws IOException, ClassNotFoundException, InterruptedException {
    final Configuration newConfiguration = new Configuration(configuration);
    final boolean vertexProgramExists = newConfiguration.get(VertexProgram.VERTEX_PROGRAM, null) != null;
    if (vertexProgramExists) {
        newConfiguration.set(Constants.GREMLIN_HADOOP_GRAPH_READER, InputOutputHelper.getInputFormat(
                (Class) newConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class))
                .getCanonicalName());/* w w  w .ja va  2  s  .co m*/
        newConfiguration.unset(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    }
    final BaseConfiguration apacheConfiguration = new BaseConfiguration();
    apacheConfiguration.setDelimiterParsingDisabled(true);
    mapReduce.storeState(apacheConfiguration);
    ConfUtil.mergeApacheIntoHadoopConfiguration(apacheConfiguration, newConfiguration);

    final Optional<Comparator<?>> mapSort = mapReduce.getMapKeySort();
    final Optional<Comparator<?>> reduceSort = mapReduce.getReduceKeySort();
    newConfiguration.setClass(Constants.GREMLIN_HADOOP_MAP_REDUCE_CLASS, mapReduce.getClass(), MapReduce.class);
    final Job job = Job.getInstance(newConfiguration, mapReduce.toString());
    HadoopGraph.LOGGER.info(Constants.GREMLIN_HADOOP_JOB_PREFIX + mapReduce.toString());
    job.setJarByClass(HadoopGraph.class);
    if (mapSort.isPresent())
        job.setSortComparatorClass(ObjectWritableComparator.ObjectWritableMapComparator.class);
    job.setMapperClass(HadoopMap.class);
    if (mapReduce.doStage(MapReduce.Stage.REDUCE)) {
        if (mapReduce.doStage(MapReduce.Stage.COMBINE))
            job.setCombinerClass(HadoopCombine.class);
        job.setReducerClass(HadoopReduce.class);
    } else {
        if (mapSort.isPresent()) {
            job.setReducerClass(Reducer.class);
            job.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        } else {
            job.setNumReduceTasks(0);
        }
    }
    job.setMapOutputKeyClass(ObjectWritable.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(ObjectWritable.class);
    job.setOutputValueClass(ObjectWritable.class);
    job.setInputFormatClass(GraphFilterInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    // if there is no vertex program, then grab the graph from the input location
    final Path graphPath;
    if (vertexProgramExists) {
        graphPath = new Path(
                Constants.getGraphLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)));
    } else {
        graphPath = new Path(newConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION));
    }

    Path memoryPath = new Path(
            Constants.getMemoryLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION),
                    (reduceSort.isPresent() ? mapReduce.getMemoryKey() + "-temp" : mapReduce.getMemoryKey())));
    if (FileSystem.get(newConfiguration).exists(memoryPath)) {
        FileSystem.get(newConfiguration).delete(memoryPath, true);
    }
    FileInputFormat.setInputPaths(job, graphPath);
    FileOutputFormat.setOutputPath(job, memoryPath);
    job.waitForCompletion(true);

    // if there is a reduce sort, we need to run another identity MapReduce job
    if (reduceSort.isPresent()) {
        final Job reduceSortJob = Job.getInstance(newConfiguration, "ReduceKeySort");
        reduceSortJob.setSortComparatorClass(ObjectWritableComparator.ObjectWritableReduceComparator.class);
        reduceSortJob.setMapperClass(Mapper.class);
        reduceSortJob.setReducerClass(Reducer.class);
        reduceSortJob.setMapOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setMapOutputValueClass(ObjectWritable.class);
        reduceSortJob.setOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setOutputValueClass(ObjectWritable.class);
        reduceSortJob.setInputFormatClass(SequenceFileInputFormat.class);
        reduceSortJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        reduceSortJob.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        FileInputFormat.setInputPaths(reduceSortJob, memoryPath);
        final Path sortedMemoryPath = new Path(Constants.getMemoryLocation(
                newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), mapReduce.getMemoryKey()));
        FileOutputFormat.setOutputPath(reduceSortJob, sortedMemoryPath);
        reduceSortJob.waitForCompletion(true);
        FileSystem.get(newConfiguration).delete(memoryPath, true); // delete the temporary memory path
        memoryPath = sortedMemoryPath;
    }
    mapReduce.addResultToMemory(memory, new ObjectWritableIterator(newConfiguration, memoryPath));
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
        Reporter reporter) throws IOException {

    if (_skipPartition)
        return;//from   w w  w  .j a  v a 2 s . c om
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }

    FlexBuffer scanArray[] = LinkKey.allocateScanArray();

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_conf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
            RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(_fs,
            incomingPaths, localMergeConfig);

    TextBytes keyBytes = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer inputBuffer = new DataInputBuffer();

    int processedKeysCount = 0;

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;
    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

        summaryRecord = null;
        linkSummaryRecord = null;
        types.clear();
        linkSources = null;
        outputKeyString = null;
        outputKeyFromInternalLink = false;
        outputKeyURLObj = null;

        int statusCount = 0;
        int linkCount = 0;

        // scan key components 
        LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

        // pick up source fp from key ... 
        URLFPV2 fpSource = new URLFPV2();

        fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

        for (RawRecordValue rawValue : nextItem.e1) {

            inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
            int length = WritableUtils.readVInt(inputBuffer);
            keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            length = WritableUtils.readVInt(inputBuffer);
            valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

            long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

            if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                statusCount++;

                try {
                    JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                    if (object != null) {
                        updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                    }
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON:" + valueBytes.toString());
                    throw new IOException(e);
                }
            } else {
                linkCount++;
                JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                // ok this is a link ... 
                updateLinkStatsFromLinkJSONObject(object, fpSource, reporter);
            }

            reporter.progress();
        }

        if (statusCount > 1) {
            reporter.incrCounter(Counters.TWO_REDUNDANT_STATUS_IN_REDUCER, 1);
        }

        if (statusCount == 0 && linkCount != 0) {
            reporter.incrCounter(Counters.DISCOVERED_NEW_LINK, 1);
        } else {
            if (statusCount >= 1 && linkCount >= 1) {
                reporter.incrCounter(Counters.GOT_CRAWL_STATUS_WITH_LINK, 1);
            } else if (statusCount >= 1 && linkCount == 0) {
                reporter.incrCounter(Counters.GOT_CRAWL_STATUS_NO_LINK, 1);
            }
        }

        if (summaryRecord != null || linkSummaryRecord != null) {
            JsonObject compositeObject = new JsonObject();
            if (summaryRecord != null) {
                compositeObject.add("crawl_status", summaryRecord);
            }
            if (linkSummaryRecord != null) {
                if (types != null && types.size() != 0) {
                    stringCollectionToJsonArray(linkSummaryRecord, "typeAndRels", types);
                    if (linkSources != null) {
                        stringCollectionToJsonArray(linkSummaryRecord, "sources", linkSources.values());
                    }
                }
                compositeObject.add("link_status", linkSummaryRecord);
            }

            if (outputKeyString != null && outputKeyURLObj != null && outputKeyURLObj.isValid()) {
                if (outputKeyFromInternalLink) {
                    reporter.incrCounter(Counters.OUTPUT_KEY_FROM_INTERNAL_LINK, 1);
                } else {
                    reporter.incrCounter(Counters.OUTPUT_KEY_FROM_EXTERNAL_LINK, 1);
                }
                output.collect(new TextBytes(outputKeyString), new TextBytes(compositeObject.toString()));
            } else {
                reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1);
            }
        }
    }
}

From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values,
        OutputCollector<SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> output, Reporter reporter)
        throws IOException {
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }/* ww  w .  j av a2s . c  om*/

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_jobConf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, TextBytes.Comparator.class,
            TextBytes.Comparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
            FileSystem.get(_jobConf), incomingPaths, localMergeConfig);

    // save a reference to the collector
    _collector = output;

    iterateItems(multiFileInputReader, reporter);
}

From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.rank.LinkScannerStep.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
        Reporter reporter) throws IOException {
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    FlexBuffer scanArray[] = LinkKey.allocateScanArray();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }//from w w  w .j  a v a2  s .c  o  m

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_jobConf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
            RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
            FileSystem.get(_jobConf), incomingPaths, localMergeConfig);

    TextBytes keyBytes = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer inputBuffer = new DataInputBuffer();
    TextBytes valueOut = new TextBytes();
    TextBytes keyOut = new TextBytes();

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;

    // pick up source fp from key ...
    URLFPV2 fpSource = new URLFPV2();

    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

        outputKeyString = null;
        outputKeyFromInternalLink = false;
        outputKeyURLObj = null;
        latestLinkDataTime = -1L;
        outlinks.clear();
        discoveredLinks.clear();

        // scan key components
        LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

        // setup fingerprint ...
        fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

        for (RawRecordValue rawValue : nextItem.e1) {

            inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
            int length = WritableUtils.readVInt(inputBuffer);
            keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            length = WritableUtils.readVInt(inputBuffer);
            valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

            long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

            if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                try {
                    JsonObject object = parser.parse(valueBytes.toString()).getAsJsonObject();
                    if (object != null) {
                        updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                    }
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON:" + valueBytes.toString());
                    throw new IOException(e);
                }
            }
            reporter.progress();
        }
        // ok now see if we have anything to emit ...
        if (discoveredLinks.size() != 0) {
            reporter.incrCounter(Counters.HAD_OUTLINK_DATA, 1);
            for (String outlink : outlinks) {
                // emit a to tuple
                toJsonObject.addProperty("to", outlink);
                valueBytes.set(toJsonObject.toString());
                output.collect(sourceDomain, valueBytes);
                // now emit a from tuple ...
                fromJsonObject.addProperty("from", sourceDomain.toString());
                keyBytes.set(outlink);
                valueBytes.set(fromJsonObject.toString());
                output.collect(keyBytes, valueBytes);
            }

            bloomKey.setDomainHash(fpSource.getDomainHash());

            for (long destDomainFP : discoveredLinks) {
                // set the bloom filter key ...
                bloomKey.setUrlHash(destDomainFP);
                // add it to the bloom filter
                emittedTuplesFilter.add(bloomKey);
            }
        } else {
            reporter.incrCounter(Counters.HAD_NO_OUTLINK_DATA, 1);
        }
    }
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

public static void main(String[] args) {

    Path testPath = new Path(args[0]);

    LOG.info("Initializing Hadoop Config");

    Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("mapred-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("commoncrawl-default.xml");
    conf.addResource("commoncrawl-site.xml");

    conf.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, URLFPV2RawComparator.class,
            RawComparator.class);
    conf.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, URLFPV2.class, WritableComparable.class);

    CrawlEnvironment.setHadoopConfig(conf);
    CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/");

    try {//from   w  ww.j  av  a 2  s. c om
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        Vector<Path> paths = new Vector<Path>();

        paths.add(new Path(testPath, "part-00000"));
        // paths.add(new Path(testPath,"part-00000"));
        paths.add(new Path(testPath, "part-00001"));

        TreeSet<URLFPV2> directReadSet = new TreeSet<URLFPV2>();
        TreeSet<URLFPV2> multiFileReadSet = new TreeSet<URLFPV2>();

        MultiFileInputReader<URLFPV2> inputReader = new MultiFileInputReader<URLFPV2>(fs, paths, conf);

        KeyAndValueData<URLFPV2> keyValueData = null;
        int multiFileKeyCount = 0;
        while ((keyValueData = inputReader.readNextItem()) != null) {
            LOG.info("Got Key Domain:" + keyValueData._keyObject.getDomainHash() + " URLHash:"
                    + keyValueData._keyObject.getUrlHash() + " Item Count:" + keyValueData._values.size()
                    + " Path[0]:" + keyValueData._values.get(0).source);

            if (keyValueData._values.size() > 1) {
                LOG.error("Got more than one item");
                for (int i = 0; i < keyValueData._values.size(); ++i) {
                    CRC32 crc = new CRC32();
                    crc.update(keyValueData._keyData.getData(), 0, keyValueData._keyData.getLength());
                    LOG.error("Item at[" + i + "] Path:" + keyValueData._values.get(i).source + " CRC:"
                            + crc.getValue());
                }
            }
            if (multiFileKeyCount++ < 1000)
                multiFileReadSet.add((URLFPV2) keyValueData._keyObject.clone());
        }
        inputReader.close();

        addFirstNFPItemsToSet(fs, new Path(testPath, "part-00000"), conf, directReadSet, 1000);
        addFirstNFPItemsToSet(fs, new Path(testPath, "part-00001"), conf, directReadSet, 1000);

        Iterator<URLFPV2> directReadIterator = directReadSet.iterator();
        Iterator<URLFPV2> multiFileReadIterator = multiFileReadSet.iterator();

        for (int i = 0; i < 1000; ++i) {
            URLFPV2 directReadFP = directReadIterator.next();
            URLFPV2 multiFileReadFP = multiFileReadIterator.next();

            if (directReadFP.compareTo(multiFileReadFP) != 0) {
                LOG.info("Mismatch at Index:" + i);
            }
        }

    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    } catch (CloneNotSupportedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractMRNewApiSaveTest.java

License:Apache License

@Test
public void testBasicMultiSave() throws Exception {
    Configuration conf = createConf();
    conf.set(ConfigurationOptions.ES_RESOURCE, "mrnewapi/multi-save");

    MultiOutputFormat.addOutputFormat(conf, EsOutputFormat.class);
    MultiOutputFormat.addOutputFormat(conf, PrintStreamOutputFormat.class);
    //MultiOutputFormat.addOutputFormat(conf, TextOutputFormat.class);

    PrintStreamOutputFormat.stream(conf, Stream.OUT);
    //conf.set("mapred.output.dir", "foo/bar");

    conf.setClass("mapreduce.outputformat.class", MultiOutputFormat.class, OutputFormat.class);
    runJob(conf);//from   w ww.  ja  v  a2s  .  c om
}

From source file:org.hdp.WordCountDataCompression.WordCountDataCompressionJob.java

@Override
public int run(String[] args) throws Exception {
    // TODO Auto-generated method stub

    // data compression code  -->
    Configuration conf = getConf();
    conf.setClass("mapreduce.output.TextOutputFormat.compress", SnappyCodec.class, CompressionCodec.class);

    // <--/*  w  w  w .jav  a2  s  .c  o  m*/
    Job job = Job.getInstance(conf, "Word Count Job with compression");
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileSystem fs = FileSystem.get(getConf());
    // does not the HDFS setting that is set for the eclipse env
    Path pathOut = new Path("/test/wordcount/op");
    if (fs.exists(pathOut)) {
        fs.delete(out, true);
    }
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setMapperClass(WordCountMapperDataCompression.class);
    job.setReducerClass(WordCountReducerDataCompression.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:org.kiji.mapreduce.bulkimport.KijiBulkImportJobBuilder.java

License:Apache License

/** {@inheritDoc} */
@Override/*ww w . j ava  2s  .c om*/
protected void configureJob(Job job) throws IOException {
    final Configuration conf = job.getConfiguration();

    // Store the name of the the importer to use in the job configuration so the mapper can
    // create instances of it.
    // Construct the bulk importer instance.
    if (null == mBulkImporterClass) {
        throw new JobConfigurationException("Must specify a bulk importer.");
    }
    conf.setClass(KijiConfKeys.KIJI_BULK_IMPORTER_CLASS, mBulkImporterClass, KijiBulkImporter.class);

    mJobOutput.configure(job);

    // Configure the mapper and reducer. This part depends on whether we're going to write
    // to HFiles or directly to the table.
    configureJobForHFileOutput(job);

    job.setJobName("Kiji bulk import: " + mBulkImporterClass.getSimpleName());

    mBulkImporter = ReflectionUtils.newInstance(mBulkImporterClass, conf);

    // Configure the MapReduce job (requires mBulkImporter to be set properly):
    super.configureJob(job);
}