Example usage for org.apache.hadoop.mapreduce Job getInstance

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getInstance.

Prototype

@Deprecated
public static Job getInstance(Cluster ignored) throws IOException

Source Link

Document

Creates a new Job with no particular Cluster .

Usage

From source file:com.yahoo.glimmer.indexing.generator.TripleIndexGenerator.java

License:Open Source License

public int run(String[] args) throws Exception {
    SimpleJSAP jsap = new SimpleJSAP(TripleIndexGenerator.class.getName(),
            "Generates a keyword index from RDF data.",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', "withoutContexts",
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(METHOD_ARG, JSAP.STRING_PARSER, "horizontal", JSAP.REQUIRED, 'm',
                            METHOD_ARG, "horizontal or vertical."),
                    new FlaggedOption(PREDICATES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
                            'p', PREDICATES_ARG, "Subset of the properties to be indexed."),
                    new FlaggedOption(RESOURCE_PREFIX_ARG, JSAP.STRING_PARSER, "@", JSAP.NOT_REQUIRED, 'r',
                            RESOURCE_PREFIX_ARG,
                            "Prefix to add to object resource hash values when indexing. Stops queries for numbers matching resource hash values. Default is '@'"),

                    new UnflaggedOption("input", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(NUMBER_OF_DOCS_ARG, JSAP.LONG_PARSER, JSAP.REQUIRED,
                            "Number of documents to index"),
                    new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the output."),
                    new UnflaggedOption(RESOURCES_HASH_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location of the resources hash file."), });

    JSAPResult jsapResult = jsap.parse(args);

    // check whether the command line was valid, and if it wasn't,
    // display usage information and exit.
    if (!jsapResult.success()) {
        System.err.println();/*from ww  w  .  j av  a2 s  .c o  m*/
        System.err.println("Usage: java " + TripleIndexGenerator.class.getName());
        System.err.println("                " + jsap.getUsage());
        System.err.println();
        System.exit(1);
    }

    Job job = Job.getInstance(getConf());
    job.setJarByClass(TripleIndexGenerator.class);
    job.setJobName("TripleIndexGenerator" + System.currentTimeMillis());

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString("input")));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(DocumentMapper.class);
    job.setMapOutputKeyClass(TermKey.class);
    job.setMapOutputValueClass(TermValue.class);

    job.setPartitionerClass(TermKey.FirstPartitioner.class);
    job.setGroupingComparatorClass(TermKey.FirstGroupingComparator.class);

    job.setReducerClass(TermReduce.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IndexRecordWriterValue.class);
    job.setOutputFormatClass(IndexRecordWriter.OutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(jsapResult.getString("output")));

    Configuration conf = job.getConfiguration();

    conf.setClass("mapred.output.key.comparator.class", TermKey.Comparator.class, WritableComparator.class);
    conf.set("mapreduce.user.classpath.first", "true");

    long numDocs = jsapResult.getLong(NUMBER_OF_DOCS_ARG);
    conf.setLong(NUMBER_OF_DOCUMENTS, numDocs);
    // Set this in a attempt to get around the 2GB of ram task limit on our cluster.
    // Setting this in the hope of fixing Direct buffer memory errors
    conf.setInt(INDEX_WRITER_CACHE_SIZE, 1024 * 1024);

    conf.set(OUTPUT_DIR, jsapResult.getString("output"));

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_HORIZONTAL)) {
        HorizontalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG));
    } else if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_VERTICAL)) {
        if (!jsapResult.contains(PREDICATES_ARG)) {
            throw new IllegalArgumentException("When '" + METHOD_ARG + "' is '" + METHOD_ARG_VALUE_VERTICAL
                    + "' you have to give a predicates file too.");
        }
        VerticalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG), jsapResult.getString(PREDICATES_ARG));
    } else {
        throw new IllegalArgumentException(METHOD_ARG + " should be '" + METHOD_ARG_VALUE_HORIZONTAL + "' or '"
                + METHOD_ARG_VALUE_VERTICAL + "'");
    }

    conf.setInt("mapreduce.input.linerecordreader.line.maxlength", 1024 * 1024);

    boolean success = job.waitForCompletion(true);

    return success ? 0 : 1;
}

From source file:com.yahoo.glimmer.indexing.preprocessor.PrepTool.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {

    SimpleJSAP jsap = new SimpleJSAP(PrepTool.class.getName(), "RDF tuples pre-processor for Glimmer",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', NO_CONTEXTS_ARG,
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(ONTOLOGY_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'O',
                            ONTOLOGY_ARG),
                    new FlaggedOption(REDUCER_COUNT_ARG, JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
                            JSAP.NOT_REQUIRED, 'r', REDUCER_COUNT_ARG),
                    new UnflaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the out data."), });

    JSAPResult jsapResult = jsap.parse(args);
    if (!jsapResult.success()) {
        System.err.print(jsap.getUsage());
        System.exit(1);/*from  w ww. ja va2  s.c  o m*/
    }

    Configuration config = getConf();

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    config.setBoolean(TuplesToResourcesMapper.INCLUDE_CONTEXTS_KEY, withContexts);

    // The ontology if any...
    String ontologyFilename = jsapResult.getString(ONTOLOGY_ARG);
    if (ontologyFilename != null) {
        // Load the ontology
        InputStream ontologyInputStream = new FileInputStream(ontologyFilename);
        OWLOntology ontology = OntologyLoader.load(ontologyInputStream);
        System.out.println(
                "Loaded ontology from " + ontologyFilename + " with " + ontology.getAxiomCount() + " axioms.");

        ArrayList<String> ontologyClasses = new ArrayList<String>();
        for (OWLClass owlClass : ontology.getClassesInSignature()) {
            ontologyClasses.add(owlClass.getIRI().toString());
        }
        System.out.println("Adding " + ontologyClasses.size() + " classes from ontology.");
        config.setStrings(TuplesToResourcesMapper.EXTRA_RESOURCES, ontologyClasses.toArray(new String[0]));
    } else {
        System.out.println("No ontology filename set in conf.  No ontology has been loaded.");
    }

    Job job = Job.getInstance(config);
    job.setJarByClass(PrepTool.class);

    job.setJobName(PrepTool.class.getName() + "-part1-" + System.currentTimeMillis());
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TuplesToResourcesMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    int reducerCount = jsapResult.getInt(REDUCER_COUNT_ARG, DEFAULT_REDUCER_COUNT);
    job.setNumReduceTasks(reducerCount);
    if (reducerCount == 1) {
        // We assign 'global' ids in the reducer. For this to work, there
        // can be only one. But using just one reducer, we run out of local disk space during the
        // pre-reduce merge with big data sets like WCC.

        job.setReducerClass(ResourcesReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Object.class);
        job.setOutputFormatClass(ResourceRecordWriter.OutputFormat.class);
    } else {
        /*
         * TODO: Take the functionality of the reducer and move it to run on
         * the gateway. We then use n identity reducers, the output of which
         * will be read and merged as streams on the gateway.
         */
    }

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString(INPUT_ARG)));

    Path outputDir = new Path(jsapResult.getString(OUTPUT_ARG));
    FileOutputFormat.setOutputPath(job, outputDir);

    if (!job.waitForCompletion(true)) {
        System.err.println("Failed to process tuples from " + jsapResult.getString(INPUT_ARG));
        return 1;
    }

    // IF THERE WAS ONLY ONE REDUCER WE NOW HAVE
    // One file per reducer containing lists of urls(recourses) for
    // subjects, predicates, objects and contexts.
    // One file per reducer that contains all resources. subjects +
    // predicates + objects + contexts.
    // One file per reducer that contains the subjects + all <predicate>
    // <object>|"Literal" <context> on that subject.

    // IF THERE WAS MORE THAN ONE REDUCER WE NOW HAVE N FILES THAT NEED TO BE MERGED ON THE GATEWAY. TODO.

    return 0;
}

From source file:com.yahoo.semsearch.fastlinking.io.RepackWikipedia.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from www  .j  a  v a2s .co  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code")
            .create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    Job job = Job.getInstance(getConf());
    job.setJarByClass(RepackWikipedia.class);
    job.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    if ("none".equals(compressionType)) {
        FileOutputFormat.setCompressOutput(job, false);
    } else {
        FileOutputFormat.setCompressOutput(job, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
            job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language));
    //job.setOutputValueClass(EnglishWikipediaPage.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    return job.waitForCompletion(true) ? 0 : -1;

}

From source file:com.yahoo.semsearch.fastlinking.io.WikipediaDocnoMappingBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from  ww  w  . ja v  a2s . co m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file")
            .create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr|it").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);

    String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(WikipediaDocnoMappingBuilder.class);
    job.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));

    job.getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    if (job.waitForCompletion(true)) {

        //         long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue() : job.getCounters().findCounter(PageTypes.ARTICLE).getValue();
        long cnt = job.getCounters()
                .findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
        WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-r-00000",
                (int) cnt, outputFile);
        FileSystem.get(getConf()).delete(new Path(tmpPath), true);
        return 0;

    } else {
        return -1;
    }
}

From source file:de.bankmark.bigbench.queries.q18.MRlinearRegression.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    int NUMBER_REDUCERS = 1;
    Job job = Job.getInstance(getConf());

    job.setJarByClass(MRlinearRegression.class);
    if (args.length != 2) {
        usage(job);/*from w  w w.  ja v a2s.c om*/
        return 2;
    }
    System.out.println("input:");
    job.setJobName(MRlinearRegression.class.getSimpleName() + "::" + args[0] + "->" + args[1]);

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    System.out.println("Input: " + input + "  out -> " + output);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(MRlinearRegression.LRmapper.class);
    job.setReducerClass(MRlinearRegression.LRreducer.class);
    job.setNumReduceTasks(NUMBER_REDUCERS);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DoubleArrayWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.bankmark.bigbench.queries.q28.ToSequenceFile.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(getConf());

    job.setJarByClass(ToSequenceFile.class);
    if (args.length != 2) {
        usage(job);//from   w w w  .  j a  v  a2 s. co m
        return 2;
    }
    System.out.println("input:");
    job.setJobName(ToSequenceFile.class.getSimpleName() + "::" + args[0] + "->" + args[1]);

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    System.out.println("Input: " + input + "  out -> " + output);
    FileInputFormat.addInputPath(job, input);
    SequenceFileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(Reducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.l3s.content.timex.extracting.WikiTimex.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from   w w w .  j a v a  2  s .c  om
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = "en"; // Assume 'en' by default.
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - language: " + language);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(WikiTimex.class);
    job.setJobName(String.format("CountWikipediaPages[%s: %s, %s: %s]", INPUT_OPTION, inputPath,
            LANGUAGE_OPTION, language));

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    job.setMapperClass(TMapper.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:de.tudarmstadt.lt.wiki.statistics.ResourceInlinkCount.java

public static boolean runJob1(String inDir, String outDir) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapred.child.java.opts", "-Xmx1200M");
    conf.set("mapred.job.map.memory.mb", "1280");
    conf.set("mapreduce.job.queuename", "smalljob");
    Job job = Job.getInstance(conf);
    job.setJarByClass(ResourceInlinkCount.class);
    FileInputFormat.addInputPath(job, new Path(inDir));
    FileOutputFormat.setOutputPath(job, new Path(outDir));
    job.setMapperClass(Map.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(TextInputFormat.class);
    return job.waitForCompletion(true);
}

From source file:de.tudarmstadt.lt.wiki.statistics.ResourceInlinkCount.java

public static boolean runJob2(String inDir, String outDir) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapreduce.job.queuename", "smalljob");
    Job job = Job.getInstance(conf);
    job.setJarByClass(ResourceInlinkCount.class);
    FileInputFormat.addInputPath(job, new Path(inDir));
    FileOutputFormat.setOutputPath(job, new Path(outDir));
    job.setMapperClass(Map2.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(TextInputFormat.class);
    return job.waitForCompletion(true);
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase1FullJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    // set from the command line

    job.setJarByClass(Phase1FullJob.class);
    job.setJobName(Phase1FullJob.class.getName());

    // mapper//from  w w w  .j  av a2  s .  c  om
    job.setMapperClass(MapperClass.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration().setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(SimpleWarcWriterReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}