Example usage for org.apache.hadoop.conf Configuration setLong

List of usage examples for org.apache.hadoop.conf Configuration setLong

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setLong.

Prototype

public void setLong(String name, long value) 

Source Link

Document

Set the value of the name property to a long.

Usage

From source file:nl.cwi.kba2013.apps.chunk_stream_DocExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//from w  ww.j a  v  a  2s .  c  o m
    String out = null;

    String traintestFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-tt".equals(args[i])) {
                traintestFile = args[++i];
                Log.info("TrainTest");
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "chunk -stream");
    job.setJarByClass(chunk_stream_DocExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);

    job.setMapOutputValueClass(Text.class);
    job.setNumReduceTasks(1);
    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;
    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    return status;

}

From source file:nl.cwi.kba2013.apps.FeatureExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//from w  w w . j a  va2 s  . com
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String systemdescription_short = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    String wikiFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-ds".equals(args[i])) {
                systemdescription_short = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];

            } else if ("-s".equals(args[i])) {
                myverFile = args[++i];

            } else if ("-c".equals(args[i])) {
                contextFile = args[++i];
            } else if ("-w".equals(args[i])) {
                wikiFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";
    if (systemdescription_short == null)
        systemdescription_short = "a two -step classification approach";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short,
            corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    /*
     * Attr.put("LengthTitle", ""); Attr.put("LengthBody", "");
     * Attr.put("LengthAnchor", ""); Attr.put("Source", "");
     * Attr.put("English", ""); Attr.put("MentionsTitle", "");
     * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", "");
     * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread",
     * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", "");
     * Attr.put("SpreadNorm", ""); // Attr.put("Related", "");
     * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", "");
     * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld",
     * ""); Attr.put("partial", ""); Attr.put("s_form", "");
     * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos",
     * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class",
     * "");
     */
    Attr.put("gcld", "0");
    Attr.put("jac", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("ppr", "0");
    Attr.put("s_form", "0");
    Attr.put("contxR", "0");
    Attr.put("contxL", "0");
    Attr.put("FirstPos", "0");
    Attr.put("LastPos", "0");
    Attr.put("LengthBody", "0");
    Attr.put("FirstPosNorm", "0");
    Attr.put("MentionsBody", "0");
    Attr.put("RelatedBody", "0");
    Attr.put("Spread", "0");
    Attr.put("LastPosNorm", "0");
    Attr.put("SpreadNorm", "0");
    Attr.put("LengthAnchor", "0");
    Attr.put("Source", "0");
    Attr.put("LengthTitle", "0");
    Attr.put("partial", "0");
    Attr.put("MentionsAnchor", "0");
    Attr.put("Relatedtitle", "0");
    Attr.put("English", "0");
    Attr.put("RelatedAnchor", "0");
    Attr.put("MentionsTitle", "0");
    Attr.put("Class", "0");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    conf.set(MYVER, new Path(myverFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString());
    conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(FeatureExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Run_info.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    //      Text line = new Text();
    //      LineReader reader = new LineReader(fs.open(new Path(out
    //            + "/part-r-00000")));
    //      for (int i = 0; i < num_filter_results; i++) {
    //         reader.readLine(line);
    //         System.out.println(line.toString().split("\t\t")[1]);
    //      }

    System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba2013.apps.FeatureExtractor_DocExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;// w  w  w .j av  a 2s  . c  om
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String systemdescription_short = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    String wikiFile = null;
    String traintestFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-ds".equals(args[i])) {
                systemdescription_short = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];

            } else if ("-s".equals(args[i])) {
                myverFile = args[++i];

            } else if ("-c".equals(args[i])) {
                contextFile = args[++i];
            } else if ("-w".equals(args[i])) {
                wikiFile = args[++i];
            } else if ("-tt".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";
    if (systemdescription_short == null)
        systemdescription_short = "a two -step classification approach";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short,
            corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    /*
     * Attr.put("LengthTitle", ""); Attr.put("LengthBody", "");
     * Attr.put("LengthAnchor", ""); Attr.put("Source", "");
     * Attr.put("English", ""); Attr.put("MentionsTitle", "");
     * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", "");
     * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread",
     * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", "");
     * Attr.put("SpreadNorm", ""); // Attr.put("Related", "");
     * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", "");
     * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld",
     * ""); Attr.put("partial", ""); Attr.put("s_form", "");
     * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos",
     * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class",
     * "");
     */
    Attr.put("gcld", "0");
    Attr.put("jac", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("ppr", "0");
    Attr.put("s_form", "0");
    Attr.put("contxR", "0");
    Attr.put("contxL", "0");
    Attr.put("FirstPos", "0");
    Attr.put("LastPos", "0");
    Attr.put("LengthBody", "0");
    Attr.put("FirstPosNorm", "0");
    Attr.put("MentionsBody", "0");
    Attr.put("RelatedBody", "0");
    Attr.put("Spread", "0");
    Attr.put("LastPosNorm", "0");
    Attr.put("SpreadNorm", "0");
    Attr.put("LengthAnchor", "0");
    Attr.put("Source", "0");
    Attr.put("LengthTitle", "0");
    Attr.put("partial", "0");
    Attr.put("MentionsAnchor", "0");
    Attr.put("Relatedtitle", "0");
    Attr.put("English", "0");
    Attr.put("RelatedAnchor", "0");
    Attr.put("MentionsTitle", "0");
    Attr.put("Class", "0");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    conf.set(MYVER, new Path(myverFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString());
    conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString());
    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(FeatureExtractor_DocExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());
    /*
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));
            
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
            
    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);
            
    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
            
    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;
    */

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(TextOutputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(StreamItemWritable.class);
    //job.setMapOutputValueClass(Text.class);
    //job.setOutputKeyClass(StreamItemWritable.class);

    // job.setCombinerClass(MyReducer.class);
    //job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Run_info.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    // Text line = new Text();
    // LineReader reader = new LineReader(fs.open(new Path(out
    // + "/part-r-00000")));
    // for (int i = 0; i < num_filter_results; i++) {
    // reader.readLine(line);
    // System.out.println(line.toString().split("\t\t")[1]);
    // }

    System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba2013.apps.KBaDocExtractorFromCleansed.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//from  w  w  w. j a v  a2s . c o m
    String out = null;
    String traintestFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-t".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    Configuration conf = getConf();
    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Annotation Extraction");
    job.setJarByClass(KBaDocExtractorFromCleansed.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);

    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StreamItemWritable.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();

    return status;

}

From source file:nl.cwi.kba2013.apps.KbaExtractMissingFromRaw.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;// ww w .java  2  s .c o m
    String out = null;
    String queryfile = null;

    String labelsFile = null;

    String traintestFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-t".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Missing Extractor");
    job.setJarByClass(KbaExtractMissingFromRaw.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());
    //let's see it crushing again

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    //job.setReducerClass(MyReducer.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.cwi.kba2013.apps.KbaNameVariantMatch.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//  w  ww .j a  va 2 s  .  co m
    String out = null;
    String queryfile = null;

    String labelsFile = null;

    String traintestFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-t".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KbaNameVariantMatch.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.cwi.kba2013.apps.KbaNameVariantMatchFeatureExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;// w w w .  ja  v  a  2s.c o  m
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String systemdescription_short = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    String wikiFile = null;
    String traintestFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-ds".equals(args[i])) {
                systemdescription_short = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];

            } else if ("-s".equals(args[i])) {
                myverFile = args[++i];

            } else if ("-c".equals(args[i])) {
                contextFile = args[++i];
            } else if ("-w".equals(args[i])) {
                wikiFile = args[++i];
            } else if ("-tt".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";
    if (systemdescription_short == null)
        systemdescription_short = "a two -step classification approach";

    //      LOG.info("Tool: " + this.getClass().getName());
    //      LOG.info(" - input path: " + in);
    //      LOG.info(" - output path: " + out);
    //      LOG.info(" - runtag: " + runtag);
    //      LOG.info(" - teamname: " + teamname);
    //      LOG.info(" - corpus_id: " + corpus_id);
    //      LOG.info(" - run description: " + systemdescription);

    Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short,
            corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    /*
     * Attr.put("LengthTitle", ""); Attr.put("LengthBody", "");
     * Attr.put("LengthAnchor", ""); Attr.put("Source", "");
     * Attr.put("English", ""); Attr.put("MentionsTitle", "");
     * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", "");
     * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread",
     * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", "");
     * Attr.put("SpreadNorm", ""); // Attr.put("Related", "");
     * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", "");
     * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld",
     * ""); Attr.put("partial", ""); Attr.put("s_form", "");
     * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos",
     * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class",
     * "");
     */
    Attr.put("gcld", "0");
    Attr.put("jac", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("ppr", "0");
    Attr.put("s_form", "0");
    Attr.put("contxR", "0");
    Attr.put("contxL", "0");
    Attr.put("FirstPos", "0");
    Attr.put("LastPos", "0");
    Attr.put("LengthBody", "0");
    Attr.put("FirstPosNorm", "0");
    Attr.put("MentionsBody", "0");
    Attr.put("RelatedBody", "0");
    Attr.put("Spread", "0");
    Attr.put("LastPosNorm", "0");
    Attr.put("SpreadNorm", "0");
    Attr.put("LengthAnchor", "0");
    Attr.put("Source", "0");
    Attr.put("LengthTitle", "0");
    Attr.put("partial", "0");
    Attr.put("MentionsAnchor", "0");
    Attr.put("Relatedtitle", "0");
    Attr.put("English", "0");
    Attr.put("RelatedAnchor", "0");
    Attr.put("MentionsTitle", "0");
    Attr.put("Class", "0");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    conf.set(MYVER, new Path(myverFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString());
    conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString());
    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KbaNameVariantMatchFeatureExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());
    /*
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));
            
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
            
    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);
            
    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
            
    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;
    */

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(TextOutputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(StreamItemWritable.class);
    //job.setMapOutputValueClass(Text.class);
    //job.setOutputKeyClass(StreamItemWritable.class);

    // job.setCombinerClass(MyReducer.class);
    //job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Run_info.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    // Text line = new Text();
    // LineReader reader = new LineReader(fs.open(new Path(out
    // + "/part-r-00000")));
    // for (int i = 0; i < num_filter_results; i++) {
    // reader.readLine(line);
    // System.out.println(line.toString().split("\t\t")[1]);
    // }

    System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba2013.apps.KBANameVariantMatchTHERank.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from w w w  .j a  v  a  2 s  .  c  o  m*/
    String out = null;

    String labelsFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || labelsFile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KBANameVariantMatchTHERank.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(50);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.cwi.kba2013.apps.KBANER.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*  ww w. j  av a  2s  .  c o  m*/
    String out = null;

    String labelsFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || labelsFile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KBANER.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.cwi.kba2013.apps.ReadGzip.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;// w ww  . ja va  2 s. c o m
    String out = null;

    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(ReadGzip.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();
    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time_secs", ((double) cputime / 1000d));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    return status;

}