Example usage for java.util HashMap containsKey

Introduction

In this page you can find the example usage for java.util HashMap containsKey.

Prototype

public boolean containsKey(Object key)

Source Link

Document

Returns true if this map contains a mapping for the specified key.

Usage

From source file:com.projity.pm.graphic.gantt.Main.java

public static void main(String[] args) {
    System.setProperty("com.apple.mrj.application.apple.menu.about.name",
            Environment.getStandAlone() ? "OpenProj" : "Project-ON-Demand");
    System.setProperty("apple.laf.useScreenMenuBar", "true");
    Locale.setDefault(ConfigurationFile.getLocale());
    HashMap opts = ApplicationStartupFactory.extractOpts(args);
    String osName = System.getProperty("os.name").toLowerCase();
    if (osName.startsWith("linux")) {
        String javaExec = ConfigurationFile.getRunProperty("JAVA_EXE");
        //check jvm
        String javaVersion = System.getProperty("java.version");
        if (Environment.compareJavaVersion(javaVersion, "1.5") < 0) {
            String message = Messages.getStringWithParam("Text.badJavaVersion", javaVersion);
            if (javaExec != null && javaExec.length() > 0)
                message += "\n" + Messages.getStringWithParam("Text.javaExecutable",
                        new Object[] { javaExec, "JAVA_EXE", "$HOME/.openproj/run.conf" });
            if (!opts.containsKey("silentlyFail"))
                JOptionPane.showMessageDialog(null, message, Messages.getContextString("Title.ProjityError"),
                        JOptionPane.ERROR_MESSAGE);
            System.exit(64);// w  w  w.  j  a v  a  2 s.  c o  m
        }
        String javaVendor = System.getProperty("java.vendor");
        if (javaVendor == null || !(javaVendor.startsWith("Sun") || javaVendor.startsWith("IBM"))) {
            String message = Messages.getStringWithParam("Text.badJavaVendor", javaVendor);
            if (javaExec != null && javaExec.length() > 0)
                message += "\n" + Messages.getStringWithParam("Text.javaExecutable",
                        new Object[] { javaExec, "JAVA_EXE", "$HOME/.openproj/run.conf" });
            if (!opts.containsKey("silentlyFail"))
                JOptionPane.showMessageDialog(null, message, Messages.getContextString("Title.ProjityError"),
                        JOptionPane.ERROR_MESSAGE);
            System.exit(64);
        }
    }

    boolean newLook = false;
    //      HashMap opts = ApplicationStartupFactory.extractOpts(args); // allow setting menu look on command line - primarily for testing or webstart args
    log.info(opts);
    //      newLook = opts.get("menu") == null;

    Environment.setNewLook(newLook);
    //      if (!Environment.isNewLaf()) {
    //         try {
    //            UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
    //         } catch (Exception e) {
    //         }
    //      }
    ApplicationStartupFactory startupFactory = new ApplicationStartupFactory(opts); //put before to initialize standalone flag
    mainFrame = new MainFrame(Messages.getContextString("Text.ApplicationTitle"), null, null);
    boolean doWelcome = true; // to do see if project param exists in args
    startupFactory.instanceFromNewSession(mainFrame, doWelcome);
}

From source file:enrichment.Disambiguate.java

/**prerequisites:
 * cd silk_2.5.3/*_links/*from   ww w .jav a  2 s  .co  m*/
 * cat *.nt|sort  -t' ' -k3   > $filename
 * 
 * @param args $filename
 * @throws IOException
 * @throws URISyntaxException
 */
public static void main(String[] args) {
    File file = new File(args[0]);
    if (file.isDirectory()) {
        args = file.list(new OnlyExtFilenameFilter("nt"));
    }

    BufferedReader in;
    for (int q = 0; q < args.length; q++) {
        String filename = null;
        if (file.isDirectory()) {
            filename = file.getPath() + File.separator + args[q];
        } else {
            filename = args[q];
        }
        try {
            FileWriter output = new FileWriter(filename + "_disambiguated.nt");
            String prefix = "@prefix rdrel: <http://rdvocab.info/RDARelationshipsWEMI/> .\n"
                    + "@prefix dbpedia:    <http://de.dbpedia.org/resource/> .\n"
                    + "@prefix frbr:   <http://purl.org/vocab/frbr/core#> .\n"
                    + "@prefix lobid: <http://lobid.org/resource/> .\n"
                    + "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n"
                    + "@prefix foaf: <http://xmlns.com/foaf/0.1/> .\n"
                    + "@prefix mo: <http://purl.org/ontology/mo/> .\n"
                    + "@prefix wikipedia: <https://de.wikipedia.org/wiki/> .";
            output.append(prefix + "\n\n");
            in = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));

            HashMap<String, HashMap<String, ArrayList<String>>> hm = new HashMap<String, HashMap<String, ArrayList<String>>>();
            String s;
            HashMap<String, ArrayList<String>> hmLobid = new HashMap<String, ArrayList<String>>();
            Stack<String> old_object = new Stack<String>();

            while ((s = in.readLine()) != null) {
                String[] triples = s.split(" ");
                String object = triples[2].substring(1, triples[2].length() - 1);
                if (old_object.size() > 0 && !old_object.firstElement().equals(object)) {
                    hmLobid = new HashMap<String, ArrayList<String>>();
                    old_object = new Stack<String>();
                }
                old_object.push(object);
                String subject = triples[0].substring(1, triples[0].length() - 1);
                System.out.print("\nSubject=" + object);
                System.out.print("\ntriples[2]=" + triples[2]);
                hmLobid.put(subject, getAllCreators(new URI(subject)));
                hm.put(object, hmLobid);

            }
            // get all dbpedia resources
            for (String key_one : hm.keySet()) {
                System.out.print("\n==============\n==== " + key_one + "\n===============");
                int resources_cnt = hm.get(key_one).keySet().size();
                ArrayList<String>[] creators = new ArrayList[resources_cnt];
                HashMap<String, Integer> creators_backed = new HashMap<String, Integer>();
                int x = 0;
                // get all lobid_resources subsumed under the dbpedia resource
                for (String subject_uri : hm.get(key_one).keySet()) {
                    creators[x] = new ArrayList<String>();
                    System.out.print("\n     subject_uri=" + subject_uri);
                    Iterator<String> ite = hm.get(key_one).get(subject_uri).iterator();
                    int y = 0;
                    // get all creators of the lobid resource
                    while (ite.hasNext()) {
                        String creator = ite.next();
                        System.out.print("\n          " + creator);
                        if (creators_backed.containsKey(creator)) {
                            y = creators_backed.get(creator);
                        } else {
                            y = creators_backed.size();
                            creators_backed.put(creator, y);
                        }
                        while (creators[x].size() <= y) {
                            creators[x].add("-");
                        }
                        creators[x].set(y, creator);
                        y++;
                    }
                    x++;
                }
                if (creators_backed.size() == 1) {
                    System.out
                            .println("\n" + "Every resource pointing to " + key_one + " has the same creator!");
                    for (String key_two : hm.get(key_one).keySet()) {
                        output.append("<" + key_two + "> rdrel:workManifested <" + key_one + "> .\n");
                        output.append("<" + key_two + ">  mo:wikipedia <"
                                + key_one.replaceAll("dbpedia\\.org/resource", "wikipedia\\.org/wiki")
                                + "> .\n");
                    }
                } /*else  {
                    for (int a = 0; a < creators.length; a++) {
                       System.out.print(creators[a].toString()+",");
                    }
                  }*/
            }

            output.flush();
            if (output != null) {
                output.close();
            }
        } catch (Exception e) {
            System.out.print("Exception while working on " + filename + ": \n");
            e.printStackTrace(System.out);
        }
    }
}

From source file:com.thed.zapi.cloud.sample.CycleExecutionReportByVersion.java

public static void main(String[] args) throws JSONException, URISyntaxException, ParseException, IOException {

    String API_GET_EXECUTIONS = "{SERVER}/public/rest/api/1.0/executions/search/cycle/";
    String API_GET_CYCLES = "{SERVER}/public/rest/api/1.0/cycles/search?";
    // Delimiter used in CSV file
    final String NEW_LINE_SEPARATOR = "\n";
    final String fileName = "F:\\cycleExecutionReport.csv";

    /** Declare JIRA,Zephyr URL,access and secret Keys */
    // JIRA Cloud URL of the instance
    String jiraBaseURL = "https://demo.atlassian.net";
    // Replace zephyr baseurl <ZAPI_Cloud_URL> shared with the user for ZAPI Cloud Installation
    String zephyrBaseUrl = "<ZAPI_Cloud_URL>";
    // zephyr accessKey , we can get from Addons >> zapi section
    String accessKey = "YjE2MjdjMGEtNzExNy0zYjY1LWFkMzQtNjcwMDM3OTljFkbWluIGFkbWlu";
    // zephyr secretKey , we can get from Addons >> zapi section
    String secretKey = "qufnbimi96Ob2hq3ISF08yZ8Qw4c1eHGeGlk";

    /** Declare parameter values here */
    String userName = "admin";
    String versionId = "-1";
    String projectId = "10100";
    String projectName = "Support";
    String versionName = "Unscheduled";

    ZFJCloudRestClient client = ZFJCloudRestClient.restBuilder(zephyrBaseUrl, accessKey, secretKey, userName)
            .build();//  w ww.  j av  a  2 s .co m
    /**
     * Get List of Cycles by Project and Version
     */

    final String getCyclesUri = API_GET_CYCLES.replace("{SERVER}", zephyrBaseUrl) + "projectId=" + projectId
            + "&versionId=" + versionId;

    Map<String, String> cycles = getCyclesByProjectVersion(getCyclesUri, client, accessKey);
    // System.out.println("cycles :"+ cycles.toString());

    /**
     * Iterating over the Cycles and writing the report to CSV
     * 
     */

    FileWriter fileWriter = null;
    System.out.println("Writing CSV file.....");
    try {
        fileWriter = new FileWriter(fileName);

        // Write the CSV file header

        fileWriter.append("Cycle Execution Report By Version and Project");
        fileWriter.append(NEW_LINE_SEPARATOR);
        fileWriter.append("PROJECT:" + "," + projectName);
        fileWriter.append(NEW_LINE_SEPARATOR);
        fileWriter.append("VERSION:" + "," + versionName);
        fileWriter.append(NEW_LINE_SEPARATOR);

        JSONArray executions;
        int totalUnexecutedCount = 0;
        int totalExecutionCount = 0;

        for (String key : cycles.keySet()) {
            int executionCount = 0;
            int unexecutedCount = 0;
            final String getExecutionsUri = API_GET_EXECUTIONS.replace("{SERVER}", zephyrBaseUrl) + key
                    + "?projectId=" + projectId + "&versionId=" + versionId;
            fileWriter.append("Cycle:" + "," + cycles.get(key));
            fileWriter.append(NEW_LINE_SEPARATOR);
            executions = getExecutionsByCycleId(getExecutionsUri, client, accessKey);
            // System.out.println("executions :" + executions.toString());

            HashMap<String, Integer> counter = new HashMap<String, Integer>();

            String[] statusName = new String[executions.length()];
            for (int i = 0; i < executions.length(); i++) {
                JSONObject executionObj = executions.getJSONObject(i).getJSONObject("execution");
                // System.out.println("executionObj
                // "+executionObj.toString());
                JSONObject statusObj = executionObj.getJSONObject("status");
                // System.out.println("statusObj :"+statusObj.toString());
                statusName[i] = statusObj.getString("name");
            }

            if (statusName.length != 0) {
                // System.out.println(statusName.toString());
                for (String a : statusName) {
                    if (counter.containsKey(a)) {
                        int oldValue = counter.get(a);
                        counter.put(a, oldValue + 1);
                    } else {
                        counter.put(a, 1);
                    }
                }
                for (String status : counter.keySet()) {
                    fileWriter.append(" " + "," + " " + "," + status + "," + counter.get(status));
                    fileWriter.append(NEW_LINE_SEPARATOR);
                    if (status.equalsIgnoreCase("UNEXECUTED")) {
                        unexecutedCount += counter.get(status);
                    } else {
                        executionCount += counter.get(status);
                    }

                }
            }
            totalExecutionCount += executionCount;
            totalUnexecutedCount += unexecutedCount;

            fileWriter.append(NEW_LINE_SEPARATOR);
        }

        fileWriter.append(NEW_LINE_SEPARATOR);
        fileWriter.append("TOTAL CYCLES:" + "," + cycles.size());
        fileWriter.append(NEW_LINE_SEPARATOR);
        fileWriter.append("TOTAL EXECUTIONS:" + "," + totalExecutionCount);
        fileWriter.append(NEW_LINE_SEPARATOR);
        fileWriter.append("TOTAL ASSIGNED:" + "," + (totalUnexecutedCount + totalExecutionCount));

        System.out.println("CSV file was created successfully !!!");
    } catch (Exception e) {
        System.out.println("Error in CsvFileWriter !!!");
        e.printStackTrace();
    } finally {
        try {
            fileWriter.flush();
            fileWriter.close();
        } catch (IOException e) {
            System.out.println("Error while flushing/closing fileWriter !!!");
            e.printStackTrace();
        }
    }

}

From source file:edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques.java

/**
 * @param args//w  ww  .  j a v  a2s . c o  m
 * @throws ParseException 
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the relationship " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option g1Option = new Option("g1", "first-group", true, "set first group of datasets");
    g1Option.setRequired(true);
    g1Option.setArgName("FIRST GROUP");
    g1Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g1Option);

    Option g2Option = new Option("g2", "second-group", true, "set second group of datasets");
    g2Option.setRequired(false);
    g2Option.setArgName("SECOND GROUP");
    g2Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g2Option);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar "
                        + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar "
                        + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> firstGroup = new ArrayList<String>();
    ArrayList<String> secondGroup = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();

    boolean removeExistingFiles = cmd.hasOption("f");

    String[] firstGroupCmd = cmd.getOptionValues("g1");
    String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : new String[0];
    addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);
    addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    if (firstGroup.isEmpty()) {
        System.out.println("First group of datasets (G1) is empty. " + "Doing G1 = G2.");
        firstGroup.addAll(secondGroup);
    }

    if (secondGroup.isEmpty()) {
        System.out.println("Second group of datasets (G2) is empty. " + "Doing G2 = G1.");
        secondGroup.addAll(firstGroup);
    }

    // getting dataset ids

    String datasetNames = "";
    String datasetIds = "";
    HashMap<String, String> datasetId = new HashMap<String, String>();
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        datasetId.put(it.next(), null);
    }

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    String firstGroupStr = "";
    String secondGroupStr = "";
    for (String dataset : firstGroup) {
        firstGroupStr += datasetId.get(dataset) + ",";
    }
    for (String dataset : secondGroup) {
        secondGroupStr += datasetId.get(dataset) + ",";
    }
    firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1);
    secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1);

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.correlationTechniquesDir, s3conf, s3);

    String dataAttributesInputDirs = "";
    String noRelationship = "";

    HashSet<String> dirs = new HashSet<String>();

    String dataset1;
    String dataset2;
    String datasetId1;
    String datasetId2;
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            datasetId1 = datasetId.get(dataset1);
            datasetId2 = datasetId.get(dataset2);

            if (dataset1.equals(dataset2))
                continue;
            String correlationOutputFileName = s3bucket + FrameworkUtils.correlationTechniquesDir + "/"
                    + dataset1 + "-" + dataset2 + "/";

            if (removeExistingFiles) {
                FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3);
            }
            if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) {
                dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset1);
                dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset2);
            } else {
                noRelationship += datasetId1 + "-" + datasetId2 + ",";
            }
        }
    }

    if (dirs.isEmpty()) {
        System.out.println("All the relationships were already computed.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    for (String dir : dirs) {
        dataAttributesInputDirs += dir + ",";
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "correlation";
    String correlationOutputDir = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/";

    FrameworkUtils.removeFile(correlationOutputDir, s3conf, s3);

    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i)));
    }
    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size",
                Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length));
    }
    conf.set("dataset-keys", datasetIds);
    conf.set("dataset-names", datasetNames);
    conf.set("first-group", firstGroupStr);
    conf.set("second-group", secondGroupStr);
    conf.set("main-dataset-id", datasetId.get(shortDataset.get(0)));
    if (noRelationship.length() > 0) {
        conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1));
    }

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");
    conf.set("mapreduce.task.timeout", "2400000");

    if (s3) {
        machineConf.setMachineConfiguration(conf);
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        conf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    Job job = new Job(conf);
    job.setJobName(jobName);

    job.setMapOutputKeyClass(PairAttributeWritable.class);
    job.setMapOutputValueClass(SpatioTemporalValueWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(CorrelationTechniquesMapper.class);
    job.setReducerClass(CorrelationTechniquesReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job,
            dataAttributesInputDirs.substring(0, dataAttributesInputDirs.length() - 1));
    FileOutputFormat.setOutputPath(job, new Path(correlationOutputDir));

    job.setJarByClass(CorrelationTechniques.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            if (dataset1.equals(dataset2))
                continue;

            String from = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/" + dataset1 + "-"
                    + dataset2 + "/";
            String to = s3bucket + FrameworkUtils.correlationTechniquesDir + "/" + dataset1 + "-" + dataset2
                    + "/";
            FrameworkUtils.renameFile(from, to, s3conf, s3);
        }
    }
}

From source file:edu.nyu.vida.data_polygamy.relationship_computation.Relationship.java

/**
 * @param args//from   www  . java  2 s  .c  o m
 * @throws ParseException 
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the relationship " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option scoreOption = new Option("sc", "score", true, "set threhsold for relationship score");
    scoreOption.setRequired(false);
    scoreOption.setArgName("SCORE THRESHOLD");
    options.addOption(scoreOption);

    Option strengthOption = new Option("st", "strength", true, "set threhsold for relationship strength");
    strengthOption.setRequired(false);
    strengthOption.setArgName("STRENGTH THRESHOLD");
    options.addOption(strengthOption);

    Option completeRandomizationOption = new Option("c", "complete-randomization", false,
            "use complete randomization when performing significance tests");
    completeRandomizationOption.setRequired(false);
    options.addOption(completeRandomizationOption);

    Option idOption = new Option("id", "ids", false, "output id instead of names for datasets and attributes");
    idOption.setRequired(false);
    options.addOption(idOption);

    Option g1Option = new Option("g1", "first-group", true, "set first group of datasets");
    g1Option.setRequired(true);
    g1Option.setArgName("FIRST GROUP");
    g1Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g1Option);

    Option g2Option = new Option("g2", "second-group", true, "set second group of datasets");
    g2Option.setRequired(false);
    g2Option.setArgName("SECOND GROUP");
    g2Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g2Option);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    Option removeOption = new Option("r", "remove-not-significant", false,
            "remove relationships that are not" + "significant from the final output");
    removeOption.setRequired(false);
    options.addOption(removeOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> firstGroup = new ArrayList<String>();
    ArrayList<String> secondGroup = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();

    boolean removeNotSignificant = cmd.hasOption("r");
    boolean removeExistingFiles = cmd.hasOption("f");
    boolean completeRandomization = cmd.hasOption("c");
    boolean hasScoreThreshold = cmd.hasOption("sc");
    boolean hasStrengthThreshold = cmd.hasOption("st");
    boolean outputIds = cmd.hasOption("id");
    String scoreThreshold = hasScoreThreshold ? cmd.getOptionValue("sc") : "";
    String strengthThreshold = hasStrengthThreshold ? cmd.getOptionValue("st") : "";

    // all datasets
    ArrayList<String> all_datasets = new ArrayList<String>();
    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        all_datasets.add(line.split("\t")[0]);
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();
    String[] all_datasets_array = new String[all_datasets.size()];
    all_datasets.toArray(all_datasets_array);

    String[] firstGroupCmd = cmd.getOptionValues("g1");
    String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : all_datasets_array;
    addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);
    addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    if (firstGroup.isEmpty()) {
        System.out.println("No indices from datasets in G1.");
        System.exit(0);
    }

    if (secondGroup.isEmpty()) {
        System.out.println("No indices from datasets in G2.");
        System.exit(0);
    }

    // getting dataset ids

    String datasetNames = "";
    String datasetIds = "";
    HashMap<String, String> datasetId = new HashMap<String, String>();
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        datasetId.put(it.next(), null);
    }

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        all_datasets.add(dt[0]);
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    String firstGroupStr = "";
    String secondGroupStr = "";
    for (String dataset : firstGroup) {
        firstGroupStr += datasetId.get(dataset) + ",";
    }
    for (String dataset : secondGroup) {
        secondGroupStr += datasetId.get(dataset) + ",";
    }
    firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1);
    secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1);

    String relationshipsDir = "";
    if (outputIds) {
        relationshipsDir = FrameworkUtils.relationshipsIdsDir;
    } else {
        relationshipsDir = FrameworkUtils.relationshipsDir;
    }

    FrameworkUtils.createDir(s3bucket + relationshipsDir, s3conf, s3);

    String random = completeRandomization ? "complete" : "restricted";

    String indexInputDirs = "";
    String noRelationship = "";

    HashSet<String> dirs = new HashSet<String>();

    String dataset1;
    String dataset2;
    String datasetId1;
    String datasetId2;
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            datasetId1 = datasetId.get(dataset1);
            datasetId2 = datasetId.get(dataset2);

            if (dataset1.equals(dataset2))
                continue;
            String correlationOutputFileName = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2
                    + "/";

            if (removeExistingFiles) {
                FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3);
            }
            if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) {
                dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset1);
                dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset2);
            } else {
                noRelationship += datasetId1 + "-" + datasetId2 + ",";
            }
        }
    }

    if (dirs.isEmpty()) {
        System.out.println("All the relationships were already computed.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    for (String dir : dirs) {
        indexInputDirs += dir + ",";
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "relationship" + "-" + random;
    String relationshipOutputDir = s3bucket + relationshipsDir + "/tmp/";

    FrameworkUtils.removeFile(relationshipOutputDir, s3conf, s3);

    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i)));
    }
    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size",
                Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length));
    }
    conf.set("dataset-keys", datasetIds);
    conf.set("dataset-names", datasetNames);
    conf.set("first-group", firstGroupStr);
    conf.set("second-group", secondGroupStr);
    conf.set("complete-random", String.valueOf(completeRandomization));
    conf.set("output-ids", String.valueOf(outputIds));
    conf.set("complete-random-str", random);
    conf.set("main-dataset-id", datasetId.get(shortDataset.get(0)));
    conf.set("remove-not-significant", String.valueOf(removeNotSignificant));
    if (noRelationship.length() > 0) {
        conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1));
    }
    if (hasScoreThreshold) {
        conf.set("score-threshold", scoreThreshold);
    }
    if (hasStrengthThreshold) {
        conf.set("strength-threshold", strengthThreshold);
    }

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");
    conf.set("mapreduce.task.timeout", "2400000");

    if (s3) {
        machineConf.setMachineConfiguration(conf);
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        conf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    Job job = new Job(conf);
    job.setJobName(jobName);

    job.setMapOutputKeyClass(PairAttributeWritable.class);
    job.setMapOutputValueClass(TopologyTimeSeriesWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(CorrelationMapper.class);
    job.setReducerClass(CorrelationReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, indexInputDirs.substring(0, indexInputDirs.length() - 1));
    FileOutputFormat.setOutputPath(job, new Path(relationshipOutputDir));

    job.setJarByClass(Relationship.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            if (dataset1.equals(dataset2))
                continue;

            String from = s3bucket + relationshipsDir + "/tmp/" + dataset1 + "-" + dataset2 + "/";
            String to = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/";
            FrameworkUtils.renameFile(from, to, s3conf, s3);
        }
    }
}

From source file:edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation.java

/**
 * @param args/* w w  w  .j  ava 2  s. co m*/
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the aggregate functions " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option gOption = new Option("g", "group", true, "set group of datasets for which the aggregate functions"
            + " will be computed, followed by their temporal and spatial attribute indices");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";
    String preProcessingDatasets = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetAggregation = new ArrayList<String>();
    HashMap<String, String> datasetTempAtt = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialAtt = new HashMap<String, String>();
    HashMap<String, String> preProcessingDataset = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();

    boolean removeExistingFiles = cmd.hasOption("f");
    String[] datasetArgs = cmd.getOptionValues("g");

    for (int i = 0; i < datasetArgs.length; i += 3) {
        String dataset = datasetArgs[i];

        // getting pre-processing
        String tempPreProcessing = FrameworkUtils.searchPreProcessing(dataset, s3conf, s3);
        if (tempPreProcessing == null) {
            System.out.println("No pre-processing available for " + dataset);
            continue;
        }
        preProcessingDataset.put(dataset, tempPreProcessing);

        shortDataset.add(dataset);
        datasetTempAtt.put(dataset, ((datasetArgs[i + 1] == "null") ? null : datasetArgs[i + 1]));
        datasetSpatialAtt.put(dataset, ((datasetArgs[i + 2] == "null") ? null : datasetArgs[i + 2]));

        datasetId.put(dataset, null);
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    Path path = null;
    FileSystem fs = null;

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        fs = FileSystem.get(new Configuration());
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.aggregatesDir, s3conf, s3);

    // getting smallest resolution

    HashMap<String, String> tempResMap = new HashMap<String, String>();
    HashMap<String, String> spatialResMap = new HashMap<String, String>();

    HashMap<String, String> datasetTemporalStrMap = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialStrMap = new HashMap<String, String>();

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String[] datasetArray = preProcessingDataset.get(dataset).split("-");

        String datasetTemporalStr = datasetArray[datasetArray.length - 2];
        int datasetTemporal = utils.temporalResolution(datasetTemporalStr);

        String datasetSpatialStr = datasetArray[datasetArray.length - 1];
        int datasetSpatial = utils.spatialResolution(datasetSpatialStr);

        // finding all possible resolutions

        String[] temporalResolutions = FrameworkUtils.getAggTempResolutions(datasetTemporal);
        String[] spatialResolutions = FrameworkUtils.getAggSpatialResolutions(datasetSpatial);

        String temporalResolution = "";
        String spatialResolution = "";

        String tempRes = "";
        String spatialRes = "";

        boolean dataAdded = false;

        for (int i = 0; i < temporalResolutions.length; i++) {
            for (int j = 0; j < spatialResolutions.length; j++) {

                temporalResolution = temporalResolutions[i];
                spatialResolution = spatialResolutions[j];

                String aggregatesOutputFileName = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";

                if (removeExistingFiles) {
                    FrameworkUtils.removeFile(aggregatesOutputFileName, s3conf, s3);
                }

                if (!FrameworkUtils.fileExists(aggregatesOutputFileName, s3conf, s3)) {

                    dataAdded = true;

                    tempRes += temporalResolution + "-";
                    spatialRes += spatialResolution + "-";
                }
            }
        }

        if (dataAdded) {
            input.add(s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
            shortDatasetAggregation.add(dataset);

            tempResMap.put(dataset, tempRes.substring(0, tempRes.length() - 1));
            spatialResMap.put(dataset, spatialRes.substring(0, spatialRes.length() - 1));

            datasetTemporalStrMap.put(dataset, datasetTemporalStr);
            datasetSpatialStrMap.put(dataset, datasetSpatialStr);
        }
    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have aggregates.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    it = input.iterator();
    while (it.hasNext()) {
        preProcessingDatasets += it.next() + ",";
    }

    Job aggJob = null;
    String aggregatesOutputDir = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/";
    String jobName = "aggregates";

    FrameworkUtils.removeFile(aggregatesOutputDir, s3conf, s3);

    Configuration aggConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    aggConf.set("dataset-name", datasetNames);
    aggConf.set("dataset-id", datasetIds);

    for (int i = 0; i < shortDatasetAggregation.size(); i++) {
        String dataset = shortDatasetAggregation.get(i);
        String id = datasetId.get(dataset);
        aggConf.set("dataset-" + id + "-temporal-resolutions", tempResMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-resolutions", spatialResMap.get(dataset));
        aggConf.set("dataset-" + id + "-temporal-att", datasetTempAtt.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-att", datasetSpatialAtt.get(dataset));
        aggConf.set("dataset-" + id + "-temporal", datasetTemporalStrMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial", datasetSpatialStrMap.get(dataset));

        if (s3)
            aggConf.set("dataset-" + id,
                    s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
        else
            aggConf.set("dataset-" + id, FileSystem.get(new Configuration()).getHomeDirectory() + "/"
                    + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
    }

    aggConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    aggConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    aggConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    aggConf.set("mapreduce.task.io.sort.mb", "200");
    aggConf.set("mapreduce.task.io.sort.factor", "100");
    machineConf.setMachineConfiguration(aggConf);

    if (s3) {
        machineConf.setMachineConfiguration(aggConf);
        aggConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        aggConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
    }

    if (snappyCompression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    aggJob = new Job(aggConf);
    aggJob.setJobName(jobName);

    aggJob.setMapOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setMapOutputValueClass(AggregationArrayWritable.class);
    aggJob.setOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setOutputValueClass(FloatArrayWritable.class);
    //aggJob.setOutputKeyClass(Text.class);
    //aggJob.setOutputValueClass(Text.class);

    aggJob.setMapperClass(AggregationMapper.class);
    aggJob.setCombinerClass(AggregationCombiner.class);
    aggJob.setReducerClass(AggregationReducer.class);
    aggJob.setNumReduceTasks(machineConf.getNumberReduces());

    aggJob.setInputFormatClass(SequenceFileInputFormat.class);
    //aggJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(aggJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(aggJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(aggJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(aggJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(aggJob, true);
    FileInputFormat.setInputPaths(aggJob,
            preProcessingDatasets.substring(0, preProcessingDatasets.length() - 1));
    FileOutputFormat.setOutputPath(aggJob, new Path(aggregatesOutputDir));

    aggJob.setJarByClass(Aggregation.class);

    long start = System.currentTimeMillis();
    aggJob.submit();
    aggJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetAggregation) {
        String from = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:edu.nyu.vida.data_polygamy.feature_identification.IndexCreation.java

/**
 * @param args/* w  w w.  j a v a2  s  .  co  m*/
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the index and events " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option thresholdOption = new Option("t", "use-custom-thresholds", false,
            "use custom thresholds for regular and rare events, defined in HDFS_HOME/"
                    + FrameworkUtils.thresholdDir + " file");
    thresholdOption.setRequired(false);
    options.addOption(thresholdOption);

    Option gOption = new Option("g", "group", true,
            "set group of datasets for which the indices and events" + " will be computed");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp("hadoop jar data-polygamy.jar "
                    + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetIndex = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();
    HashMap<String, HashMap<Integer, Double>> datasetRegThreshold = new HashMap<String, HashMap<Integer, Double>>();
    HashMap<String, HashMap<Integer, Double>> datasetRareThreshold = new HashMap<String, HashMap<Integer, Double>>();

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());
    BufferedReader br;

    boolean removeExistingFiles = cmd.hasOption("f");
    boolean isThresholdUserDefined = cmd.hasOption("t");

    for (String dataset : cmd.getOptionValues("g")) {

        // getting aggregates
        String[] aggregate = FrameworkUtils.searchAggregates(dataset, s3conf, s3);
        if (aggregate.length == 0) {
            System.out.println("No aggregates found for " + dataset + ".");
            continue;
        }

        // getting aggregates header
        String aggregatesHeaderFileName = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3);
        if (aggregatesHeaderFileName == null) {
            System.out.println("No aggregate header for " + dataset);
            continue;
        }

        String aggregatesHeader = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName;

        shortDataset.add(dataset);
        datasetId.put(dataset, null);

        if (s3) {
            path = new Path(aggregatesHeader);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader);
        }

        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        datasetAgg.put(dataset, br.readLine().split("\t")[1]);
        br.close();
        if (s3)
            fs.close();
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    // getting user defined thresholds

    if (isThresholdUserDefined) {
        if (s3) {
            path = new Path(s3bucket + FrameworkUtils.thresholdDir);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.thresholdDir);
        }
        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        line = br.readLine();
        while (line != null) {
            // getting dataset name
            String dataset = line.trim();
            HashMap<Integer, Double> regThresholds = new HashMap<Integer, Double>();
            HashMap<Integer, Double> rareThresholds = new HashMap<Integer, Double>();
            line = br.readLine();
            while ((line != null) && (line.split("\t").length > 1)) {
                // getting attribute ids and thresholds
                String[] keyVals = line.trim().split("\t");
                int att = Integer.parseInt(keyVals[0].trim());
                regThresholds.put(att, Double.parseDouble(keyVals[1].trim()));
                rareThresholds.put(att, Double.parseDouble(keyVals[2].trim()));
                line = br.readLine();
            }
            datasetRegThreshold.put(dataset, regThresholds);
            datasetRareThreshold.put(dataset, rareThresholds);
        }
        br.close();
    }
    if (s3)
        fs.close();

    // datasets that will use existing merge tree
    ArrayList<String> useMergeTree = new ArrayList<String>();

    // creating index for each spatio-temporal resolution

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.indexDir, s3conf, s3);

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String indexCreationOutputFileName = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        String mergeTreeFileName = s3bucket + FrameworkUtils.mergeTreeDir + "/" + dataset + "/";

        if (removeExistingFiles) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            FrameworkUtils.removeFile(mergeTreeFileName, s3conf, s3);
            FrameworkUtils.createDir(mergeTreeFileName, s3conf, s3);
        } else if (datasetRegThreshold.containsKey(dataset)) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            if (FrameworkUtils.fileExists(mergeTreeFileName, s3conf, s3)) {
                useMergeTree.add(dataset);
            }
        }

        if (!FrameworkUtils.fileExists(indexCreationOutputFileName, s3conf, s3)) {
            input.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset);
            shortDatasetIndex.add(dataset);
        }

    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have indices.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    String aggregateDatasets = "";
    it = input.iterator();
    while (it.hasNext()) {
        aggregateDatasets += it.next() + ",";
    }

    Job icJob = null;
    Configuration icConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "index";
    String indexOutputDir = s3bucket + FrameworkUtils.indexDir + "/tmp/";

    FrameworkUtils.removeFile(indexOutputDir, s3conf, s3);

    icConf.set("dataset-name", datasetNames);
    icConf.set("dataset-id", datasetIds);

    if (!useMergeTree.isEmpty()) {
        String useMergeTreeStr = "";
        for (String dt : useMergeTree) {
            useMergeTreeStr += dt + ",";
        }
        icConf.set("use-merge-tree", useMergeTreeStr.substring(0, useMergeTreeStr.length() - 1));
    }

    for (int i = 0; i < shortDataset.size(); i++) {
        String dataset = shortDataset.get(i);
        String id = datasetId.get(dataset);
        icConf.set("dataset-" + id + "-aggregates", datasetAgg.get(dataset));
        if (datasetRegThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> regThresholds = datasetRegThreshold.get(dataset);
            String thresholds = "";
            for (int att : regThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(regThresholds.get(att)) + ",";
            }
            icConf.set("regular-" + id, thresholds.substring(0, thresholds.length() - 1));
        }

        if (datasetRareThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> rareThresholds = datasetRareThreshold.get(dataset);
            String thresholds = "";
            for (int att : rareThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(rareThresholds.get(att)) + ",";
            }
            icConf.set("rare-" + id, thresholds.substring(0, thresholds.length() - 1));
        }
    }

    icConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    icConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    icConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    icConf.set("mapreduce.task.io.sort.mb", "200");
    icConf.set("mapreduce.task.io.sort.factor", "100");
    //icConf.set("mapreduce.task.timeout", "1800000");
    machineConf.setMachineConfiguration(icConf);

    if (s3) {
        machineConf.setMachineConfiguration(icConf);
        icConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        icConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        icConf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    icJob = new Job(icConf);
    icJob.setJobName(jobName);

    icJob.setMapOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setMapOutputValueClass(SpatioTemporalFloatWritable.class);
    icJob.setOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setOutputValueClass(TopologyTimeSeriesWritable.class);
    //icJob.setOutputKeyClass(Text.class);
    //icJob.setOutputValueClass(Text.class);

    icJob.setMapperClass(IndexCreationMapper.class);
    icJob.setReducerClass(IndexCreationReducer.class);
    icJob.setNumReduceTasks(machineConf.getNumberReduces());

    icJob.setInputFormatClass(SequenceFileInputFormat.class);
    //icJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(icJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(icJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(icJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(icJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(icJob, true);
    FileInputFormat.setInputPaths(icJob, aggregateDatasets.substring(0, aggregateDatasets.length() - 1));
    FileOutputFormat.setOutputPath(icJob, new Path(indexOutputDir));

    icJob.setJarByClass(IndexCreation.class);

    long start = System.currentTimeMillis();
    icJob.submit();
    icJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetIndex) {
        String from = s3bucket + FrameworkUtils.indexDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:io.anserini.index.UpdateIndex.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(//  w  ww.  jav a 2s  .  c  o  m
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(UpdateIndex.class.getName(), options);
        System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    textOptions.setStoreTermVectors(true);

    LOG.info("index: " + indexPath);

    File file = new File("PittsburghUserTimeline");
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    final StatusStream stream = new JsonStatusCorpusReader(file);

    Status status;
    String s;
    HashMap<Long, String> hm = new HashMap<Long, String>();
    try {
        while ((s = stream.nextRaw()) != null) {
            try {
                status = DataObjectFactory.createStatus(s);

                if (status.getText() == null) {
                    continue;
                }

                hm.put(status.getUser().getId(),
                        hm.get(status.getUser().getId()) + status.getText().replaceAll("[\\r\\n]+", " "));

            } catch (Exception e) {

            }
        }

    } catch (Exception e) {
        e.printStackTrace();
    } finally {

        stream.close();
    }

    ArrayList<String> userIDList = new ArrayList<String>();
    try (BufferedReader br = new BufferedReader(new FileReader(new File("userID")))) {
        String line;
        while ((line = br.readLine()) != null) {
            userIDList.add(line.replaceAll("[\\r\\n]+", ""));

            // process the line.
        }
    }

    try {
        reader = DirectoryReader
                .open(FSDirectory.open(new File(cmdline.getOptionValue(INDEX_OPTION)).toPath()));
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION)));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

    final IndexWriter writer = new IndexWriter(dir, config);

    IndexSearcher searcher = new IndexSearcher(reader);
    System.out.println("The total number of docs indexed "
            + searcher.collectionStatistics(TweetStreamReader.StatusField.TEXT.name).docCount());

    for (int city = 0; city < cityName.length; city++) {

        // Pittsburgh's coordinate -79.976389, 40.439722

        Query q_long = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LONGITUDE.name,
                new Double(longitude[city] - 0.05), new Double(longitude[city] + 0.05), true, true);
        Query q_lat = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LATITUDE.name,
                new Double(latitude[city] - 0.05), new Double(latitude[city] + 0.05), true, true);

        BooleanQuery bqCityName = new BooleanQuery();

        Term t = new Term("place", cityName[city]);
        TermQuery query = new TermQuery(t);
        bqCityName.add(query, BooleanClause.Occur.SHOULD);
        System.out.println(query.toString());

        for (int i = 0; i < cityNameAlias[city].length; i++) {
            t = new Term("place", cityNameAlias[city][i]);
            query = new TermQuery(t);
            bqCityName.add(query, BooleanClause.Occur.SHOULD);
            System.out.println(query.toString());
        }

        BooleanQuery bq = new BooleanQuery();

        BooleanQuery finalQuery = new BooleanQuery();

        // either a coordinate match
        bq.add(q_long, BooleanClause.Occur.MUST);
        bq.add(q_lat, BooleanClause.Occur.MUST);

        finalQuery.add(bq, BooleanClause.Occur.SHOULD);
        // or a place city name match
        finalQuery.add(bqCityName, BooleanClause.Occur.SHOULD);

        TotalHitCountCollector totalHitCollector = new TotalHitCountCollector();

        // Query hasFieldQuery = new ConstantScoreQuery(new
        // FieldValueFilter("timeline"));
        //
        // searcher.search(hasFieldQuery, totalHitCollector);
        //
        // if (totalHitCollector.getTotalHits() > 0) {
        // TopScoreDocCollector collector =
        // TopScoreDocCollector.create(Math.max(0,
        // totalHitCollector.getTotalHits()));
        // searcher.search(finalQuery, collector);
        // ScoreDoc[] hits = collector.topDocs().scoreDocs;
        //
        //
        // HashMap<String, Integer> hasHit = new HashMap<String, Integer>();
        // int dupcount = 0;
        // for (int i = 0; i < hits.length; ++i) {
        // int docId = hits[i].doc;
        // Document d;
        //
        // d = searcher.doc(docId);
        //
        // System.out.println(d.getFields());
        // }
        // }

        // totalHitCollector = new TotalHitCountCollector();
        searcher.search(finalQuery, totalHitCollector);

        if (totalHitCollector.getTotalHits() > 0) {
            TopScoreDocCollector collector = TopScoreDocCollector
                    .create(Math.max(0, totalHitCollector.getTotalHits()));
            searcher.search(finalQuery, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits.");

            HashMap<String, Integer> hasHit = new HashMap<String, Integer>();
            int dupcount = 0;
            for (int i = 0; i < hits.length; ++i) {
                int docId = hits[i].doc;
                Document d;

                d = searcher.doc(docId);

                if (userIDList.contains(d.get(IndexTweets.StatusField.USER_ID.name))
                        && hm.containsKey(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name)))) {
                    //            System.out.println("Has timeline field?" + (d.get("timeline") != null));
                    //            System.out.println(reader.getDocCount("timeline"));
                    //            d.add(new Field("timeline", hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))),
                    //                textOptions));
                    System.out.println("Found a user hit");
                    BytesRefBuilder brb = new BytesRefBuilder();
                    NumericUtils.longToPrefixCodedBytes(Long.parseLong(d.get(IndexTweets.StatusField.ID.name)),
                            0, brb);
                    Term term = new Term(IndexTweets.StatusField.ID.name, brb.get());
                    //            System.out.println(reader.getDocCount("timeline"));

                    Document d_new = new Document();
                    //            for (IndexableField field : d.getFields()) {
                    //              d_new.add(field);
                    //            }
                    // System.out.println(d_new.getFields());
                    d_new.add(new StringField("userBackground", d.get(IndexTweets.StatusField.USER_ID.name),
                            Store.YES));
                    d_new.add(new Field("timeline",
                            hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))), textOptions));
                    // System.out.println(d_new.get());
                    writer.addDocument(d_new);
                    writer.commit();

                    //            t = new Term("label", "why");
                    //            TermQuery tqnew = new TermQuery(t);
                    //
                    //            totalHitCollector = new TotalHitCountCollector();
                    //
                    //            searcher.search(tqnew, totalHitCollector);
                    //
                    //            if (totalHitCollector.getTotalHits() > 0) {
                    //              collector = TopScoreDocCollector.create(Math.max(0, totalHitCollector.getTotalHits()));
                    //              searcher.search(tqnew, collector);
                    //              hits = collector.topDocs().scoreDocs;
                    //
                    //              System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits.");
                    //
                    //              for (int k = 0; k < hits.length; k++) {
                    //                docId = hits[k].doc;
                    //                d = searcher.doc(docId);
                    //                System.out.println(d.get(IndexTweets.StatusField.ID.name));
                    //                System.out.println(d.get(IndexTweets.StatusField.PLACE.name));
                    //              }
                    //            }

                    // writer.deleteDocuments(term);
                    // writer.commit();
                    // writer.addDocument(d);
                    // writer.commit();

                    //            System.out.println(reader.getDocCount("timeline"));
                    // writer.updateDocument(term, d);
                    // writer.commit();

                }

            }
        }
    }
    reader.close();
    writer.close();

}

From source file:Main.java

public static HashMap tallyPrint(String phrase) {
    int count = 0;
    HashMap<Character, Integer> fav = new HashMap<Character, Integer>();
    for (int i = 0; i < phrase.length(); i++) {
        if (fav.containsKey(phrase.charAt(i)))
            fav.put(phrase.charAt(i), (fav.get(phrase.charAt(i))) + 1);
        else// w w  w. ja va 2  s .  co m
            fav.put(phrase.charAt(i), 1);
    }
    return fav;
}

From source file:Main.java

private static void addToUsed(HashMap<Integer, Integer> used, Integer key, Integer value) {
    if (used.containsKey(value)) {
        value = used.get(value);/* ww  w.  j  a v  a  2  s.c o m*/
        addToUsed(used, key, value);
    }
    used.put(key, value);
}