Example usage for java.util Iterator hasNext

Introduction

In this page you can find the example usage for java.util Iterator hasNext.

Prototype

boolean hasNext();

Source Link

Document

Returns true if the iteration has more elements.

Usage

From source file:com.github.xbn.examples.regexutil.non_xbn.BetweenLineMarkersButSkipFirstXmpl.java

public static final void main(String[] as_1RqdTxtFilePath) {
    Iterator<String> lineItr = null;
    try {//from   w  w  w  . ja  v  a2 s  .  c  o  m
        lineItr = FileUtils.lineIterator(new File(as_1RqdTxtFilePath[0])); //Throws npx if null
    } catch (IOException iox) {
        throw new RuntimeException("Attempting to open \"" + as_1RqdTxtFilePath[0] + "\"", iox);
    } catch (RuntimeException rx) {
        throw new RuntimeException("One required parameter: The path to the text file.", rx);
    }

    String LINE_SEP = System.getProperty("line.separator", "\n");

    ArrayList<String> alsItems = new ArrayList<String>();
    boolean bStartMark = false;
    boolean bLine1Skipped = false;
    StringBuilder sdCurrentItem = new StringBuilder();
    while (lineItr.hasNext()) {
        String sLine = lineItr.next().trim();
        if (!bStartMark) {
            if (sLine.startsWith(".START_SEQUENCE")) {
                bStartMark = true;
                continue;
            }
            throw new IllegalStateException("Start mark not found.");
        }
        if (!bLine1Skipped) {
            bLine1Skipped = true;
            continue;
        } else if (!sLine.equals(".END_SEQUENCE")) {
            sdCurrentItem.append(sLine).append(LINE_SEP);
        } else {
            alsItems.add(sdCurrentItem.toString());
            sdCurrentItem.setLength(0);
            bStartMark = false;
            bLine1Skipped = false;
            continue;
        }
    }

    for (String s : alsItems) {
        System.out.println("----------");
        System.out.print(s);
    }
}

From source file:edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques.java

/**
 * @param args//  w  w w.j  a  v  a2 s .  co m
 * @throws ParseException 
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the relationship " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option g1Option = new Option("g1", "first-group", true, "set first group of datasets");
    g1Option.setRequired(true);
    g1Option.setArgName("FIRST GROUP");
    g1Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g1Option);

    Option g2Option = new Option("g2", "second-group", true, "set second group of datasets");
    g2Option.setRequired(false);
    g2Option.setArgName("SECOND GROUP");
    g2Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g2Option);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar "
                        + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar "
                        + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> firstGroup = new ArrayList<String>();
    ArrayList<String> secondGroup = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();

    boolean removeExistingFiles = cmd.hasOption("f");

    String[] firstGroupCmd = cmd.getOptionValues("g1");
    String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : new String[0];
    addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);
    addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    if (firstGroup.isEmpty()) {
        System.out.println("First group of datasets (G1) is empty. " + "Doing G1 = G2.");
        firstGroup.addAll(secondGroup);
    }

    if (secondGroup.isEmpty()) {
        System.out.println("Second group of datasets (G2) is empty. " + "Doing G2 = G1.");
        secondGroup.addAll(firstGroup);
    }

    // getting dataset ids

    String datasetNames = "";
    String datasetIds = "";
    HashMap<String, String> datasetId = new HashMap<String, String>();
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        datasetId.put(it.next(), null);
    }

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    String firstGroupStr = "";
    String secondGroupStr = "";
    for (String dataset : firstGroup) {
        firstGroupStr += datasetId.get(dataset) + ",";
    }
    for (String dataset : secondGroup) {
        secondGroupStr += datasetId.get(dataset) + ",";
    }
    firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1);
    secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1);

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.correlationTechniquesDir, s3conf, s3);

    String dataAttributesInputDirs = "";
    String noRelationship = "";

    HashSet<String> dirs = new HashSet<String>();

    String dataset1;
    String dataset2;
    String datasetId1;
    String datasetId2;
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            datasetId1 = datasetId.get(dataset1);
            datasetId2 = datasetId.get(dataset2);

            if (dataset1.equals(dataset2))
                continue;
            String correlationOutputFileName = s3bucket + FrameworkUtils.correlationTechniquesDir + "/"
                    + dataset1 + "-" + dataset2 + "/";

            if (removeExistingFiles) {
                FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3);
            }
            if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) {
                dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset1);
                dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset2);
            } else {
                noRelationship += datasetId1 + "-" + datasetId2 + ",";
            }
        }
    }

    if (dirs.isEmpty()) {
        System.out.println("All the relationships were already computed.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    for (String dir : dirs) {
        dataAttributesInputDirs += dir + ",";
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "correlation";
    String correlationOutputDir = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/";

    FrameworkUtils.removeFile(correlationOutputDir, s3conf, s3);

    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i)));
    }
    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size",
                Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length));
    }
    conf.set("dataset-keys", datasetIds);
    conf.set("dataset-names", datasetNames);
    conf.set("first-group", firstGroupStr);
    conf.set("second-group", secondGroupStr);
    conf.set("main-dataset-id", datasetId.get(shortDataset.get(0)));
    if (noRelationship.length() > 0) {
        conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1));
    }

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");
    conf.set("mapreduce.task.timeout", "2400000");

    if (s3) {
        machineConf.setMachineConfiguration(conf);
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        conf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    Job job = new Job(conf);
    job.setJobName(jobName);

    job.setMapOutputKeyClass(PairAttributeWritable.class);
    job.setMapOutputValueClass(SpatioTemporalValueWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(CorrelationTechniquesMapper.class);
    job.setReducerClass(CorrelationTechniquesReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job,
            dataAttributesInputDirs.substring(0, dataAttributesInputDirs.length() - 1));
    FileOutputFormat.setOutputPath(job, new Path(correlationOutputDir));

    job.setJarByClass(CorrelationTechniques.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            if (dataset1.equals(dataset2))
                continue;

            String from = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/" + dataset1 + "-"
                    + dataset2 + "/";
            String to = s3bucket + FrameworkUtils.correlationTechniquesDir + "/" + dataset1 + "-" + dataset2
                    + "/";
            FrameworkUtils.renameFile(from, to, s3conf, s3);
        }
    }
}

From source file:com.alkacon.opencms.registration.CmsRegistrationFormHandler.java

/**
 * As test case.<p>//from   w w  w  . j ava  2  s .  c  o m
 * 
 * @param args not used
 */
public static void main(String[] args) {

    CmsUser user = new CmsUser(null, "/mylongouname/m.moossen@alkacon.com", "", "", "", "", 0, 0, 0, null);
    String code = getActivationCode(user);
    System.out.println(code);
    System.out.println(getUserName(code));

    CmsMacroResolver macroResolver = CmsMacroResolver.newInstance();
    macroResolver.setKeepEmptyMacros(true);
    // create macros for getters 
    Method[] methods = CmsUser.class.getDeclaredMethods();
    for (int i = 0; i < methods.length; i++) {
        Method method = methods[i];
        if (method.getReturnType() != String.class) {
            continue;
        }
        if (method.getParameterTypes().length > 0) {
            continue;
        }
        if (!method.getName().startsWith("get") || (method.getName().length() < 4)
                || method.getName().equals("getPassword")) {
            continue;
        }
        String label = ("" + method.getName().charAt(3)).toLowerCase();
        if (method.getName().length() > 4) {
            label += method.getName().substring(4);
        }
        try {
            Object value = method.invoke(user, new Object[] {});
            if (value == null) {
                value = "";
            }
            macroResolver.addMacro(label, value.toString());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    // add addinfo values as macros
    Iterator itFields = user.getAdditionalInfo().entrySet().iterator();
    while (itFields.hasNext()) {
        Map.Entry entry = (Map.Entry) itFields.next();
        if ((entry.getValue() instanceof String) && (entry.getKey() instanceof String)) {
            macroResolver.addMacro(entry.getKey().toString(), entry.getValue().toString());
        }
    }
    // add login
    macroResolver.addMacro(FIELD_LOGIN, user.getSimpleName());

}

From source file:edu.nyu.vida.data_polygamy.feature_identification.IndexCreation.java

/**
 * @param args/* w  ww. ja  v a 2s . c o m*/
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the index and events " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option thresholdOption = new Option("t", "use-custom-thresholds", false,
            "use custom thresholds for regular and rare events, defined in HDFS_HOME/"
                    + FrameworkUtils.thresholdDir + " file");
    thresholdOption.setRequired(false);
    options.addOption(thresholdOption);

    Option gOption = new Option("g", "group", true,
            "set group of datasets for which the indices and events" + " will be computed");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp("hadoop jar data-polygamy.jar "
                    + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetIndex = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();
    HashMap<String, HashMap<Integer, Double>> datasetRegThreshold = new HashMap<String, HashMap<Integer, Double>>();
    HashMap<String, HashMap<Integer, Double>> datasetRareThreshold = new HashMap<String, HashMap<Integer, Double>>();

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());
    BufferedReader br;

    boolean removeExistingFiles = cmd.hasOption("f");
    boolean isThresholdUserDefined = cmd.hasOption("t");

    for (String dataset : cmd.getOptionValues("g")) {

        // getting aggregates
        String[] aggregate = FrameworkUtils.searchAggregates(dataset, s3conf, s3);
        if (aggregate.length == 0) {
            System.out.println("No aggregates found for " + dataset + ".");
            continue;
        }

        // getting aggregates header
        String aggregatesHeaderFileName = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3);
        if (aggregatesHeaderFileName == null) {
            System.out.println("No aggregate header for " + dataset);
            continue;
        }

        String aggregatesHeader = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName;

        shortDataset.add(dataset);
        datasetId.put(dataset, null);

        if (s3) {
            path = new Path(aggregatesHeader);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader);
        }

        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        datasetAgg.put(dataset, br.readLine().split("\t")[1]);
        br.close();
        if (s3)
            fs.close();
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    // getting user defined thresholds

    if (isThresholdUserDefined) {
        if (s3) {
            path = new Path(s3bucket + FrameworkUtils.thresholdDir);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.thresholdDir);
        }
        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        line = br.readLine();
        while (line != null) {
            // getting dataset name
            String dataset = line.trim();
            HashMap<Integer, Double> regThresholds = new HashMap<Integer, Double>();
            HashMap<Integer, Double> rareThresholds = new HashMap<Integer, Double>();
            line = br.readLine();
            while ((line != null) && (line.split("\t").length > 1)) {
                // getting attribute ids and thresholds
                String[] keyVals = line.trim().split("\t");
                int att = Integer.parseInt(keyVals[0].trim());
                regThresholds.put(att, Double.parseDouble(keyVals[1].trim()));
                rareThresholds.put(att, Double.parseDouble(keyVals[2].trim()));
                line = br.readLine();
            }
            datasetRegThreshold.put(dataset, regThresholds);
            datasetRareThreshold.put(dataset, rareThresholds);
        }
        br.close();
    }
    if (s3)
        fs.close();

    // datasets that will use existing merge tree
    ArrayList<String> useMergeTree = new ArrayList<String>();

    // creating index for each spatio-temporal resolution

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.indexDir, s3conf, s3);

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String indexCreationOutputFileName = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        String mergeTreeFileName = s3bucket + FrameworkUtils.mergeTreeDir + "/" + dataset + "/";

        if (removeExistingFiles) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            FrameworkUtils.removeFile(mergeTreeFileName, s3conf, s3);
            FrameworkUtils.createDir(mergeTreeFileName, s3conf, s3);
        } else if (datasetRegThreshold.containsKey(dataset)) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            if (FrameworkUtils.fileExists(mergeTreeFileName, s3conf, s3)) {
                useMergeTree.add(dataset);
            }
        }

        if (!FrameworkUtils.fileExists(indexCreationOutputFileName, s3conf, s3)) {
            input.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset);
            shortDatasetIndex.add(dataset);
        }

    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have indices.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    String aggregateDatasets = "";
    it = input.iterator();
    while (it.hasNext()) {
        aggregateDatasets += it.next() + ",";
    }

    Job icJob = null;
    Configuration icConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "index";
    String indexOutputDir = s3bucket + FrameworkUtils.indexDir + "/tmp/";

    FrameworkUtils.removeFile(indexOutputDir, s3conf, s3);

    icConf.set("dataset-name", datasetNames);
    icConf.set("dataset-id", datasetIds);

    if (!useMergeTree.isEmpty()) {
        String useMergeTreeStr = "";
        for (String dt : useMergeTree) {
            useMergeTreeStr += dt + ",";
        }
        icConf.set("use-merge-tree", useMergeTreeStr.substring(0, useMergeTreeStr.length() - 1));
    }

    for (int i = 0; i < shortDataset.size(); i++) {
        String dataset = shortDataset.get(i);
        String id = datasetId.get(dataset);
        icConf.set("dataset-" + id + "-aggregates", datasetAgg.get(dataset));
        if (datasetRegThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> regThresholds = datasetRegThreshold.get(dataset);
            String thresholds = "";
            for (int att : regThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(regThresholds.get(att)) + ",";
            }
            icConf.set("regular-" + id, thresholds.substring(0, thresholds.length() - 1));
        }

        if (datasetRareThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> rareThresholds = datasetRareThreshold.get(dataset);
            String thresholds = "";
            for (int att : rareThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(rareThresholds.get(att)) + ",";
            }
            icConf.set("rare-" + id, thresholds.substring(0, thresholds.length() - 1));
        }
    }

    icConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    icConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    icConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    icConf.set("mapreduce.task.io.sort.mb", "200");
    icConf.set("mapreduce.task.io.sort.factor", "100");
    //icConf.set("mapreduce.task.timeout", "1800000");
    machineConf.setMachineConfiguration(icConf);

    if (s3) {
        machineConf.setMachineConfiguration(icConf);
        icConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        icConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        icConf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    icJob = new Job(icConf);
    icJob.setJobName(jobName);

    icJob.setMapOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setMapOutputValueClass(SpatioTemporalFloatWritable.class);
    icJob.setOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setOutputValueClass(TopologyTimeSeriesWritable.class);
    //icJob.setOutputKeyClass(Text.class);
    //icJob.setOutputValueClass(Text.class);

    icJob.setMapperClass(IndexCreationMapper.class);
    icJob.setReducerClass(IndexCreationReducer.class);
    icJob.setNumReduceTasks(machineConf.getNumberReduces());

    icJob.setInputFormatClass(SequenceFileInputFormat.class);
    //icJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(icJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(icJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(icJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(icJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(icJob, true);
    FileInputFormat.setInputPaths(icJob, aggregateDatasets.substring(0, aggregateDatasets.length() - 1));
    FileOutputFormat.setOutputPath(icJob, new Path(indexOutputDir));

    icJob.setJarByClass(IndexCreation.class);

    long start = System.currentTimeMillis();
    icJob.submit();
    icJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetIndex) {
        String from = s3bucket + FrameworkUtils.indexDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:org.ala.dao.CassandraPelopsHelper.java

public static void main(String[] args) throws Exception {
    CassandraPelopsHelper helper = new CassandraPelopsHelper();
    helper.init();/*w  w w .  ja v  a  2 s  .  c  o m*/

    Map<String, Object> map = helper.getSubColumnsByGuid("tc", "103067807");
    Set<String> keys = map.keySet();
    Iterator<String> it = keys.iterator();
    while (it.hasNext()) {
        String key = it.next();
        ColumnType type = ColumnType.getColumnType(key);
        Object o = map.get(type.getColumnName());
        if (o instanceof List) {
            List l = (List) o;
        } else {
            Comparable c = (Comparable) o;
        }
    }

    /*
          TaxonConcept t = null;
           List<Comparable> l = new ArrayList<Comparable>();
            
          for(int i=0; i< 10; i++){
               t =  new TaxonConcept();
               t.setId(i);
               t.setGuid("urn:lsid:"+i);
               t.setNameString("Aus bus");
               t.setAuthor("Smith");
               t.setAuthorYear("2008");
               t.setInfoSourceName("AFD");
               t.setInfoSourceURL("http://afd.org.au");
               helper.putSingle("taxonConcept", "tc", "taxonConcept", t.getGuid(), t);
            
               l.add(t);
               if(i % 1000==0){
      System.out.println("id: "+i);
               }
          }
          helper.putList("taxonConcept", "tc", "taxonConcept", "128", l, true);
            
            CommonName c1 = new CommonName();
            c1.setNameString("Dave");
            
            CommonName c2 = new CommonName();
            c2.setNameString("Frank");
            
            helper.putSingle("taxonConcept", "tc", "taxonConcept", "123", t);
            helper.put("taxonConcept", "tc", "commonName", "123", c1);
            helper.put("taxonConcept", "tc", "commonName", "123", c2);
            helper.putSingle("taxonConcept", "tc", "taxonConcept", "124", t);
            
            TaxonConcept tc = (TaxonConcept) helper.get("taxonConcept", "tc", "taxonConcept", "123", TaxonConcept.class);
            System.out.println("Retrieved: "+tc.getNameString());
            
            List<CommonName> cns = (List) helper.getList("taxonConcept", "tc", "commonName", "123", CommonName.class);
            System.out.println("Retrieved: "+cns);
    */
    //cassandra scanning
    Scanner scanner = helper.getScanner("taxonConcept", "tc", "taxonConcept");
    for (int i = 0; i < 10; i++) {
        System.out.println(new String(scanner.getNextGuid()));
    }
    System.exit(0);
}

From source file:com.amalto.workbench.utils.XSDParser.java

/**
 * Print a simple type definition for the document.
 * /*from w w w. java2  s .co  m*/
 * @param xsdSimpleTypeDefinition a simple type definition in the schema for schema.
 */
/*
 * public void printSimpleTypeDefinition( XSDSimpleTypeDefinition xsdSimpleTypeDefinition) { if
 * (xsdSimpleTypeDefinition == null) { } else if (xsdSimpleTypeDefinition.getEffectiveEnumerationFacet() != null) {
 * List value = xsdSimpleTypeDefinition.getEffectiveEnumerationFacet() .getValue(); if (value.size() > 1) {
 * System.out.print("("); } for (Iterator enumerators = value.iterator(); enumerators.hasNext();) { String
 * enumerator = enumerators.next().toString(); System.out.print("<em>"); System.out.print(enumerator);
 * System.out.print("</em>"); if (enumerators.hasNext()) { System.out.print("&nbsp;|&nbsp;"); } } if (value.size() >
 * 1) { System.out.print(")"); } } else if (xsdSimpleTypeDefinition.getElement() != null &&
 * xsdSimpleTypeDefinition.getElement().hasAttribute( XSDConstants.ID_ATTRIBUTE)) { System.out.print("<a href='#" +
 * xsdSimpleTypeDefinition.getName() + "-simple-type'>"); System.out.print(xsdSimpleTypeDefinition.getName());
 * System.out.print("</a>"); } else if (XSDVariety.UNION_LITERAL == xsdSimpleTypeDefinition .getVariety()) {
 * System.out.print("("); for (Iterator members = xsdSimpleTypeDefinition .getMemberTypeDefinitions().iterator();
 * members.hasNext();) { XSDSimpleTypeDefinition memberTypeDefinition = (XSDSimpleTypeDefinition) members .next();
 * printSimpleTypeDefinition(memberTypeDefinition); if (members.hasNext()) { System.out.print("&nbsp;|&nbsp;"); } }
 * System.out.print(")"); } else if (XSDVariety.UNION_LITERAL == xsdSimpleTypeDefinition .getVariety()) {
 * System.out.print("List&nbsp;of&nbsp;"); printSimpleTypeDefinition(xsdSimpleTypeDefinition
 * .getItemTypeDefinition()); } else if (xsdSimpleTypeDefinition.getName() != null) { if
 * ("public".equals(xsdSimpleTypeDefinition.getName())) { System.out.print("<a target='Part2' href='" +
 * XSDConstants.PART2 + "#anyURI'>anyURI</a>&nbsp;&nbsp;"); System.out.print("<a target='Errata' href='" + errata +
 * "#pfipublic'><em>public</em></a>"); } else { System.out.print("<b><em>");
 * System.out.print(xsdSimpleTypeDefinition.getName()); System.out.print("</em></b>"); } } else if
 * (xsdSimpleTypeDefinition.getEffectivePatternFacet() != null) { //
 * System.out.print(xsdSimpleTypeDefinition.getEffectivePatternFacet().getLexicalValue());
 * 
 * System.out.print("<em>"); System.out.print("<a target='Part1' href='" + XSDConstants.PART1 +
 * "#coss-identity-constraint'>"); System.out.print("a restricted xpath expression"); System.out.print("</a>");
 * System.out.print("</em>"); } else { System.out.print("***"); } }
 */

public static void main(String args[]) {
    try {
        /*
         * String xsd = ""; FileInputStream fis = new FileInputStream(
         * "/home/bgrieder/workspace/XCBL35/XCBL35.xsd"); BufferedReader br = new BufferedReader(new
         * InputStreamReader(fis, "utf-8")); String line; while ((line = br.readLine()) != null) xsd += line + "\n";
         * 
         * XSDParser parser = new XSDParser(); parser.loadAndPrint(xsd);
         */
        FileWriter fw = new FileWriter("/tmp/xcb35sr.xsd"); //$NON-NLS-1$

        Resource.Factory.Registry.INSTANCE.getExtensionToFactoryMap().put("xsd", new XSDResourceFactoryImpl()); //$NON-NLS-1$
        String xsdFile = "/home/bgrieder/workspace/XCBL35/XCBL35.xsd"; //$NON-NLS-1$
        ResourceSet resourceSet = new ResourceSetImpl();
        XSDResourceImpl xsdResource = (XSDResourceImpl) resourceSet.getResource(URI.createFileURI(xsdFile),
                true);

        /*
         * XSDResourceImpl res = new XSDResourceImpl(URI.createFileURI(xsdFile));
         */
        XSDSchema xsdSchema = xsdResource.getSchema();

        String header = "<xsd:schema " + "elementFormDefault=\"qualified\" " //$NON-NLS-1$ //$NON-NLS-2$
                + "targetNamespace=\"rrn:org.xcbl:schemas/xcbl/v3_5/xcbl35.xsd\" " //$NON-NLS-1$
                + "xmlns=\"rrn:org.xcbl:schemas/xcbl/v3_5/xcbl35.xsd\" " //$NON-NLS-1$
                + "xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\">"; //$NON-NLS-1$

        fw.write(header);

        Iterator it = xsdSchema.getElementDeclarations().iterator();
        for (; it.hasNext();) {
            XSDElementDeclaration elementDeclaration = (XSDElementDeclaration) it.next();
            // if ("Order".equals(elementDeclaration.getName())) {
            fw.write(Util.nodeToString(elementDeclaration.getElement())
                    .replaceAll("xmlns:xsd=\"http:\\/\\/www\\.w3\\.org\\/2001\\/XMLSchema\"", "")); //$NON-NLS-1$ //$NON-NLS-2$
            // }
        }
        it = xsdSchema.getTypeDefinitions().iterator();
        for (; it.hasNext();) {
            XSDTypeDefinition typedef = (XSDTypeDefinition) it.next();
            fw.write(Util.nodeToString(typedef.getElement()));
        }
        String footer = "</xsd:schema>"; //$NON-NLS-1$
        fw.write(footer);
        fw.close();
    } catch (Exception e) {
        log.error(e.getMessage(), e);
    }
}

From source file:edu.nyu.vida.data_polygamy.relationship_computation.Relationship.java

/**
 * @param args//from w w w . j  a va 2s.c  o m
 * @throws ParseException 
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the relationship " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option scoreOption = new Option("sc", "score", true, "set threhsold for relationship score");
    scoreOption.setRequired(false);
    scoreOption.setArgName("SCORE THRESHOLD");
    options.addOption(scoreOption);

    Option strengthOption = new Option("st", "strength", true, "set threhsold for relationship strength");
    strengthOption.setRequired(false);
    strengthOption.setArgName("STRENGTH THRESHOLD");
    options.addOption(strengthOption);

    Option completeRandomizationOption = new Option("c", "complete-randomization", false,
            "use complete randomization when performing significance tests");
    completeRandomizationOption.setRequired(false);
    options.addOption(completeRandomizationOption);

    Option idOption = new Option("id", "ids", false, "output id instead of names for datasets and attributes");
    idOption.setRequired(false);
    options.addOption(idOption);

    Option g1Option = new Option("g1", "first-group", true, "set first group of datasets");
    g1Option.setRequired(true);
    g1Option.setArgName("FIRST GROUP");
    g1Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g1Option);

    Option g2Option = new Option("g2", "second-group", true, "set second group of datasets");
    g2Option.setRequired(false);
    g2Option.setArgName("SECOND GROUP");
    g2Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g2Option);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    Option removeOption = new Option("r", "remove-not-significant", false,
            "remove relationships that are not" + "significant from the final output");
    removeOption.setRequired(false);
    options.addOption(removeOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> firstGroup = new ArrayList<String>();
    ArrayList<String> secondGroup = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();

    boolean removeNotSignificant = cmd.hasOption("r");
    boolean removeExistingFiles = cmd.hasOption("f");
    boolean completeRandomization = cmd.hasOption("c");
    boolean hasScoreThreshold = cmd.hasOption("sc");
    boolean hasStrengthThreshold = cmd.hasOption("st");
    boolean outputIds = cmd.hasOption("id");
    String scoreThreshold = hasScoreThreshold ? cmd.getOptionValue("sc") : "";
    String strengthThreshold = hasStrengthThreshold ? cmd.getOptionValue("st") : "";

    // all datasets
    ArrayList<String> all_datasets = new ArrayList<String>();
    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        all_datasets.add(line.split("\t")[0]);
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();
    String[] all_datasets_array = new String[all_datasets.size()];
    all_datasets.toArray(all_datasets_array);

    String[] firstGroupCmd = cmd.getOptionValues("g1");
    String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : all_datasets_array;
    addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);
    addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    if (firstGroup.isEmpty()) {
        System.out.println("No indices from datasets in G1.");
        System.exit(0);
    }

    if (secondGroup.isEmpty()) {
        System.out.println("No indices from datasets in G2.");
        System.exit(0);
    }

    // getting dataset ids

    String datasetNames = "";
    String datasetIds = "";
    HashMap<String, String> datasetId = new HashMap<String, String>();
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        datasetId.put(it.next(), null);
    }

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        all_datasets.add(dt[0]);
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    String firstGroupStr = "";
    String secondGroupStr = "";
    for (String dataset : firstGroup) {
        firstGroupStr += datasetId.get(dataset) + ",";
    }
    for (String dataset : secondGroup) {
        secondGroupStr += datasetId.get(dataset) + ",";
    }
    firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1);
    secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1);

    String relationshipsDir = "";
    if (outputIds) {
        relationshipsDir = FrameworkUtils.relationshipsIdsDir;
    } else {
        relationshipsDir = FrameworkUtils.relationshipsDir;
    }

    FrameworkUtils.createDir(s3bucket + relationshipsDir, s3conf, s3);

    String random = completeRandomization ? "complete" : "restricted";

    String indexInputDirs = "";
    String noRelationship = "";

    HashSet<String> dirs = new HashSet<String>();

    String dataset1;
    String dataset2;
    String datasetId1;
    String datasetId2;
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            datasetId1 = datasetId.get(dataset1);
            datasetId2 = datasetId.get(dataset2);

            if (dataset1.equals(dataset2))
                continue;
            String correlationOutputFileName = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2
                    + "/";

            if (removeExistingFiles) {
                FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3);
            }
            if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) {
                dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset1);
                dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset2);
            } else {
                noRelationship += datasetId1 + "-" + datasetId2 + ",";
            }
        }
    }

    if (dirs.isEmpty()) {
        System.out.println("All the relationships were already computed.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    for (String dir : dirs) {
        indexInputDirs += dir + ",";
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "relationship" + "-" + random;
    String relationshipOutputDir = s3bucket + relationshipsDir + "/tmp/";

    FrameworkUtils.removeFile(relationshipOutputDir, s3conf, s3);

    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i)));
    }
    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size",
                Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length));
    }
    conf.set("dataset-keys", datasetIds);
    conf.set("dataset-names", datasetNames);
    conf.set("first-group", firstGroupStr);
    conf.set("second-group", secondGroupStr);
    conf.set("complete-random", String.valueOf(completeRandomization));
    conf.set("output-ids", String.valueOf(outputIds));
    conf.set("complete-random-str", random);
    conf.set("main-dataset-id", datasetId.get(shortDataset.get(0)));
    conf.set("remove-not-significant", String.valueOf(removeNotSignificant));
    if (noRelationship.length() > 0) {
        conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1));
    }
    if (hasScoreThreshold) {
        conf.set("score-threshold", scoreThreshold);
    }
    if (hasStrengthThreshold) {
        conf.set("strength-threshold", strengthThreshold);
    }

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");
    conf.set("mapreduce.task.timeout", "2400000");

    if (s3) {
        machineConf.setMachineConfiguration(conf);
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        conf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    Job job = new Job(conf);
    job.setJobName(jobName);

    job.setMapOutputKeyClass(PairAttributeWritable.class);
    job.setMapOutputValueClass(TopologyTimeSeriesWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(CorrelationMapper.class);
    job.setReducerClass(CorrelationReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, indexInputDirs.substring(0, indexInputDirs.length() - 1));
    FileOutputFormat.setOutputPath(job, new Path(relationshipOutputDir));

    job.setJarByClass(Relationship.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            if (dataset1.equals(dataset2))
                continue;

            String from = s3bucket + relationshipsDir + "/tmp/" + dataset1 + "-" + dataset2 + "/";
            String to = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/";
            FrameworkUtils.renameFile(from, to, s3conf, s3);
        }
    }
}

From source file:grnet.filter.XMLFiltering.java

public static void main(String[] args) throws IOException {
    // TODO Auto-generated method ssstub

    Enviroment enviroment = new Enviroment(args[0]);

    if (enviroment.envCreation) {
        Core core = new Core();

        XMLSource source = new XMLSource(args[0]);

        File sourceFile = source.getSource();

        if (sourceFile.exists()) {

            Collection<File> xmls = source.getXMLs();

            System.out.println("Filtering repository:" + enviroment.dataProviderFilteredIn.getName());

            System.out.println("Number of files to filter:" + xmls.size());

            Iterator<File> iterator = xmls.iterator();

            FilteringReport report = null;
            if (enviroment.getArguments().getProps().getProperty(Constants.createReport)
                    .equalsIgnoreCase("true")) {
                report = new FilteringReport(enviroment.getArguments().getDestFolderLocation(),
                        enviroment.getDataProviderFilteredIn().getName());
            }/* w ww  .j  a va 2 s.  c  o  m*/

            ConnectionFactory factory = new ConnectionFactory();
            factory.setHost(enviroment.getArguments().getQueueHost());
            factory.setUsername(enviroment.getArguments().getQueueUserName());
            factory.setPassword(enviroment.getArguments().getQueuePassword());

            while (iterator.hasNext()) {

                StringBuffer logString = new StringBuffer();
                logString.append(enviroment.dataProviderFilteredIn.getName());
                File xmlFile = iterator.next();

                String name = xmlFile.getName();
                name = name.substring(0, name.indexOf(".xml"));
                logString.append(" " + name);

                boolean xmlIsFilteredIn = core.filterXML(xmlFile, enviroment.getArguments().getQueries());

                if (xmlIsFilteredIn) {
                    logString.append(" " + "FilteredIn");
                    slf4jLogger.info(logString.toString());

                    Connection connection = factory.newConnection();
                    Channel channel = connection.createChannel();
                    channel.queueDeclare(QUEUE_NAME, false, false, false, null);

                    channel.basicPublish("", QUEUE_NAME, null, logString.toString().getBytes());
                    channel.close();
                    connection.close();

                    try {
                        if (report != null) {
                            report.appendXMLFileNameNStatus(xmlFile.getPath(), Constants.filteredInData);
                            report.raiseFilteredInFilesNum();
                        }

                        FileUtils.copyFileToDirectory(xmlFile, enviroment.getDataProviderFilteredIn());
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        // e.printStackTrace();
                        e.printStackTrace();
                        System.out.println("Filtering failed.");
                    }
                } else {
                    logString.append(" " + "FilteredOut");
                    slf4jLogger.info(logString.toString());

                    Connection connection = factory.newConnection();
                    Channel channel = connection.createChannel();
                    channel.queueDeclare(QUEUE_NAME, false, false, false, null);

                    channel.basicPublish("", QUEUE_NAME, null, logString.toString().getBytes());
                    channel.close();
                    connection.close();
                    try {
                        if (report != null) {
                            report.appendXMLFileNameNStatus(xmlFile.getPath(), Constants.filteredOutData);
                            report.raiseFilteredOutFilesNum();
                        }
                        FileUtils.copyFileToDirectory(xmlFile, enviroment.getDataProviderFilteredOuT());
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        // e.printStackTrace();
                        e.printStackTrace();
                        System.out.println("Filtering failed.");
                    }
                }
            }
            if (report != null) {

                report.appendXPathExpression(enviroment.getArguments().getQueries());

                report.appendGeneralInfo();
            }
            System.out.println("Filtering is done.");
        }

    }
}

From source file:grnet.validation.XMLValidation.java

public static void main(String[] args) throws IOException {
    // TODO Auto-generated method ssstub

    Enviroment enviroment = new Enviroment(args[0]);

    if (enviroment.envCreation) {
        String schemaUrl = enviroment.getArguments().getSchemaURL();
        Core core = new Core(schemaUrl);

        XMLSource source = new XMLSource(args[0]);

        File sourceFile = source.getSource();

        if (sourceFile.exists()) {

            Collection<File> xmls = source.getXMLs();

            System.out.println("Validating repository:" + sourceFile.getName());

            System.out.println("Number of files to validate:" + xmls.size());

            Iterator<File> iterator = xmls.iterator();

            System.out.println("Validating against schema:" + schemaUrl + "...");

            ValidationReport report = null;
            if (enviroment.getArguments().createReport().equalsIgnoreCase("true")) {

                report = new ValidationReport(enviroment.getArguments().getDestFolderLocation(),
                        enviroment.getDataProviderValid().getName());

            }/*from ww w  .  j  a  v  a2s.com*/

            ConnectionFactory factory = new ConnectionFactory();
            factory.setHost(enviroment.getArguments().getQueueHost());
            factory.setUsername(enviroment.getArguments().getQueueUserName());
            factory.setPassword(enviroment.getArguments().getQueuePassword());

            while (iterator.hasNext()) {

                StringBuffer logString = new StringBuffer();
                logString.append(sourceFile.getName());
                logString.append(" " + schemaUrl);

                File xmlFile = iterator.next();
                String name = xmlFile.getName();
                name = name.substring(0, name.indexOf(".xml"));

                logString.append(" " + name);

                boolean xmlIsValid = core.validateXMLSchema(xmlFile);

                if (xmlIsValid) {
                    logString.append(" " + "Valid");
                    slf4jLogger.info(logString.toString());

                    Connection connection = factory.newConnection();
                    Channel channel = connection.createChannel();
                    channel.queueDeclare(QUEUE_NAME, false, false, false, null);

                    channel.basicPublish("", QUEUE_NAME, null, logString.toString().getBytes());
                    channel.close();
                    connection.close();
                    try {
                        if (report != null) {

                            report.raiseValidFilesNum();
                        }

                        FileUtils.copyFileToDirectory(xmlFile, enviroment.getDataProviderValid());
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                } else {
                    logString.append(" " + "Invalid");
                    slf4jLogger.info(logString.toString());

                    Connection connection = factory.newConnection();
                    Channel channel = connection.createChannel();
                    channel.queueDeclare(QUEUE_NAME, false, false, false, null);

                    channel.basicPublish("", QUEUE_NAME, null, logString.toString().getBytes());
                    channel.close();
                    connection.close();

                    try {
                        if (report != null) {

                            if (enviroment.getArguments().extendedReport().equalsIgnoreCase("true"))
                                report.appendXMLFileNameNStatus(xmlFile.getPath(), Constants.invalidData,
                                        core.getReason());

                            report.raiseInvalidFilesNum();
                        }
                        FileUtils.copyFileToDirectory(xmlFile, enviroment.getDataProviderInValid());
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }

            if (report != null) {
                report.writeErrorBank(core.getErrorBank());
                report.appendGeneralInfo();
            }
            System.out.println("Validation is done.");

        }

    }
}

From source file:net.iiit.siel.analysis.lang.LanguageIdentifier.java

/**
 * The main method./*  www .  j a  va2s  .  c o  m*/
 *
 * @param args the arguments
 */
public static void main(String args[]) {

    String usage = "Usage: LanguageIdentifier " + "[-identifyrows filename maxlines] "
            + "[-identifyfile charset filename] " + "[-identifyfileset charset files] "
            + "[-identifytext text] " + "[-identifyurl url]";
    int command = 0;

    final int IDFILE = 1;
    final int IDTEXT = 2;
    final int IDURL = 3;
    final int IDFILESET = 4;
    final int IDROWS = 5;

    Vector fileset = new Vector();
    String filename = "";
    String charset = "";
    String url = "";
    String text = "";
    int max = 0;

    // TODO niket writing test args here..
    /*      args = new String[2];
          args[0] = "-identifyurl";
          args[1] = "file:/home1/niket/TamilSamplePage.html";
          //args[2] = "/home1/niket/nutch-clia/input.txt";
    */
    // TODO niket end here

    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
    }

    for (int i = 0; i < args.length; i++) { // parse command line
        if (args[i].equals("-identifyfile")) {
            command = IDFILE;
            charset = args[++i];
            filename = args[++i];
        }

        if (args[i].equals("-identifyurl")) {
            command = IDURL;
            filename = args[++i];
        }

        if (args[i].equals("-identifyrows")) {
            command = IDROWS;
            filename = args[++i];
            max = Integer.parseInt(args[++i]);
        }

        if (args[i].equals("-identifytext")) {
            command = IDTEXT;
            for (i++; i < args.length - 1; i++)
                text += args[i] + " ";
        }

        if (args[i].equals("-identifyfileset")) {
            command = IDFILESET;
            charset = args[++i];
            for (i++; i < args.length; i++) {
                File[] files = null;
                File f = new File(args[i]);
                if (f.isDirectory()) {
                    files = f.listFiles();
                } else {
                    files = new File[] { f };
                }
                for (int j = 0; j < files.length; j++) {
                    fileset.add(files[j].getAbsolutePath());
                }
            }
        }

    }

    Configuration conf = NutchConfiguration.create();
    String lang = null;
    LanguageIdentifier idfr = new LanguageIdentifier(conf);
    File f;
    FileInputStream fis;
    try {
        switch (command) {

        case IDTEXT:
            lang = idfr.identify(text);
            System.out.println("Lang :" + lang);
            break;

        case IDFILE:
            f = new File(filename);
            fis = new FileInputStream(f);
            lang = idfr.identify(fis, charset);
            fis.close();
            break;

        case IDURL:
            lang = LangIdentifierUtility.IdentifyLangFromURLDirectly(filename);

            /*
             * our url identifier is confused or couldn't identify lang from
             * URL
             */
            if (lang == null || lang.equalsIgnoreCase("en")) {
                System.out.println("Ambuguity in identifying language from URL");
            } else {
                System.out.println("Lang was identified(using URL) as: " + lang);
            }
            break;

        case IDROWS:
            f = new File(filename);
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
            String line;
            while (max > 0 && (line = br.readLine()) != null) {
                line = line.trim();
                if (line.length() > 2) {
                    max--;
                    lang = idfr.identify(line);
                    System.out.println("R=" + lang + ":" + line);
                }
            }

            br.close();
            System.exit(0);
            break;

        case IDFILESET:
            /*
             * used for benchs for (int j=128; j<=524288; j*=2) { long start
             * = System.currentTimeMillis(); idfr.analyzeLength = j;
             */
            System.out.println("FILESET");
            Iterator i = fileset.iterator();
            while (i.hasNext()) {
                try {
                    filename = (String) i.next();
                    f = new File(filename);
                    fis = new FileInputStream(f);
                    lang = idfr.identify(fis, charset);
                    fis.close();
                } catch (Exception e) {
                    System.out.println(e);
                }
                System.out.println(filename + " was identified as " + lang);
            }
            /*
             * used for benchs System.out.println(j + "/" +
             * (System.currentTimeMillis()-start)); }
             */
            System.exit(0);
            break;
        }
    } catch (Exception e) {
        System.out.println(e);
        System.out.println("lang could not be identified properly");
        e.printStackTrace();
    }
    System.out.println("text was identified as " + lang);

    /*
     * DONOT delete the next few lines, they should be enabled, when a lang.
     * mapping map needs to be generated. TODO  this is for printing
     * the hashMapRangeLangIDTable only
     * 
     * idfr.langMarkerObject.printHashmapTableWithFormatting();
     * 
     * System.out
     * .println("\n\n\n Printing english text contents in this file:\n");
     * System.out.println(idfr.langMarkerObject.getLangCharacters(
     * LanguageIdentifierConstants.LangShortNames.ENGLISH
     * .langShortName()).toString());
     * 
     * System.out
     * .println("\n\n\n Printing telugu text contents in this file:\n");
     * System.out.println(idfr.langMarkerObject.getLangCharacters(
     * LanguageIdentifierConstants.LangShortNames.TELUGU
     * .langShortName()).toString());
     * 
     * System.out
     * .println("\n\n\n Printing unknown text contents in this file:\n");
     * System.out.println(idfr.langMarkerObject.getLangCharacters(
     * LanguageIdentifierConstants.LangShortNames.UNKNOWN_LANG
     * .langShortName()).toString());
     */
}