Example usage for java.lang Float POSITIVE_INFINITY

List of usage examples for java.lang Float POSITIVE_INFINITY


In this page you can find the example usage for java.lang Float POSITIVE_INFINITY.



To view the source code for java.lang Float POSITIVE_INFINITY.

Click Source Link


A constant holding the positive infinity of type float .


From source file:io.github.gjyaiya.stetho.realm.Database.java

private List<Object> flattenRows(Table table, long limit, boolean addRowIndex) {
    Util.throwIfNot(limit >= 0);
    final List<Object> flatList = new ArrayList<>();
    long numColumns = table.getColumnCount();

    final RowFetcher rowFetcher = RowFetcher.getInstance();
    final long tableSize = table.size();
    for (long index = 0; index < limit && index < tableSize; index++) {
        final long row = ascendingOrder ? index : (tableSize - index - 1);
        final RowWrapper rowData = RowWrapper.wrap(rowFetcher.getRow(table, row));
        if (addRowIndex) {
        }/*from   www  .  j a  va 2  s  .  c o  m*/
        for (int column = 0; column < numColumns; column++) {
            switch (rowData.getColumnType(column)) {
            case INTEGER:
                if (rowData.isNull(column)) {
                } else {
            case BOOLEAN:
                if (rowData.isNull(column)) {
                } else {
            case STRING:
                if (rowData.isNull(column)) {
                } else {
            case BINARY:
                if (rowData.isNull(column)) {
                } else {
            case FLOAT:
                if (rowData.isNull(column)) {
                } else {
                    final float aFloat = rowData.getFloat(column);
                    if (Float.isNaN(aFloat)) {
                    } else if (aFloat == Float.POSITIVE_INFINITY) {
                    } else if (aFloat == Float.NEGATIVE_INFINITY) {
                    } else {
            case DOUBLE:
                if (rowData.isNull(column)) {
                } else {
                    final double aDouble = rowData.getDouble(column);
                    if (Double.isNaN(aDouble)) {
                    } else if (aDouble == Double.POSITIVE_INFINITY) {
                    } else if (aDouble == Double.NEGATIVE_INFINITY) {
                    } else {
            case OLD_DATE:
            case DATE:
                if (rowData.isNull(column)) {
                } else {
            case OBJECT:
                if (rowData.isNullLink(column)) {
                } else {
            case LIST:
                // LIST never be null
                flatList.add("unknown column type: " + rowData.getColumnType(column));

    if (limit < table.size()) {
        for (int column = 0; column < numColumns; column++) {

    return flatList;

From source file:org.shaman.terrain.polygonal.GraphToHeightmap.java

private static void findClosestTwoPoints(Vector3f[] points, float px, float py, Vector3f first,
        Vector3f second) {/* ww  w.j av a  2 s .c o m*/
    float dist1 = Float.POSITIVE_INFINITY;
    float dist2 = Float.POSITIVE_INFINITY;
    for (Vector3f p : points) {
        float d = dist(p, px, py);
        if (d < dist1) {
            dist2 = dist1;
            dist1 = d;
        } else if (d < dist2) {
            dist2 = d;

From source file:net.qvex.dommel.data.DommelDataService.java

public boolean getData(String username, String password) throws ClientProtocolException, IOException {

    // adapted from phptelemeter
    unlimited = false;/*from w  w  w .  j a  v a2  s  . c  o m*/

    /* login */
    Map<String, String> urlParameters = new HashMap<String, String>();
    urlParameters.put("op", "login");
    urlParameters.put("new_language", "english");
    urlParameters.put("submit", "login");
    urlParameters.put("username", username);
    urlParameters.put("password", password);

    String res;

    //        try{
    res = this.httpPost(URL_LOGIN, urlParameters);
    //        }catch(Exception e)
    //        {
    //        // TODO: check for errors in res. possibly just catch nullpointer
    //        // exception..
    //            throw new Exception(e);
    //        }

    /* go to the packages page, and get the serv_id and client_id */
    res = this.httpGet(URL_PACKAGES);
    // TODO: check for errors in res

    String[] lines = res.split("\n");
    String log = null;
    int pos = 0;
    /* figure out the stats exact url */
    for (int i = 0; i < lines.length; i++) {
        pos = lines[i].indexOf(URL_STATS_INIT);
        if (pos >= 0) {
            log = lines[i].substring(pos);

    String url_stats = log.substring(0, log.indexOf("'"));

    /* and get the data */
    String data = this.httpGet(url_stats);

    /* logout */
    res = this.httpGet(URL_LOGOUT);

    lines = data.split("/n");
    String data2 = null;
    pos = 0;

    /* find the entry position */
    for (int i = 0; i < lines.length; i++) {
        pos = lines[i].indexOf("total traffic downloaded in broadband");
        if (pos >= 0) {
            data2 = lines[i].substring(pos);

    lines = data2.split("<br>");

    /* set some default positions */
    int pos_remaining = -1;
    int pos_traffic = -1;
    int pos_reset_date = -1;
    int pos_total = -1;
    int strpos_total = -1;

    /* position finding & data cleanup */
    for (int i = 0; i < lines.length; i++) {
        lines[i] = stripTags(lines[i]);


        if (lines[i].contains("total traffic downloaded")) {
            pos_traffic = i;
        } else if (lines[i].contains("next counter reset")) {
            pos_reset_date = i;
        } else if (lines[i].contains("remaining")) {
            pos_remaining = i;
            if (lines[i].contains("unlimited")) {
                unlimited = true;
        } else if (lines[i].contains("maximum datatransfer")) {
            pos_total = i;
            /* data cleanup */
            int test_ind = lines[i].indexOf("maximum datatransfer:");
            if (test_ind >= 0 && test_ind + 1 < lines[i].length()) {
                lines[i] = lines[i].substring(test_ind + 21);

        /* data cleanup */
        int test_ind = lines[i].indexOf(":");
        if (test_ind >= 0 && test_ind + 1 < lines[i].length()) {
            lines[i] = lines[i].substring(test_ind + 2);


    /* stats */
    /* total used */
    volume_used = Float.parseFloat(lines[pos_traffic].substring(0, lines[pos_traffic].length() - 3)) * 1024;

    volume_remaining = 0;

    /* remaining, if exists? */
    if (pos_remaining >= 0) {
        if (!unlimited) {
            volume_remaining = Float
                    .parseFloat(lines[pos_remaining].substring(0, lines[pos_remaining].length() - 3)) * 1024;

            if (pos_total >= 0) {
                volume_total = Float.parseFloat(lines[pos_total].substring(0, 4)) * 1024;
        } else {
            // Unlimited account
            volume_remaining = Float.POSITIVE_INFINITY;

    /* reset date */
    String reset_date_str = lines[pos_reset_date].substring(0, 10);
    DateFormat df = new SimpleDateFormat("dd/MM/yyyy");
    try {
    } catch (ParseException e) {
        // TODO: handle error
        return false;

    days_left = calculateDaysLeft(reset_date);

    Date now = new Date();

    Editor edit = prefs.edit();
    edit.putFloat("volume_used", volume_used);
    edit.putFloat("volume_remaining", volume_remaining);
    edit.putFloat("volume_total", volume_total);
    edit.putLong("reset_date", reset_date.getTimeInMillis());
    edit.putInt("days_left", days_left);
    edit.putBoolean("unlimited", unlimited);
    edit.putLong("last_update", now.getTime());
    edit.putBoolean("last_update_success", true);

    return edit.commit();


From source file:net.sf.json.TestJSONArray.java

public void testConstructor_primitive_array_float_Infinity() {
    try {//from   w w  w  .j  a va  2 s.  c  om
        JSONArray.fromObject(new float[] { Float.NEGATIVE_INFINITY });
        fail("Should have thrown a JSONException");
    } catch (JSONException expected) {
        // OK

    try {
        JSONArray.fromObject(new float[] { Float.POSITIVE_INFINITY });
        fail("Should have thrown a JSONException");
    } catch (JSONException expected) {
        // OK

From source file:edworld.pdfreader4humans.PDFReader.java

protected Component createGroup(int groupIndex, Map<Component, Integer> groupMap) {
    float fromX = Float.POSITIVE_INFINITY;
    float fromY = Float.POSITIVE_INFINITY;
    float toX = Float.NEGATIVE_INFINITY;
    float toY = Float.NEGATIVE_INFINITY;
    for (Component component : groupMap.keySet())
        if (groupMap.get(component) == groupIndex) {
            fromX = Math.min(component.getFromX(), fromX);
            fromY = Math.min(component.getFromY(), fromY);
            toX = Math.max(component.getToX(), toX);
            toY = Math.max(component.getToY(), toY);
        }//from w  ww.  j  a  va  2 s. co m
    return new GroupComponent(fromX, fromY, toX, toY);

From source file:wqm.web.server.controller.WQMCalibrationController.java

private void ecCalibrateCommand(HttpSession session, Station station, AtlasSensor sensor, int phaseID,
        String command, ECSensorProbe ec_sensor_type) {
    logger.error("EC Calibrate Command.");
    if (CalibrationCommands.Accept.commandEquals(command)) {
        stationManager.acceptCalibrationPhase(session, true, station, sensor, phaseID,
                ec_sensor_type.getPacketVariable(), Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY);
        if ((phaseID + 1) >= sensor.getCalibrationPhases()) {
            //We have finished all the phases of calibration for this sensor.
            stationManager.quitCalibrationPhase(session, station, sensor);
            session.setAttribute(Messages.SUCCESS_MESSAGE, "EC Sensor calibrated.");
            try {
            } catch (InterruptedException e) {
                logger.error("", e);
            }//from  w  ww .ja va2  s  . c o  m
            throw new RedirectException("/");
        throw new RedirectException(
                String.format("/wqm/c/%s/%d/%d", station.getCompactAddress(), sensor.getId(), phaseID + 1));


From source file:gedi.util.ArrayUtils.java

public static int argmin(float[] a) {
    float re = Float.POSITIVE_INFINITY;
    int arg = -1;
    for (int i = 0; i < a.length; i++) {
        if (a[i] < re) {
            re = a[i];// w w  w.ja v  a  2s. c o m
            arg = i;
    return arg;

From source file:org.opentripplanner.routing.algorithm.strategies.WeightTable.java

private void floyd() {
    int n = table.length;
    for (int k = 0; k < n; k++) {
        for (int i = 0; i < n; i++) {
            double ik = table[i][k];
            if (ik == Float.POSITIVE_INFINITY)
            for (int j = 0; j < n; j++) {
                double kj = table[k][j];
                if (kj == Float.POSITIVE_INFINITY)
                double ikj = ik + kj;
                double ij = table[i][j];
                if (ikj < ij)
                    table[i][j] = (float) ikj;
            }/*from w ww  .ja va2  s.c  om*/
        if (k % 50 == 0)
            LOG.debug("k=" + k + "/" + n);

From source file:com.elex.dmp.vectorizer.SparseVectorsFromSequenceFiles.java

public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withDescription(//from w  w  w .  ja  v  a 2s  .  c  o m
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, it will override this value.")

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors."
                            + "  Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors "
                            + "will be filtered out. Default is -1.0.  Overrides maxDFPercent")

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
    try {
        Parser parser = new Parser();
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            return -1;

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            ClassUtils.instantiateAs(analyzerClass, Analyzer.class);

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
        } else {
            processIdf = true;

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;

        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        boolean shouldPrune = maxDFSigma >= 0.0;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;

        if (!processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
            Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

            // Calculate the standard deviation
            double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            HadoopUtil.delete(new Configuration(conf), tfDir);
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
    } catch (OptionException e) {
        log.error("Exception", e);
    return 0;

From source file:org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles.java

public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withDescription("The chunkSize in MegaBytes. Default Value: 100MB").withShortName("chunk")
            .create();// ww  w . jav  a2 s  .c  o m

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withDescription("The kind of weight to use. Currently TF or TFIDF. Default: TFIDF")

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, "
                            + "it will override this value.")

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) "
                            + "of the document frequencies of these vectors. Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less "
                            + "than 0 no vectors will be filtered out. Default is -1.0.  Overrides maxDFPercent")

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
    try {
        Parser parser = new Parser();
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            return -1;

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
        } else {
            processIdf = true;

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        log.info("Tokenizing documents in {}", inputDir);
        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom
        // to have one framework for all of this.
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        boolean shouldPrune = maxDFSigma >= 0.0 || maxDFPercent > 0.00;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
        log.info("Creating Term Frequency Vectors");
        if (processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);

        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            log.info("Calculating IDF");
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            if (maxDFSigma >= 0.0) {
                Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
                Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

                // Calculate the standard deviation
                double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
                maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);

            long maxDFThreshold = (long) (vectorCount * (maxDF / 100.0f));

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            HadoopUtil.delete(new Configuration(conf), tfDir);
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
    } catch (OptionException e) {
        log.error("Exception", e);
    return 0;