List of usage examples for java.lang Float POSITIVE_INFINITY
float POSITIVE_INFINITY
To view the source code for java.lang Float POSITIVE_INFINITY.
Click Source Link
From source file:io.github.gjyaiya.stetho.realm.Database.java
private List<Object> flattenRows(Table table, long limit, boolean addRowIndex) { Util.throwIfNot(limit >= 0); final List<Object> flatList = new ArrayList<>(); long numColumns = table.getColumnCount(); final RowFetcher rowFetcher = RowFetcher.getInstance(); final long tableSize = table.size(); for (long index = 0; index < limit && index < tableSize; index++) { final long row = ascendingOrder ? index : (tableSize - index - 1); final RowWrapper rowData = RowWrapper.wrap(rowFetcher.getRow(table, row)); if (addRowIndex) { flatList.add(rowData.getIndex()); }/*from www . j a va 2 s . c o m*/ for (int column = 0; column < numColumns; column++) { switch (rowData.getColumnType(column)) { case INTEGER: if (rowData.isNull(column)) { flatList.add(NULL); } else { flatList.add(rowData.getLong(column)); } break; case BOOLEAN: if (rowData.isNull(column)) { flatList.add(NULL); } else { flatList.add(rowData.getBoolean(column)); } break; case STRING: if (rowData.isNull(column)) { flatList.add(NULL); } else { flatList.add(rowData.getString(column)); } break; case BINARY: if (rowData.isNull(column)) { flatList.add(NULL); } else { flatList.add(rowData.getBinaryByteArray(column)); } break; case FLOAT: if (rowData.isNull(column)) { flatList.add(NULL); } else { final float aFloat = rowData.getFloat(column); if (Float.isNaN(aFloat)) { flatList.add("NaN"); } else if (aFloat == Float.POSITIVE_INFINITY) { flatList.add("Infinity"); } else if (aFloat == Float.NEGATIVE_INFINITY) { flatList.add("-Infinity"); } else { flatList.add(aFloat); } } break; case DOUBLE: if (rowData.isNull(column)) { flatList.add(NULL); } else { final double aDouble = rowData.getDouble(column); if (Double.isNaN(aDouble)) { flatList.add("NaN"); } else if (aDouble == Double.POSITIVE_INFINITY) { flatList.add("Infinity"); } else if (aDouble == Double.NEGATIVE_INFINITY) { flatList.add("-Infinity"); } else { flatList.add(aDouble); } } break; case OLD_DATE: case DATE: if (rowData.isNull(column)) { flatList.add(NULL); } else { flatList.add(formatDate(rowData.getDate(column))); } break; case OBJECT: if (rowData.isNullLink(column)) { flatList.add(NULL); } else { flatList.add(rowData.getLink(column)); } break; case LIST: // LIST never be null flatList.add(formatList(rowData.getLinkList(column))); break; default: flatList.add("unknown column type: " + rowData.getColumnType(column)); break; } } } if (limit < table.size()) { for (int column = 0; column < numColumns; column++) { flatList.add("{truncated}"); } } return flatList; }
From source file:org.shaman.terrain.polygonal.GraphToHeightmap.java
private static void findClosestTwoPoints(Vector3f[] points, float px, float py, Vector3f first, Vector3f second) {/* ww w.j av a 2 s .c o m*/ float dist1 = Float.POSITIVE_INFINITY; float dist2 = Float.POSITIVE_INFINITY; for (Vector3f p : points) { float d = dist(p, px, py); if (d < dist1) { dist2 = dist1; second.set(first); dist1 = d; first.set(p); } else if (d < dist2) { dist2 = d; second.set(p); } } }
From source file:net.qvex.dommel.data.DommelDataService.java
public boolean getData(String username, String password) throws ClientProtocolException, IOException { // adapted from phptelemeter unlimited = false;/*from w w w . j a v a2 s . c o m*/ /* login */ Map<String, String> urlParameters = new HashMap<String, String>(); urlParameters.put("op", "login"); urlParameters.put("new_language", "english"); urlParameters.put("submit", "login"); urlParameters.put("username", username); urlParameters.put("password", password); String res; // try{ res = this.httpPost(URL_LOGIN, urlParameters); // // }catch(Exception e) // { // // TODO: check for errors in res. possibly just catch nullpointer // // exception.. // throw new Exception(e); // } /* go to the packages page, and get the serv_id and client_id */ res = this.httpGet(URL_PACKAGES); // TODO: check for errors in res String[] lines = res.split("\n"); String log = null; int pos = 0; /* figure out the stats exact url */ for (int i = 0; i < lines.length; i++) { pos = lines[i].indexOf(URL_STATS_INIT); if (pos >= 0) { log = lines[i].substring(pos); break; } } String url_stats = log.substring(0, log.indexOf("'")); /* and get the data */ String data = this.httpGet(url_stats); /* logout */ res = this.httpGet(URL_LOGOUT); lines = data.split("/n"); String data2 = null; pos = 0; /* find the entry position */ for (int i = 0; i < lines.length; i++) { pos = lines[i].indexOf("total traffic downloaded in broadband"); if (pos >= 0) { data2 = lines[i].substring(pos); break; } } lines = data2.split("<br>"); /* set some default positions */ int pos_remaining = -1; int pos_traffic = -1; int pos_reset_date = -1; int pos_total = -1; @SuppressWarnings("unused") int strpos_total = -1; /* position finding & data cleanup */ for (int i = 0; i < lines.length; i++) { lines[i] = stripTags(lines[i]); //System.out.println(lines[i]); if (lines[i].contains("total traffic downloaded")) { pos_traffic = i; } else if (lines[i].contains("next counter reset")) { pos_reset_date = i; } else if (lines[i].contains("remaining")) { pos_remaining = i; if (lines[i].contains("unlimited")) { unlimited = true; } } else if (lines[i].contains("maximum datatransfer")) { pos_total = i; /* data cleanup */ int test_ind = lines[i].indexOf("maximum datatransfer:"); if (test_ind >= 0 && test_ind + 1 < lines[i].length()) { lines[i] = lines[i].substring(test_ind + 21); } } /* data cleanup */ int test_ind = lines[i].indexOf(":"); if (test_ind >= 0 && test_ind + 1 < lines[i].length()) { lines[i] = lines[i].substring(test_ind + 2); } } /* stats */ /* total used */ volume_used = Float.parseFloat(lines[pos_traffic].substring(0, lines[pos_traffic].length() - 3)) * 1024; volume_remaining = 0; /* remaining, if exists? */ if (pos_remaining >= 0) { if (!unlimited) { volume_remaining = Float .parseFloat(lines[pos_remaining].substring(0, lines[pos_remaining].length() - 3)) * 1024; if (pos_total >= 0) { volume_total = Float.parseFloat(lines[pos_total].substring(0, 4)) * 1024; } } else { // Unlimited account volume_remaining = Float.POSITIVE_INFINITY; } } /* reset date */ String reset_date_str = lines[pos_reset_date].substring(0, 10); DateFormat df = new SimpleDateFormat("dd/MM/yyyy"); try { reset_date.setTime(df.parse(reset_date_str)); } catch (ParseException e) { e.printStackTrace(); // TODO: handle error return false; } days_left = calculateDaysLeft(reset_date); Date now = new Date(); Editor edit = prefs.edit(); edit.putFloat("volume_used", volume_used); edit.putFloat("volume_remaining", volume_remaining); edit.putFloat("volume_total", volume_total); edit.putLong("reset_date", reset_date.getTimeInMillis()); edit.putInt("days_left", days_left); edit.putBoolean("unlimited", unlimited); edit.putLong("last_update", now.getTime()); edit.putBoolean("last_update_success", true); return edit.commit(); }
From source file:net.sf.json.TestJSONArray.java
public void testConstructor_primitive_array_float_Infinity() { try {//from w w w .j a va 2 s. c om JSONArray.fromObject(new float[] { Float.NEGATIVE_INFINITY }); fail("Should have thrown a JSONException"); } catch (JSONException expected) { // OK } try { JSONArray.fromObject(new float[] { Float.POSITIVE_INFINITY }); fail("Should have thrown a JSONException"); } catch (JSONException expected) { // OK } }
From source file:edworld.pdfreader4humans.PDFReader.java
protected Component createGroup(int groupIndex, Map<Component, Integer> groupMap) { float fromX = Float.POSITIVE_INFINITY; float fromY = Float.POSITIVE_INFINITY; float toX = Float.NEGATIVE_INFINITY; float toY = Float.NEGATIVE_INFINITY; for (Component component : groupMap.keySet()) if (groupMap.get(component) == groupIndex) { fromX = Math.min(component.getFromX(), fromX); fromY = Math.min(component.getFromY(), fromY); toX = Math.max(component.getToX(), toX); toY = Math.max(component.getToY(), toY); }//from w ww. j a va 2 s. co m return new GroupComponent(fromX, fromY, toX, toY); }
From source file:wqm.web.server.controller.WQMCalibrationController.java
private void ecCalibrateCommand(HttpSession session, Station station, AtlasSensor sensor, int phaseID, String command, ECSensorProbe ec_sensor_type) { logger.error("EC Calibrate Command."); if (CalibrationCommands.Accept.commandEquals(command)) { stationManager.acceptCalibrationPhase(session, true, station, sensor, phaseID, ec_sensor_type.getPacketVariable(), Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY); if ((phaseID + 1) >= sensor.getCalibrationPhases()) { //We have finished all the phases of calibration for this sensor. stationManager.quitCalibrationPhase(session, station, sensor); session.setAttribute(Messages.SUCCESS_MESSAGE, "EC Sensor calibrated."); try { Thread.sleep(1000); } catch (InterruptedException e) { logger.error("", e); }//from w ww .ja va2 s . c o m throw new RedirectException("/"); } throw new RedirectException( String.format("/wqm/c/%s/%d/%d", station.getCompactAddress(), sensor.getId(), phaseID + 1)); } }
From source file:gedi.util.ArrayUtils.java
public static int argmin(float[] a) { float re = Float.POSITIVE_INFINITY; int arg = -1; for (int i = 0; i < a.length; i++) { if (a[i] < re) { re = a[i];// w w w.ja v a 2s. c o m arg = i; } } return arg; }
From source file:org.opentripplanner.routing.algorithm.strategies.WeightTable.java
private void floyd() { LOG.debug("Floyd"); int n = table.length; for (int k = 0; k < n; k++) { for (int i = 0; i < n; i++) { double ik = table[i][k]; if (ik == Float.POSITIVE_INFINITY) continue; for (int j = 0; j < n; j++) { double kj = table[k][j]; if (kj == Float.POSITIVE_INFINITY) continue; double ikj = ik + kj; double ij = table[i][j]; if (ikj < ij) table[i][j] = (float) ikj; }/*from w ww .ja va2 s.c om*/ } if (k % 50 == 0) LOG.debug("k=" + k + "/" + n); } }
From source file:com.elex.dmp.vectorizer.SparseVectorsFromSequenceFiles.java
@Override public int run(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputDirOpt = DefaultOptionCreator.inputOption().create(); Option outputDirOpt = DefaultOptionCreator.outputOption().create(); Option minSupportOpt = obuilder.withLongName("minSupport") .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create(); Option analyzerNameOpt = obuilder.withLongName("analyzerName") .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()) .withDescription("The class name of the analyzer").withShortName("a").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize") .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()) .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create(); Option weightOpt = obuilder.withLongName("weight").withRequired(false) .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create()) .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create(); Option minDFOpt = obuilder.withLongName("minDF").withRequired(false) .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()) .withDescription("The minimum document frequency. Default is 1").withShortName("md").create(); Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false) .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()) .withDescription(//from w w w . ja v a 2s . c o m "The max percentage of docs for the DF. Can be used to remove really high frequency terms." + " Expressed as an integer between 0 and 100. Default is 99. If maxDFSigma is also set, it will override this value.") .withShortName("x").create(); Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false) .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create()) .withDescription( "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors." + " Can be used to remove really high frequency terms." + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors " + "will be filtered out. Default is -1.0. Overrides maxDFPercent") .withShortName("xs").create(); Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false) .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional)The minimum Log Likelihood Ratio(Float) Default is " + LLRReducer.DEFAULT_MIN_LLR) .withShortName("ml").create(); Option numReduceTasksOpt = obuilder.withLongName("numReducers") .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr") .create(); Option powerOpt = obuilder.withLongName("norm").withRequired(false) .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create()) .withDescription( "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm. " + "Must be greater or equal to 0. The default is not to normalize") .withShortName("n").create(); Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false) .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false") .withShortName("lnorm").create(); Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false) .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) The maximum size of ngrams to create" + " (2 = bigrams, 3 = trigrams, etc) Default Value:1") .withShortName("ng").create(); Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false) .withDescription( "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false") .withShortName("seq").create(); Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false) .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false") .withShortName("nv").create(); Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false) .withDescription("If set, overwrite the output directory").withShortName("ow").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt) .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt) .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt) .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt) .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt) .withOption(namedVectorOpt).withOption(logNormalizeOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return -1; } Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt)); Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt)); int chunkSize = 100; if (cmdLine.hasOption(chunkSizeOpt)) { chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); } int minSupport = 2; if (cmdLine.hasOption(minSupportOpt)) { String minSupportString = (String) cmdLine.getValue(minSupportOpt); minSupport = Integer.parseInt(minSupportString); } int maxNGramSize = 1; if (cmdLine.hasOption(maxNGramSizeOpt)) { try { maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString()); } catch (NumberFormatException ex) { log.warn("Could not parse ngram size option"); } } log.info("Maximum n-gram size is: {}", maxNGramSize); if (cmdLine.hasOption(overwriteOutput)) { HadoopUtil.delete(getConf(), outputDir); } float minLLRValue = LLRReducer.DEFAULT_MIN_LLR; if (cmdLine.hasOption(minLLROpt)) { minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString()); } log.info("Minimum LLR value: {}", minLLRValue); int reduceTasks = 1; if (cmdLine.hasOption(numReduceTasksOpt)) { reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString()); } log.info("Number of reduce tasks: {}", reduceTasks); Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class; if (cmdLine.hasOption(analyzerNameOpt)) { String className = cmdLine.getValue(analyzerNameOpt).toString(); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it ClassUtils.instantiateAs(analyzerClass, Analyzer.class); } boolean processIdf; if (cmdLine.hasOption(weightOpt)) { String wString = cmdLine.getValue(weightOpt).toString(); if ("tf".equalsIgnoreCase(wString)) { processIdf = false; } else if ("tfidf".equalsIgnoreCase(wString)) { processIdf = true; } else { throw new OptionException(weightOpt); } } else { processIdf = true; } int minDf = 1; if (cmdLine.hasOption(minDFOpt)) { minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString()); } int maxDFPercent = 99; if (cmdLine.hasOption(maxDFPercentOpt)) { maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()); } double maxDFSigma = -1.0; if (cmdLine.hasOption(maxDFSigmaOpt)) { maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString()); } float norm = PartialVectorMerger.NO_NORMALIZING; if (cmdLine.hasOption(powerOpt)) { String power = cmdLine.getValue(powerOpt).toString(); if ("INF".equals(power)) { norm = Float.POSITIVE_INFINITY; } else { norm = Float.parseFloat(power); } } boolean logNormalize = false; if (cmdLine.hasOption(logNormalizeOpt)) { logNormalize = true; } Configuration conf = getConf(); Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this. DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf); boolean sequentialAccessOutput = false; if (cmdLine.hasOption(sequentialAccessVectorOpt)) { sequentialAccessOutput = true; } boolean namedVectors = false; if (cmdLine.hasOption(namedVectorOpt)) { namedVectors = true; } boolean shouldPrune = maxDFSigma >= 0.0; String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune" : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER; if (!processIdf) { DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors); } else { DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors); } Pair<Long[], List<Path>> docFrequenciesFeatures = null; // Should document frequency features be processed if (shouldPrune || processIdf) { docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf, chunkSize); } long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed if (shouldPrune) { Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER); Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR); // Calculate the standard deviation double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf); long vectorCount = docFrequenciesFeatures.getFirst()[1]; maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount); // Prune the term frequency vectors Path tfDir = new Path(outputDir, tfDirName); Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER); Path prunedPartialTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial"); if (processIdf) { HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf, docFrequenciesFeatures, -1.0f, false, reduceTasks); } else { HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf, docFrequenciesFeatures, norm, logNormalize, reduceTasks); } HadoopUtil.delete(new Configuration(conf), tfDir); } if (processIdf) { TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize, sequentialAccessOutput, namedVectors, reduceTasks); } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } return 0; }
From source file:org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles.java
@Override public int run(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputDirOpt = DefaultOptionCreator.inputOption().create(); Option outputDirOpt = DefaultOptionCreator.outputOption().create(); Option minSupportOpt = obuilder.withLongName("minSupport") .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create(); Option analyzerNameOpt = obuilder.withLongName("analyzerName") .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()) .withDescription("The class name of the analyzer").withShortName("a").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize") .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()) .withDescription("The chunkSize in MegaBytes. Default Value: 100MB").withShortName("chunk") .create();// ww w . jav a2 s .c o m Option weightOpt = obuilder.withLongName("weight").withRequired(false) .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create()) .withDescription("The kind of weight to use. Currently TF or TFIDF. Default: TFIDF") .withShortName("wt").create(); Option minDFOpt = obuilder.withLongName("minDF").withRequired(false) .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()) .withDescription("The minimum document frequency. Default is 1").withShortName("md").create(); Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false) .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()) .withDescription( "The max percentage of docs for the DF. Can be used to remove really high frequency terms." + " Expressed as an integer between 0 and 100. Default is 99. If maxDFSigma is also set, " + "it will override this value.") .withShortName("x").create(); Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false) .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create()) .withDescription( "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) " + "of the document frequencies of these vectors. Can be used to remove really high frequency terms." + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less " + "than 0 no vectors will be filtered out. Default is -1.0. Overrides maxDFPercent") .withShortName("xs").create(); Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false) .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional)The minimum Log Likelihood Ratio(Float) Default is " + LLRReducer.DEFAULT_MIN_LLR) .withShortName("ml").create(); Option numReduceTasksOpt = obuilder.withLongName("numReducers") .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr") .create(); Option powerOpt = obuilder.withLongName("norm").withRequired(false) .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create()) .withDescription( "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm. " + "Must be greater or equal to 0. The default is not to normalize") .withShortName("n").create(); Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false) .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false") .withShortName("lnorm").create(); Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false) .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) The maximum size of ngrams to create" + " (2 = bigrams, 3 = trigrams, etc) Default Value:1") .withShortName("ng").create(); Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false) .withDescription( "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false") .withShortName("seq").create(); Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false) .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false") .withShortName("nv").create(); Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false) .withDescription("If set, overwrite the output directory").withShortName("ow").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt) .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt) .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt) .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt) .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt) .withOption(namedVectorOpt).withOption(logNormalizeOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return -1; } Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt)); Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt)); int chunkSize = 100; if (cmdLine.hasOption(chunkSizeOpt)) { chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); } int minSupport = 2; if (cmdLine.hasOption(minSupportOpt)) { String minSupportString = (String) cmdLine.getValue(minSupportOpt); minSupport = Integer.parseInt(minSupportString); } int maxNGramSize = 1; if (cmdLine.hasOption(maxNGramSizeOpt)) { try { maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString()); } catch (NumberFormatException ex) { log.warn("Could not parse ngram size option"); } } log.info("Maximum n-gram size is: {}", maxNGramSize); if (cmdLine.hasOption(overwriteOutput)) { HadoopUtil.delete(getConf(), outputDir); } float minLLRValue = LLRReducer.DEFAULT_MIN_LLR; if (cmdLine.hasOption(minLLROpt)) { minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString()); } log.info("Minimum LLR value: {}", minLLRValue); int reduceTasks = 1; if (cmdLine.hasOption(numReduceTasksOpt)) { reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString()); } log.info("Number of reduce tasks: {}", reduceTasks); Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class; if (cmdLine.hasOption(analyzerNameOpt)) { String className = cmdLine.getValue(analyzerNameOpt).toString(); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it AnalyzerUtils.createAnalyzer(analyzerClass); } boolean processIdf; if (cmdLine.hasOption(weightOpt)) { String wString = cmdLine.getValue(weightOpt).toString(); if ("tf".equalsIgnoreCase(wString)) { processIdf = false; } else if ("tfidf".equalsIgnoreCase(wString)) { processIdf = true; } else { throw new OptionException(weightOpt); } } else { processIdf = true; } int minDf = 1; if (cmdLine.hasOption(minDFOpt)) { minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString()); } int maxDFPercent = 99; if (cmdLine.hasOption(maxDFPercentOpt)) { maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()); } double maxDFSigma = -1.0; if (cmdLine.hasOption(maxDFSigmaOpt)) { maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString()); } float norm = PartialVectorMerger.NO_NORMALIZING; if (cmdLine.hasOption(powerOpt)) { String power = cmdLine.getValue(powerOpt).toString(); if ("INF".equals(power)) { norm = Float.POSITIVE_INFINITY; } else { norm = Float.parseFloat(power); } } boolean logNormalize = false; if (cmdLine.hasOption(logNormalizeOpt)) { logNormalize = true; } log.info("Tokenizing documents in {}", inputDir); Configuration conf = getConf(); Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom // to have one framework for all of this. DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf); boolean sequentialAccessOutput = false; if (cmdLine.hasOption(sequentialAccessVectorOpt)) { sequentialAccessOutput = true; } boolean namedVectors = false; if (cmdLine.hasOption(namedVectorOpt)) { namedVectors = true; } boolean shouldPrune = maxDFSigma >= 0.0 || maxDFPercent > 0.00; String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune" : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER; log.info("Creating Term Frequency Vectors"); if (processIdf) { DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors); } else { DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors); } Pair<Long[], List<Path>> docFrequenciesFeatures = null; // Should document frequency features be processed if (shouldPrune || processIdf) { log.info("Calculating IDF"); docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf, chunkSize); } long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed if (shouldPrune) { long vectorCount = docFrequenciesFeatures.getFirst()[1]; if (maxDFSigma >= 0.0) { Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER); Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR); // Calculate the standard deviation double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf); maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount); } long maxDFThreshold = (long) (vectorCount * (maxDF / 100.0f)); // Prune the term frequency vectors Path tfDir = new Path(outputDir, tfDirName); Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER); Path prunedPartialTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial"); log.info("Pruning"); if (processIdf) { HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf, conf, docFrequenciesFeatures, -1.0f, false, reduceTasks); } else { HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf, conf, docFrequenciesFeatures, norm, logNormalize, reduceTasks); } HadoopUtil.delete(new Configuration(conf), tfDir); } if (processIdf) { TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize, sequentialAccessOutput, namedVectors, reduceTasks); } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } return 0; }