List of usage examples for java.lang Integer MAX_VALUE
int MAX_VALUE
To view the source code for java.lang Integer MAX_VALUE.
Click Source Link
From source file:edu.cmu.lti.oaqa.knn4qa.apps.BuildInMemFwdIndexApp.java
public static void main(String[] args) { Options options = new Options(); options.addOption(CommonParams.ROOT_DIR_PARAM, null, true, CommonParams.ROOT_DIR_DESC); options.addOption(CommonParams.SUB_DIR_TYPE_PARAM, null, true, CommonParams.SUB_DIR_TYPE_DESC); options.addOption(CommonParams.MAX_NUM_REC_PARAM, null, true, CommonParams.MAX_NUM_REC_DESC); options.addOption(CommonParams.SOLR_FILE_NAME_PARAM, null, true, CommonParams.SOLR_FILE_NAME_DESC); options.addOption(CommonParams.OUT_INDEX_PARAM, null, true, CommonParams.OUT_MINDEX_DESC); options.addOption(EXCLUDE_FIELDS_PARAM, null, true, EXCLUDE_FIELDS_DESC); CommandLineParser parser = new org.apache.commons.cli.GnuParser(); try {/*from ww w. ja va 2s .c om*/ CommandLine cmd = parser.parse(options, args); String rootDir = null; rootDir = cmd.getOptionValue(CommonParams.ROOT_DIR_PARAM); if (null == rootDir) Usage("Specify: " + CommonParams.ROOT_DIR_DESC, options); String outPrefix = cmd.getOptionValue(CommonParams.OUT_INDEX_PARAM); if (null == outPrefix) Usage("Specify: " + CommonParams.OUT_MINDEX_DESC, options); String subDirTypeList = cmd.getOptionValue(CommonParams.SUB_DIR_TYPE_PARAM); if (null == subDirTypeList || subDirTypeList.isEmpty()) Usage("Specify: " + CommonParams.SUB_DIR_TYPE_DESC, options); String solrFileName = cmd.getOptionValue(CommonParams.SOLR_FILE_NAME_PARAM); if (null == solrFileName) Usage("Specify: " + CommonParams.SOLR_FILE_NAME_DESC, options); int maxNumRec = Integer.MAX_VALUE; String tmp = cmd.getOptionValue(CommonParams.MAX_NUM_REC_PARAM); if (tmp != null) { try { maxNumRec = Integer.parseInt(tmp); if (maxNumRec <= 0) { Usage("The maximum number of records should be a positive integer", options); } } catch (NumberFormatException e) { Usage("The maximum number of records should be a positive integer", options); } } String[] exclFields = new String[0]; tmp = cmd.getOptionValue(EXCLUDE_FIELDS_PARAM); if (null != tmp) { exclFields = tmp.split(","); } String[] subDirs = subDirTypeList.split(","); for (int k = 0; k < FeatureExtractor.mFieldNames.length; ++k) { String field = FeatureExtractor.mFieldsSOLR[k]; String fieldName = FeatureExtractor.mFieldNames[k]; boolean bOk = !StringUtilsLeo.isInArrayNoCase(fieldName, exclFields); if (bOk) System.out.println("Processing field: " + field); else { System.out.println("Skipping field: " + field); continue; } String[] fileNames = new String[subDirs.length]; for (int i = 0; i < fileNames.length; ++i) fileNames[i] = rootDir + "/" + subDirs[i] + "/" + solrFileName; InMemForwardIndex indx = new InMemForwardIndex(field, fileNames, maxNumRec); indx.save(InMemIndexFeatureExtractor.indexFileName(outPrefix, fieldName)); } } catch (ParseException e) { Usage("Cannot parse arguments", options); } catch (Exception e) { e.printStackTrace(); System.err.println("Terminating due to an exception: " + e); System.exit(1); } }
From source file:edu.cmu.lti.oaqa.knn4qa.apps.GenTranEmbeddings.java
public static void main(String[] args) { Options options = new Options(); options.addOption(CommonParams.MEMINDEX_PARAM, null, true, CommonParams.MEMINDEX_DESC); options.addOption(CommonParams.GIZA_ROOT_DIR_PARAM, null, true, CommonParams.GIZA_ROOT_DIR_DESC); options.addOption(CommonParams.GIZA_ITER_QTY_PARAM, null, true, CommonParams.GIZA_ITER_QTY_DESC); options.addOption(OUT_FILE_PARAM, null, true, OUT_FILE_DESC); options.addOption(MAX_MODEL_ORDER_PARAM, null, true, MAX_MODEL_ORDER_DESC); options.addOption(MIN_PROB_PARAM, null, true, MIN_PROB_DESC); options.addOption(MAX_DIGIT_PARAM, null, true, MAX_DIGIT_DESC); options.addOption(CommonParams.MAX_WORD_QTY_PARAM, null, true, CommonParams.MAX_WORD_QTY_PARAM); CommandLineParser parser = new org.apache.commons.cli.GnuParser(); try {//from w ww. j av a2 s . c om CommandLine cmd = parser.parse(options, args); int maxWordQty = Integer.MAX_VALUE; String tmpi = cmd.getOptionValue(CommonParams.MAX_WORD_QTY_PARAM); if (null != tmpi) { maxWordQty = Integer.parseInt(tmpi); } String memIndexPref = cmd.getOptionValue(CommonParams.MEMINDEX_PARAM); if (null == memIndexPref) { Usage("Specify '" + CommonParams.MEMINDEX_DESC + "'", options); } String gizaRootDir = cmd.getOptionValue(CommonParams.GIZA_ROOT_DIR_PARAM); if (null == gizaRootDir) { Usage("Specify '" + CommonParams.GIZA_ROOT_DIR_PARAM + "'", options); } int gizaIterQty = -1; if (cmd.hasOption(CommonParams.GIZA_ITER_QTY_PARAM)) { gizaIterQty = Integer.parseInt(cmd.getOptionValue(CommonParams.GIZA_ITER_QTY_PARAM)); } if (gizaIterQty <= 0) { Usage("Specify '" + CommonParams.GIZA_ITER_QTY_DESC + "'", options); } int maxModelOrder = -1; if (cmd.hasOption(MAX_MODEL_ORDER_PARAM)) { maxModelOrder = Integer.parseInt(cmd.getOptionValue(MAX_MODEL_ORDER_PARAM)); } String outFilePrefix = cmd.getOptionValue(OUT_FILE_PARAM); if (null == outFilePrefix) { Usage("Specify '" + OUT_FILE_DESC + "'", options); } float minProb = 0; if (cmd.hasOption(MIN_PROB_PARAM)) { minProb = Float.parseFloat(cmd.getOptionValue(MIN_PROB_PARAM)); } else { Usage("Specify '" + MIN_PROB_DESC + "'", options); } int maxDigit = 5; if (cmd.hasOption(MAX_DIGIT_PARAM)) { maxDigit = Integer.parseInt(cmd.getOptionValue(MAX_DIGIT_PARAM)); } // We use unlemmatized text here, because lemmatized dictionary is going to be mostly subset of the unlemmatized one. int fieldId = FeatureExtractor.TEXT_UNLEMM_FIELD_ID; String memFwdIndxName = FeatureExtractor.indexFileName(memIndexPref, FeatureExtractor.mFieldNames[fieldId]); FrequentIndexWordFilterAndRecoder filterAndRecoder = new FrequentIndexWordFilterAndRecoder( memFwdIndxName, maxWordQty); InMemForwardIndex index = new InMemForwardIndex(memFwdIndxName); BM25SimilarityLucene simil = new BM25SimilarityLucene(FeatureExtractor.BM25_K1, FeatureExtractor.BM25_B, index); String prefix = gizaRootDir + "/" + FeatureExtractor.mFieldNames[fieldId] + "/"; GizaVocabularyReader answVoc = new GizaVocabularyReader(prefix + "source.vcb", filterAndRecoder); GizaVocabularyReader questVoc = new GizaVocabularyReader(prefix + "target.vcb", filterAndRecoder); GizaTranTableReaderAndRecoder answToQuestTran = new GizaTranTableReaderAndRecoder( false /* don't flip a translation table */, prefix + "/output.t1." + gizaIterQty, filterAndRecoder, answVoc, questVoc, (float) FeatureExtractor.DEFAULT_PROB_SELF_TRAN, minProb); int order = 0; System.out.println("Starting to compute the 0-order model"); HashIntObjMap<SparseVector> currModel = SparseEmbeddingReaderAndRecorder.createTranVecDict(index, filterAndRecoder, minProb, answToQuestTran); System.out.println("0-order model is computed"); SparseEmbeddingReaderAndRecorder.saveDict(index, outFilePrefix + ".0", currModel, maxDigit); System.out.println("0-order model is saved"); while (order < maxModelOrder) { ++order; System.out.println("Starting to compute the " + order + "-order model"); currModel = SparseEmbeddingReaderAndRecorder.nextOrderDict(currModel, index, minProb, answToQuestTran); System.out.println(order + "-order model is computed"); SparseEmbeddingReaderAndRecorder.saveDict(index, outFilePrefix + "." + order, currModel, maxDigit); System.out.println(order + "-order model is saved"); } } catch (ParseException e) { Usage("Cannot parse arguments", options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } System.out.println("Terminated successfully!"); }
From source file:edu.cmu.lti.oaqa.knn4qa.apps.LuceneIndexer.java
public static void main(String[] args) { Options options = new Options(); options.addOption(CommonParams.ROOT_DIR_PARAM, null, true, CommonParams.ROOT_DIR_DESC); options.addOption(CommonParams.SUB_DIR_TYPE_PARAM, null, true, CommonParams.SUB_DIR_TYPE_DESC); options.addOption(CommonParams.MAX_NUM_REC_PARAM, null, true, CommonParams.MAX_NUM_REC_DESC); options.addOption(CommonParams.SOLR_FILE_NAME_PARAM, null, true, CommonParams.SOLR_FILE_NAME_DESC); options.addOption(CommonParams.OUT_INDEX_PARAM, null, true, CommonParams.OUT_MINDEX_DESC); CommandLineParser parser = new org.apache.commons.cli.GnuParser(); try {//from w ww .j a v a2 s. c o m CommandLine cmd = parser.parse(options, args); String rootDir = null; rootDir = cmd.getOptionValue(CommonParams.ROOT_DIR_PARAM); if (null == rootDir) Usage("Specify: " + CommonParams.ROOT_DIR_DESC, options); String outputDirName = cmd.getOptionValue(CommonParams.OUT_INDEX_PARAM); if (null == outputDirName) Usage("Specify: " + CommonParams.OUT_MINDEX_DESC, options); String subDirTypeList = cmd.getOptionValue(CommonParams.SUB_DIR_TYPE_PARAM); if (null == subDirTypeList || subDirTypeList.isEmpty()) Usage("Specify: " + CommonParams.SUB_DIR_TYPE_DESC, options); String solrFileName = cmd.getOptionValue(CommonParams.SOLR_FILE_NAME_PARAM); if (null == solrFileName) Usage("Specify: " + CommonParams.SOLR_FILE_NAME_DESC, options); int maxNumRec = Integer.MAX_VALUE; String tmp = cmd.getOptionValue(CommonParams.MAX_NUM_REC_PARAM); if (tmp != null) { try { maxNumRec = Integer.parseInt(tmp); if (maxNumRec <= 0) { Usage("The maximum number of records should be a positive integer", options); } } catch (NumberFormatException e) { Usage("The maximum number of records should be a positive integer", options); } } File outputDir = new File(outputDirName); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); System.exit(1); } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); System.exit(1); } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); System.exit(1); } String subDirs[] = subDirTypeList.split(","); int docNum = 0; // No English analyzer here, all language-related processing is done already, // here we simply white-space tokenize and index tokens verbatim. Analyzer analyzer = new WhitespaceAnalyzer(); FSDirectory indexDir = FSDirectory.open(outputDir); IndexWriterConfig indexConf = new IndexWriterConfig(analyzer.getVersion(), analyzer); System.out.println("Creating a new Lucene index, maximum # of docs to process: " + maxNumRec); indexConf.setOpenMode(OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(indexDir, indexConf); for (int subDirId = 0; subDirId < subDirs.length && docNum < maxNumRec; ++subDirId) { String inputFileName = rootDir + "/" + subDirs[subDirId] + "/" + solrFileName; System.out.println("Input file name: " + inputFileName); BufferedReader inpText = new BufferedReader( new InputStreamReader(CompressUtils.createInputStream(inputFileName))); String docText = XmlHelper.readNextXMLIndexEntry(inpText); for (; docText != null && docNum < maxNumRec; docText = XmlHelper.readNextXMLIndexEntry(inpText)) { ++docNum; Map<String, String> docFields = null; Document luceneDoc = new Document(); try { docFields = XmlHelper.parseXMLIndexEntry(docText); } catch (Exception e) { System.err.println(String.format("Parsing error, offending DOC #%d:\n%s", docNum, docText)); System.exit(1); } String id = docFields.get(UtilConst.TAG_DOCNO); if (id == null) { System.err.println(String.format("No ID tag '%s', offending DOC #%d:\n%s", UtilConst.TAG_DOCNO, docNum, docText)); } luceneDoc.add(new StringField(UtilConst.TAG_DOCNO, id, Field.Store.YES)); for (Map.Entry<String, String> e : docFields.entrySet()) if (!e.getKey().equals(UtilConst.TAG_DOCNO)) { luceneDoc.add(new TextField(e.getKey(), e.getValue(), Field.Store.YES)); } indexWriter.addDocument(luceneDoc); if (docNum % 1000 == 0) System.out.println("Indexed " + docNum + " docs"); } System.out.println("Indexed " + docNum + " docs"); } indexWriter.commit(); indexWriter.close(); } catch (ParseException e) { Usage("Cannot parse arguments", options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } }
From source file:edu.cmu.lti.oaqa.knn4qa.apps.FilterTranTable.java
public static void main(String[] args) { Options options = new Options(); options.addOption(INPUT_PARAM, null, true, INPUT_DESC); options.addOption(OUTPUT_PARAM, null, true, OUTPUT_DESC); options.addOption(CommonParams.MEM_FWD_INDEX_PARAM, null, true, CommonParams.MEM_FWD_INDEX_DESC); options.addOption(CommonParams.GIZA_ITER_QTY_PARAM, null, true, CommonParams.GIZA_ITER_QTY_PARAM); options.addOption(CommonParams.GIZA_ROOT_DIR_PARAM, null, true, CommonParams.GIZA_ROOT_DIR_PARAM); options.addOption(CommonParams.MIN_PROB_PARAM, null, true, CommonParams.MIN_PROB_DESC); options.addOption(CommonParams.MAX_WORD_QTY_PARAM, null, true, CommonParams.MAX_WORD_QTY_PARAM); CommandLineParser parser = new org.apache.commons.cli.GnuParser(); try {/* w w w .j a v a2 s .com*/ CommandLine cmd = parser.parse(options, args); String outputFile = null; outputFile = cmd.getOptionValue(OUTPUT_PARAM); if (null == outputFile) { Usage("Specify 'A name of the output file'", options); } String gizaRootDir = cmd.getOptionValue(CommonParams.GIZA_ROOT_DIR_PARAM); if (null == gizaRootDir) { Usage("Specify '" + CommonParams.GIZA_ROOT_DIR_DESC + "'", options); } String gizaIterQty = cmd.getOptionValue(CommonParams.GIZA_ITER_QTY_PARAM); if (null == gizaIterQty) { Usage("Specify '" + CommonParams.GIZA_ITER_QTY_DESC + "'", options); } float minProb = 0; String tmpf = cmd.getOptionValue(CommonParams.MIN_PROB_PARAM); if (tmpf != null) { minProb = Float.parseFloat(tmpf); } int maxWordQty = Integer.MAX_VALUE; String tmpi = cmd.getOptionValue(CommonParams.MAX_WORD_QTY_PARAM); if (null != tmpi) { maxWordQty = Integer.parseInt(tmpi); } String memFwdIndxName = cmd.getOptionValue(CommonParams.MEM_FWD_INDEX_PARAM); if (null == memFwdIndxName) { Usage("Specify '" + CommonParams.MEM_FWD_INDEX_DESC + "'", options); } System.out.println("Filtering index: " + memFwdIndxName + " max # of frequent words: " + maxWordQty + " min. probability:" + minProb); VocabularyFilterAndRecoder filter = new FrequentIndexWordFilterAndRecoder(memFwdIndxName, maxWordQty); String srcVocFile = CompressUtils.findFileVariant(gizaRootDir + "/source.vcb"); System.out.println("Source vocabulary file: " + srcVocFile); GizaVocabularyReader srcVoc = new GizaVocabularyReader(srcVocFile, filter); String dstVocFile = CompressUtils.findFileVariant(gizaRootDir + "/target.vcb"); System.out.println("Target vocabulary file: " + dstVocFile); GizaVocabularyReader dstVoc = new GizaVocabularyReader(CompressUtils.findFileVariant(dstVocFile), filter); String inputFile = CompressUtils.findFileVariant(gizaRootDir + "/output.t1." + gizaIterQty); BufferedReader finp = new BufferedReader( new InputStreamReader(CompressUtils.createInputStream(inputFile))); BufferedWriter fout = new BufferedWriter( new OutputStreamWriter(CompressUtils.createOutputStream(outputFile))); try { String line; int prevSrcId = -1; int wordQty = 0; long addedQty = 0; long totalQty = 0; boolean isNotFiltered = false; for (totalQty = 0; (line = finp.readLine()) != null;) { ++totalQty; // Skip empty lines line = line.trim(); if (line.isEmpty()) continue; GizaTranRec rec = new GizaTranRec(line); if (rec.mSrcId != prevSrcId) { ++wordQty; } if (totalQty % REPORT_INTERVAL_QTY == 0) { System.out.println(String.format( "Processed %d lines (%d source word entries) from '%s', added %d lines", totalQty, wordQty, inputFile, addedQty)); } // isNotFiltered should be set after procOneWord if (rec.mSrcId != prevSrcId) { if (rec.mSrcId == 0) isNotFiltered = true; else { String wordSrc = srcVoc.getWord(rec.mSrcId); isNotFiltered = filter == null || (wordSrc != null && filter.checkWord(wordSrc)); } } prevSrcId = rec.mSrcId; if (rec.mProb >= minProb && isNotFiltered) { String wordDst = dstVoc.getWord(rec.mDstId); if (filter == null || (wordDst != null && filter.checkWord(wordDst))) { fout.write(String.format(rec.mSrcId + " " + rec.mDstId + " " + rec.mProb)); fout.newLine(); addedQty++; } } } System.out.println( String.format("Processed %d lines (%d source word entries) from '%s', added %d lines", totalQty, wordQty, inputFile, addedQty)); } finally { finp.close(); fout.close(); } } catch (ParseException e) { Usage("Cannot parse arguments", options); } catch (Exception e) { e.printStackTrace(); System.err.println("Terminating due to an exception: " + e); System.exit(1); } }
From source file:edu.cmu.lti.oaqa.knn4qa.apps.ExtractDataAndQueryAsSparseVectors.java
public static void main(String[] args) { String optKeys[] = { CommonParams.MAX_NUM_QUERY_PARAM, MAX_NUM_DATA_PARAM, CommonParams.MEMINDEX_PARAM, IN_QUERIES_PARAM, OUT_QUERIES_PARAM, OUT_DATA_PARAM, TEXT_FIELD_PARAM, TEST_QTY_PARAM, }; String optDescs[] = { CommonParams.MAX_NUM_QUERY_DESC, MAX_NUM_DATA_DESC, CommonParams.MEMINDEX_DESC, IN_QUERIES_DESC, OUT_QUERIES_DESC, OUT_DATA_DESC, TEXT_FIELD_DESC, TEST_QTY_DESC }; boolean hasArg[] = { true, true, true, true, true, true, true, true }; ParamHelper prmHlp = null;// w w w . j av a2 s. co m try { prmHlp = new ParamHelper(args, optKeys, optDescs, hasArg); CommandLine cmd = prmHlp.getCommandLine(); Options opt = prmHlp.getOptions(); int maxNumQuery = Integer.MAX_VALUE; String tmpn = cmd.getOptionValue(CommonParams.MAX_NUM_QUERY_PARAM); if (tmpn != null) { try { maxNumQuery = Integer.parseInt(tmpn); } catch (NumberFormatException e) { UsageSpecify(CommonParams.MAX_NUM_QUERY_PARAM, opt); } } int maxNumData = Integer.MAX_VALUE; tmpn = cmd.getOptionValue(MAX_NUM_DATA_PARAM); if (tmpn != null) { try { maxNumData = Integer.parseInt(tmpn); } catch (NumberFormatException e) { UsageSpecify(MAX_NUM_DATA_PARAM, opt); } } String memIndexPref = cmd.getOptionValue(CommonParams.MEMINDEX_PARAM); if (null == memIndexPref) { UsageSpecify(CommonParams.MEMINDEX_PARAM, opt); } String textField = cmd.getOptionValue(TEXT_FIELD_PARAM); if (null == textField) { UsageSpecify(TEXT_FIELD_PARAM, opt); } textField = textField.toLowerCase(); int fieldId = -1; for (int i = 0; i < FeatureExtractor.mFieldNames.length; ++i) if (FeatureExtractor.mFieldNames[i].compareToIgnoreCase(textField) == 0) { fieldId = i; break; } if (-1 == fieldId) { Usage("Wrong field index, should be one of the following: " + String.join(",", FeatureExtractor.mFieldNames), opt); } InMemForwardIndex indx = new InMemForwardIndex( FeatureExtractor.indexFileName(memIndexPref, FeatureExtractor.mFieldNames[fieldId])); BM25SimilarityLucene bm25simil = new BM25SimilarityLucene(FeatureExtractor.BM25_K1, FeatureExtractor.BM25_B, indx); String inQueryFile = cmd.getOptionValue(IN_QUERIES_PARAM); String outQueryFile = cmd.getOptionValue(OUT_QUERIES_PARAM); if ((inQueryFile == null) != (outQueryFile == null)) { Usage("You should either specify both " + IN_QUERIES_PARAM + " and " + OUT_QUERIES_PARAM + " or none of them", opt); } String outDataFile = cmd.getOptionValue(OUT_DATA_PARAM); tmpn = cmd.getOptionValue(TEST_QTY_PARAM); int testQty = 0; if (tmpn != null) { try { testQty = Integer.parseInt(tmpn); } catch (NumberFormatException e) { UsageSpecify(TEST_QTY_PARAM, opt); } } ArrayList<DocEntry> testDocEntries = new ArrayList<DocEntry>(); ArrayList<DocEntry> testQueryEntries = new ArrayList<DocEntry>(); ArrayList<TrulySparseVector> testDocVectors = new ArrayList<TrulySparseVector>(); ArrayList<TrulySparseVector> testQueryVectors = new ArrayList<TrulySparseVector>(); if (outDataFile != null) { BufferedWriter out = new BufferedWriter( new OutputStreamWriter(CompressUtils.createOutputStream(outDataFile))); ArrayList<DocEntryExt> docEntries = indx.getDocEntries(); for (int id = 0; id < Math.min(maxNumData, docEntries.size()); ++id) { DocEntry e = docEntries.get(id).mDocEntry; TrulySparseVector v = bm25simil.getDocSparseVector(e, false); if (id < testQty) { testDocEntries.add(e); testDocVectors.add(v); } outputVector(out, v); } out.close(); } Splitter splitOnSpace = Splitter.on(' ').trimResults().omitEmptyStrings(); if (outQueryFile != null) { BufferedReader inpText = new BufferedReader( new InputStreamReader(CompressUtils.createInputStream(inQueryFile))); BufferedWriter out = new BufferedWriter( new OutputStreamWriter(CompressUtils.createOutputStream(outQueryFile))); String queryText = XmlHelper.readNextXMLIndexEntry(inpText); for (int queryQty = 0; queryText != null && queryQty < maxNumQuery; queryText = XmlHelper .readNextXMLIndexEntry(inpText), queryQty++) { Map<String, String> queryFields = null; // 1. Parse a query try { queryFields = XmlHelper.parseXMLIndexEntry(queryText); } catch (Exception e) { System.err.println("Parsing error, offending QUERY:\n" + queryText); throw new Exception("Parsing error."); } String fieldText = queryFields.get(FeatureExtractor.mFieldsSOLR[fieldId]); if (fieldText == null) { fieldText = ""; } ArrayList<String> tmpa = new ArrayList<String>(); for (String s : splitOnSpace.split(fieldText)) tmpa.add(s); DocEntry e = indx.createDocEntry(tmpa.toArray(new String[tmpa.size()])); TrulySparseVector v = bm25simil.getDocSparseVector(e, true); if (queryQty < testQty) { testQueryEntries.add(e); testQueryVectors.add(v); } outputVector(out, v); } out.close(); } int testedQty = 0, diffQty = 0; // Now let's do some testing for (int iq = 0; iq < testQueryEntries.size(); ++iq) { DocEntry queryEntry = testQueryEntries.get(iq); TrulySparseVector queryVector = testQueryVectors.get(iq); for (int id = 0; id < testDocEntries.size(); ++id) { DocEntry docEntry = testDocEntries.get(id); TrulySparseVector docVector = testDocVectors.get(id); float val1 = bm25simil.compute(queryEntry, docEntry); float val2 = TrulySparseVector.scalarProduct(queryVector, docVector); ++testedQty; if (Math.abs(val1 - val2) > 1e5) { System.err.println( String.format("Potential mismatch BM25=%f <-> scalar product=%f", val1, val2)); ++diffQty; } } } if (testedQty > 0) System.out.println(String.format("Tested %d Mismatched %d", testedQty, diffQty)); } catch (ParseException e) { Usage("Cannot parse arguments: " + e, prmHlp != null ? prmHlp.getOptions() : null); e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); System.err.println("Terminating due to an exception: " + e); System.exit(1); } }
From source file:com.athena.peacock.agent.sample.CommandExecutorSample.java
/** * <pre>//from w w w. ja v a 2s . co m * * </pre> * @param args * @throws CommandLineException * @throws IOException */ public static void main(String[] args) throws CommandLineException, IOException { // Windows wmic usage // http://blog.naver.com/PostView.nhn?blogId=diadld2&logNo=30157625015 // http://www.petenetlive.com/KB/Article/0000619.htm OSType osType = OSUtil.getOSName(); File executable = null; Commandline commandLine = null; if (osType.equals(OSType.WINDOWS)) { executable = new File("C:\\Windows\\System32\\wbem\\WMIC.exe"); commandLine = new Commandline(); commandLine.setExecutable(executable.getAbsolutePath()); //commandLine.setExecutable("wmic"); // available only that command is in path /** change working directory if necessary */ commandLine.setWorkingDirectory("/"); /** invoke createArg() and setValue() one by one for each arguments */ commandLine.createArg().setValue("product"); commandLine.createArg().setValue("get"); commandLine.createArg().setValue("name,vendor,version"); /** invoke createArg() and setLine() for entire arguments */ //commandLine.createArg().setLine("product get name,vendor,version"); /** verify command string */ System.out.println("C:\\> " + commandLine.toString() + "\n"); } else { executable = new File("/bin/cat"); commandLine = new Commandline(); commandLine.setExecutable(executable.getAbsolutePath()); /** change working directory if necessary */ commandLine.setWorkingDirectory("/"); /** invoke createArg() and setValue() one by one for each arguments */ commandLine.createArg().setValue("-n"); commandLine.createArg().setValue("/etc/hosts"); /** invoke createArg() and setLine() for entire arguments */ //commandLine.createArg().setLine("-n /etc/hosts"); /** verify command string */ System.out.println("~]$ " + commandLine.toString() + "\n"); } /** also enable StringWriter, PrintWriter, WriterStreamConsumer and etc. */ StringStreamConsumer consumer = new CommandLineUtils.StringStreamConsumer(); int returnCode = CommandLineUtils.executeCommandLine(commandLine, consumer, consumer, Integer.MAX_VALUE); if (returnCode == 0) { // success System.out.println("==============[SUCCEED]=============="); System.out.println("[" + consumer.getOutput().substring(0, consumer.getOutput().length() - 1) + "]"); if (osType.equals(OSType.WINDOWS)) { List<Product> productList = parse(consumer.getOutput()); for (Product product : productList) { System.out.println(product); } int UTF_8 = 0x01; int EUC_KR = 0x02; int KSC5601 = 0x04; int MS949 = 0x08; int ISO8859_1 = 0x10; int mode = 0x00; //mode ^= UTF_8; //mode ^= EUC_KR; //mode ^= KSC5601; //mode ^= MS949; //mode ^= ISO8859_1; if ((mode & UTF_8) == UTF_8) { System.out.println("+:+:+:+: UTF-8 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes(), "UTF-8")); System.out.println("+:+:+:+: EUC-KR => UTF-8 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("EUC-KR"), "UTF-8")); System.out.println("+:+:+:+: KSC5601 => UTF-8 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("KSC5601"), "UTF-8")); System.out.println("+:+:+:+: MS949 => UTF-8 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("MS949"), "UTF-8")); System.out.println("+:+:+:+: ISO8859_1 => UTF-8 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("ISO8859_1"), "UTF-8")); } if ((mode & EUC_KR) == EUC_KR) { System.out.println("+:+:+:+: EUC-KR +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes(), "EUC-KR")); System.out.println("+:+:+:+: UTF-8 => EUC-KR +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("UTF-8"), "EUC-KR")); System.out.println("+:+:+:+: KSC5601 => EUC-KR +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("KSC5601"), "EUC-KR")); System.out.println("+:+:+:+: MS949 => EUC-KR +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("MS949"), "EUC-KR")); System.out.println("+:+:+:+: ISO8859_1 => EUC-KR +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("ISO8859_1"), "EUC-KR")); } if ((mode & KSC5601) == KSC5601) { System.out.println("+:+:+:+: KSC5601 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes(), "KSC5601")); System.out.println("+:+:+:+: EUC-KR => KSC5601 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("EUC-KR"), "KSC5601")); System.out.println("+:+:+:+: UTF-8 => KSC5601 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("UTF-8"), "KSC5601")); System.out.println("+:+:+:+: MS949 => KSC5601 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("MS949"), "KSC5601")); System.out.println("+:+:+:+: ISO8859_1 => KSC5601 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("ISO8859_1"), "KSC5601")); } if ((mode & MS949) == MS949) { System.out.println("+:+:+:+: MS949 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes(), "MS949")); System.out.println("+:+:+:+: EUC-KR => MS949 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("EUC-KR"), "MS949")); System.out.println("+:+:+:+: UTF-8 => MS949 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("UTF-8"), "MS949")); System.out.println("+:+:+:+: KSC5601 => MS949 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("KSC5601"), "MS949")); System.out.println("+:+:+:+: ISO8859_1 => MS949 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("ISO8859_1"), "MS949")); } if ((mode & ISO8859_1) == ISO8859_1) { System.out.println("+:+:+:+: ISO8859_1 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes(), "ISO8859_1")); System.out.println("+:+:+:+: EUC-KR => ISO8859_1 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("EUC-KR"), "ISO8859_1")); System.out.println("+:+:+:+: UTF-8 => ISO8859_1 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("UTF-8"), "ISO8859_1")); System.out.println("+:+:+:+: KSC5601 => ISO8859_1 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("KSC5601"), "ISO8859_1")); System.out.println("+:+:+:+: MS949 => ISO8859_1 +:+:+:+:"); System.out.println(new String(consumer.getOutput().getBytes("MS949"), "ISO8859_1")); } } } else { // fail System.err.println("==============[FAILED]=============="); System.err.println(consumer.getOutput()); } }
From source file:com.metamx.druid.utils.ExposeS3DataSource.java
public static void main(String[] args) throws ServiceException, IOException, NoSuchAlgorithmException { CLI cli = new CLI(); cli.addOption(new RequiredOption(null, "s3Bucket", true, "s3 bucket to pull data from")); cli.addOption(new RequiredOption(null, "s3Path", true, "base input path in s3 bucket. Everything until the date strings.")); cli.addOption(new RequiredOption(null, "timeInterval", true, "ISO8601 interval of dates to index")); cli.addOption(new RequiredOption(null, "granularity", true, String.format( "granularity of index, supported granularities: [%s]", Arrays.asList(Granularity.values())))); cli.addOption(new RequiredOption(null, "zkCluster", true, "Cluster string to connect to ZK with.")); cli.addOption(new RequiredOption(null, "zkBasePath", true, "The base path to register index changes to.")); CommandLine commandLine = cli.parse(args); if (commandLine == null) { return;/*from w w w .ja v a2 s. c om*/ } String s3Bucket = commandLine.getOptionValue("s3Bucket"); String s3Path = commandLine.getOptionValue("s3Path"); String timeIntervalString = commandLine.getOptionValue("timeInterval"); String granularity = commandLine.getOptionValue("granularity"); String zkCluster = commandLine.getOptionValue("zkCluster"); String zkBasePath = commandLine.getOptionValue("zkBasePath"); Interval timeInterval = new Interval(timeIntervalString); Granularity gran = Granularity.valueOf(granularity.toUpperCase()); final RestS3Service s3Client = new RestS3Service(new AWSCredentials( System.getProperty("com.metamx.aws.accessKey"), System.getProperty("com.metamx.aws.secretKey"))); ZkClient zkClient = new ZkClient(new ZkConnection(zkCluster), Integer.MAX_VALUE, new StringZkSerializer()); zkClient.waitUntilConnected(); for (Interval interval : gran.getIterable(timeInterval)) { log.info("Processing interval[%s]", interval); String s3DatePath = JOINER.join(s3Path, gran.toPath(interval.getStart())); if (!s3DatePath.endsWith("/")) { s3DatePath += "/"; } StorageObjectsChunk chunk = s3Client.listObjectsChunked(s3Bucket, s3DatePath, "/", 2000, null, true); TreeSet<String> commonPrefixes = Sets.newTreeSet(); commonPrefixes.addAll(Arrays.asList(chunk.getCommonPrefixes())); if (commonPrefixes.isEmpty()) { log.info("Nothing at s3://%s/%s", s3Bucket, s3DatePath); continue; } String latestPrefix = commonPrefixes.last(); log.info("Latest segments at [s3://%s/%s]", s3Bucket, latestPrefix); chunk = s3Client.listObjectsChunked(s3Bucket, latestPrefix, "/", 2000, null, true); Integer partitionNumber; if (chunk.getCommonPrefixes().length == 0) { partitionNumber = null; } else { partitionNumber = -1; for (String partitionPrefix : chunk.getCommonPrefixes()) { String[] splits = partitionPrefix.split("/"); partitionNumber = Math.max(partitionNumber, Integer.parseInt(splits[splits.length - 1])); } } log.info("Highest segment partition[%,d]", partitionNumber); if (partitionNumber == null) { final S3Object s3Obj = new S3Object(new S3Bucket(s3Bucket), String.format("%sdescriptor.json", latestPrefix)); updateWithS3Object(zkBasePath, s3Client, zkClient, s3Obj); } else { for (int i = partitionNumber; i >= 0; --i) { final S3Object partitionObject = new S3Object(new S3Bucket(s3Bucket), String.format("%s%s/descriptor.json", latestPrefix, i)); updateWithS3Object(zkBasePath, s3Client, zkClient, partitionObject); } } } }
From source file:com.asakusafw.dmdl.java.Main.java
/** * Program entry.//w ww .ja v a2 s. c o m * @param args program arguments */ public static void main(String... args) { GenerateTask task; try { Configuration conf = configure(args); task = new GenerateTask(conf); } catch (Exception e) { HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(Integer.MAX_VALUE); formatter.printHelp(MessageFormat.format("java -classpath ... {0}", //$NON-NLS-1$ Main.class.getName()), OPTIONS, true); e.printStackTrace(System.out); System.exit(1); return; } try { task.process(); } catch (Exception e) { e.printStackTrace(System.out); System.exit(1); return; } }
From source file:io.bfscan.clueweb12.BuildWarcTrecIdMapping.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file") .create(INPUT_OPTION));/*from ww w.j av a 2 s . co m*/ options.addOption( OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("maximum number of documents to index").create(MAX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads") .create(THREADS_OPTION)); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(BuildWarcTrecIdMapping.class.getCanonicalName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; long startTime = System.currentTimeMillis(); String path = cmdline.getOptionValue(INPUT_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); LOG.info("Creating index at " + indexPath); LOG.info("Indexing with " + threads + " threads"); FileInputStream fis = null; BufferedReader br = null; try { fis = new FileInputStream(new File(path)); byte[] ignoreBytes = new byte[2]; fis.read(ignoreBytes); // "B", "Z" bytes from commandline tools br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF8")); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String s; while ((s = br.readLine()) != null) { Runnable worker = new AddDocumentRunnable(writer, s); executor.execute(worker); cnt++; if (cnt % 1000000 == 0) { LOG.info(cnt + " articles added"); } if (cnt >= maxdocs) { break; } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } LOG.info("Total of " + cnt + " articles indexed."); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); out.close(); br.close(); fis.close(); } }
From source file:cc.wikitools.lucene.IndexWikipediaDump.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file") .create(INPUT_OPTION));/*from www . ja v a2 s .co m*/ options.addOption( OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("maximum number of documents to index").create(MAX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads") .create(THREADS_OPTION)); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexWikipediaDump.class.getCanonicalName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; long startTime = System.currentTimeMillis(); String path = cmdline.getOptionValue(INPUT_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build(); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); LOG.info("Creating index at " + indexPath); LOG.info("Indexing with " + threads + " threads"); try { WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String page; while ((page = stream.readNext()) != null) { String title = cleaner.getTitle(page); // These are heuristic specifically for filtering out non-articles in enwiki-20120104. if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) { continue; } if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) { continue; } Runnable worker = new AddDocumentRunnable(writer, cleaner, page); executor.execute(worker); cnt++; if (cnt % 10000 == 0) { LOG.info(cnt + " articles added"); } if (cnt >= maxdocs) { break; } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } LOG.info("Total of " + cnt + " articles indexed."); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); out.close(); } }