List of usage examples for java.io PrintStream PrintStream
public PrintStream(OutputStream out, boolean autoFlush, Charset charset)
From source file:cc.wikitools.lucene.SearchWikipedia.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(/*w ww . j a v a2 s . c o m*/ OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("query text").create(QUERY_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); options.addOption(new Option(TITLE_OPTION, "search title")); options.addOption(new Option(ARTICLE_OPTION, "search article")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(QUERY_OPTION) || !cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(QUERY_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(SearchWikipedia.class.getName(), options); System.exit(-1); } File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); if (!indexLocation.exists()) { System.err.println("Error: " + indexLocation + " does not exist!"); System.exit(-1); } String queryText = cmdline.getOptionValue(QUERY_OPTION); int numResults = cmdline.hasOption(NUM_RESULTS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)) : DEFAULT_NUM_RESULTS; boolean verbose = cmdline.hasOption(VERBOSE_OPTION); boolean searchArticle = !cmdline.hasOption(TITLE_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); WikipediaSearcher searcher = new WikipediaSearcher(indexLocation); TopDocs rs = null; if (searchArticle) { rs = searcher.searchArticle(queryText, numResults); } else { rs = searcher.searchTitle(queryText, numResults); } int i = 1; for (ScoreDoc scoreDoc : rs.scoreDocs) { Document hit = searcher.doc(scoreDoc.doc); out.println(String.format("%d. %s (wiki id = %s, lucene id = %d) %f", i, hit.getField(IndexField.TITLE.name).stringValue(), hit.getField(IndexField.ID.name).stringValue(), scoreDoc.doc, scoreDoc.score)); if (verbose) { out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " ")); } i++; } searcher.close(); out.close(); }
From source file:cc.twittertools.util.VerifySubcollection.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory") .create(COLLECTION_OPTION)); options.addOption(//from ww w . j a va 2 s . c o m OptionBuilder.withArgName("file").hasArg().withDescription("list of tweetids").create(ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(ID_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(ExtractSubcollection.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); LongOpenHashSet tweetids = new LongOpenHashSet(); File tweetidsFile = new File(cmdline.getOptionValue(ID_OPTION)); if (!tweetidsFile.exists()) { System.err.println("Error: " + tweetidsFile + " does not exist!"); System.exit(-1); } LOG.info("Reading tweetids from " + tweetidsFile); FileInputStream fin = new FileInputStream(tweetidsFile); BufferedReader br = new BufferedReader(new InputStreamReader(fin)); String s; while ((s = br.readLine()) != null) { tweetids.add(Long.parseLong(s)); } br.close(); fin.close(); LOG.info("Read " + tweetids.size() + " tweetids."); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } LongOpenHashSet seen = new LongOpenHashSet(); TreeMap<Long, String> tweets = Maps.newTreeMap(); PrintStream out = new PrintStream(System.out, true, "UTF-8"); StatusStream stream = new JsonStatusCorpusReader(file); Status status; int cnt = 0; while ((status = stream.next()) != null) { if (!tweetids.contains(status.getId())) { LOG.error("tweetid " + status.getId() + " doesn't belong in collection"); continue; } if (seen.contains(status.getId())) { LOG.error("tweetid " + status.getId() + " already seen!"); continue; } tweets.put(status.getId(), status.getJsonObject().toString()); seen.add(status.getId()); cnt++; } LOG.info("total of " + cnt + " tweets in subcollection."); for (Map.Entry<Long, String> entry : tweets.entrySet()) { out.println(entry.getValue()); } stream.close(); out.close(); }
From source file:cc.twittertools.search.api.RunQueriesThrift.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("string").hasArg().withDescription("host").create(HOST_OPTION)); options.addOption(OptionBuilder.withArgName("port").hasArg().withDescription("port").create(PORT_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg() .withDescription("file containing topics in TREC format").create(QUERIES_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(//from ww w . j a v a 2 s . c o m OptionBuilder.withArgName("string").hasArg().withDescription("group id").create(GROUP_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("access token").create(TOKEN_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(HOST_OPTION) || !cmdline.hasOption(PORT_OPTION) || !cmdline.hasOption(QUERIES_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(RunQueriesThrift.class.getName(), options); System.exit(-1); } String queryFile = cmdline.getOptionValue(QUERIES_OPTION); if (!new File(queryFile).exists()) { System.err.println("Error: " + queryFile + " doesn't exist!"); System.exit(-1); } String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; TrecTopicSet topicsFile = TrecTopicSet.fromFile(new File(queryFile)); int numResults = 1000; try { if (cmdline.hasOption(NUM_RESULTS_OPTION)) { numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)); } } catch (NumberFormatException e) { System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION)); System.exit(-1); } String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null; String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null; boolean verbose = cmdline.hasOption(VERBOSE_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION), Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token); for (cc.twittertools.search.TrecTopic query : topicsFile) { List<TResult> results = client.search(query.getQuery(), query.getQueryTweetTime(), numResults); int i = 1; Set<Long> tweetIds = new HashSet<Long>(); for (TResult result : results) { if (!tweetIds.contains(result.id)) { tweetIds.add(result.id); out.println( String.format("%s Q0 %d %d %f %s", query.getId(), result.id, i, result.rsv, runtag)); if (verbose) { out.println("# " + result.toString().replaceAll("[\\n\\r]+", " ")); } i++; } } } out.close(); }
From source file:cc.twittertools.search.api.RunQueriesBaselineThrift.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("string").hasArg().withDescription("host").create(HOST_OPTION)); options.addOption(OptionBuilder.withArgName("port").hasArg().withDescription("port").create(PORT_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg() .withDescription("file containing topics in TREC format").create(QUERIES_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(//from w w w .j a v a2 s .c om OptionBuilder.withArgName("string").hasArg().withDescription("group id").create(GROUP_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("access token").create(TOKEN_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(HOST_OPTION) || !cmdline.hasOption(PORT_OPTION) || !cmdline.hasOption(QUERIES_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(RunQueriesThrift.class.getName(), options); System.exit(-1); } String queryFile = cmdline.getOptionValue(QUERIES_OPTION); if (!new File(queryFile).exists()) { System.err.println("Error: " + queryFile + " doesn't exist!"); System.exit(-1); } String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; TrecTopicSet topicsFile = TrecTopicSet.fromFile(new File(queryFile)); int numResults = 1000; try { if (cmdline.hasOption(NUM_RESULTS_OPTION)) { numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)); } } catch (NumberFormatException e) { System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION)); System.exit(-1); } String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null; String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null; boolean verbose = cmdline.hasOption(VERBOSE_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION), Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token); for (cc.twittertools.search.TrecTopic query : topicsFile) { List<TResult> results = client.search(query.getQuery(), query.getQueryTweetTime(), numResults); SortedSet<TResultComparable> sortedResults = new TreeSet<TResultComparable>(); for (TResult result : results) { // Throw away retweets. if (result.getRetweeted_status_id() == 0) { sortedResults.add(new TResultComparable(result)); } } int i = 1; int dupliCount = 0; double rsvPrev = 0; for (TResultComparable sortedResult : sortedResults) { TResult result = sortedResult.getTResult(); double rsvCurr = result.rsv; if (Math.abs(rsvCurr - rsvPrev) > 0.0000001) { dupliCount = 0; } else { dupliCount++; rsvCurr = rsvCurr - 0.000001 / numResults * dupliCount; } out.println(String.format("%s Q0 %d %d %." + (int) (6 + Math.ceil(Math.log10(numResults))) + "f %s", query.getId(), result.id, i, rsvCurr, runtag)); if (verbose) { out.println("# " + result.toString().replaceAll("[\\n\\r]+", " ")); } i++; rsvPrev = result.rsv; } } out.close(); }
From source file:io.bfscan.clueweb12.BuildWarcTrecIdMapping.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file") .create(INPUT_OPTION));//from w ww. j av a 2 s .c o m options.addOption( OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("maximum number of documents to index").create(MAX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads") .create(THREADS_OPTION)); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(BuildWarcTrecIdMapping.class.getCanonicalName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; long startTime = System.currentTimeMillis(); String path = cmdline.getOptionValue(INPUT_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); LOG.info("Creating index at " + indexPath); LOG.info("Indexing with " + threads + " threads"); FileInputStream fis = null; BufferedReader br = null; try { fis = new FileInputStream(new File(path)); byte[] ignoreBytes = new byte[2]; fis.read(ignoreBytes); // "B", "Z" bytes from commandline tools br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF8")); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String s; while ((s = br.readLine()) != null) { Runnable worker = new AddDocumentRunnable(writer, s); executor.execute(worker); cnt++; if (cnt % 1000000 == 0) { LOG.info(cnt + " articles added"); } if (cnt >= maxdocs) { break; } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } LOG.info("Total of " + cnt + " articles indexed."); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); out.close(); br.close(); fis.close(); } }
From source file:cc.wikitools.lucene.IndexWikipediaDump.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file") .create(INPUT_OPTION));// ww w. ja va 2 s . co m options.addOption( OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("maximum number of documents to index").create(MAX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads") .create(THREADS_OPTION)); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexWikipediaDump.class.getCanonicalName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; long startTime = System.currentTimeMillis(); String path = cmdline.getOptionValue(INPUT_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build(); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); LOG.info("Creating index at " + indexPath); LOG.info("Indexing with " + threads + " threads"); try { WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String page; while ((page = stream.readNext()) != null) { String title = cleaner.getTitle(page); // These are heuristic specifically for filtering out non-articles in enwiki-20120104. if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) { continue; } if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) { continue; } Runnable worker = new AddDocumentRunnable(writer, cleaner, page); executor.execute(worker); cnt++; if (cnt % 10000 == 0) { LOG.info(cnt + " articles added"); } if (cnt >= maxdocs) { break; } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } LOG.info("Total of " + cnt + " articles indexed."); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); out.close(); } }
From source file:io.anserini.search.SearchTweets.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(RM3_OPTION, "apply relevance feedback with rm3")); options.addOption(//w w w. ja va2s .c o m OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg() .withDescription("file containing topics in TREC format").create(QUERIES_OPTION)); options.addOption(OptionBuilder.withArgName("similarity").hasArg() .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(SearchTweets.class.getName(), options); System.exit(-1); } File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); if (!indexLocation.exists()) { System.err.println("Error: " + indexLocation + " does not exist!"); System.exit(-1); } String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; String topicsFile = cmdline.getOptionValue(QUERIES_OPTION); int numResults = 1000; try { if (cmdline.hasOption(NUM_RESULTS_OPTION)) { numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)); } } catch (NumberFormatException e) { System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION)); System.exit(-1); } String similarity = "LM"; if (cmdline.hasOption(SIMILARITY_OPTION)) { similarity = cmdline.getOptionValue(SIMILARITY_OPTION); } PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexLocation.getAbsolutePath()))); IndexSearcher searcher = new IndexSearcher(reader); if (similarity.equalsIgnoreCase("BM25")) { searcher.setSimilarity(new BM25Similarity()); } else if (similarity.equalsIgnoreCase("LM")) { searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); } MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(topicsFile)); for (MicroblogTopic topic : topics) { Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(), true, true); Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery()); TopDocs rs = searcher.search(query, filter, numResults); RerankerContext context = new RerankerContext(searcher, query, topic.getQuery(), filter); RerankerCascade cascade = new RerankerCascade(context); if (cmdline.hasOption(RM3_OPTION)) { cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name)); cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } else { cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher)); for (int i = 0; i < docs.documents.length; i++) { String qid = topic.getId().replaceFirst("^MB0*", ""); out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i], runtag)); } } reader.close(); out.close(); }
From source file:cc.twittertools.search.local.RunQueries.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(//from w ww . j a va2 s. com OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg() .withDescription("file containing topics in TREC format").create(QUERIES_OPTION)); options.addOption(OptionBuilder.withArgName("similarity").hasArg() .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(RunQueries.class.getName(), options); System.exit(-1); } File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); if (!indexLocation.exists()) { System.err.println("Error: " + indexLocation + " does not exist!"); System.exit(-1); } String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; String topicsFile = cmdline.getOptionValue(QUERIES_OPTION); int numResults = 1000; try { if (cmdline.hasOption(NUM_RESULTS_OPTION)) { numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)); } } catch (NumberFormatException e) { System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION)); System.exit(-1); } String similarity = "LM"; if (cmdline.hasOption(SIMILARITY_OPTION)) { similarity = cmdline.getOptionValue(SIMILARITY_OPTION); } boolean verbose = cmdline.hasOption(VERBOSE_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation)); IndexSearcher searcher = new IndexSearcher(reader); if (similarity.equalsIgnoreCase("BM25")) { searcher.setSimilarity(new BM25Similarity()); } else if (similarity.equalsIgnoreCase("LM")) { searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); } QueryParser p = new QueryParser(Version.LUCENE_43, StatusField.TEXT.name, IndexStatuses.ANALYZER); TrecTopicSet topics = TrecTopicSet.fromFile(new File(topicsFile)); for (TrecTopic topic : topics) { Query query = p.parse(topic.getQuery()); Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(), true, true); TopDocs rs = searcher.search(query, filter, numResults); int i = 1; for (ScoreDoc scoreDoc : rs.scoreDocs) { Document hit = searcher.doc(scoreDoc.doc); out.println(String.format("%s Q0 %s %d %f %s", topic.getId(), hit.getField(StatusField.ID.name).numericValue(), i, scoreDoc.score, runtag)); if (verbose) { out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " ")); } i++; } } reader.close(); out.close(); }
From source file:cc.twittertools.search.api.SearchStatusesThrift.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(OptionBuilder.withArgName("string").hasArg().withDescription("host").create(HOST_OPTION)); options.addOption(OptionBuilder.withArgName("port").hasArg().withDescription("port").create(PORT_OPTION)); options.addOption(/*from www . j ava 2 s . co m*/ OptionBuilder.withArgName("string").hasArg().withDescription("query id").create(QID_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("query text").create(QUERY_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("maxid").create(MAX_ID_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("group id").create(GROUP_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("access token").create(TOKEN_OPTION)); options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(HOST_OPTION) || !cmdline.hasOption(PORT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(SearchStatusesThrift.class.getName(), options); System.exit(-1); } String qid = cmdline.hasOption(QID_OPTION) ? cmdline.getOptionValue(QID_OPTION) : DEFAULT_QID; String query = cmdline.hasOption(QUERY_OPTION) ? cmdline.getOptionValue(QUERY_OPTION) : DEFAULT_Q; String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; long maxId = cmdline.hasOption(MAX_ID_OPTION) ? Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)) : DEFAULT_MAX_ID; int numResults = cmdline.hasOption(NUM_RESULTS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)) : DEFAULT_NUM_RESULTS; boolean verbose = cmdline.hasOption(VERBOSE_OPTION); String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null; String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null; TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION), Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token); System.err.println("qid: " + qid); System.err.println("q: " + query); System.err.println("max_id: " + maxId); System.err.println("num_results: " + numResults); PrintStream out = new PrintStream(System.out, true, "UTF-8"); List<TResult> results = client.search(query, maxId, numResults); int i = 1; for (TResult result : results) { out.println(String.format("%s Q0 %d %d %f %s", qid, result.id, i, result.rsv, runtag)); if (verbose) { System.out.println("# " + result.toString().replaceAll("[\\n\\r]+", " ")); } i++; } out.close(); }
From source file:is.merkor.core.cli.Main.java
public static void main(String[] args) throws Exception { List<String> results = new ArrayList<String>(); PrintStream out = new PrintStream(System.out, true, "UTF-8"); CommandLineParser parser = new GnuParser(); try {/*from www.j av a 2s . c o m*/ MerkorCoreCommandLineOptions.createOptions(); results = processCommandLine(parser.parse(MerkorCoreCommandLineOptions.options, args)); out.print("\n"); for (String str : results) { if (!str.equals("no message")) out.println(str); } out.print("\n"); if (results.isEmpty()) { out.println("nothing found for parameters: "); for (int i = 0; i < args.length; i++) out.println("\t" + args[i]); out.println("for help type: -help or see README.markdown"); out.print("\n"); } } catch (ParseException e) { System.err.println("Parsing failed. Reason: " + e.getMessage()); } }