Example usage for java.io PrintStream PrintStream

List of usage examples for java.io PrintStream PrintStream

Introduction

In this page you can find the example usage for java.io PrintStream PrintStream.

Prototype

public PrintStream(OutputStream out, boolean autoFlush, Charset charset) 

Source Link

Document

Creates a new print stream, with the specified OutputStream, automatic line flushing and charset.

Usage

From source file:cc.wikitools.lucene.SearchWikipedia.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(/*w ww . j a  v a2  s .  c  o  m*/
            OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("query text").create(QUERY_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return")
            .create(NUM_RESULTS_OPTION));

    options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));
    options.addOption(new Option(TITLE_OPTION, "search title"));
    options.addOption(new Option(ARTICLE_OPTION, "search article"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(QUERY_OPTION) || !cmdline.hasOption(INDEX_OPTION)
            || !cmdline.hasOption(QUERY_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(SearchWikipedia.class.getName(), options);
        System.exit(-1);
    }

    File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));
    if (!indexLocation.exists()) {
        System.err.println("Error: " + indexLocation + " does not exist!");
        System.exit(-1);
    }

    String queryText = cmdline.getOptionValue(QUERY_OPTION);
    int numResults = cmdline.hasOption(NUM_RESULTS_OPTION)
            ? Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION))
            : DEFAULT_NUM_RESULTS;
    boolean verbose = cmdline.hasOption(VERBOSE_OPTION);
    boolean searchArticle = !cmdline.hasOption(TITLE_OPTION);

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    WikipediaSearcher searcher = new WikipediaSearcher(indexLocation);
    TopDocs rs = null;
    if (searchArticle) {
        rs = searcher.searchArticle(queryText, numResults);
    } else {
        rs = searcher.searchTitle(queryText, numResults);
    }

    int i = 1;
    for (ScoreDoc scoreDoc : rs.scoreDocs) {
        Document hit = searcher.doc(scoreDoc.doc);

        out.println(String.format("%d. %s (wiki id = %s, lucene id = %d) %f", i,
                hit.getField(IndexField.TITLE.name).stringValue(),
                hit.getField(IndexField.ID.name).stringValue(), scoreDoc.doc, scoreDoc.score));
        if (verbose) {
            out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " "));
        }
        i++;
    }

    searcher.close();
    out.close();
}

From source file:cc.twittertools.util.VerifySubcollection.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory")
            .create(COLLECTION_OPTION));
    options.addOption(//from  ww w  .  j a va  2 s . c o m
            OptionBuilder.withArgName("file").hasArg().withDescription("list of tweetids").create(ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(ID_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(ExtractSubcollection.class.getName(), options);
        System.exit(-1);
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);

    LongOpenHashSet tweetids = new LongOpenHashSet();
    File tweetidsFile = new File(cmdline.getOptionValue(ID_OPTION));
    if (!tweetidsFile.exists()) {
        System.err.println("Error: " + tweetidsFile + " does not exist!");
        System.exit(-1);
    }
    LOG.info("Reading tweetids from " + tweetidsFile);

    FileInputStream fin = new FileInputStream(tweetidsFile);
    BufferedReader br = new BufferedReader(new InputStreamReader(fin));

    String s;
    while ((s = br.readLine()) != null) {
        tweetids.add(Long.parseLong(s));
    }
    br.close();
    fin.close();
    LOG.info("Read " + tweetids.size() + " tweetids.");

    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    LongOpenHashSet seen = new LongOpenHashSet();
    TreeMap<Long, String> tweets = Maps.newTreeMap();

    PrintStream out = new PrintStream(System.out, true, "UTF-8");
    StatusStream stream = new JsonStatusCorpusReader(file);
    Status status;
    int cnt = 0;
    while ((status = stream.next()) != null) {
        if (!tweetids.contains(status.getId())) {
            LOG.error("tweetid " + status.getId() + " doesn't belong in collection");
            continue;
        }
        if (seen.contains(status.getId())) {
            LOG.error("tweetid " + status.getId() + " already seen!");
            continue;
        }

        tweets.put(status.getId(), status.getJsonObject().toString());
        seen.add(status.getId());
        cnt++;
    }
    LOG.info("total of " + cnt + " tweets in subcollection.");

    for (Map.Entry<Long, String> entry : tweets.entrySet()) {
        out.println(entry.getValue());
    }

    stream.close();
    out.close();
}

From source file:cc.twittertools.search.api.RunQueriesThrift.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("string").hasArg().withDescription("host").create(HOST_OPTION));
    options.addOption(OptionBuilder.withArgName("port").hasArg().withDescription("port").create(PORT_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg()
            .withDescription("file containing topics in TREC format").create(QUERIES_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return")
            .create(NUM_RESULTS_OPTION));
    options.addOption(//from   ww  w  . j a v a  2 s  .  c  o  m
            OptionBuilder.withArgName("string").hasArg().withDescription("group id").create(GROUP_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("access token").create(TOKEN_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION));
    options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(HOST_OPTION) || !cmdline.hasOption(PORT_OPTION)
            || !cmdline.hasOption(QUERIES_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(RunQueriesThrift.class.getName(), options);
        System.exit(-1);
    }

    String queryFile = cmdline.getOptionValue(QUERIES_OPTION);
    if (!new File(queryFile).exists()) {
        System.err.println("Error: " + queryFile + " doesn't exist!");
        System.exit(-1);
    }

    String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;

    TrecTopicSet topicsFile = TrecTopicSet.fromFile(new File(queryFile));

    int numResults = 1000;
    try {
        if (cmdline.hasOption(NUM_RESULTS_OPTION)) {
            numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION));
        }
    } catch (NumberFormatException e) {
        System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION));
        System.exit(-1);
    }

    String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null;
    String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null;

    boolean verbose = cmdline.hasOption(VERBOSE_OPTION);

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION),
            Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token);

    for (cc.twittertools.search.TrecTopic query : topicsFile) {
        List<TResult> results = client.search(query.getQuery(), query.getQueryTweetTime(), numResults);
        int i = 1;
        Set<Long> tweetIds = new HashSet<Long>();
        for (TResult result : results) {
            if (!tweetIds.contains(result.id)) {
                tweetIds.add(result.id);
                out.println(
                        String.format("%s Q0 %d %d %f %s", query.getId(), result.id, i, result.rsv, runtag));
                if (verbose) {
                    out.println("# " + result.toString().replaceAll("[\\n\\r]+", " "));
                }
                i++;
            }
        }
    }
    out.close();
}

From source file:cc.twittertools.search.api.RunQueriesBaselineThrift.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("string").hasArg().withDescription("host").create(HOST_OPTION));
    options.addOption(OptionBuilder.withArgName("port").hasArg().withDescription("port").create(PORT_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg()
            .withDescription("file containing topics in TREC format").create(QUERIES_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return")
            .create(NUM_RESULTS_OPTION));
    options.addOption(//from w  w  w  .j a v a2 s .c om
            OptionBuilder.withArgName("string").hasArg().withDescription("group id").create(GROUP_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("access token").create(TOKEN_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION));
    options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(HOST_OPTION) || !cmdline.hasOption(PORT_OPTION)
            || !cmdline.hasOption(QUERIES_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(RunQueriesThrift.class.getName(), options);
        System.exit(-1);
    }

    String queryFile = cmdline.getOptionValue(QUERIES_OPTION);
    if (!new File(queryFile).exists()) {
        System.err.println("Error: " + queryFile + " doesn't exist!");
        System.exit(-1);
    }

    String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;

    TrecTopicSet topicsFile = TrecTopicSet.fromFile(new File(queryFile));

    int numResults = 1000;
    try {
        if (cmdline.hasOption(NUM_RESULTS_OPTION)) {
            numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION));
        }
    } catch (NumberFormatException e) {
        System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION));
        System.exit(-1);
    }

    String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null;
    String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null;

    boolean verbose = cmdline.hasOption(VERBOSE_OPTION);

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION),
            Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token);

    for (cc.twittertools.search.TrecTopic query : topicsFile) {
        List<TResult> results = client.search(query.getQuery(), query.getQueryTweetTime(), numResults);

        SortedSet<TResultComparable> sortedResults = new TreeSet<TResultComparable>();
        for (TResult result : results) {
            // Throw away retweets.
            if (result.getRetweeted_status_id() == 0) {
                sortedResults.add(new TResultComparable(result));
            }
        }

        int i = 1;
        int dupliCount = 0;
        double rsvPrev = 0;
        for (TResultComparable sortedResult : sortedResults) {
            TResult result = sortedResult.getTResult();
            double rsvCurr = result.rsv;
            if (Math.abs(rsvCurr - rsvPrev) > 0.0000001) {
                dupliCount = 0;
            } else {
                dupliCount++;
                rsvCurr = rsvCurr - 0.000001 / numResults * dupliCount;
            }
            out.println(String.format("%s Q0 %d %d %." + (int) (6 + Math.ceil(Math.log10(numResults))) + "f %s",
                    query.getId(), result.id, i, rsvCurr, runtag));
            if (verbose) {
                out.println("# " + result.toString().replaceAll("[\\n\\r]+", " "));
            }
            i++;
            rsvPrev = result.rsv;
        }

    }
    out.close();
}

From source file:io.bfscan.clueweb12.BuildWarcTrecIdMapping.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file")
            .create(INPUT_OPTION));//from  w  ww.  j  av  a  2  s  .c o  m
    options.addOption(
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("maximum number of documents to index").create(MAX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads")
            .create(THREADS_OPTION));

    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(BuildWarcTrecIdMapping.class.getCanonicalName(), options);
        System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX_OPTION);
    int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION))
            : Integer.MAX_VALUE;
    int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION))
            : DEFAULT_NUM_THREADS;

    long startTime = System.currentTimeMillis();

    String path = cmdline.getOptionValue(INPUT_OPTION);
    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    LOG.info("Creating index at " + indexPath);
    LOG.info("Indexing with " + threads + " threads");

    FileInputStream fis = null;
    BufferedReader br = null;

    try {
        fis = new FileInputStream(new File(path));
        byte[] ignoreBytes = new byte[2];
        fis.read(ignoreBytes); // "B", "Z" bytes from commandline tools
        br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF8"));

        ExecutorService executor = Executors.newFixedThreadPool(threads);
        int cnt = 0;
        String s;
        while ((s = br.readLine()) != null) {
            Runnable worker = new AddDocumentRunnable(writer, s);
            executor.execute(worker);

            cnt++;
            if (cnt % 1000000 == 0) {
                LOG.info(cnt + " articles added");
            }
            if (cnt >= maxdocs) {
                break;
            }
        }

        executor.shutdown();
        // Wait until all threads are finish
        while (!executor.isTerminated()) {
        }

        LOG.info("Total of " + cnt + " articles indexed.");

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        out.close();
        br.close();
        fis.close();
    }
}

From source file:cc.wikitools.lucene.IndexWikipediaDump.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file")
            .create(INPUT_OPTION));//  ww  w.  ja va 2  s  . co  m
    options.addOption(
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("maximum number of documents to index").create(MAX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads")
            .create(THREADS_OPTION));

    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexWikipediaDump.class.getCanonicalName(), options);
        System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX_OPTION);
    int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION))
            : Integer.MAX_VALUE;
    int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION))
            : DEFAULT_NUM_THREADS;

    long startTime = System.currentTimeMillis();

    String path = cmdline.getOptionValue(INPUT_OPTION);
    PrintStream out = new PrintStream(System.out, true, "UTF-8");
    WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build();

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    LOG.info("Creating index at " + indexPath);
    LOG.info("Indexing with " + threads + " threads");

    try {
        WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path);

        ExecutorService executor = Executors.newFixedThreadPool(threads);
        int cnt = 0;
        String page;
        while ((page = stream.readNext()) != null) {
            String title = cleaner.getTitle(page);

            // These are heuristic specifically for filtering out non-articles in enwiki-20120104.
            if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) {
                continue;
            }

            if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) {
                continue;
            }

            Runnable worker = new AddDocumentRunnable(writer, cleaner, page);
            executor.execute(worker);

            cnt++;
            if (cnt % 10000 == 0) {
                LOG.info(cnt + " articles added");
            }
            if (cnt >= maxdocs) {
                break;
            }
        }

        executor.shutdown();
        // Wait until all threads are finish
        while (!executor.isTerminated()) {
        }

        LOG.info("Total of " + cnt + " articles indexed.");

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        out.close();
    }
}

From source file:io.anserini.search.SearchTweets.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(RM3_OPTION, "apply relevance feedback with rm3"));

    options.addOption(//w w w. ja  va2s  .c o  m
            OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return")
            .create(NUM_RESULTS_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg()
            .withDescription("file containing topics in TREC format").create(QUERIES_OPTION));
    options.addOption(OptionBuilder.withArgName("similarity").hasArg()
            .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(SearchTweets.class.getName(), options);
        System.exit(-1);
    }

    File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));
    if (!indexLocation.exists()) {
        System.err.println("Error: " + indexLocation + " does not exist!");
        System.exit(-1);
    }

    String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;

    String topicsFile = cmdline.getOptionValue(QUERIES_OPTION);

    int numResults = 1000;
    try {
        if (cmdline.hasOption(NUM_RESULTS_OPTION)) {
            numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION));
        }
    } catch (NumberFormatException e) {
        System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION));
        System.exit(-1);
    }

    String similarity = "LM";
    if (cmdline.hasOption(SIMILARITY_OPTION)) {
        similarity = cmdline.getOptionValue(SIMILARITY_OPTION);
    }

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexLocation.getAbsolutePath())));
    IndexSearcher searcher = new IndexSearcher(reader);

    if (similarity.equalsIgnoreCase("BM25")) {
        searcher.setSimilarity(new BM25Similarity());
    } else if (similarity.equalsIgnoreCase("LM")) {
        searcher.setSimilarity(new LMDirichletSimilarity(2500.0f));
    }

    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(topicsFile));
    for (MicroblogTopic topic : topics) {
        Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(),
                true, true);
        Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER,
                topic.getQuery());

        TopDocs rs = searcher.search(query, filter, numResults);

        RerankerContext context = new RerankerContext(searcher, query, topic.getQuery(), filter);
        RerankerCascade cascade = new RerankerCascade(context);

        if (cmdline.hasOption(RM3_OPTION)) {
            cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name));
            cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
        } else {
            cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
        }

        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher));

        for (int i = 0; i < docs.documents.length; i++) {
            String qid = topic.getId().replaceFirst("^MB0*", "");
            out.println(String.format("%s Q0 %s %d %f %s", qid,
                    docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i],
                    runtag));
        }
    }
    reader.close();
    out.close();
}

From source file:cc.twittertools.search.local.RunQueries.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(//from  w  ww . j  a va2 s. com
            OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return")
            .create(NUM_RESULTS_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg()
            .withDescription("file containing topics in TREC format").create(QUERIES_OPTION));
    options.addOption(OptionBuilder.withArgName("similarity").hasArg()
            .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION));
    options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(RunQueries.class.getName(), options);
        System.exit(-1);
    }

    File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));
    if (!indexLocation.exists()) {
        System.err.println("Error: " + indexLocation + " does not exist!");
        System.exit(-1);
    }

    String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;

    String topicsFile = cmdline.getOptionValue(QUERIES_OPTION);

    int numResults = 1000;
    try {
        if (cmdline.hasOption(NUM_RESULTS_OPTION)) {
            numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION));
        }
    } catch (NumberFormatException e) {
        System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION));
        System.exit(-1);
    }

    String similarity = "LM";
    if (cmdline.hasOption(SIMILARITY_OPTION)) {
        similarity = cmdline.getOptionValue(SIMILARITY_OPTION);
    }

    boolean verbose = cmdline.hasOption(VERBOSE_OPTION);

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation));
    IndexSearcher searcher = new IndexSearcher(reader);

    if (similarity.equalsIgnoreCase("BM25")) {
        searcher.setSimilarity(new BM25Similarity());
    } else if (similarity.equalsIgnoreCase("LM")) {
        searcher.setSimilarity(new LMDirichletSimilarity(2500.0f));
    }

    QueryParser p = new QueryParser(Version.LUCENE_43, StatusField.TEXT.name, IndexStatuses.ANALYZER);

    TrecTopicSet topics = TrecTopicSet.fromFile(new File(topicsFile));
    for (TrecTopic topic : topics) {
        Query query = p.parse(topic.getQuery());
        Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(),
                true, true);

        TopDocs rs = searcher.search(query, filter, numResults);

        int i = 1;
        for (ScoreDoc scoreDoc : rs.scoreDocs) {
            Document hit = searcher.doc(scoreDoc.doc);
            out.println(String.format("%s Q0 %s %d %f %s", topic.getId(),
                    hit.getField(StatusField.ID.name).numericValue(), i, scoreDoc.score, runtag));
            if (verbose) {
                out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " "));
            }
            i++;
        }
    }
    reader.close();
    out.close();
}

From source file:cc.twittertools.search.api.SearchStatusesThrift.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(OptionBuilder.withArgName("string").hasArg().withDescription("host").create(HOST_OPTION));
    options.addOption(OptionBuilder.withArgName("port").hasArg().withDescription("port").create(PORT_OPTION));
    options.addOption(/*from  www .  j  ava 2  s . co  m*/
            OptionBuilder.withArgName("string").hasArg().withDescription("query id").create(QID_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("query text").create(QUERY_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("maxid").create(MAX_ID_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return")
            .create(NUM_RESULTS_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("group id").create(GROUP_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("access token").create(TOKEN_OPTION));
    options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(HOST_OPTION) || !cmdline.hasOption(PORT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(SearchStatusesThrift.class.getName(), options);
        System.exit(-1);
    }

    String qid = cmdline.hasOption(QID_OPTION) ? cmdline.getOptionValue(QID_OPTION) : DEFAULT_QID;
    String query = cmdline.hasOption(QUERY_OPTION) ? cmdline.getOptionValue(QUERY_OPTION) : DEFAULT_Q;
    String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;
    long maxId = cmdline.hasOption(MAX_ID_OPTION) ? Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION))
            : DEFAULT_MAX_ID;
    int numResults = cmdline.hasOption(NUM_RESULTS_OPTION)
            ? Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION))
            : DEFAULT_NUM_RESULTS;
    boolean verbose = cmdline.hasOption(VERBOSE_OPTION);

    String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null;
    String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null;
    TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION),
            Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token);

    System.err.println("qid: " + qid);
    System.err.println("q: " + query);
    System.err.println("max_id: " + maxId);
    System.err.println("num_results: " + numResults);

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    List<TResult> results = client.search(query, maxId, numResults);
    int i = 1;
    for (TResult result : results) {
        out.println(String.format("%s Q0 %d %d %f %s", qid, result.id, i, result.rsv, runtag));
        if (verbose) {
            System.out.println("# " + result.toString().replaceAll("[\\n\\r]+", " "));
        }
        i++;
    }
    out.close();
}

From source file:is.merkor.core.cli.Main.java

public static void main(String[] args) throws Exception {

    List<String> results = new ArrayList<String>();
    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    CommandLineParser parser = new GnuParser();
    try {/*from   www.j av a 2s  .  c o m*/

        MerkorCoreCommandLineOptions.createOptions();
        results = processCommandLine(parser.parse(MerkorCoreCommandLineOptions.options, args));
        out.print("\n");
        for (String str : results) {
            if (!str.equals("no message"))
                out.println(str);
        }
        out.print("\n");
        if (results.isEmpty()) {
            out.println("nothing found for parameters: ");
            for (int i = 0; i < args.length; i++)
                out.println("\t" + args[i]);
            out.println("for help type: -help or see README.markdown");
            out.print("\n");
        }
    } catch (ParseException e) {
        System.err.println("Parsing failed.  Reason: " + e.getMessage());
    }
}