Example usage for java.nio.file Paths get

Introduction

In this page you can find the example usage for java.nio.file Paths get.

Prototype

public static Path get(URI uri)

Source Link

Document

Converts the given URI to a Path object.

Usage

From source file:cu.uci.gws.sdlcrawler.PdfCrawlController.java

public static void main(String[] args) throws Exception {
    Properties cm = PdfCrawlerConfigManager.getInstance().loadConfigFile();
    long startTime = System.currentTimeMillis();
    DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
    Date date = new Date();
    System.out.println(dateFormat.format(date));
    int numberOfCrawlers = Integer.parseInt(cm.getProperty("sdlcrawler.NumberOfCrawlers"));
    String pdfFolder = cm.getProperty("sdlcrawler.CrawlPdfFolder");

    CrawlConfig config = new CrawlConfig();

    config.setCrawlStorageFolder(cm.getProperty("sdlcrawler.CrawlStorageFolder"));
    config.setProxyHost(cm.getProperty("sdlcrawler.ProxyHost"));
    if (!"".equals(cm.getProperty("sdlcrawler.ProxyPort"))) {
        config.setProxyPort(Integer.parseInt(cm.getProperty("sdlcrawler.ProxyPort")));
    }//from  www.  j  av a2 s.  co m
    config.setProxyUsername(cm.getProperty("sdlcrawler.ProxyUser"));
    config.setProxyPassword(cm.getProperty("sdlcrawler.ProxyPass"));
    config.setMaxDownloadSize(Integer.parseInt(cm.getProperty("sdlcrawler.MaxDownloadSize")));
    config.setIncludeBinaryContentInCrawling(
            Boolean.parseBoolean(cm.getProperty("sdlcrawler.IncludeBinaryContent")));
    config.setFollowRedirects(Boolean.parseBoolean(cm.getProperty("sdlcrawler.Redirects")));
    config.setUserAgentString(cm.getProperty("sdlcrawler.UserAgent"));
    config.setMaxDepthOfCrawling(Integer.parseInt(cm.getProperty("sdlcrawler.MaxDepthCrawl")));
    config.setMaxConnectionsPerHost(Integer.parseInt(cm.getProperty("sdlcrawler.MaxConnectionsPerHost")));
    config.setSocketTimeout(Integer.parseInt(cm.getProperty("sdlcrawler.SocketTimeout")));
    config.setMaxOutgoingLinksToFollow(Integer.parseInt(cm.getProperty("sdlcrawler.MaxOutgoingLinks")));
    config.setResumableCrawling(Boolean.parseBoolean(cm.getProperty("sdlcrawler.ResumableCrawling")));
    config.setIncludeHttpsPages(Boolean.parseBoolean(cm.getProperty("sdlcrawler.IncludeHttpsPages")));
    config.setMaxTotalConnections(Integer.parseInt(cm.getProperty("sdlcrawler.MaxTotalConnections")));
    config.setMaxPagesToFetch(Integer.parseInt(cm.getProperty("sdlcrawler.MaxPagesToFetch")));
    config.setPolitenessDelay(Integer.parseInt(cm.getProperty("sdlcrawler.PolitenessDelay")));
    config.setConnectionTimeout(Integer.parseInt(cm.getProperty("sdlcrawler.ConnectionTimeout")));

    System.out.println(config.toString());
    Collection<BasicHeader> defaultHeaders = new HashSet<>();
    defaultHeaders
            .add(new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
    defaultHeaders.add(new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"));
    defaultHeaders.add(new BasicHeader("Accept-Language", "en-US,en,es-ES,es;q=0.8"));
    defaultHeaders.add(new BasicHeader("Connection", "keep-alive"));
    config.setDefaultHeaders(defaultHeaders);

    List<String> list = Files.readAllLines(Paths.get("config/" + cm.getProperty("sdlcrawler.SeedFile")),
            StandardCharsets.UTF_8);
    String[] crawlDomains = list.toArray(new String[list.size()]);

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    for (String domain : crawlDomains) {
        controller.addSeed(domain);
    }

    PdfCrawler.configure(crawlDomains, pdfFolder);
    controller.start(PdfCrawler.class, numberOfCrawlers);
    DateFormat dateFormat1 = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
    Date date1 = new Date();
    System.out.println(dateFormat1.format(date1));
    long endTime = System.currentTimeMillis();
    long totalTime = endTime - startTime;
    System.out.println("Total time:" + totalTime);
}

From source file:edu.jhu.hlt.concrete.ingesters.simple.CompleteFileIngester.java

/**
 * See usage string.//from   ww w  .ja va  2 s .  c o  m
 *
 * @param args
 */
public static void main(String[] args) {
    if (args.length != 3) {
        System.err.println("This program converts a character-based file to a .concrete file.");
        System.err.println("The text file must contain UTF-8 encoded characters.");
        System.err.println(
                "The .concrete file will share the same name as the input file, including the extension.");
        System.err.println("This program takes 3 arguments.");
        System.err.println("Argument 1: path/to/a/character/based/file");
        System.err.println("Argument 2: type of Communication to generate [e.g., tweet]");
        System.err.println("Argument 3: path/to/output/folder");
        System.err.println("Example usage: " + CompleteFileIngester.class.getName()
                + " /my/text/file story /my/output/folder");
        System.exit(1);
    }

    String inPathStr = args[0];
    Path inPath = Paths.get(inPathStr);
    try {
        ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(inPath);
        Optional<String> commType = Optional.ofNullable(args[1]);
        Optional<String> outPathStr = Optional.ofNullable(args[2]);

        Path ep = ef.getPath();
        String fn = ef.getName();
        Path outPath = Paths.get(outPathStr.get());
        Path outFile = outPath.resolve(fn + ".concrete");

        // Output directory exists, or it doesn't.
        // Try to create if it does not.
        if (!Files.exists(outPath)) {
            try {
                Files.createDirectories(outPath);
            } catch (IOException e) {
                logger.error("Caught exception when making output directories.", e);
            }

            // if it does, check to make sure it's a directory.
        } else {
            if (!Files.isDirectory(outPath)) {
                logger.error("Output path exists but is not a directory.");
                System.exit(1);
            } else {
                // check to make sure the output file won't be overwritten.
                if (Files.exists(outFile)) {
                    logger.warn("Output file {} exists; not overwriting.", outFile.toString());
                    System.exit(1);
                }
            }
        }

        try {
            UTF8FileIngester ing = new CompleteFileIngester(commType.get());
            Communication comm = ing.fromCharacterBasedFile(ep);
            new WritableCommunication(comm).writeToFile(outFile, false);
        } catch (IngestException e) {
            logger.error("Caught exception during ingest.", e);
            System.exit(1);
        } catch (ConcreteException e) {
            logger.error("Caught exception writing output.", e);
        }

    } catch (NoSuchFileException e) {
        logger.error("Path {} does not exist.", inPathStr);
        System.exit(1);
    } catch (NotFileException e) {
        logger.error("Path {} is a directory.", inPathStr);
        System.exit(1);
    }
}

From source file:cli.Main.java

public static void main(String[] args) {
    // Workaround for BKS truststore
    Security.insertProviderAt(new org.spongycastle.jce.provider.BouncyCastleProvider(), 1);

    Namespace ns = parseArgs(args);
    if (ns == null) {
        System.exit(1);//from  w  ww  . j  av a 2  s .  c o  m
    }

    final String username = ns.getString("username");
    final Manager m = new Manager(username);
    if (m.userExists()) {
        try {
            m.load();
        } catch (Exception e) {
            System.err.println("Error loading state file \"" + m.getFileName() + "\": " + e.getMessage());
            System.exit(2);
        }
    }

    switch (ns.getString("command")) {
    case "register":
        if (!m.userHasKeys()) {
            m.createNewIdentity();
        }
        try {
            m.register(ns.getBoolean("voice"));
        } catch (IOException e) {
            System.err.println("Request verify error: " + e.getMessage());
            System.exit(3);
        }
        break;
    case "verify":
        if (!m.userHasKeys()) {
            System.err.println("User has no keys, first call register.");
            System.exit(1);
        }
        if (m.isRegistered()) {
            System.err.println("User registration is already verified");
            System.exit(1);
        }
        try {
            m.verifyAccount(ns.getString("verificationCode"));
        } catch (IOException e) {
            System.err.println("Verify error: " + e.getMessage());
            System.exit(3);
        }
        break;
    case "send":
        if (!m.isRegistered()) {
            System.err.println("User is not registered.");
            System.exit(1);
        }
        String messageText = ns.getString("message");
        if (messageText == null) {
            try {
                messageText = IOUtils.toString(System.in);
            } catch (IOException e) {
                System.err.println("Failed to read message from stdin: " + e.getMessage());
                System.exit(1);
            }
        }

        final List<String> attachments = ns.getList("attachment");
        List<TextSecureAttachment> textSecureAttachments = null;
        if (attachments != null) {
            textSecureAttachments = new ArrayList<>(attachments.size());
            for (String attachment : attachments) {
                try {
                    File attachmentFile = new File(attachment);
                    InputStream attachmentStream = new FileInputStream(attachmentFile);
                    final long attachmentSize = attachmentFile.length();
                    String mime = Files.probeContentType(Paths.get(attachment));
                    textSecureAttachments
                            .add(new TextSecureAttachmentStream(attachmentStream, mime, attachmentSize, null));
                } catch (IOException e) {
                    System.err.println("Failed to add attachment \"" + attachment + "\": " + e.getMessage());
                    System.err.println("Aborting sending.");
                    System.exit(1);
                }
            }
        }

        List<TextSecureAddress> recipients = new ArrayList<>(ns.<String>getList("recipient").size());
        for (String recipient : ns.<String>getList("recipient")) {
            try {
                recipients.add(m.getPushAddress(recipient));
            } catch (InvalidNumberException e) {
                System.err.println("Failed to add recipient \"" + recipient + "\": " + e.getMessage());
                System.err.println("Aborting sending.");
                System.exit(1);
            }
        }
        sendMessage(m, messageText, textSecureAttachments, recipients);
        break;
    case "receive":
        if (!m.isRegistered()) {
            System.err.println("User is not registered.");
            System.exit(1);
        }
        try {
            m.receiveMessages(5, true, new ReceiveMessageHandler(m));
        } catch (IOException e) {
            System.err.println("Error while receiving message: " + e.getMessage());
            System.exit(3);
        } catch (AssertionError e) {
            System.err.println("Failed to receive message (Assertion): " + e.getMessage());
            System.err.println(e.getStackTrace());
            System.err.println(
                    "If you use an Oracle JRE please check if you have unlimited strength crypto enabled, see README");
            System.exit(1);
        }
        break;
    }
    m.save();
    System.exit(0);
}

From source file:GIST.IzbirkomExtractor.IzbirkomExtractor.java

/**
 * @param args/* ww  w. ja v a 2 s  .c om*/
 */
public static void main(String[] args) {

    // process command-line options
    Options options = new Options();
    options.addOption("n", "noaddr", false, "do not do any address matching (for testing)");
    options.addOption("i", "info", false, "create and populate address information table");
    options.addOption("h", "help", false, "this message");

    // database connection
    options.addOption("s", "server", true, "database server to connect to");
    options.addOption("d", "database", true, "OSM database name");
    options.addOption("u", "user", true, "OSM database user name");
    options.addOption("p", "pass", true, "OSM database password");

    // logging options
    options.addOption("l", "logdir", true, "log file directory (default './logs')");
    options.addOption("e", "loglevel", true, "log level (default 'FINEST')");

    // automatically generate the help statement
    HelpFormatter help_formatter = new HelpFormatter();

    // database URI for connection
    String dburi = null;

    // Information message for help screen
    String info_msg = "IzbirkomExtractor [options] <html_directory>";

    try {
        CommandLineParser parser = new GnuParser();
        CommandLine cmd = parser.parse(options, args);

        if (cmd.hasOption('h') || cmd.getArgs().length != 1) {
            help_formatter.printHelp(info_msg, options);
            System.exit(1);
        }

        /* prohibit n and i together */
        if (cmd.hasOption('n') && cmd.hasOption('i')) {
            System.err.println("Options 'n' and 'i' cannot be used together.");
            System.exit(1);
        }

        /* require database arguments without -n */
        if (cmd.hasOption('n')
                && (cmd.hasOption('s') || cmd.hasOption('d') || cmd.hasOption('u') || cmd.hasOption('p'))) {
            System.err.println("Options 'n' and does not need any databse parameters.");
            System.exit(1);
        }

        /* require all 4 database options to be used together */
        if (!cmd.hasOption('n')
                && !(cmd.hasOption('s') && cmd.hasOption('d') && cmd.hasOption('u') && cmd.hasOption('p'))) {
            System.err.println(
                    "For database access all of the following arguments have to be specified: server, database, user, pass");
            System.exit(1);
        }

        /* useful variables */
        SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'kk:mm");
        String dateString = formatter.format(new Date());

        /* setup logging */
        File logdir = new File(cmd.hasOption('l') ? cmd.getOptionValue('l') : "logs");
        FileUtils.forceMkdir(logdir);
        File log_file_name = new File(
                logdir + "/" + IzbirkomExtractor.class.getName() + "-" + formatter.format(new Date()) + ".log");
        FileHandler log_file = new FileHandler(log_file_name.getPath());

        /* create "latest" link to currently created log file */
        Path latest_log_link = Paths.get(logdir + "/latest");
        Files.deleteIfExists(latest_log_link);
        Files.createSymbolicLink(latest_log_link, Paths.get(log_file_name.getName()));

        log_file.setFormatter(new SimpleFormatter());
        LogManager.getLogManager().reset(); // prevents logging to console
        logger.addHandler(log_file);
        logger.setLevel(cmd.hasOption('e') ? Level.parse(cmd.getOptionValue('e')) : Level.FINEST);

        // open directory with HTML files and create file list
        File dir = new File(cmd.getArgs()[0]);
        if (!dir.isDirectory()) {
            System.err.println("Unable to find directory '" + cmd.getArgs()[0] + "', exiting");
            System.exit(1);
        }
        PathMatcher pmatcher = FileSystems.getDefault()
                .getPathMatcher("glob:?  * ?*.html");
        ArrayList<File> html_files = new ArrayList<>();
        for (Path file : Files.newDirectoryStream(dir.toPath()))
            if (pmatcher.matches(file.getFileName()))
                html_files.add(file.toFile());
        if (html_files.size() == 0) {
            System.err.println("No matching HTML files found in '" + dir.getAbsolutePath() + "', exiting");
            System.exit(1);
        }

        // create csvResultSink
        FileOutputStream csvout_file = new FileOutputStream("parsed_addresses-" + dateString + ".csv");
        OutputStreamWriter csvout = new OutputStreamWriter(csvout_file, "UTF-8");
        ResultSink csvResultSink = new CSVResultSink(csvout, new CSVStrategy('|', '"', '#'));

        // Connect to DB and osmAddressMatcher
        AddressMatcher osmAddressMatcher;
        DBSink dbSink = null;
        DBInfoSink dbInfoSink = null;
        if (cmd.hasOption('n')) {
            osmAddressMatcher = new DummyAddressMatcher();
        } else {
            dburi = "jdbc:postgresql://" + cmd.getOptionValue('s') + "/" + cmd.getOptionValue('d');
            Connection con = DriverManager.getConnection(dburi, cmd.getOptionValue('u'),
                    cmd.getOptionValue('p'));
            osmAddressMatcher = new OsmAddressMatcher(con);
            dbSink = new DBSink(con);
            if (cmd.hasOption('i'))
                dbInfoSink = new DBInfoSink(con);
        }

        /* create resultsinks */
        SinkMultiplexor sm = SinkMultiplexor.newSinkMultiplexor();
        sm.addResultSink(csvResultSink);
        if (dbSink != null) {
            sm.addResultSink(dbSink);
            if (dbInfoSink != null)
                sm.addResultSink(dbInfoSink);
        }

        // create tableExtractor
        TableExtractor te = new TableExtractor(osmAddressMatcher, sm);

        // TODO: printout summary of options: processing date/time, host, directory of HTML files, jdbc uri, command line with parameters

        // iterate through files
        logger.info("Start processing " + html_files.size() + " files in " + dir);
        for (int i = 0; i < html_files.size(); i++) {
            System.err.println("Parsing #" + i + ": " + html_files.get(i));
            te.processHTMLfile(html_files.get(i));
        }

        System.err.println("Processed " + html_files.size() + " HTML files");
        logger.info("Finished processing " + html_files.size() + " files in " + dir);

    } catch (ParseException e1) {
        System.err.println("Failed to parse CLI: " + e1.getMessage());
        help_formatter.printHelp(info_msg, options);
        System.exit(1);
    } catch (IOException e) {
        System.err.println("I/O Exception: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    } catch (SQLException e) {
        System.err.println("Database '" + dburi + "': " + e.getMessage());
        System.exit(1);
    } catch (ResultSinkException e) {
        System.err.println("Failed to initialize ResultSink: " + e.getMessage());
        System.exit(1);
    } catch (TableExtractorException e) {
        System.err.println("Failed to initialize Table Extractor: " + e.getMessage());
        System.exit(1);
    } catch (CloneNotSupportedException | IllegalAccessException | InstantiationException e) {
        System.err.println("Something really odd happened: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }
}

From source file:eu.crydee.stanfordcorenlp.Tokenizer.java

/**
 * Wrapper around Stanford CoreNLP to tokenize text.
 *
 * Give it an input dir of text files with --input-dir and it'll ouput
 * tokenized versions, one sentence per line with space separated words to
 * --output-dir (defaults to out/)./*from   w ww.ja va  2s .c om*/
 *
 * @param args CLI args. Example: --input-dir my-input --output-dir
 * my-output.
 */
public static void main(String[] args) {
    ArgumentParser parser = ArgumentParsers.newArgumentParser("stanford-corenlp-tokenizer-wrapper")
            .description("Converts Mediawiki dumps to text.");
    parser.addArgument("-i", "--input-dir").required(true).help("Path of the input text files directory.");
    parser.addArgument("-o", "--output-dir").help("Path of the output text files directory.").setDefault("out");
    Params params = new Params();
    try {
        parser.parseArgs(args, params);
    } catch (ArgumentParserException ex) {
        System.err.println("Could not parse arguments: " + ex.getMessage());
        System.exit(1);
    }
    Tokenizer tokenizer = new Tokenizer();

    try {
        Files.list(Paths.get(params.inDirPath)).filter(Files::isRegularFile).map(Path::toFile).map(f -> {
            try {
                return Pair.of(f.getName(), FileUtils.readFileToString(f, StandardCharsets.UTF_8));
            } catch (IOException ex) {
                System.err.println("Could not read input text file: " + ex.getLocalizedMessage());
                throw new UncheckedIOException(ex);
            }
        }).forEach(p -> {
            String text = tokenizer.tokenizeAndSentenceSplit(p.getRight());
            try {
                FileUtils.writeStringToFile(Paths.get(params.outDirpath, p.getLeft()).toFile(), text,
                        StandardCharsets.UTF_8);
            } catch (IOException ex) {
                System.err.println("Could not write output text file: " + ex.getLocalizedMessage());
            }
        });
    } catch (IOException ex) {
        System.err.println("Could not read from input directory: " + ex.getLocalizedMessage());
    }
}

From source file:tpt.dbweb.cat.evaluation.ComparisonResult.java

public static void main(String[] args) throws JsonGenerationException, JsonMappingException, IOException {
    ComparisonResult result = new ComparisonResult();
    result.docidToMetricToResult.computeIfAbsent("doc", k -> new TreeMap<>()).put("part",
            new ValueEvaluationStatistics(Fraction.ONE, Fraction.ONE));

    System.out.println(mapper.writeValueAsString(new Fraction(1.0, 2.0)));
    String ves = mapper.writeValueAsString(new ValueEvaluationStatistics(Fraction.ONE, new Fraction(1.0, 2.0)));
    System.out.println(ves);/*from   w w  w.  j  a v a 2 s  . c  o m*/

    System.out.println(mapper.readValue(ves, ValueEvaluationStatistics.class));

    result.write(Paths.get("/tmp/test.json"));
    read(Paths.get("/tmp/test.json"));
}

From source file:io.anserini.search.SearchTweets.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(RM3_OPTION, "apply relevance feedback with rm3"));

    options.addOption(/*from  w ww .  ja  v  a  2 s .  c  o  m*/
            OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return")
            .create(NUM_RESULTS_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg()
            .withDescription("file containing topics in TREC format").create(QUERIES_OPTION));
    options.addOption(OptionBuilder.withArgName("similarity").hasArg()
            .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION));
    options.addOption(
            OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(SearchTweets.class.getName(), options);
        System.exit(-1);
    }

    File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));
    if (!indexLocation.exists()) {
        System.err.println("Error: " + indexLocation + " does not exist!");
        System.exit(-1);
    }

    String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;

    String topicsFile = cmdline.getOptionValue(QUERIES_OPTION);

    int numResults = 1000;
    try {
        if (cmdline.hasOption(NUM_RESULTS_OPTION)) {
            numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION));
        }
    } catch (NumberFormatException e) {
        System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION));
        System.exit(-1);
    }

    String similarity = "LM";
    if (cmdline.hasOption(SIMILARITY_OPTION)) {
        similarity = cmdline.getOptionValue(SIMILARITY_OPTION);
    }

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexLocation.getAbsolutePath())));
    IndexSearcher searcher = new IndexSearcher(reader);

    if (similarity.equalsIgnoreCase("BM25")) {
        searcher.setSimilarity(new BM25Similarity());
    } else if (similarity.equalsIgnoreCase("LM")) {
        searcher.setSimilarity(new LMDirichletSimilarity(2500.0f));
    }

    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(topicsFile));
    for (MicroblogTopic topic : topics) {
        Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(),
                true, true);
        Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER,
                topic.getQuery());

        TopDocs rs = searcher.search(query, filter, numResults);

        RerankerContext context = new RerankerContext(searcher, query, topic.getQuery(), filter);
        RerankerCascade cascade = new RerankerCascade(context);

        if (cmdline.hasOption(RM3_OPTION)) {
            cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name));
            cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
        } else {
            cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
        }

        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher));

        for (int i = 0; i < docs.documents.length; i++) {
            String qid = topic.getId().replaceFirst("^MB0*", "");
            out.println(String.format("%s Q0 %s %d %f %s", qid,
                    docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i],
                    runtag));
        }
    }
    reader.close();
    out.close();
}

From source file:fr.inria.atlanmod.instantiator.SpecimenGenerator.java

public static void main(String[] args) throws GenerationException, IOException {

    Options options = new Options();

    configureOptions(options);/*from   ww  w.j av a  2 s . c  o m*/

    CommandLineParser parser = new GnuParser();

    try {
        CommandLine commandLine = parser.parse(options, args);

        String metamodel = commandLine.getOptionValue(METAMODEL);
        DefaultModelGenerator modelGen = new DefaultModelGenerator(URI.createFileURI(metamodel));

        if (commandLine.hasOption(ADDITIONAL_METAMODEL)) {
            for (String additionalMetamodel : commandLine.getOptionValues(ADDITIONAL_METAMODEL)) {
                URI additionalMetamodelUri = URI.createFileURI(additionalMetamodel);
                Resource resource = new XMIResourceImpl(additionalMetamodelUri);
                resource.load(Collections.emptyMap());
                registerPackages(resource);
            }
        }

        if (commandLine.hasOption(OUTPUT_DIR)) {
            String outDir = commandLine.getOptionValue(OUTPUT_DIR);
            modelGen.setSamplesPath(Paths.get(outDir));
        } else {
            modelGen.setSamplesPath(Paths.get("."));
        }
        if (commandLine.hasOption(N_MODELS)) {
            int models = ((Number) commandLine.getParsedOptionValue(N_MODELS)).intValue();
            modelGen.setSetSize(new int[] { models });
        } else {
            modelGen.setSetSize(new int[] { 1 });
        }
        if (commandLine.hasOption(SIZE)) {
            long size = ((Number) commandLine.getParsedOptionValue(SIZE)).longValue();
            modelGen.setModelsSize(new long[] { size });
        } else {
            modelGen.setModelsSize(new long[] { 1000 });
        }
        if (commandLine.hasOption(SEED)) {
            long seed = ((Number) commandLine.getParsedOptionValue(SEED)).longValue();
            modelGen.setSeed(seed);
        } else {
            modelGen.setSeed(System.currentTimeMillis());
        }
        modelGen.runGeneration();
    } catch (ParseException e) {
        System.err.println(e.getLocalizedMessage());
        HelpFormatter formatter = new HelpFormatter();
        formatter.setOptionComparator(new OptionComarator<Option>());
        try {
            formatter.setWidth(Math.max(TerminalFactory.get().getWidth(), 80));
        } catch (Throwable t) {
            // Nothing to do...
        }
        ;
        formatter.printHelp("java -jar <this-file.jar>", options, true);
    }
}

From source file:io.anserini.index.IndexGov2.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(/* w  ww  .j  av a  2 s .  c om*/
            OptionBuilder.withArgName("path").hasArg().withDescription("input data path").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output index path")
            .create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexer threads")
            .create(THREADS_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("max number of documents to index (-1 to index everything)")
            .create(DOCLIMIT_OPTION));

    options.addOption(POSITIONS_OPTION, false, "index positions");
    options.addOption(OPTIMIZE_OPTION, false, "merge all index segments");

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)
            || !cmdline.hasOption(THREADS_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(100);
        formatter.printHelp(IndexGov2.class.getCanonicalName(), options);
        System.exit(-1);
    }

    final String dirPath = cmdline.getOptionValue(INDEX_OPTION);
    final String dataDir = cmdline.getOptionValue(INPUT_OPTION);
    final int docCountLimit = cmdline.hasOption(DOCLIMIT_OPTION)
            ? Integer.parseInt(cmdline.getOptionValue(DOCLIMIT_OPTION))
            : -1;
    final int numThreads = Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION));

    final boolean doUpdate = cmdline.hasOption(UPDATE_OPTION);
    final boolean positions = cmdline.hasOption(POSITIONS_OPTION);
    final boolean optimize = cmdline.hasOption(OPTIMIZE_OPTION);

    final Analyzer a = new EnglishAnalyzer();
    final TrecContentSource trecSource = createGov2Source(dataDir);
    final Directory dir = FSDirectory.open(Paths.get(dirPath));

    LOG.info("Index path: " + dirPath);
    LOG.info("Doc limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit));
    LOG.info("Threads: " + numThreads);
    LOG.info("Positions: " + positions);
    LOG.info("Optimize (merge segments): " + optimize);

    final IndexWriterConfig config = new IndexWriterConfig(a);

    if (doUpdate) {
        config.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
    } else {
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    }

    final IndexWriter writer = new IndexWriter(dir, config);
    Gov2IndexThreads threads = new Gov2IndexThreads(writer, positions, trecSource, numThreads, docCountLimit);
    LOG.info("Indexer: start");

    final long t0 = System.currentTimeMillis();

    threads.start();

    while (!threads.done()) {
        Thread.sleep(100);
    }
    threads.stop();

    final long t1 = System.currentTimeMillis();
    LOG.info("Indexer: indexing done (" + (t1 - t0) / 1000.0 + " sec); total " + writer.maxDoc() + " docs");
    if (!doUpdate && docCountLimit != -1 && writer.maxDoc() != docCountLimit) {
        throw new RuntimeException("w.maxDoc()=" + writer.maxDoc() + " but expected " + docCountLimit);
    }
    if (threads.failed.get()) {
        throw new RuntimeException("exceptions during indexing");
    }

    final long t2;
    t2 = System.currentTimeMillis();

    final Map<String, String> commitData = new HashMap<String, String>();
    commitData.put("userData", "multi");
    writer.setCommitData(commitData);
    writer.commit();
    final long t3 = System.currentTimeMillis();
    LOG.info("Indexer: commit multi (took " + (t3 - t2) / 1000.0 + " sec)");

    if (optimize) {
        LOG.info("Indexer: merging all segments");
        writer.forceMerge(1);
        final long t4 = System.currentTimeMillis();
        LOG.info("Indexer: segments merged (took " + (t4 - t3) / 1000.0 + " sec)");
    }

    LOG.info("Indexer: at close: " + writer.segString());
    final long tCloseStart = System.currentTimeMillis();
    writer.close();
    LOG.info("Indexer: close took " + (System.currentTimeMillis() - tCloseStart) / 1000.0 + " sec");
    dir.close();
    final long tFinal = System.currentTimeMillis();
    LOG.info("Indexer: finished (" + (tFinal - t0) / 1000.0 + " sec)");
    LOG.info("Indexer: net bytes indexed " + threads.getBytesIndexed());
    LOG.info("Indexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / ((tFinal - t0) / 3600000.))
            + " GB/hour plain text");
}

From source file:apps.LuceneIndexer.java

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("i", null, true, "input file");
    options.addOption("o", null, true, "output directory");
    options.addOption("r", null, true, "optional output TREC-format QREL file");

    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "document source type: " + commaJoin.join(SourceFactory.getDocSourceList()));

    // If you increase this value, you may need to modify the following line in *.sh file
    // export MAVEN_OPTS="-Xms8192m -server"
    double ramBufferSizeMB = 1024 * 8; // 8 GB

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    IndexWriter indexWriter = null;//from  w w  w  .j a va 2 s  .  c  o m
    BufferedWriter qrelWriter = null;

    int docNum = 0;

    try {
        CommandLine cmd = parser.parse(options, args);

        String inputFileName = null, outputDirName = null, qrelFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        if (cmd.hasOption("o")) {
            outputDirName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'index directory'", options);
        }

        if (cmd.hasOption("r")) {
            qrelFileName = cmd.getOptionValue("r");
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        if (qrelFileName != null)
            qrelWriter = new BufferedWriter(new FileWriter(qrelFileName));

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName));
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer);

        /*
            OpenMode.CREATE creates a new index or overwrites an existing one.
            https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE
        */
        indexConf.setOpenMode(OpenMode.CREATE);
        indexConf.setRAMBufferSizeMB(ramBufferSizeMB);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b));
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b));
        }

        indexWriter = new IndexWriter(indexDir, indexConf);

        DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName);
        DocumentEntry inpDoc = null;
        TextCleaner textCleaner = new TextCleaner(null);

        while ((inpDoc = inpDocSource.next()) != null) {
            ++docNum;

            Document luceneDoc = new Document();
            ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText);
            String cleanText = spaceJoin.join(cleanedToks);

            //        System.out.println(inpDoc.mDocId);
            //        System.out.println(cleanText);
            //        System.out.println("==============================");

            luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES));
            luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES));
            indexWriter.addDocument(luceneDoc);

            if (inpDoc.mIsRel != null && qrelWriter != null) {
                saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0);
            }
            if (docNum % 1000 == 0)
                System.out.println(String.format("Indexed %d documents", docNum));

        }

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments" + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    } finally {
        System.out.println(String.format("Indexed %d documents", docNum));

        try {
            if (null != indexWriter)
                indexWriter.close();
            if (null != qrelWriter)
                qrelWriter.close();
        } catch (IOException e) {
            System.err.println("IO exception: " + e);
            e.printStackTrace();
        }
    }
}