List of usage examples for java.nio.file Paths get
public static Path get(URI uri)
From source file:cu.uci.gws.sdlcrawler.PdfCrawlController.java
public static void main(String[] args) throws Exception { Properties cm = PdfCrawlerConfigManager.getInstance().loadConfigFile(); long startTime = System.currentTimeMillis(); DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); Date date = new Date(); System.out.println(dateFormat.format(date)); int numberOfCrawlers = Integer.parseInt(cm.getProperty("sdlcrawler.NumberOfCrawlers")); String pdfFolder = cm.getProperty("sdlcrawler.CrawlPdfFolder"); CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(cm.getProperty("sdlcrawler.CrawlStorageFolder")); config.setProxyHost(cm.getProperty("sdlcrawler.ProxyHost")); if (!"".equals(cm.getProperty("sdlcrawler.ProxyPort"))) { config.setProxyPort(Integer.parseInt(cm.getProperty("sdlcrawler.ProxyPort"))); }//from www. j av a2 s. co m config.setProxyUsername(cm.getProperty("sdlcrawler.ProxyUser")); config.setProxyPassword(cm.getProperty("sdlcrawler.ProxyPass")); config.setMaxDownloadSize(Integer.parseInt(cm.getProperty("sdlcrawler.MaxDownloadSize"))); config.setIncludeBinaryContentInCrawling( Boolean.parseBoolean(cm.getProperty("sdlcrawler.IncludeBinaryContent"))); config.setFollowRedirects(Boolean.parseBoolean(cm.getProperty("sdlcrawler.Redirects"))); config.setUserAgentString(cm.getProperty("sdlcrawler.UserAgent")); config.setMaxDepthOfCrawling(Integer.parseInt(cm.getProperty("sdlcrawler.MaxDepthCrawl"))); config.setMaxConnectionsPerHost(Integer.parseInt(cm.getProperty("sdlcrawler.MaxConnectionsPerHost"))); config.setSocketTimeout(Integer.parseInt(cm.getProperty("sdlcrawler.SocketTimeout"))); config.setMaxOutgoingLinksToFollow(Integer.parseInt(cm.getProperty("sdlcrawler.MaxOutgoingLinks"))); config.setResumableCrawling(Boolean.parseBoolean(cm.getProperty("sdlcrawler.ResumableCrawling"))); config.setIncludeHttpsPages(Boolean.parseBoolean(cm.getProperty("sdlcrawler.IncludeHttpsPages"))); config.setMaxTotalConnections(Integer.parseInt(cm.getProperty("sdlcrawler.MaxTotalConnections"))); config.setMaxPagesToFetch(Integer.parseInt(cm.getProperty("sdlcrawler.MaxPagesToFetch"))); config.setPolitenessDelay(Integer.parseInt(cm.getProperty("sdlcrawler.PolitenessDelay"))); config.setConnectionTimeout(Integer.parseInt(cm.getProperty("sdlcrawler.ConnectionTimeout"))); System.out.println(config.toString()); Collection<BasicHeader> defaultHeaders = new HashSet<>(); defaultHeaders .add(new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")); defaultHeaders.add(new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3")); defaultHeaders.add(new BasicHeader("Accept-Language", "en-US,en,es-ES,es;q=0.8")); defaultHeaders.add(new BasicHeader("Connection", "keep-alive")); config.setDefaultHeaders(defaultHeaders); List<String> list = Files.readAllLines(Paths.get("config/" + cm.getProperty("sdlcrawler.SeedFile")), StandardCharsets.UTF_8); String[] crawlDomains = list.toArray(new String[list.size()]); PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); for (String domain : crawlDomains) { controller.addSeed(domain); } PdfCrawler.configure(crawlDomains, pdfFolder); controller.start(PdfCrawler.class, numberOfCrawlers); DateFormat dateFormat1 = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); Date date1 = new Date(); System.out.println(dateFormat1.format(date1)); long endTime = System.currentTimeMillis(); long totalTime = endTime - startTime; System.out.println("Total time:" + totalTime); }
From source file:edu.jhu.hlt.concrete.ingesters.simple.CompleteFileIngester.java
/** * See usage string.//from ww w .ja va 2 s . c o m * * @param args */ public static void main(String[] args) { if (args.length != 3) { System.err.println("This program converts a character-based file to a .concrete file."); System.err.println("The text file must contain UTF-8 encoded characters."); System.err.println( "The .concrete file will share the same name as the input file, including the extension."); System.err.println("This program takes 3 arguments."); System.err.println("Argument 1: path/to/a/character/based/file"); System.err.println("Argument 2: type of Communication to generate [e.g., tweet]"); System.err.println("Argument 3: path/to/output/folder"); System.err.println("Example usage: " + CompleteFileIngester.class.getName() + " /my/text/file story /my/output/folder"); System.exit(1); } String inPathStr = args[0]; Path inPath = Paths.get(inPathStr); try { ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(inPath); Optional<String> commType = Optional.ofNullable(args[1]); Optional<String> outPathStr = Optional.ofNullable(args[2]); Path ep = ef.getPath(); String fn = ef.getName(); Path outPath = Paths.get(outPathStr.get()); Path outFile = outPath.resolve(fn + ".concrete"); // Output directory exists, or it doesn't. // Try to create if it does not. if (!Files.exists(outPath)) { try { Files.createDirectories(outPath); } catch (IOException e) { logger.error("Caught exception when making output directories.", e); } // if it does, check to make sure it's a directory. } else { if (!Files.isDirectory(outPath)) { logger.error("Output path exists but is not a directory."); System.exit(1); } else { // check to make sure the output file won't be overwritten. if (Files.exists(outFile)) { logger.warn("Output file {} exists; not overwriting.", outFile.toString()); System.exit(1); } } } try { UTF8FileIngester ing = new CompleteFileIngester(commType.get()); Communication comm = ing.fromCharacterBasedFile(ep); new WritableCommunication(comm).writeToFile(outFile, false); } catch (IngestException e) { logger.error("Caught exception during ingest.", e); System.exit(1); } catch (ConcreteException e) { logger.error("Caught exception writing output.", e); } } catch (NoSuchFileException e) { logger.error("Path {} does not exist.", inPathStr); System.exit(1); } catch (NotFileException e) { logger.error("Path {} is a directory.", inPathStr); System.exit(1); } }
From source file:cli.Main.java
public static void main(String[] args) { // Workaround for BKS truststore Security.insertProviderAt(new org.spongycastle.jce.provider.BouncyCastleProvider(), 1); Namespace ns = parseArgs(args); if (ns == null) { System.exit(1);//from w ww . j av a 2 s . c o m } final String username = ns.getString("username"); final Manager m = new Manager(username); if (m.userExists()) { try { m.load(); } catch (Exception e) { System.err.println("Error loading state file \"" + m.getFileName() + "\": " + e.getMessage()); System.exit(2); } } switch (ns.getString("command")) { case "register": if (!m.userHasKeys()) { m.createNewIdentity(); } try { m.register(ns.getBoolean("voice")); } catch (IOException e) { System.err.println("Request verify error: " + e.getMessage()); System.exit(3); } break; case "verify": if (!m.userHasKeys()) { System.err.println("User has no keys, first call register."); System.exit(1); } if (m.isRegistered()) { System.err.println("User registration is already verified"); System.exit(1); } try { m.verifyAccount(ns.getString("verificationCode")); } catch (IOException e) { System.err.println("Verify error: " + e.getMessage()); System.exit(3); } break; case "send": if (!m.isRegistered()) { System.err.println("User is not registered."); System.exit(1); } String messageText = ns.getString("message"); if (messageText == null) { try { messageText = IOUtils.toString(System.in); } catch (IOException e) { System.err.println("Failed to read message from stdin: " + e.getMessage()); System.exit(1); } } final List<String> attachments = ns.getList("attachment"); List<TextSecureAttachment> textSecureAttachments = null; if (attachments != null) { textSecureAttachments = new ArrayList<>(attachments.size()); for (String attachment : attachments) { try { File attachmentFile = new File(attachment); InputStream attachmentStream = new FileInputStream(attachmentFile); final long attachmentSize = attachmentFile.length(); String mime = Files.probeContentType(Paths.get(attachment)); textSecureAttachments .add(new TextSecureAttachmentStream(attachmentStream, mime, attachmentSize, null)); } catch (IOException e) { System.err.println("Failed to add attachment \"" + attachment + "\": " + e.getMessage()); System.err.println("Aborting sending."); System.exit(1); } } } List<TextSecureAddress> recipients = new ArrayList<>(ns.<String>getList("recipient").size()); for (String recipient : ns.<String>getList("recipient")) { try { recipients.add(m.getPushAddress(recipient)); } catch (InvalidNumberException e) { System.err.println("Failed to add recipient \"" + recipient + "\": " + e.getMessage()); System.err.println("Aborting sending."); System.exit(1); } } sendMessage(m, messageText, textSecureAttachments, recipients); break; case "receive": if (!m.isRegistered()) { System.err.println("User is not registered."); System.exit(1); } try { m.receiveMessages(5, true, new ReceiveMessageHandler(m)); } catch (IOException e) { System.err.println("Error while receiving message: " + e.getMessage()); System.exit(3); } catch (AssertionError e) { System.err.println("Failed to receive message (Assertion): " + e.getMessage()); System.err.println(e.getStackTrace()); System.err.println( "If you use an Oracle JRE please check if you have unlimited strength crypto enabled, see README"); System.exit(1); } break; } m.save(); System.exit(0); }
From source file:GIST.IzbirkomExtractor.IzbirkomExtractor.java
/** * @param args/* ww w. ja v a 2 s .c om*/ */ public static void main(String[] args) { // process command-line options Options options = new Options(); options.addOption("n", "noaddr", false, "do not do any address matching (for testing)"); options.addOption("i", "info", false, "create and populate address information table"); options.addOption("h", "help", false, "this message"); // database connection options.addOption("s", "server", true, "database server to connect to"); options.addOption("d", "database", true, "OSM database name"); options.addOption("u", "user", true, "OSM database user name"); options.addOption("p", "pass", true, "OSM database password"); // logging options options.addOption("l", "logdir", true, "log file directory (default './logs')"); options.addOption("e", "loglevel", true, "log level (default 'FINEST')"); // automatically generate the help statement HelpFormatter help_formatter = new HelpFormatter(); // database URI for connection String dburi = null; // Information message for help screen String info_msg = "IzbirkomExtractor [options] <html_directory>"; try { CommandLineParser parser = new GnuParser(); CommandLine cmd = parser.parse(options, args); if (cmd.hasOption('h') || cmd.getArgs().length != 1) { help_formatter.printHelp(info_msg, options); System.exit(1); } /* prohibit n and i together */ if (cmd.hasOption('n') && cmd.hasOption('i')) { System.err.println("Options 'n' and 'i' cannot be used together."); System.exit(1); } /* require database arguments without -n */ if (cmd.hasOption('n') && (cmd.hasOption('s') || cmd.hasOption('d') || cmd.hasOption('u') || cmd.hasOption('p'))) { System.err.println("Options 'n' and does not need any databse parameters."); System.exit(1); } /* require all 4 database options to be used together */ if (!cmd.hasOption('n') && !(cmd.hasOption('s') && cmd.hasOption('d') && cmd.hasOption('u') && cmd.hasOption('p'))) { System.err.println( "For database access all of the following arguments have to be specified: server, database, user, pass"); System.exit(1); } /* useful variables */ SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'kk:mm"); String dateString = formatter.format(new Date()); /* setup logging */ File logdir = new File(cmd.hasOption('l') ? cmd.getOptionValue('l') : "logs"); FileUtils.forceMkdir(logdir); File log_file_name = new File( logdir + "/" + IzbirkomExtractor.class.getName() + "-" + formatter.format(new Date()) + ".log"); FileHandler log_file = new FileHandler(log_file_name.getPath()); /* create "latest" link to currently created log file */ Path latest_log_link = Paths.get(logdir + "/latest"); Files.deleteIfExists(latest_log_link); Files.createSymbolicLink(latest_log_link, Paths.get(log_file_name.getName())); log_file.setFormatter(new SimpleFormatter()); LogManager.getLogManager().reset(); // prevents logging to console logger.addHandler(log_file); logger.setLevel(cmd.hasOption('e') ? Level.parse(cmd.getOptionValue('e')) : Level.FINEST); // open directory with HTML files and create file list File dir = new File(cmd.getArgs()[0]); if (!dir.isDirectory()) { System.err.println("Unable to find directory '" + cmd.getArgs()[0] + "', exiting"); System.exit(1); } PathMatcher pmatcher = FileSystems.getDefault() .getPathMatcher("glob:? * ?*.html"); ArrayList<File> html_files = new ArrayList<>(); for (Path file : Files.newDirectoryStream(dir.toPath())) if (pmatcher.matches(file.getFileName())) html_files.add(file.toFile()); if (html_files.size() == 0) { System.err.println("No matching HTML files found in '" + dir.getAbsolutePath() + "', exiting"); System.exit(1); } // create csvResultSink FileOutputStream csvout_file = new FileOutputStream("parsed_addresses-" + dateString + ".csv"); OutputStreamWriter csvout = new OutputStreamWriter(csvout_file, "UTF-8"); ResultSink csvResultSink = new CSVResultSink(csvout, new CSVStrategy('|', '"', '#')); // Connect to DB and osmAddressMatcher AddressMatcher osmAddressMatcher; DBSink dbSink = null; DBInfoSink dbInfoSink = null; if (cmd.hasOption('n')) { osmAddressMatcher = new DummyAddressMatcher(); } else { dburi = "jdbc:postgresql://" + cmd.getOptionValue('s') + "/" + cmd.getOptionValue('d'); Connection con = DriverManager.getConnection(dburi, cmd.getOptionValue('u'), cmd.getOptionValue('p')); osmAddressMatcher = new OsmAddressMatcher(con); dbSink = new DBSink(con); if (cmd.hasOption('i')) dbInfoSink = new DBInfoSink(con); } /* create resultsinks */ SinkMultiplexor sm = SinkMultiplexor.newSinkMultiplexor(); sm.addResultSink(csvResultSink); if (dbSink != null) { sm.addResultSink(dbSink); if (dbInfoSink != null) sm.addResultSink(dbInfoSink); } // create tableExtractor TableExtractor te = new TableExtractor(osmAddressMatcher, sm); // TODO: printout summary of options: processing date/time, host, directory of HTML files, jdbc uri, command line with parameters // iterate through files logger.info("Start processing " + html_files.size() + " files in " + dir); for (int i = 0; i < html_files.size(); i++) { System.err.println("Parsing #" + i + ": " + html_files.get(i)); te.processHTMLfile(html_files.get(i)); } System.err.println("Processed " + html_files.size() + " HTML files"); logger.info("Finished processing " + html_files.size() + " files in " + dir); } catch (ParseException e1) { System.err.println("Failed to parse CLI: " + e1.getMessage()); help_formatter.printHelp(info_msg, options); System.exit(1); } catch (IOException e) { System.err.println("I/O Exception: " + e.getMessage()); e.printStackTrace(); System.exit(1); } catch (SQLException e) { System.err.println("Database '" + dburi + "': " + e.getMessage()); System.exit(1); } catch (ResultSinkException e) { System.err.println("Failed to initialize ResultSink: " + e.getMessage()); System.exit(1); } catch (TableExtractorException e) { System.err.println("Failed to initialize Table Extractor: " + e.getMessage()); System.exit(1); } catch (CloneNotSupportedException | IllegalAccessException | InstantiationException e) { System.err.println("Something really odd happened: " + e.getMessage()); e.printStackTrace(); System.exit(1); } }
From source file:eu.crydee.stanfordcorenlp.Tokenizer.java
/** * Wrapper around Stanford CoreNLP to tokenize text. * * Give it an input dir of text files with --input-dir and it'll ouput * tokenized versions, one sentence per line with space separated words to * --output-dir (defaults to out/)./*from w ww.ja va 2s .c om*/ * * @param args CLI args. Example: --input-dir my-input --output-dir * my-output. */ public static void main(String[] args) { ArgumentParser parser = ArgumentParsers.newArgumentParser("stanford-corenlp-tokenizer-wrapper") .description("Converts Mediawiki dumps to text."); parser.addArgument("-i", "--input-dir").required(true).help("Path of the input text files directory."); parser.addArgument("-o", "--output-dir").help("Path of the output text files directory.").setDefault("out"); Params params = new Params(); try { parser.parseArgs(args, params); } catch (ArgumentParserException ex) { System.err.println("Could not parse arguments: " + ex.getMessage()); System.exit(1); } Tokenizer tokenizer = new Tokenizer(); try { Files.list(Paths.get(params.inDirPath)).filter(Files::isRegularFile).map(Path::toFile).map(f -> { try { return Pair.of(f.getName(), FileUtils.readFileToString(f, StandardCharsets.UTF_8)); } catch (IOException ex) { System.err.println("Could not read input text file: " + ex.getLocalizedMessage()); throw new UncheckedIOException(ex); } }).forEach(p -> { String text = tokenizer.tokenizeAndSentenceSplit(p.getRight()); try { FileUtils.writeStringToFile(Paths.get(params.outDirpath, p.getLeft()).toFile(), text, StandardCharsets.UTF_8); } catch (IOException ex) { System.err.println("Could not write output text file: " + ex.getLocalizedMessage()); } }); } catch (IOException ex) { System.err.println("Could not read from input directory: " + ex.getLocalizedMessage()); } }
From source file:tpt.dbweb.cat.evaluation.ComparisonResult.java
public static void main(String[] args) throws JsonGenerationException, JsonMappingException, IOException { ComparisonResult result = new ComparisonResult(); result.docidToMetricToResult.computeIfAbsent("doc", k -> new TreeMap<>()).put("part", new ValueEvaluationStatistics(Fraction.ONE, Fraction.ONE)); System.out.println(mapper.writeValueAsString(new Fraction(1.0, 2.0))); String ves = mapper.writeValueAsString(new ValueEvaluationStatistics(Fraction.ONE, new Fraction(1.0, 2.0))); System.out.println(ves);/*from w w w. j a v a 2 s . c o m*/ System.out.println(mapper.readValue(ves, ValueEvaluationStatistics.class)); result.write(Paths.get("/tmp/test.json")); read(Paths.get("/tmp/test.json")); }
From source file:io.anserini.search.SearchTweets.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(RM3_OPTION, "apply relevance feedback with rm3")); options.addOption(/*from w ww . ja v a 2 s . c o m*/ OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg() .withDescription("file containing topics in TREC format").create(QUERIES_OPTION)); options.addOption(OptionBuilder.withArgName("similarity").hasArg() .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(SearchTweets.class.getName(), options); System.exit(-1); } File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); if (!indexLocation.exists()) { System.err.println("Error: " + indexLocation + " does not exist!"); System.exit(-1); } String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; String topicsFile = cmdline.getOptionValue(QUERIES_OPTION); int numResults = 1000; try { if (cmdline.hasOption(NUM_RESULTS_OPTION)) { numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)); } } catch (NumberFormatException e) { System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION)); System.exit(-1); } String similarity = "LM"; if (cmdline.hasOption(SIMILARITY_OPTION)) { similarity = cmdline.getOptionValue(SIMILARITY_OPTION); } PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexLocation.getAbsolutePath()))); IndexSearcher searcher = new IndexSearcher(reader); if (similarity.equalsIgnoreCase("BM25")) { searcher.setSimilarity(new BM25Similarity()); } else if (similarity.equalsIgnoreCase("LM")) { searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); } MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(topicsFile)); for (MicroblogTopic topic : topics) { Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(), true, true); Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery()); TopDocs rs = searcher.search(query, filter, numResults); RerankerContext context = new RerankerContext(searcher, query, topic.getQuery(), filter); RerankerCascade cascade = new RerankerCascade(context); if (cmdline.hasOption(RM3_OPTION)) { cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name)); cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } else { cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher)); for (int i = 0; i < docs.documents.length; i++) { String qid = topic.getId().replaceFirst("^MB0*", ""); out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i], runtag)); } } reader.close(); out.close(); }
From source file:fr.inria.atlanmod.instantiator.SpecimenGenerator.java
public static void main(String[] args) throws GenerationException, IOException { Options options = new Options(); configureOptions(options);/*from ww w.j av a 2 s . c o m*/ CommandLineParser parser = new GnuParser(); try { CommandLine commandLine = parser.parse(options, args); String metamodel = commandLine.getOptionValue(METAMODEL); DefaultModelGenerator modelGen = new DefaultModelGenerator(URI.createFileURI(metamodel)); if (commandLine.hasOption(ADDITIONAL_METAMODEL)) { for (String additionalMetamodel : commandLine.getOptionValues(ADDITIONAL_METAMODEL)) { URI additionalMetamodelUri = URI.createFileURI(additionalMetamodel); Resource resource = new XMIResourceImpl(additionalMetamodelUri); resource.load(Collections.emptyMap()); registerPackages(resource); } } if (commandLine.hasOption(OUTPUT_DIR)) { String outDir = commandLine.getOptionValue(OUTPUT_DIR); modelGen.setSamplesPath(Paths.get(outDir)); } else { modelGen.setSamplesPath(Paths.get(".")); } if (commandLine.hasOption(N_MODELS)) { int models = ((Number) commandLine.getParsedOptionValue(N_MODELS)).intValue(); modelGen.setSetSize(new int[] { models }); } else { modelGen.setSetSize(new int[] { 1 }); } if (commandLine.hasOption(SIZE)) { long size = ((Number) commandLine.getParsedOptionValue(SIZE)).longValue(); modelGen.setModelsSize(new long[] { size }); } else { modelGen.setModelsSize(new long[] { 1000 }); } if (commandLine.hasOption(SEED)) { long seed = ((Number) commandLine.getParsedOptionValue(SEED)).longValue(); modelGen.setSeed(seed); } else { modelGen.setSeed(System.currentTimeMillis()); } modelGen.runGeneration(); } catch (ParseException e) { System.err.println(e.getLocalizedMessage()); HelpFormatter formatter = new HelpFormatter(); formatter.setOptionComparator(new OptionComarator<Option>()); try { formatter.setWidth(Math.max(TerminalFactory.get().getWidth(), 80)); } catch (Throwable t) { // Nothing to do... } ; formatter.printHelp("java -jar <this-file.jar>", options, true); } }
From source file:io.anserini.index.IndexGov2.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(/* w ww .j av a 2 s . c om*/ OptionBuilder.withArgName("path").hasArg().withDescription("input data path").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output index path") .create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexer threads") .create(THREADS_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("max number of documents to index (-1 to index everything)") .create(DOCLIMIT_OPTION)); options.addOption(POSITIONS_OPTION, false, "index positions"); options.addOption(OPTIMIZE_OPTION, false, "merge all index segments"); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(THREADS_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(100); formatter.printHelp(IndexGov2.class.getCanonicalName(), options); System.exit(-1); } final String dirPath = cmdline.getOptionValue(INDEX_OPTION); final String dataDir = cmdline.getOptionValue(INPUT_OPTION); final int docCountLimit = cmdline.hasOption(DOCLIMIT_OPTION) ? Integer.parseInt(cmdline.getOptionValue(DOCLIMIT_OPTION)) : -1; final int numThreads = Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)); final boolean doUpdate = cmdline.hasOption(UPDATE_OPTION); final boolean positions = cmdline.hasOption(POSITIONS_OPTION); final boolean optimize = cmdline.hasOption(OPTIMIZE_OPTION); final Analyzer a = new EnglishAnalyzer(); final TrecContentSource trecSource = createGov2Source(dataDir); final Directory dir = FSDirectory.open(Paths.get(dirPath)); LOG.info("Index path: " + dirPath); LOG.info("Doc limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit)); LOG.info("Threads: " + numThreads); LOG.info("Positions: " + positions); LOG.info("Optimize (merge segments): " + optimize); final IndexWriterConfig config = new IndexWriterConfig(a); if (doUpdate) { config.setOpenMode(IndexWriterConfig.OpenMode.APPEND); } else { config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } final IndexWriter writer = new IndexWriter(dir, config); Gov2IndexThreads threads = new Gov2IndexThreads(writer, positions, trecSource, numThreads, docCountLimit); LOG.info("Indexer: start"); final long t0 = System.currentTimeMillis(); threads.start(); while (!threads.done()) { Thread.sleep(100); } threads.stop(); final long t1 = System.currentTimeMillis(); LOG.info("Indexer: indexing done (" + (t1 - t0) / 1000.0 + " sec); total " + writer.maxDoc() + " docs"); if (!doUpdate && docCountLimit != -1 && writer.maxDoc() != docCountLimit) { throw new RuntimeException("w.maxDoc()=" + writer.maxDoc() + " but expected " + docCountLimit); } if (threads.failed.get()) { throw new RuntimeException("exceptions during indexing"); } final long t2; t2 = System.currentTimeMillis(); final Map<String, String> commitData = new HashMap<String, String>(); commitData.put("userData", "multi"); writer.setCommitData(commitData); writer.commit(); final long t3 = System.currentTimeMillis(); LOG.info("Indexer: commit multi (took " + (t3 - t2) / 1000.0 + " sec)"); if (optimize) { LOG.info("Indexer: merging all segments"); writer.forceMerge(1); final long t4 = System.currentTimeMillis(); LOG.info("Indexer: segments merged (took " + (t4 - t3) / 1000.0 + " sec)"); } LOG.info("Indexer: at close: " + writer.segString()); final long tCloseStart = System.currentTimeMillis(); writer.close(); LOG.info("Indexer: close took " + (System.currentTimeMillis() - tCloseStart) / 1000.0 + " sec"); dir.close(); final long tFinal = System.currentTimeMillis(); LOG.info("Indexer: finished (" + (tFinal - t0) / 1000.0 + " sec)"); LOG.info("Indexer: net bytes indexed " + threads.getBytesIndexed()); LOG.info("Indexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / ((tFinal - t0) / 3600000.)) + " GB/hour plain text"); }
From source file:apps.LuceneIndexer.java
public static void main(String[] args) { Options options = new Options(); options.addOption("i", null, true, "input file"); options.addOption("o", null, true, "output directory"); options.addOption("r", null, true, "optional output TREC-format QREL file"); options.addOption("bm25_b", null, true, "BM25 parameter: b"); options.addOption("bm25_k1", null, true, "BM25 parameter: k1"); options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity"); Joiner commaJoin = Joiner.on(','); Joiner spaceJoin = Joiner.on(' '); options.addOption("source_type", null, true, "document source type: " + commaJoin.join(SourceFactory.getDocSourceList())); // If you increase this value, you may need to modify the following line in *.sh file // export MAVEN_OPTS="-Xms8192m -server" double ramBufferSizeMB = 1024 * 8; // 8 GB CommandLineParser parser = new org.apache.commons.cli.GnuParser(); IndexWriter indexWriter = null;//from w w w .j a va 2 s . c o m BufferedWriter qrelWriter = null; int docNum = 0; try { CommandLine cmd = parser.parse(options, args); String inputFileName = null, outputDirName = null, qrelFileName = null; if (cmd.hasOption("i")) { inputFileName = cmd.getOptionValue("i"); } else { Usage("Specify 'input file'", options); } if (cmd.hasOption("o")) { outputDirName = cmd.getOptionValue("o"); } else { Usage("Specify 'index directory'", options); } if (cmd.hasOption("r")) { qrelFileName = cmd.getOptionValue("r"); } String sourceName = cmd.getOptionValue("source_type"); if (sourceName == null) Usage("Specify document source type", options); if (qrelFileName != null) qrelWriter = new BufferedWriter(new FileWriter(qrelFileName)); File outputDir = new File(outputDirName); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); System.exit(1); } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); System.exit(1); } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); System.exit(1); } boolean useFixedBM25 = cmd.hasOption("bm25fixed"); float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT; if (cmd.hasOption("bm25_k1")) { try { bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_k1'", options); } } if (cmd.hasOption("bm25_b")) { try { bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_b'", options); } } EnglishAnalyzer analyzer = new EnglishAnalyzer(); FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName)); IndexWriterConfig indexConf = new IndexWriterConfig(analyzer); /* OpenMode.CREATE creates a new index or overwrites an existing one. https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE */ indexConf.setOpenMode(OpenMode.CREATE); indexConf.setRAMBufferSizeMB(ramBufferSizeMB); System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b)); if (useFixedBM25) { System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b)); } else { System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b)); } indexWriter = new IndexWriter(indexDir, indexConf); DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName); DocumentEntry inpDoc = null; TextCleaner textCleaner = new TextCleaner(null); while ((inpDoc = inpDocSource.next()) != null) { ++docNum; Document luceneDoc = new Document(); ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText); String cleanText = spaceJoin.join(cleanedToks); // System.out.println(inpDoc.mDocId); // System.out.println(cleanText); // System.out.println("=============================="); luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES)); luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES)); indexWriter.addDocument(luceneDoc); if (inpDoc.mIsRel != null && qrelWriter != null) { saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0); } if (docNum % 1000 == 0) System.out.println(String.format("Indexed %d documents", docNum)); } } catch (ParseException e) { e.printStackTrace(); Usage("Cannot parse arguments" + e, options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } finally { System.out.println(String.format("Indexed %d documents", docNum)); try { if (null != indexWriter) indexWriter.close(); if (null != qrelWriter) qrelWriter.close(); } catch (IOException e) { System.err.println("IO exception: " + e); e.printStackTrace(); } } }