List of usage examples for java.io BufferedReader close
public void close() throws IOException
From source file:iac.cnr.it.TestSearcher.java
public static void main(String[] args) throws IOException, ParseException { /** Command line parser and options */ CommandLineParser parser = new PosixParser(); Options options = new Options(); options.addOption(OPT_INDEX, true, "Index path"); options.addOption(OPT_QUERY, true, "The query"); CommandLine cmd = null;/*w ww . j a v a2s .c o m*/ try { cmd = parser.parse(options, args); } catch (org.apache.commons.cli.ParseException e) { logger.fatal("Error while parsing command line arguments"); System.exit(1); } /** Check for mandatory options */ if (!cmd.hasOption(OPT_INDEX) || !cmd.hasOption(OPT_QUERY)) { usage(); System.exit(0); } /** Read options */ File casePath = new File(cmd.getOptionValue(OPT_INDEX)); String query = cmd.getOptionValue(OPT_QUERY); /** Check correctness of the path containing an ISODAC case */ if (!casePath.exists() || !casePath.isDirectory()) { logger.fatal("The case directory \"" + casePath.getAbsolutePath() + "\" is not valid"); System.exit(1); } /** Check existance of the info.dat file */ File infoFile = new File(casePath, INFO_FILENAME); if (!infoFile.exists()) { logger.fatal("Can't find " + INFO_FILENAME + " within the case directory (" + casePath + ")"); System.exit(1); } /** Load the mapping image_uuid --> image_filename */ imagesMap = new HashMap<Integer, String>(); BufferedReader reader = new BufferedReader(new FileReader(infoFile)); while (reader.ready()) { String line = reader.readLine(); logger.info("Read the line: " + line); String currentID = line.split("\t")[0]; String currentImgFile = line.split("\t")[1]; imagesMap.put(Integer.parseInt(currentID), currentImgFile); logger.info("ID: " + currentID + " - IMG: " + currentImgFile + " added to the map"); } reader.close(); /** Load all the directories containing an index */ ArrayList<String> indexesDirs = new ArrayList<String>(); for (File f : casePath.listFiles()) { logger.info("Analyzing: " + f); if (f.isDirectory()) indexesDirs.add(f.getAbsolutePath()); } logger.info(indexesDirs.size() + " directories found!"); /** Set-up the searcher */ Searcher searcher = null; try { String[] array = indexesDirs.toArray(new String[indexesDirs.size()]); searcher = new Searcher(array); TopDocs results = searcher.search(query, Integer.MAX_VALUE); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); for (int i = 0; i < numTotalHits; i++) { Document doc = searcher.doc(hits[i].doc); String path = doc.get(FIELD_PATH); String filename = doc.get(FIELD_FILENAME); String image_uuid = doc.get(FIELD_IMAGE_ID); if (path != null) { //System.out.println((i + 1) + ". " + path + File.separator + filename + " - score: " + hits[i].score); // System.out.println((i + 1) + ". " + path + File.separator + filename + " - image_file: " + image_uuid); System.out.println((i + 1) + ". " + path + File.separator + filename + " - image_file: " + imagesMap.get(Integer.parseInt(image_uuid))); } else { System.out.println((i + 1) + ". " + "No path for this document"); } } } catch (Exception e) { System.err.println("An error occurred: " + e.getMessage()); e.printStackTrace(); } finally { if (searcher != null) searcher.close(); } }
From source file:ca.mcgill.networkdynamics.geoinference.evaluation.CrossValidationScorer.java
public static void main(String[] args) throws Exception { if (args.length != 4) { System.out.println("java CVS predictions-dir/ " + "cv-gold-dir/ results.txt error-sample.tsv"); return;//from www. j av a 2 s . c om } File predDir = new File(args[0]); File cvDir = new File(args[1]); TDoubleList errors = new TDoubleArrayList(10_000_000); TLongSet locatedUsers = new TLongHashSet(10_000_000); TLongSet allUsers = new TLongHashSet(10_000_000); TLongObjectMap<TDoubleList> userToErrors = new TLongObjectHashMap<TDoubleList>(); TLongDoubleMap tweetIdToError = new TLongDoubleHashMap(10_000_000); TLongObjectMap<double[]> idToPredLoc = new TLongObjectHashMap<double[]>(); int tweetsSeen = 0; int tweetsLocated = 0; BufferedReader cvBr = new BufferedReader(new FileReader(new File(cvDir, "folds.info.tsv"))); for (String foldLine = null; (foldLine = cvBr.readLine()) != null;) { String[] cols = foldLine.split("\t"); String foldName = cols[0]; System.out.printf("Scoring results for fold %s%n", foldName); File foldPredictionsFile = new File(predDir, foldName + ".results.tsv.gz"); File goldLocFile = new File(cvDir, foldName + ".gold-locations.tsv"); if (foldPredictionsFile.exists()) { BufferedReader br = Files.openGz(foldPredictionsFile); for (String line = null; (line = br.readLine()) != null;) { String[] arr = line.split("\t"); long id = Long.parseLong(arr[0]); idToPredLoc.put(id, new double[] { Double.parseDouble(arr[1]), Double.parseDouble(arr[2]) }); } br.close(); } System.out.printf("loaded predictions for %d tweets; " + "scoring predictions%n", idToPredLoc.size()); BufferedReader br = new BufferedReader(new FileReader(goldLocFile)); for (String line = null; (line = br.readLine()) != null;) { String[] arr = line.split("\t"); long id = Long.parseLong(arr[0]); long userId = Long.parseLong(arr[1]); allUsers.add(userId); tweetsSeen++; double[] predLoc = idToPredLoc.get(id); if (predLoc == null) continue; tweetsLocated++; locatedUsers.add(userId); double[] goldLoc = new double[] { Double.parseDouble(arr[2]), Double.parseDouble(arr[3]) }; double dist = Geometry.getDistance(predLoc, goldLoc); errors.add(dist); tweetIdToError.put(id, dist); TDoubleList userErrors = userToErrors.get(userId); if (userErrors == null) { userErrors = new TDoubleArrayList(); userToErrors.put(userId, userErrors); } userErrors.add(dist); } br.close(); } errors.sort(); System.out.println("Num errors to score: " + errors.size()); double auc = 0; double userCoverage = 0; double tweetCoverage = tweetsLocated / (double) tweetsSeen; double medianMaxUserError = Double.NaN; double medianMedianUserError = Double.NaN; if (errors.size() > 0) { auc = computeAuc(errors); userCoverage = locatedUsers.size() / ((double) allUsers.size()); TDoubleList maxUserErrors = new TDoubleArrayList(locatedUsers.size()); TDoubleList medianUserErrors = new TDoubleArrayList(locatedUsers.size()); for (TDoubleList userErrors : userToErrors.valueCollection()) { userErrors.sort(); maxUserErrors.add(userErrors.get(userErrors.size() - 1)); medianUserErrors.add(userErrors.get(userErrors.size() / 2)); } maxUserErrors.sort(); medianMaxUserError = maxUserErrors.get(maxUserErrors.size() / 2); medianUserErrors.sort(); medianMedianUserError = medianUserErrors.get(medianUserErrors.size() / 2); // Compute CDF int[] errorsPerKm = new int[MAX_KM]; for (int i = 0; i < errors.size(); ++i) { int error = (int) (Math.round(errors.get(i))); errorsPerKm[error]++; } // The accumulated sum of errors per km int[] errorsBelowEachKm = new int[errorsPerKm.length]; for (int i = 0; i < errorsBelowEachKm.length; ++i) { errorsBelowEachKm[i] = errorsPerKm[i]; if (i > 0) errorsBelowEachKm[i] += errorsBelowEachKm[i - 1]; } final double[] cdf = new double[errorsBelowEachKm.length]; double dSize = errors.size(); // to avoid casting all the time for (int i = 0; i < cdf.length; ++i) cdf[i] = errorsBelowEachKm[i] / dSize; } PrintWriter pw = new PrintWriter(new File(args[2])); pw.println("AUC\t" + auc); pw.println("user coverage\t" + userCoverage); pw.println("tweet coverage\t" + tweetCoverage); pw.println("median-max error\t" + medianMaxUserError); pw.close(); // Choose a random sampling of 10K tweets to pass on to the authors // here. PrintWriter errorsPw = new PrintWriter(args[3]); TLongList idsWithErrors = new TLongArrayList(tweetIdToError.keySet()); idsWithErrors.shuffle(new Random()); // Choose the first 10K for (int i = 0, chosen = 0; i < idsWithErrors.size() && chosen < 10_000; ++i) { long id = idsWithErrors.get(i); double[] prediction = idToPredLoc.get(id); double error = tweetIdToError.get(id); errorsPw.println(id + "\t" + error + "\t" + prediction[0] + "\t" + prediction[1]); ++chosen; } errorsPw.close(); }
From source file:com.aestel.chemistry.openEye.fp.DistMatrix.java
public static void main(String... args) throws IOException { long start = System.currentTimeMillis(); // create command line Options object Options options = new Options(); Option opt = new Option("i", true, "input file [.tsv from FingerPrinter]"); opt.setRequired(true);/*from www. j av a 2s . com*/ options.addOption(opt); opt = new Option("o", true, "outpur file [.tsv "); opt.setRequired(true); options.addOption(opt); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (Exception e) { System.err.println(e.getMessage()); exitWithHelp(options); } args = cmd.getArgs(); if (args.length != 0) exitWithHelp(options); String file = cmd.getOptionValue("i"); BufferedReader in = new BufferedReader(new FileReader(file)); file = cmd.getOptionValue("o"); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file))); ArrayList<Fingerprint> fps = new ArrayList<Fingerprint>(); ArrayList<String> ids = new ArrayList<String>(); String line; while ((line = in.readLine()) != null) { String[] parts = line.split("\t"); if (parts.length == 3) { ids.add(parts[0]); fps.add(new ByteFingerprint(parts[2])); } } in.close(); out.print("ID"); for (int i = 0; i < ids.size(); i++) { out.print('\t'); out.print(ids.get(i)); } out.println(); for (int i = 0; i < ids.size(); i++) { out.print(ids.get(i)); Fingerprint fp1 = fps.get(i); for (int j = 0; j <= i; j++) { out.printf("\t%.4g", fp1.tanimoto(fps.get(j))); } out.println(); } out.close(); System.err.printf("Done %d fingerprints in %.2gsec\n", fps.size(), (System.currentTimeMillis() - start) / 1000D); }
From source file:com.cyberway.issue.util.SurtPrefixSet.java
/** * Allow class to be used as a command-line tool for converting * URL lists (or naked host or host/path fragments implied * to be HTTP URLs) to implied SURT prefix form. * /*from ww w . ja v a 2 s. com*/ * Read from stdin or first file argument. Writes to stdout. * * @param args cmd-line arguments: may include input file * @throws IOException */ public static void main(String[] args) throws IOException { InputStream in = args.length > 0 ? new BufferedInputStream(new FileInputStream(args[0])) : System.in; PrintStream out = args.length > 1 ? new PrintStream(new BufferedOutputStream(new FileOutputStream(args[1]))) : System.out; BufferedReader br = new BufferedReader(new InputStreamReader(in)); String line; while ((line = br.readLine()) != null) { if (line.indexOf("#") > 0) line = line.substring(0, line.indexOf("#")); line = line.trim(); if (line.length() == 0) continue; out.println(prefixFromPlain(line)); } br.close(); out.close(); }
From source file:URLGet.java
public static void main(String[] args) { BufferedReader in = null; if (args.length == 1) { try {/*from w w w . j av a2s . c o m*/ URL url = new URL(args[0]); in = new BufferedReader(new InputStreamReader(url.openStream())); String line = null; while ((line = in.readLine()) != null) System.out.println(line); } catch (MalformedURLException ex) { System.err.println(ex); } catch (FileNotFoundException ex) { System.err.println("Failed to open stream to URL: " + ex); } catch (IOException ex) { System.err.println("Error reading URL content: " + ex); } if (in != null) try { in.close(); } catch (IOException ex) { } } else System.err.println("Usage: URLGet URL"); }
From source file:cc.twittertools.index.IndexStatuses.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory") .create(COLLECTION_OPTION)); options.addOption(/*from w w w .j ava 2 s .c o m*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexStatuses.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexed(true); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); LongOpenHashSet deletes = null; if (cmdline.hasOption(DELETES_OPTION)) { deletes = new LongOpenHashSet(); File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION)); if (!deletesFile.exists()) { System.err.println("Error: " + deletesFile + " does not exist!"); System.exit(-1); } LOG.info("Reading deletes from " + deletesFile); FileInputStream fin = new FileInputStream(deletesFile); byte[] ignoreBytes = new byte[2]; fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin))); String s; while ((s = br.readLine()) != null) { if (s.contains("\t")) { deletes.add(Long.parseLong(s.split("\t")[0])); } else { deletes.add(Long.parseLong(s)); } } br.close(); fin.close(); LOG.info("Read " + deletes.size() + " tweetids from deletes file."); } long maxId = Long.MAX_VALUE; if (cmdline.hasOption(MAX_ID_OPTION)) { maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)); LOG.info("index: " + maxId); } long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } StatusStream stream = new JsonStatusCorpusReader(file); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getText() == null) { continue; } // Skip deletes tweetids. if (deletes != null && deletes.contains(status.getId())) { continue; } if (status.getId() > maxId) { continue; } cnt++; Document doc = new Document(); doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES)); doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES)); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES)); doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES)); doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES)); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES)); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES)); doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES)); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } writer.addDocument(doc); if (cnt % 100000 == 0) { LOG.info(cnt + " statuses indexed"); } } LOG.info(String.format("Total of %s statuses added", cnt)); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:io.anserini.index.IndexTweets.java
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory") .create(COLLECTION_OPTION)); options.addOption(/*ww w . j ava 2 s .co m*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexTweets.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); LongOpenHashSet deletes = null; if (cmdline.hasOption(DELETES_OPTION)) { deletes = new LongOpenHashSet(); File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION)); if (!deletesFile.exists()) { System.err.println("Error: " + deletesFile + " does not exist!"); System.exit(-1); } LOG.info("Reading deletes from " + deletesFile); FileInputStream fin = new FileInputStream(deletesFile); byte[] ignoreBytes = new byte[2]; fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin))); String s; while ((s = br.readLine()) != null) { if (s.contains("\t")) { deletes.add(Long.parseLong(s.split("\t")[0])); } else { deletes.add(Long.parseLong(s)); } } br.close(); fin.close(); LOG.info("Read " + deletes.size() + " tweetids from deletes file."); } long maxId = Long.MAX_VALUE; if (cmdline.hasOption(MAX_ID_OPTION)) { maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)); LOG.info("index: " + maxId); } long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } StatusStream stream = new JsonStatusCorpusReader(file); Directory dir = FSDirectory.open(Paths.get(indexPath)); final IndexWriterConfig config = new IndexWriterConfig(ANALYZER); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getText() == null) { continue; } // Skip deletes tweetids. if (deletes != null && deletes.contains(status.getId())) { continue; } if (status.getId() > maxId) { continue; } cnt++; Document doc = new Document(); doc.add(new LongPoint(StatusField.ID.name, status.getId())); doc.add(new StoredField(StatusField.ID.name, status.getId())); doc.add(new LongPoint(StatusField.EPOCH.name, status.getEpoch())); doc.add(new StoredField(StatusField.EPOCH.name, status.getEpoch())); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, status.getFollowersCount())); doc.add(new StoredField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount())); doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount())); doc.add(new StoredField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount())); doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, status.getStatusesCount())); doc.add(new StoredField(StatusField.STATUSES_COUNT.name, status.getStatusesCount())); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId)); doc.add(new StoredField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId)); doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId())); doc.add(new StoredField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId())); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId)); doc.add(new StoredField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId)); doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId())); doc.add(new StoredField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId())); doc.add(new IntPoint(StatusField.RETWEET_COUNT.name, status.getRetweetCount())); doc.add(new StoredField(StatusField.RETWEET_COUNT.name, status.getRetweetCount())); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } writer.addDocument(doc); if (cnt % 100000 == 0) { LOG.info(cnt + " statuses indexed"); } } LOG.info(String.format("Total of %s statuses added", cnt)); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:fr.inria.edelweiss.kgdqp.core.FedQueryingCLI.java
@SuppressWarnings("unchecked") public static void main(String args[]) throws ParseException, EngineException { List<String> endpoints = new ArrayList<String>(); String queryPath = null;//from ww w .j a va 2 s .co m int slice = -1; Options options = new Options(); Option helpOpt = new Option("h", "help", false, "print this message"); Option queryOpt = new Option("q", "query", true, "specify the sparql query file"); Option endpointOpt = new Option("e", "endpoints", true, "the list of federated sparql endpoint URLs"); Option groupingOpt = new Option("g", "grouping", true, "triple pattern optimisation"); Option slicingOpt = new Option("s", "slicing", true, "size of the slicing parameter"); Option versionOpt = new Option("v", "version", false, "print the version information and exit"); options.addOption(queryOpt); options.addOption(endpointOpt); options.addOption(helpOpt); options.addOption(versionOpt); options.addOption(groupingOpt); options.addOption(slicingOpt); String header = "Corese/KGRAM DQP command line interface"; String footer = "\nPlease report any issue to alban.gaignard@cnrs.fr"; CommandLineParser parser = new BasicParser(); CommandLine cmd = parser.parse(options, args); if (cmd.hasOption("h")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("kgdqp", header, options, footer, true); System.exit(0); } if (!cmd.hasOption("e")) { logger.info("You must specify at least the URL of one sparql endpoint !"); System.exit(0); } else { endpoints = new ArrayList<String>(Arrays.asList(cmd.getOptionValues("e"))); } if (!cmd.hasOption("q")) { logger.info("You must specify a path for a sparql query !"); System.exit(0); } else { queryPath = cmd.getOptionValue("q"); } if (cmd.hasOption("s")) { try { slice = Integer.parseInt(cmd.getOptionValue("s")); } catch (NumberFormatException ex) { logger.warn(cmd.getOptionValue("s") + " is not formatted as number for the slicing parameter"); logger.warn("Slicing disabled"); } } if (cmd.hasOption("v")) { logger.info("version 3.0.4-SNAPSHOT"); System.exit(0); } ///////////////// Graph graph = Graph.create(); QueryProcessDQP exec = QueryProcessDQP.create(graph); exec.setGroupingEnabled(cmd.hasOption("g")); if (slice > 0) { exec.setSlice(slice); } Provider sProv = ProviderImplCostMonitoring.create(); exec.set(sProv); for (String url : endpoints) { try { exec.addRemote(new URL(url), WSImplem.REST); } catch (MalformedURLException ex) { logger.error(url + " is not a well-formed URL"); System.exit(1); } } StringBuffer fileData = new StringBuffer(1000); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(queryPath)); } catch (FileNotFoundException ex) { logger.error("Query file " + queryPath + " not found !"); System.exit(1); } char[] buf = new char[1024]; int numRead = 0; try { while ((numRead = reader.read(buf)) != -1) { String readData = String.valueOf(buf, 0, numRead); fileData.append(readData); buf = new char[1024]; } reader.close(); } catch (IOException ex) { logger.error("Error while reading query file " + queryPath); System.exit(1); } String sparqlQuery = fileData.toString(); // Query q = exec.compile(sparqlQuery, null); // System.out.println(q); StopWatch sw = new StopWatch(); sw.start(); Mappings map = exec.query(sparqlQuery); int dqpSize = map.size(); System.out.println("--------"); long time = sw.getTime(); System.out.println(time + " " + dqpSize); }
From source file:biomine.nodeimportancecompression.ImportanceCompressionReport.java
public static void main(String[] args) throws IOException, java.text.ParseException { opts.addOption("algorithm", true, "Used algorithm for compression. Possible values are 'brute-force', " + "'brute-force-edges','brute-force-merges','randomized','randomized-merges'," + "'randomized-edges'," + "'fast-brute-force'," + "'fast-brute-force-merges','fast-brute-force-merge-edges'. Default is 'brute-force'."); opts.addOption("query", true, "Query nodes ids, separated by comma."); opts.addOption("queryfile", true, "Read query nodes from file."); opts.addOption("ratio", true, "Goal ratio"); opts.addOption("importancefile", true, "Read importances straight from file"); opts.addOption("keepedges", false, "Don't remove edges during merges"); opts.addOption("connectivity", false, "Compute and output connectivities in edge oriented case"); opts.addOption("paths", false, "Do path oriented compression"); opts.addOption("edges", false, "Do edge oriented compression"); // opts.addOption( "a", double sigma = 1.0; CommandLineParser parser = new PosixParser(); CommandLine cmd = null;/*ww w .ja va2s . com*/ try { cmd = parser.parse(opts, args); } catch (ParseException e) { e.printStackTrace(); System.exit(0); } String queryStr = cmd.getOptionValue("query"); String[] queryNodeIDs = {}; double[] queryNodeIMP = {}; if (queryStr != null) { queryNodeIDs = queryStr.split(","); queryNodeIMP = new double[queryNodeIDs.length]; for (int i = 0; i < queryNodeIDs.length; i++) { String s = queryNodeIDs[i]; String[] es = s.split("="); queryNodeIMP[i] = 1; if (es.length == 2) { queryNodeIDs[i] = es[0]; queryNodeIMP[i] = Double.parseDouble(es[1]); } else if (es.length > 2) { System.out.println("Too many '=' in querynode specification: " + s); } } } String queryFile = cmd.getOptionValue("queryfile"); Map<String, Double> queryNodes = Collections.EMPTY_MAP; if (queryFile != null) { File in = new File(queryFile); BufferedReader read = new BufferedReader(new FileReader(in)); queryNodes = readMap(read); read.close(); } String impfile = cmd.getOptionValue("importancefile"); Map<String, Double> importances = null; if (impfile != null) { File in = new File(impfile); BufferedReader read = new BufferedReader(new FileReader(in)); importances = readMap(read); read.close(); } String algoStr = cmd.getOptionValue("algorithm"); CompressionAlgorithm algo = null; if (algoStr == null || algoStr.equals("brute-force")) { algo = new BruteForceCompression(); } else if (algoStr.equals("brute-force-edges")) { algo = new BruteForceCompressionOnlyEdges(); } else if (algoStr.equals("brute-force-merges")) { algo = new BruteForceCompressionOnlyMerges(); } else if (algoStr.equals("fast-brute-force-merges")) { //algo = new FastBruteForceCompressionOnlyMerges(); algo = new FastBruteForceCompression(true, false); } else if (algoStr.equals("fast-brute-force-edges")) { algo = new FastBruteForceCompression(false, true); //algo = new FastBruteForceCompressionOnlyEdges(); } else if (algoStr.equals("fast-brute-force")) { algo = new FastBruteForceCompression(true, true); } else if (algoStr.equals("randomized-edges")) { algo = new RandomizedCompressionOnlyEdges(); //modified } else if (algoStr.equals("randomized")) { algo = new RandomizedCompression(); } else if (algoStr.equals("randomized-merges")) { algo = new RandomizedCompressionOnlyMerges(); } else { System.out.println("Unsupported algorithm: " + algoStr); printHelp(); } String ratioStr = cmd.getOptionValue("ratio"); double ratio = 0; if (ratioStr != null) { ratio = Double.parseDouble(ratioStr); } else { System.out.println("Goal ratio not specified"); printHelp(); } String infile = null; if (cmd.getArgs().length != 0) { infile = cmd.getArgs()[0]; } else { printHelp(); } BMGraph bmg = BMGraphUtils.readBMGraph(new File(infile)); HashMap<BMNode, Double> queryBMNodes = new HashMap<BMNode, Double>(); for (String id : queryNodes.keySet()) { queryBMNodes.put(bmg.getNode(id), queryNodes.get(id)); } long startMillis = System.currentTimeMillis(); ImportanceGraphWrapper wrap = QueryImportance.queryImportanceGraph(bmg, queryBMNodes); if (importances != null) { for (String id : importances.keySet()) { wrap.setImportance(bmg.getNode(id), importances.get(id)); } } ImportanceMerger merger = null; if (cmd.hasOption("edges")) { merger = new ImportanceMergerEdges(wrap.getImportanceGraph()); } else if (cmd.hasOption("paths")) { merger = new ImportanceMergerPaths(wrap.getImportanceGraph()); } else { System.out.println("Specify either 'paths' or 'edges'."); System.exit(1); } if (cmd.hasOption("keepedges")) { merger.setKeepEdges(true); } algo.compress(merger, ratio); long endMillis = System.currentTimeMillis(); // write importance { BufferedWriter wr = new BufferedWriter(new FileWriter("importance.txt", false)); for (BMNode nod : bmg.getNodes()) { wr.write(nod + " " + wrap.getImportance(nod) + "\n"); } wr.close(); } // write sum of all pairs of node importance added by Fang /* { BufferedWriter wr = new BufferedWriter(new FileWriter("sum_of_all_pairs_importance.txt", true)); ImportanceGraph orig = wrap.getImportanceGraph(); double sum = 0; for (int i = 0; i <= orig.getMaxNodeId(); i++) { for (int j = i+1; j <= orig.getMaxNodeId(); j++) { sum = sum+ wrap.getImportance(i)* wrap.getImportance(j); } } wr.write(""+sum); wr.write("\n"); wr.close(); } */ // write uncompressed edges { BufferedWriter wr = new BufferedWriter(new FileWriter("edges.txt", false)); ImportanceGraph orig = wrap.getImportanceGraph(); ImportanceGraph ucom = merger.getUncompressedGraph(); for (int i = 0; i <= orig.getMaxNodeId(); i++) { String iname = wrap.intToNode(i).toString(); HashSet<Integer> ne = new HashSet<Integer>(); ne.addAll(orig.getNeighbors(i)); ne.addAll(ucom.getNeighbors(i)); for (int j : ne) { if (i < j) continue; String jname = wrap.intToNode(j).toString(); double a = orig.getEdgeWeight(i, j); double b = ucom.getEdgeWeight(i, j); wr.write(iname + " " + jname + " " + a + " " + b + " " + Math.abs(a - b)); wr.write("\n"); } } wr.close(); } // write distance { // BufferedWriter wr = new BufferedWriter(new // FileWriter("distance.txt",false)); BufferedWriter wr = new BufferedWriter(new FileWriter("distance.txt", true)); //modified by Fang ImportanceGraph orig = wrap.getImportanceGraph(); ImportanceGraph ucom = merger.getUncompressedGraph(); double error = 0; for (int i = 0; i <= orig.getMaxNodeId(); i++) { HashSet<Integer> ne = new HashSet<Integer>(); ne.addAll(orig.getNeighbors(i)); ne.addAll(ucom.getNeighbors(i)); for (int j : ne) { if (i <= j) continue; double a = orig.getEdgeWeight(i, j); double b = ucom.getEdgeWeight(i, j); error += (a - b) * (a - b) * wrap.getImportance(i) * wrap.getImportance(j); // modify by Fang: multiply imp(u)imp(v) } } error = Math.sqrt(error); //////////error = Math.sqrt(error / 2); // modified by Fang: the error of each // edge is counted twice wr.write("" + error); wr.write("\n"); wr.close(); } // write sizes { ImportanceGraph orig = wrap.getImportanceGraph(); ImportanceGraph comp = merger.getCurrentGraph(); // BufferedWriter wr = new BufferedWriter(new // FileWriter("sizes.txt",false)); BufferedWriter wr = new BufferedWriter(new FileWriter("sizes.txt", true)); //modified by Fang wr.write(orig.getNodeCount() + " " + orig.getEdgeCount() + " " + comp.getNodeCount() + " " + comp.getEdgeCount()); wr.write("\n"); wr.close(); } //write time { System.out.println("writing time"); BufferedWriter wr = new BufferedWriter(new FileWriter("time.txt", true)); //modified by Fang double secs = (endMillis - startMillis) * 0.001; wr.write("" + secs + "\n"); wr.close(); } //write change of connectivity for edge-oriented case // added by Fang { if (cmd.hasOption("connectivity")) { BufferedWriter wr = new BufferedWriter(new FileWriter("connectivity.txt", true)); ImportanceGraph orig = wrap.getImportanceGraph(); ImportanceGraph ucom = merger.getUncompressedGraph(); double diff = 0; for (int i = 0; i <= orig.getMaxNodeId(); i++) { ProbDijkstra pdori = new ProbDijkstra(orig, i); ProbDijkstra pducom = new ProbDijkstra(ucom, i); for (int j = i + 1; j <= orig.getMaxNodeId(); j++) { double oriconn = pdori.getProbTo(j); double ucomconn = pducom.getProbTo(j); diff = diff + (oriconn - ucomconn) * (oriconn - ucomconn) * wrap.getImportance(i) * wrap.getImportance(j); } } diff = Math.sqrt(diff); wr.write("" + diff); wr.write("\n"); wr.close(); } } //write output graph { BMGraph output = bmg;//new BMGraph(bmg); int no = 0; BMNode[] nodes = new BMNode[merger.getGroups().size()]; for (ArrayList<Integer> gr : merger.getGroups()) { BMNode bmgroup = new BMNode("Group", "" + (no + 1)); bmgroup.setAttributes(new HashMap<String, String>()); bmgroup.put("autoedges", "0"); nodes[no] = bmgroup; no++; if (gr.size() == 0) continue; for (int x : gr) { BMNode nod = output.getNode(wrap.intToNode(x).toString()); BMEdge belongs = new BMEdge(nod, bmgroup, "belongs_to"); output.ensureHasEdge(belongs); } output.ensureHasNode(bmgroup); } for (int i = 0; i < nodes.length; i++) { for (int x : merger.getCurrentGraph().getNeighbors(i)) { if (x == i) { nodes[x].put("selfedge", "" + merger.getCurrentGraph().getEdgeWeight(i, x)); //ge.put("goodness", ""+merger.getCurrentGraph().getEdgeWeight(i, x)); continue; } BMEdge ge = new BMEdge(nodes[x], nodes[i], "groupedge"); ge.setAttributes(new HashMap<String, String>()); ge.put("goodness", "" + merger.getCurrentGraph().getEdgeWeight(i, x)); output.ensureHasEdge(ge); } } System.out.println(output.getGroupNodes()); BMGraphUtils.writeBMGraph(output, "output.bmg"); } }
From source file:eu.smartfp7.foursquare.AttendanceCrawler.java
/** * The main takes an undefined number of cities as arguments, then initializes * the specific crawling of all the trending venues of these cities. * The trending venues must have been previously identified using the `DownloadPages` * program./*w ww . j a va 2 s.co m*/ * * Current valid cities are: london, amsterdam, goldcoast, sanfrancisco. * */ public static void main(String[] args) throws Exception { Settings settings = Settings.getInstance(); String folder = settings.getFolder(); // We keep info and error logs, so that we know what happened in case // of incoherence in the time series. Map<String, FileWriter> info_logs = new HashMap<String, FileWriter>(); Map<String, FileWriter> error_logs = new HashMap<String, FileWriter>(); // For each city we monitor, we store the venue IDs that we got from // a previous crawl. Map<String, Collection<String>> city_venues = new HashMap<String, Collection<String>>(); // Contains the epoch time when the last API call has been made for each // venue. Ensures that we get data only once each hour. Map<String, Long> venue_last_call = new HashMap<String, Long>(); // Contains the epoch time when we last checked if time series were broken // for each city. // We do these checks once every day before the batch forecasting begins. Map<String, Long> sanity_checks = new HashMap<String, Long>(); // We also keep in memory the number of checkins for the last hour for // each venue. Map<String, Integer> venue_last_checkin = new HashMap<String, Integer>(); Map<Long, Integer> APICallsCount = new HashMap<Long, Integer>(); DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); int total_venues = 0; long total_calls = 0; long time_spent_on_API = 0; for (String c : args) { settings.checkFileHierarchy(c); city_venues.put(c, loadVenues(c)); total_venues += city_venues.get(c).size(); info_logs.put(c, new FileWriter(folder + c + File.separator + "log" + File.separator + "info.log", true)); error_logs.put(c, new FileWriter(folder + c + File.separator + "log" + File.separator + "error.log", true)); Calendar cal = Calendar.getInstance(); info_logs.get(c).write("[" + df.format(cal.getTime()) + "] Crawler initialization for " + c + ". " + city_venues.get(c).size() + " venues loaded.\n"); info_logs.get(c).flush(); // If we interrupted the program for some reason, we can get back // the in-memory data. // Important: the program must not be interrupted for more than one // hour, or we will lose time series data. for (String venue_id : city_venues.get(c)) { String ts_file = folder + c + File.separator + "attendances_crawl" + File.separator + venue_id + ".ts"; if (new File(ts_file).exists()) { BufferedReader buffer = new BufferedReader(new FileReader(ts_file)); String mem = null, line = null; for (; (line = buffer.readLine()) != null; mem = line) ; buffer.close(); if (mem == null) continue; String[] tmp = mem.split(","); venue_last_call.put(venue_id, df.parse(tmp[0]).getTime()); venue_last_checkin.put(venue_id, Integer.parseInt(tmp[3])); VenueUtil.fixBrokenTimeSeriesVenue(new File(ts_file)); } // if } // for sanity_checks.put(c, cal.getTimeInMillis()); } // for if (total_venues > 5000) { System.out.println( "Too much venues for a single API account (max 5000).\nPlease create a new Foursquare API account and use these credentials.\nExiting now."); return; } while (true) { for (String c : args) { // We create a FIFO queue and pop venue IDs one at a time. LinkedList<String> city_venues_buffer = new LinkedList<String>(city_venues.get(c)); String venue_id = null; // Artificial wait to avoid processors looping at 100% of their capacity // when there is no more venues to crawl for the current hour. Thread.sleep(3000); while ((venue_id = city_venues_buffer.pollFirst()) != null) { // We get the current time according to the city's time zone Calendar cal = Calendar.getInstance(); cal.add(Calendar.MILLISECOND, TimeZone.getTimeZone(settings.getCityTimezone(c)).getOffset(cal.getTime().getTime()) - Calendar.getInstance().getTimeZone().getOffset(cal.getTime().getTime())); //TimeZone.getTimeZone("Europe/London").getOffset(cal.getTime().getTime())); long current_time = DateUtils.truncate(cal.getTime(), Calendar.HOUR).getTime(); // We query Foursquare only once per hour per venue. if (venue_last_call.get(venue_id) != null && current_time < venue_last_call.get(venue_id) + 3600000) continue; intelligentWait(total_venues, cal.getTime().getTime(), (total_calls == 0 ? 0 : Math.round(time_spent_on_API / total_calls))); Venue venue = null; try { long beforeCall = System.currentTimeMillis(); venue = new Venue(getFoursquareVenueById(venue_id, c)); // If there is no last call, this is the beginning of the time series // for this venue. We get the number of people "here now" to initialize // the series. if (venue_last_call.get(venue_id) == null) { /** TODO: by doing this, we keep a representation of the venue dating from the beginning * of the specific crawl. we might want to change this and update this file once * in a while. */ FileWriter info = new FileWriter(folder + c + File.separator + "foursquare_venues" + File.separator + venue_id + ".info"); info.write(venue.getFoursquareJson()); info.close(); FileWriter out = new FileWriter(folder + c + File.separator + "attendances_crawl" + File.separator + venue_id + ".ts"); out.write("Date,here_now,hour_checkins,total_checkins\n"); out.write(df.format(current_time) + "," + venue.getHereNow() + "," + venue.getHereNow() + "," + venue.getCheckincount() + "\n"); out.close(); } else { FileWriter out = new FileWriter(folder + c + File.separator + "attendances_crawl" + File.separator + venue_id + ".ts", true); int checks = venue.getCheckincount() - venue_last_checkin.get(venue_id); out.write(df.format(current_time) + "," + venue.getHereNow() + "," + Integer.toString(checks) + "," + venue.getCheckincount() + "\n"); out.close(); } if (APICallsCount.get(current_time) == null) APICallsCount.put(current_time, 1); else APICallsCount.put(current_time, APICallsCount.get(current_time) + 1); total_calls++; venue_last_call.put(venue_id, current_time); venue_last_checkin.put(venue_id, venue.getCheckincount()); time_spent_on_API += System.currentTimeMillis() - beforeCall; } catch (Exception e) { // If something bad happens (crawler not available, IO error, ...), we put the // venue_id in the FIFO queue so that it gets reevaluated later. //e.printStackTrace(); error_logs.get(c) .write("[" + df.format(cal.getTime().getTime()) + "] Error with venue " + venue_id + " (" + e.getMessage() + "). " + APICallsCount.get(current_time) + " API calls so far this hour, " + city_venues_buffer.size() + " venues remaining in the buffer.\n"); error_logs.get(c).flush(); System.out.println("[" + df.format(cal.getTime().getTime()) + "] " + c + " -- " + APICallsCount.get(current_time) + " API calls // " + city_venues_buffer.size() + " venues remaining " + " (" + e.getMessage() + ")"); if (e instanceof FoursquareAPIException) if (((FoursquareAPIException) e).getHttp_code().equals("400") && ((FoursquareAPIException) e).getError_detail() .equals("Venue " + venue_id + " has been deleted")) { city_venues.get(c).remove(venue_id); removeVenue(venue_id, c); } else city_venues_buffer.add(venue_id); continue; } } // while // Every day between 0am and 2am, we repair all the broken time series (if there // is something to repair). Calendar cal = Calendar.getInstance(); if (city_venues_buffer.peekFirst() == null && (cal.getTimeInMillis() - sanity_checks.get(c)) >= 86400000 && cal.get(Calendar.HOUR_OF_DAY) < 2) { VenueUtil.fixBrokenTimeSeriesCity(c, folder); sanity_checks.put(c, cal.getTimeInMillis()); info_logs.get(c).write("[" + df.format(cal.getTime()) + "] Sanity check OK.\n"); info_logs.get(c).flush(); } } // for } // while }