List of usage examples for java.util Collection size
int size();
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step6HITPreparator.java
public static void main(String[] args) throws Exception { // input dir - list of xml query containers // step5-linguistic-annotation/ System.err.println("Starting step 6 HIT Preparation"); File inputDir = new File(args[0]); // output dir File outputDir = new File(args[1]); if (outputDir.exists()) { outputDir.delete();//from w w w .j a v a 2s. c o m } outputDir.mkdir(); List<String> queries = new ArrayList<>(); // iterate over query containers int countClueWeb = 0; int countSentence = 0; for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); if (queries.contains(f.getName()) || queries.size() == 0) { // groups contain only non-empty documents Map<Integer, List<QueryResultContainer.SingleRankedResult>> groups = new HashMap<>(); // split to groups according to number of sentences for (QueryResultContainer.SingleRankedResult rankedResult : queryResultContainer.rankedResults) { if (rankedResult.originalXmi != null) { byte[] bytes = new BASE64Decoder() .decodeBuffer(new ByteArrayInputStream(rankedResult.originalXmi.getBytes())); JCas jCas = JCasFactory.createJCas(); XmiCasDeserializer.deserialize(new ByteArrayInputStream(bytes), jCas.getCas()); Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class); int groupId = sentences.size() / 40; if (rankedResult.originalXmi == null) { System.err.println("Empty document: " + rankedResult.clueWebID); } else { if (!groups.containsKey(groupId)) { groups.put(groupId, new ArrayList<>()); } } //handle it groups.get(groupId).add(rankedResult); countClueWeb++; } } for (Map.Entry<Integer, List<QueryResultContainer.SingleRankedResult>> entry : groups.entrySet()) { Integer groupId = entry.getKey(); List<QueryResultContainer.SingleRankedResult> rankedResults = entry.getValue(); // make sure the results are sorted // DEBUG // for (QueryResultContainer.SingleRankedResult r : rankedResults) { // System.out.print(r.rank + "\t"); // } Collections.sort(rankedResults, (o1, o2) -> o1.rank.compareTo(o2.rank)); // iterate over results for one query and group for (int i = 0; i < rankedResults.size() && i < TOP_RESULTS_PER_GROUP; i++) { QueryResultContainer.SingleRankedResult rankedResult = rankedResults.get(i); QueryResultContainer.SingleRankedResult r = rankedResults.get(i); int rank = r.rank; MustacheFactory mf = new DefaultMustacheFactory(); Mustache mustache = mf.compile("template/template.html"); String queryId = queryResultContainer.qID; String query = queryResultContainer.query; // make the first letter uppercase query = query.substring(0, 1).toUpperCase() + query.substring(1); List<String> relevantInformationExamples = queryResultContainer.relevantInformationExamples; List<String> irrelevantInformationExamples = queryResultContainer.irrelevantInformationExamples; byte[] bytes = new BASE64Decoder() .decodeBuffer(new ByteArrayInputStream(rankedResult.originalXmi.getBytes())); JCas jCas = JCasFactory.createJCas(); XmiCasDeserializer.deserialize(new ByteArrayInputStream(bytes), jCas.getCas()); List<generators.Sentence> sentences = new ArrayList<>(); List<Integer> paragraphs = new ArrayList<>(); paragraphs.add(0); for (WebParagraph webParagraph : JCasUtil.select(jCas, WebParagraph.class)) { for (Sentence s : JCasUtil.selectCovered(Sentence.class, webParagraph)) { String sentenceBegin = String.valueOf(s.getBegin()); generators.Sentence sentence = new generators.Sentence(s.getCoveredText(), sentenceBegin); sentences.add(sentence); countSentence++; } int SentenceID = paragraphs.get(paragraphs.size() - 1); if (sentences.size() > 120) while (SentenceID < sentences.size()) { if (!paragraphs.contains(SentenceID)) paragraphs.add(SentenceID); SentenceID = SentenceID + 120; } paragraphs.add(sentences.size()); } System.err.println("Output dir: " + outputDir); int startID = 0; int endID; for (int j = 0; j < paragraphs.size(); j++) { endID = paragraphs.get(j); int sentLength = endID - startID; if (sentLength > 120 || j == paragraphs.size() - 1) { if (sentLength > 120) { endID = paragraphs.get(j - 1); j--; } sentLength = endID - startID; if (sentLength <= 40) groupId = 40; else if (sentLength <= 80 && sentLength > 40) groupId = 80; else if (sentLength > 80) groupId = 120; File folder = new File(outputDir + "/" + groupId); if (!folder.exists()) { System.err.println("creating directory: " + outputDir + "/" + groupId); boolean result = false; try { folder.mkdir(); result = true; } catch (SecurityException se) { //handle it } if (result) { System.out.println("DIR created"); } } String newHtmlFile = folder.getAbsolutePath() + "/" + f.getName() + "_" + rankedResult.clueWebID + "_" + sentLength + ".html"; System.err.println("Printing a file: " + newHtmlFile); File newHTML = new File(newHtmlFile); int t = 0; while (newHTML.exists()) { newHTML = new File(folder.getAbsolutePath() + "/" + f.getName() + "_" + rankedResult.clueWebID + "_" + sentLength + "." + t + ".html"); t++; } mustache.execute(new PrintWriter(new FileWriter(newHTML)), new generators(query, relevantInformationExamples, irrelevantInformationExamples, sentences.subList(startID, endID), queryId, rank)) .flush(); startID = endID; } } } } } } System.out.println("Printed " + countClueWeb + " documents with " + countSentence + " sentences"); }
From source file:com.tamingtext.classifier.bayes.ExtractTrainingData.java
public static void main(String[] args) { log.info("Command-line arguments: " + Arrays.toString(args)); DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder.withLongName("dir").withRequired(true) .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create()) .withDescription("Lucene index directory containing input data").withShortName("d").create(); Option categoryOpt = obuilder.withLongName("categories").withRequired(true) .withArgument(abuilder.withName("file").withMinimum(1).withMaximum(1).create()) .withDescription("File containing a list of categories").withShortName("c").create(); Option outputOpt = obuilder.withLongName("output").withRequired(false) .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()) .withDescription("Output directory").withShortName("o").create(); Option categoryFieldsOpt = obuilder.withLongName("category-fields").withRequired(true) .withArgument(abuilder.withName("fields").withMinimum(1).withMaximum(1).create()) .withDescription("Fields to match categories against (comma-delimited)").withShortName("cf") .create();/*from w ww. j a v a2 s . c o m*/ Option textFieldsOpt = obuilder.withLongName("text-fields").withRequired(true) .withArgument(abuilder.withName("fields").withMinimum(1).withMaximum(1).create()) .withDescription("Fields from which to extract training text (comma-delimited)").withShortName("tf") .create(); Option useTermVectorsOpt = obuilder.withLongName("use-term-vectors").withDescription( "Extract term vectors containing preprocessed data " + "instead of unprocessed, stored text values") .withShortName("tv").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(categoryOpt) .withOption(outputOpt).withOption(categoryFieldsOpt).withOption(textFieldsOpt) .withOption(useTermVectorsOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File inputDir = new File(cmdLine.getValue(inputOpt).toString()); if (!inputDir.isDirectory()) { throw new IllegalArgumentException(inputDir + " does not exist or is not a directory"); } File categoryFile = new File(cmdLine.getValue(categoryOpt).toString()); if (!categoryFile.isFile()) { throw new IllegalArgumentException(categoryFile + " does not exist or is not a directory"); } File outputDir = new File(cmdLine.getValue(outputOpt).toString()); outputDir.mkdirs(); if (!outputDir.isDirectory()) { throw new IllegalArgumentException(outputDir + " is not a directory or could not be created"); } Collection<String> categoryFields = stringToList(cmdLine.getValue(categoryFieldsOpt).toString()); if (categoryFields.size() < 1) { throw new IllegalArgumentException("At least one category field must be spcified."); } Collection<String> textFields = stringToList(cmdLine.getValue(textFieldsOpt).toString()); if (categoryFields.size() < 1) { throw new IllegalArgumentException("At least one text field must be spcified."); } boolean useTermVectors = cmdLine.hasOption(useTermVectorsOpt); extractTraininingData(inputDir, categoryFile, categoryFields, textFields, outputDir, useTermVectors); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (IOException e) { log.error("IOException", e); } finally { closeWriters(); } }
From source file:io.apiman.tools.i18n.TemplateScanner.java
public static void main(String[] args) throws IOException { if (args == null || args.length != 1) { System.out.println("Template directory not provided (no path provided)."); System.exit(1);//from ww w. j av a 2 s . c om } File templateDir = new File(args[0]); if (!templateDir.isDirectory()) { System.out.println("Template directory not provided (provided path is not a directory)."); System.exit(1); } if (!new File(templateDir, "dash.html").isFile()) { System.out.println("Template directory not provided (dash.html not found)."); System.exit(1); } File outputDir = new File(templateDir, "../../../../../../tools/i18n/target"); if (!outputDir.isDirectory()) { System.out.println("Output directory not found: " + outputDir); System.exit(1); } File outputFile = new File(outputDir, "scanner-messages.properties"); if (outputFile.isFile() && !outputFile.delete()) { System.out.println("Couldn't delete the old messages.properties: " + outputFile); System.exit(1); } System.out.println("Starting scan."); System.out.println("Scanning template directory: " + templateDir.getAbsolutePath()); String[] extensions = { "html", "include" }; Collection<File> files = FileUtils.listFiles(templateDir, extensions, true); TreeMap<String, String> strings = new TreeMap<>(); for (File file : files) { System.out.println("\tScanning file: " + file); scanFile(file, strings); } outputMessages(strings, outputFile); System.out.println("Scan complete. Scanned " + files.size() + " files and discovered " + strings.size() + " translation strings."); }
From source file:eu.annocultor.utils.OntologySubtractor.java
public static void main(String[] args) throws Exception { boolean copy = checkNoCopyOption(args); if (args.length == 2 || args.length == 3) { File sourceDir = new File(args[0]); File destinationDir = new File(args[1]); checkSrcAndDstDirs(sourceDir, destinationDir); Collection<String> filesWithDeletedStatements = listNameStamsForFilesWithDeletedStatements(sourceDir); if (filesWithDeletedStatements.isEmpty()) { System.out.println(//from w ww . j a v a 2 s .c om "Did not found any file *.*.*.deleted.rdf with statements to be deleted. Do nothing and exit."); } else { System.out.println( "Found " + filesWithDeletedStatements.size() + " files with statements to be deleted"); System.out.println( "Copying all RDF files from " + sourceDir.getName() + " to " + destinationDir.getName()); if (copy) { copyRdfFiles(sourceDir, destinationDir); } sutractAll(sourceDir, destinationDir, filesWithDeletedStatements); } } else { for (Object string : IOUtils.readLines(new AutoCloseInputStream( OntologySubtractor.class.getResourceAsStream("/subtractor/readme.txt")))) { System.out.println(string.toString()); } } }
From source file:com.evolveum.midpoint.testing.model.client.sample.Main.java
/** * @param args/*from www . j a v a 2s . co m*/ */ public static void main(String[] args) { try { ModelPortType modelPort = createModelPort(args); SystemConfigurationType configurationType = getConfiguration(modelPort); System.out.println("Got system configuration"); // System.out.println(configurationType); UserType userAdministrator = searchUserByName(modelPort, "administrator"); System.out.println("Got administrator user: " + userAdministrator.getOid()); // System.out.println(userAdministrator); RoleType sailorRole = searchRoleByName(modelPort, "Sailor"); System.out.println("Got Sailor role"); // System.out.println(sailorRole); Collection<ResourceType> resources = listResources(modelPort); System.out.println("Resources (" + resources.size() + ")"); // dump(resources); Collection<UserType> users = listUsers(modelPort); System.out.println("Users (" + users.size() + ")"); // dump(users); Collection<TaskType> tasks = listTasks(modelPort); System.out.println("Tasks (" + tasks.size() + ")"); // dump(tasks); // System.out.println("Next scheduled times: "); // for (TaskType taskType : tasks) { // System.out.println(" - " + getOrig(taskType.getName()) + ": " + taskType.getNextRunStartTimestamp()); // } String userGuybrushoid = createUserGuybrush(modelPort, sailorRole); System.out.println("Created user guybrush, OID: " + userGuybrushoid); UserType userGuybrush = getUser(modelPort, userGuybrushoid); System.out.println("Fetched user guybrush:"); // System.out.println(userGuybrush); System.out.println("Users fullName: " + ModelClientUtil.getOrig(userGuybrush.getFullName())); String userLeChuckOid = createUserFromSystemResource(modelPort, "user-lechuck.xml"); System.out.println("Created user lechuck, OID: " + userLeChuckOid); changeUserPassword(modelPort, userGuybrushoid, "MIGHTYpirate"); System.out.println("Changed user password"); changeUserGivenName(modelPort, userLeChuckOid, "CHUCK"); System.out.println("Changed user given name"); assignRoles(modelPort, userGuybrushoid, ROLE_PIRATE_OID, ROLE_CAPTAIN_OID); System.out.println("Assigned roles"); unAssignRoles(modelPort, userGuybrushoid, ROLE_CAPTAIN_OID); System.out.println("Unassigned roles"); Collection<RoleType> roles = listRequestableRoles(modelPort); System.out.println("Found " + roles.size() + " requestable roles"); // System.out.println(roles); String seaSuperuserRole = createRoleFromSystemResource(modelPort, "role-sea-superuser.xml"); System.out.println("Created role Sea Superuser, OID: " + seaSuperuserRole); assignRoles(modelPort, userLeChuckOid, seaSuperuserRole); System.out.println("Assigned role Sea Superuser to LeChuck"); modifyRoleModifyInducement(modelPort, seaSuperuserRole); System.out.println("Modified role Sea Superuser - modified resource inducement"); modifyRoleReplaceInducement(modelPort, seaSuperuserRole, 2, ROLE_CAPTAIN_OID); System.out.println("Modified role Sea Superuser - changed role inducement"); reconcileUser(modelPort, userLeChuckOid); System.out.println("LeChuck reconciled."); // Uncomment the following lines if you want to see what midPoint really did // ... because deleting the user will delete also all the traces (except logs and audit of course). deleteUser(modelPort, userGuybrushoid); deleteUser(modelPort, userLeChuckOid); deleteRole(modelPort, seaSuperuserRole); System.out.println("Deleted user(s)"); } catch (Exception e) { e.printStackTrace(); System.exit(-1); } }
From source file:br.com.autonomiccs.cloudTraces.main.CloudTracesSimulator.java
public static void main(String[] args) { validateInputFile(args);/*from w ww . j a v a2 s . com*/ String cloudTracesFile = args[0]; Collection<VirtualMachine> virtualMachines = getAllVirtualMachinesFromCloudTraces(cloudTracesFile); logger.info(String.format("#VirtualMachines [%d] found on [%s].", virtualMachines.size(), cloudTracesFile)); Map<Integer, List<VirtualMachine>> mapVirtualMachinesTaskExecutionByTime = createMapVirtualMachinesTaskExecutionByTime( virtualMachines); logger.info(String.format("#Times [%d] that have tasks being executed by VMs ", mapVirtualMachinesTaskExecutionByTime.size())); Cloud cloud = createCloudEnvirtonmentToStartsimulation(); logger.info("Cloud configuration: " + cloud); List<Integer> timesToExecuteTasks = new ArrayList<>(mapVirtualMachinesTaskExecutionByTime.keySet()); Collections.sort(timesToExecuteTasks); Integer firstTimeInTimeUnitOfUsedCloudData = timesToExecuteTasks.get(0); Integer lastTimeInTimeUnitOfUserCloudData = timesToExecuteTasks.get(timesToExecuteTasks.size() - 1); logger.info("First time: " + firstTimeInTimeUnitOfUsedCloudData); logger.info("Last time: " + lastTimeInTimeUnitOfUserCloudData); double timeUnitPerLoopIteration = getTimeUnitPerLoopIteration(firstTimeInTimeUnitOfUsedCloudData, lastTimeInTimeUnitOfUserCloudData); logger.info("The time unit converted to trace time: " + timeUnitPerLoopIteration); double currentTime = firstTimeInTimeUnitOfUsedCloudData; long highetResourceAllocation = Long.MIN_VALUE; String cloudStateHighestMemoryAllocation = ""; while (currentTime < lastTimeInTimeUnitOfUserCloudData + 2 * timeUnitPerLoopIteration) { logger.debug("Current time of iteration: " + currentTime); if (cloud.getMemoryAllocatedInBytes() > highetResourceAllocation) { highetResourceAllocation = cloud.getMemoryAllocatedInBytes(); cloudStateHighestMemoryAllocation = cloud.toString(); } applyLoadOnCloudForCurrentTime(mapVirtualMachinesTaskExecutionByTime, cloud, currentTime); destroyVirtualMachinesIfNeeded(cloud, currentTime); logger.info(String.format("Time [%.3f], cloud state [%s] ", currentTime, cloud)); executeManagement(cloud, currentTime); logClustersConfigurationsAndStdAtTime(cloud.getClusters(), currentTime); currentTime += timeUnitPerLoopIteration; } logger.info("Cloud configuration after simulation: " + cloud); logger.info("Cloud highestResourceUsage: " + cloudStateHighestMemoryAllocation); }
From source file:edu.umn.msi.tropix.proteomics.tools.DTAToMzXML.java
public static void main(final String[] args) throws Exception { if (args.length < 1) { usage();/* ww w . ja va 2 s. c o m*/ System.exit(0); } Collection<File> files = null; if (args[0].equals("-files")) { if (args.length < 2) { out.println("No files specified."); usage(); exit(-1); } else { files = new ArrayList<File>(args.length - 1); for (int i = 1; i < args.length; i++) { files.add(new File(args[i])); } } } else if (args[0].equals("-directory")) { File directory; if (args.length < 2) { directory = new File(System.getProperty("user.dir")); } else { directory = new File(args[2]); } files = FileUtilsFactory.getInstance().listFiles(directory, new String[] { "dta" }, false); } else { usage(); exit(-1); } final InMemoryDTAListImpl dtaList = new InMemoryDTAListImpl(); File firstFile = null; if (files.size() == 0) { out.println("No files found."); exit(-1); } else { firstFile = files.iterator().next(); } for (final File file : files) { dtaList.add(FileUtils.readFileToByteArray(file), file.getName()); } final DTAToMzXMLConverter dtaToMzXMLConverter = new DTAToMzXMLConverterImpl(); final MzXML mzxml = dtaToMzXMLConverter.dtaToMzXML(dtaList, null); final String mzxmlName = firstFile.getName().substring(0, firstFile.getName().indexOf(".")) + ".mzXML"; new MzXMLUtility().serialize(mzxml, mzxmlName); }
From source file:de.unileipzig.ub.indexer.App.java
public static void main(String[] args) throws IOException { // create Options object Options options = new Options(); options.addOption("h", "help", false, "display this help"); options.addOption("f", "filename", true, "name of the JSON file whose content should be indexed"); options.addOption("i", "index", true, "the name of the target index"); options.addOption("d", "doctype", true, "the name of the doctype (title, local, ...)"); options.addOption("t", "host", true, "elasticsearch hostname (default: 0.0.0.0)"); options.addOption("p", "port", true, "transport port (that's NOT the http port, default: 9300)"); options.addOption("c", "cluster", true, "cluster name (default: elasticsearch_mdma)"); options.addOption("b", "bulksize", true, "number of docs sent in one request (default: 3000)"); options.addOption("v", "verbose", false, "show processing speed while indexing"); options.addOption("s", "status", false, "only show status of index for file"); options.addOption("r", "repair", false, "attempt to repair recoverable inconsistencies on the go"); options.addOption("e", "debug", false, "set logging level to debug"); options.addOption("l", "logfile", true, "logfile - in not specified only log to stdout"); options.addOption("m", "memcached", true, "host and port of memcached (default: localhost:11211)"); options.addOption("z", "latest-flag-on", true, "enable latest flag according to field (within content, e.g. 001)"); options.addOption("a", "flat", false, "flat-mode: do not check for inconsistencies"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null;//from w w w . ja va2 s. c o m try { cmd = parser.parse(options, args); } catch (ParseException ex) { logger.error(ex); System.exit(1); } // setup logging Properties systemProperties = System.getProperties(); systemProperties.put("net.spy.log.LoggerImpl", "net.spy.memcached.compat.log.Log4JLogger"); System.setProperties(systemProperties); Logger.getLogger("net.spy.memcached").setLevel(Level.ERROR); Properties props = new Properties(); props.load(props.getClass().getResourceAsStream("/log4j.properties")); if (cmd.hasOption("debug")) { props.setProperty("log4j.logger.de.unileipzig", "DEBUG"); } if (cmd.hasOption("logfile")) { props.setProperty("log4j.rootLogger", "INFO, stdout, F"); props.setProperty("log4j.appender.F", "org.apache.log4j.FileAppender"); props.setProperty("log4j.appender.F.File", cmd.getOptionValue("logfile")); props.setProperty("log4j.appender.F.layout", "org.apache.log4j.PatternLayout"); props.setProperty("log4j.appender.F.layout.ConversionPattern", "%5p | %d | %F | %L | %m%n"); } PropertyConfigurator.configure(props); InetAddress addr = InetAddress.getLocalHost(); String memcachedHostAndPort = addr.getHostAddress() + ":11211"; if (cmd.hasOption("m")) { memcachedHostAndPort = cmd.getOptionValue("m"); } // setup caching try { if (memcachedClient == null) { memcachedClient = new MemcachedClient( new ConnectionFactoryBuilder().setFailureMode(FailureMode.Cancel).build(), AddrUtil.getAddresses("0.0.0.0:11211")); try { // give client and server 500ms Thread.sleep(300); } catch (InterruptedException ex) { } Collection availableServers = memcachedClient.getAvailableServers(); logger.info(availableServers); if (availableServers.size() == 0) { logger.info("no memcached servers found"); memcachedClient.shutdown(); memcachedClient = null; } else { logger.info(availableServers.size() + " memcached server(s) detected, fine."); } } } catch (IOException ex) { logger.warn("couldn't create a connection, bailing out: " + ex.getMessage()); } // process options if (cmd.hasOption("h")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("indexer", options, true); quit(0); } boolean verbose = false; if (cmd.hasOption("verbose")) { verbose = true; } // ES options String[] hosts = new String[] { "0.0.0.0" }; int port = 9300; String clusterName = "elasticsearch_mdma"; int bulkSize = 3000; if (cmd.hasOption("host")) { hosts = cmd.getOptionValues("host"); } if (cmd.hasOption("port")) { port = Integer.parseInt(cmd.getOptionValue("port")); } if (cmd.hasOption("cluster")) { clusterName = cmd.getOptionValue("cluster"); } if (cmd.hasOption("bulksize")) { bulkSize = Integer.parseInt(cmd.getOptionValue("bulksize")); if (bulkSize < 1 || bulkSize > 100000) { logger.error("bulksize must be between 1 and 100,000"); quit(1); } } // ES Client final Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "elasticsearch_mdma") .build(); final TransportClient client = new TransportClient(settings); for (String host : hosts) { client.addTransportAddress(new InetSocketTransportAddress(host, port)); } if (cmd.hasOption("filename") && cmd.hasOption("index") && cmd.hasOption("doctype")) { final String filename = cmd.getOptionValue("filename"); final File _file = new File(filename); if (_file.length() == 0) { logger.info(_file.getAbsolutePath() + " is empty, skipping"); quit(0); // file is empty } // for flat mode: leave a stampfile beside the json to // indicate previous successful processing File directory = new File(filename).getParentFile(); File stampfile = new File(directory, DigestUtils.shaHex(filename) + ".indexed"); long start = System.currentTimeMillis(); long lineCount = 0; final String indexName = cmd.getOptionValue("index"); final String docType = cmd.getOptionValue("doctype"); BulkRequestBuilder bulkRequest = client.prepareBulk(); try { if (cmd.hasOption("flat")) { // flat mode // ......... if (stampfile.exists()) { logger.info("SKIPPING, since it seems this file has already " + "been imported (found: " + stampfile.getAbsolutePath() + ")"); quit(0); } } else { final String srcSHA1 = extractSrcSHA1(filename); logger.debug(filename + " srcsha1: " + srcSHA1); long docsInIndex = getIndexedRecordCount(client, indexName, srcSHA1); logger.debug(filename + " indexed: " + docsInIndex); long docsInFile = getLineCount(filename); logger.debug(filename + " lines: " + docsInFile); // in non-flat-mode, indexing would take care // of inconsistencies if (docsInIndex == docsInFile) { logger.info("UP-TO DATE: " + filename + " (" + docsInIndex + ", " + srcSHA1 + ")"); client.close(); quit(0); } if (docsInIndex > 0) { logger.warn("INCONSISTENCY DETECTED: " + filename + ": indexed:" + docsInIndex + " lines:" + docsInFile); if (!cmd.hasOption("r")) { logger.warn( "Please re-run indexer with --repair flag or delete residues first with: $ curl -XDELETE " + hosts[0] + ":9200/" + indexName + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'"); client.close(); quit(1); } else { logger.info("Attempting to clear residues..."); // attempt to repair once DeleteByQueryResponse dbqr = client.prepareDeleteByQuery(indexName) .setQuery(termQuery("meta.srcsha1", srcSHA1)).execute().actionGet(); Iterator<IndexDeleteByQueryResponse> it = dbqr.iterator(); long deletions = 0; while (it.hasNext()) { IndexDeleteByQueryResponse response = it.next(); deletions += 1; } logger.info("Deleted residues of " + filename); logger.info("Refreshing [" + indexName + "]"); RefreshResponse refreshResponse = client.admin().indices() .refresh(new RefreshRequest(indexName)).actionGet(); long indexedAfterDelete = getIndexedRecordCount(client, indexName, srcSHA1); logger.info(indexedAfterDelete + " docs remained"); if (indexedAfterDelete > 0) { logger.warn("Not all residues cleaned. Try to fix this manually: $ curl -XDELETE " + hosts[0] + ":9200/" + indexName + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'"); quit(1); } else { logger.info("Residues are gone. Now trying to reindex: " + filename); } } } } logger.info("INDEXING-REQUIRED: " + filename); if (cmd.hasOption("status")) { quit(0); } HashSet idsInBatch = new HashSet(); String idField = null; if (cmd.hasOption("z")) { idField = cmd.getOptionValue("z"); } final FileReader fr = new FileReader(filename); final BufferedReader br = new BufferedReader(fr); String line; // one line is one document while ((line = br.readLine()) != null) { // "Latest-Flag" machine // This gets obsolete with a "flat" index if (cmd.hasOption("z")) { // flag that indicates, whether the document // about to be indexed will be the latest boolean willBeLatest = true; // check if there is a previous (lower meta.timestamp) document with // the same identifier (whatever that may be - queried under "content") final String contentIdentifier = getContentIdentifier(line, idField); idsInBatch.add(contentIdentifier); // assumed in meta.timestamp final Long timestamp = Long.parseLong(getTimestamp(line)); logger.debug("Checking whether record is latest (line: " + lineCount + ")"); logger.debug(contentIdentifier + ", " + timestamp); // get all docs, which match the contentIdentifier // by filter, which doesn't score final TermFilterBuilder idFilter = new TermFilterBuilder("content." + idField, contentIdentifier); final TermFilterBuilder kindFilter = new TermFilterBuilder("meta.kind", docType); final AndFilterBuilder afb = new AndFilterBuilder(); afb.add(idFilter).add(kindFilter); final FilteredQueryBuilder fb = filteredQuery(matchAllQuery(), afb); final SearchResponse searchResponse = client.prepareSearch(indexName) .setSearchType(SearchType.DFS_QUERY_THEN_FETCH).setQuery(fb).setFrom(0) .setSize(1200) // 3 years and 105 days assuming daily updates at the most .setExplain(false).execute().actionGet(); final SearchHits searchHits = searchResponse.getHits(); logger.debug("docs with this id in the index: " + searchHits.getTotalHits()); for (final SearchHit hit : searchHits.getHits()) { final String docId = hit.id(); final Map<String, Object> source = hit.sourceAsMap(); final Map meta = (Map) source.get("meta"); final Long docTimestamp = Long.parseLong(meta.get("timestamp").toString()); // if the indexed doc timestamp is lower the the current one, // remove any latest flag if (timestamp >= docTimestamp) { source.remove("latest"); final ObjectMapper mapper = new ObjectMapper(); // put the updated doc back // IndexResponse response = client.prepareIndex(indexName, docType).setCreate(false).setId(docId) .setSource(mapper.writeValueAsBytes(source)) .execute(new ActionListener<IndexResponse>() { public void onResponse(IndexResponse rspns) { logger.debug("Removed latest flag from " + contentIdentifier + ", " + docTimestamp + ", " + hit.id() + " since (" + timestamp + " > " + docTimestamp + ")"); } public void onFailure(Throwable thrwbl) { logger.error("Could not remove flag from " + hit.id() + ", " + contentIdentifier); } }); // .execute() //.actionGet(); } else { logger.debug("Doc " + hit.id() + " is newer (" + docTimestamp + ")"); willBeLatest = false; } } if (willBeLatest) { line = setLatestFlag(line); logger.info("Setting latest flag on " + contentIdentifier + ", " + timestamp); } // end of latest-flag machine // beware - this will be correct as long as there // are no dups within one bulk! } bulkRequest.add(client.prepareIndex(indexName, docType).setSource(line)); lineCount++; logger.debug("Added line " + lineCount + " to BULK"); logger.debug(line); if (lineCount % bulkSize == 0) { if (idsInBatch.size() != bulkSize && cmd.hasOption("z")) { logger.error( "This batch has duplications in the ID. That's not bad for the index, just makes the latest flag fuzzy"); logger.error( "Bulk size was: " + bulkSize + ", but " + idsInBatch.size() + " IDs (only)"); } idsInBatch.clear(); logger.debug("Issuing BULK request"); final long actionCount = bulkRequest.numberOfActions(); final BulkResponse bulkResponse = bulkRequest.execute().actionGet(); final long tookInMillis = bulkResponse.getTookInMillis(); if (bulkResponse.hasFailures()) { logger.fatal("FAILED, bulk not indexed. exiting now."); Iterator<BulkItemResponse> it = bulkResponse.iterator(); while (it.hasNext()) { BulkItemResponse bir = it.next(); if (bir.isFailed()) { Failure failure = bir.getFailure(); logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage() + ", type: " + failure.getType() + ", index: " + failure.getIndex()); } } quit(1); } else { if (verbose) { final double elapsed = System.currentTimeMillis() - start; final double speed = (lineCount / elapsed * 1000); logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)"); } } bulkRequest = client.prepareBulk(); } } // handle the remaining items final long actionCount = bulkRequest.numberOfActions(); if (actionCount > 0) { final BulkResponse bulkResponse = bulkRequest.execute().actionGet(); final long tookInMillis = bulkResponse.getTookInMillis(); if (bulkResponse.hasFailures()) { logger.fatal("FAILED, bulk not indexed. exiting now."); Iterator<BulkItemResponse> it = bulkResponse.iterator(); while (it.hasNext()) { BulkItemResponse bir = it.next(); if (bir.isFailed()) { Failure failure = bir.getFailure(); logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage() + ", type: " + failure.getType() + ", index: " + failure.getIndex()); } } quit(1); } else { // trigger update now RefreshResponse refreshResponse = client.admin().indices() .refresh(new RefreshRequest(indexName)).actionGet(); if (verbose) { final double elapsed = System.currentTimeMillis() - start; final double speed = (lineCount / elapsed * 1000); logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)"); } } } br.close(); client.close(); final double elapsed = (System.currentTimeMillis() - start) / 1000; final double speed = (lineCount / elapsed); logger.info("indexing (" + filename + ") " + lineCount + " docs took " + elapsed + "s (speed: " + String.format("%.2f", speed) + "r/s)"); if (cmd.hasOption("flat")) { try { FileUtils.touch(stampfile); } catch (IOException ioe) { logger.warn(".indexed files not created. Will reindex everything everytime."); } } } catch (IOException e) { client.close(); logger.error(e); quit(1); } finally { client.close(); } } quit(0); }
From source file:kindleclippings.quizlet.QuizletSync.java
public static void main(String[] args) throws IOException, JSONException, URISyntaxException, InterruptedException, BackingStoreException { ProgressMonitor progress = new ProgressMonitor(null, "QuizletSync", "loading Kindle clippings file", 0, 100);/*from ww w . j a v a 2 s . com*/ progress.setMillisToPopup(0); progress.setMillisToDecideToPopup(0); progress.setProgress(0); try { Map<String, List<Clipping>> books = readClippingsFile(); if (books == null) return; if (books.isEmpty()) { JOptionPane.showMessageDialog(null, "no clippings to be uploaded", "QuizletSync", JOptionPane.OK_OPTION); return; } progress.setNote("checking Quizlet account"); progress.setProgress(5); Preferences prefs = getPrefs(); QuizletAPI api = new QuizletAPI(prefs.get("access_token", null)); Collection<TermSet> sets = null; try { progress.setNote("checking Quizlet library"); progress.setProgress(10); sets = api.getSets(prefs.get("user_id", null)); } catch (IOException e) { if (e.toString().contains("401")) { // Not Authorized => Token has been revoked clearPrefs(); prefs = getPrefs(); api = new QuizletAPI(prefs.get("access_token", null)); sets = api.getSets(prefs.get("user_id", null)); } else { throw e; } } progress.setProgress(15); progress.setMaximum(15 + books.size()); progress.setNote("uploading new notes"); Map<String, TermSet> indexedSets = new HashMap<String, TermSet>(sets.size()); for (TermSet t : sets) { indexedSets.put(t.getTitle(), t); } int pro = 15; int createdSets = 0; int createdTerms = 0; int updatedTerms = 0; for (List<Clipping> c : books.values()) { String book = c.get(0).getBook(); progress.setNote(book); progress.setProgress(pro++); TermSet termSet = indexedSets.get(book); if (termSet == null) { if (c.size() < 2) { System.err.println("ignored [" + book + "] (need at least two notes)"); continue; } addSet(api, book, c); createdSets++; createdTerms += c.size(); continue; } // compare against existing terms for (Clipping cl : c) { if (!checkExistingTerm(cl, termSet)) { addTerm(api, termSet, cl); updatedTerms++; } } } progress.setProgress(pro++); if (createdSets == 0 && updatedTerms == 0) { JOptionPane.showMessageDialog(null, "Done.\nNo new data was uploaded", "QuizletSync", JOptionPane.OK_OPTION); } else if (createdSets > 0) { JOptionPane.showMessageDialog(null, String.format( "Done.\nCreated %d new sets with %d cards, and added %d cards to existing sets", createdSets, createdTerms, updatedTerms), "QuizletSync", JOptionPane.OK_OPTION); } else { JOptionPane.showMessageDialog(null, String.format("Done.\nAdded %d cards to existing sets", updatedTerms), "QuizletSync", JOptionPane.OK_OPTION); } } finally { progress.close(); } System.exit(0); }
From source file:gov.nih.nci.ncicb.cadsr.common.persistence.dao.jdbc.JDBCAdminComponentDAO.java
public static void main(String[] args) { ServiceLocator locator = new SimpleServiceLocator(); JDBCAdminComponentDAO jdbcAdminComponentDAO = new JDBCAdminComponentDAO(locator); /*// w w w. jav a2 s . c o m int res = jdbcAdminComponentDAO.assignClassification( "99BA9DC8-2357-4E69-E034-080020C9C0E0", "29A8FB30-0AB1-11D6-A42F-0010A4C1E842"); // acId, csCsiId System.out.println ("res = " + res); */ /* int deleteRes = jdbcAdminComponentDAO.removeClassification ("D66B85B6-4EDA-469B-E034-0003BA0B1A09"); System.out.println ("deleteRes = " + deleteRes); Collection csito = jdbcAdminComponentDAO.retrieveClassifications( "29A8FB19-0AB1-11D6-A42F-0010A4C1E842"); */ Collection contacts = jdbcAdminComponentDAO.getContacts("0B244855-6696-5A67-E044-0003BA8EB8F1"); System.out.println(contacts.size()); }