List of usage examples for java.util Set add
boolean add(E e);
From source file:HSqlPrimerDesign.java
public static void main(String[] args) throws NoSuchFieldException, IllegalAccessException, ClassNotFoundException, InstantiationException, SQLException, FileNotFoundException { Class.forName(JDBC_DRIVER_HSQL).newInstance(); conn = DriverManager.getConnection(DB_SERVER_URL, USER, PASS); PrintWriter log = new PrintWriter(new File("javalog.log")); DpalLoad.main(args);/* www . j a v a 2 s . c om*/ Dpal_Inst = DpalLoad.INSTANCE_WIN64; // int[][] arr =calcHairpin("GGGGGGCCCCCCCCCCCCGGGGGGG",4); // if(arr.length<=1){ // System.out.println("No Hairpin's found"); // }else{ // System.out.println("Hairpin(s) found"); // } Statement stat = conn.createStatement(); ResultSet call = stat.executeQuery( "Select * From " + "Primerdb.primers where Cluster ='A1' and UniqueP =True and Bp = 20"); Set<CharSequence> primers = new HashSet<>(); while (call.next()) { primers.add(call.getString("Sequence")); } // primers.stream().forEach(x->{ // log.println(x); // log.println(complementarity(x, x, Dpal_Inst)); // int[][] arr =calcHairpin((String)x,4); // if(arr.length<=1){ // log.println("No Hairpin's found"); // }else{ // log.println("Hairpin(s) found"); // } // log.println(); // log.flush(); // }); }
From source file:de.jetwick.snacktory.HtmlFetcher.java
public static void main(String[] args) throws Exception { BufferedReader reader = new BufferedReader(new FileReader("urls.txt")); String line = null;/*ww w .j a v a 2 s . c o m*/ Set<String> existing = new LinkedHashSet<String>(); while ((line = reader.readLine()) != null) { int index1 = line.indexOf("\""); int index2 = line.indexOf("\"", index1 + 1); String url = line.substring(index1 + 1, index2); String domainStr = SHelper.extractDomain(url, true); String counterStr = ""; // TODO more similarities if (existing.contains(domainStr)) counterStr = "2"; else existing.add(domainStr); String html = new HtmlFetcher().fetchAsString(url, 20000); String outFile = domainStr + counterStr + ".html"; BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); writer.write(html); writer.close(); } reader.close(); }
From source file:Main.java
public static void main(String[] args) throws Exception { List<Car> carsList = new ArrayList<Car>(); Set<Car> carsset = new HashSet<Car>(); File fXmlFile = new File("cars.xml"); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(fXmlFile); doc.getDocumentElement().normalize(); NodeList nList = doc.getElementsByTagName("cars"); Car tempCar = null;//from www .java 2 s .c o m for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { tempCar = new Car(); Element eElement = (Element) nNode; tempCar.setModel(eElement.getElementsByTagName("model").item(0).getTextContent()); tempCar.setVersion(eElement.getElementsByTagName("version").item(0).getTextContent()); carsList.add(tempCar); carsset.add(tempCar); } } for (Car cs : carsset) { int count = 0; for (Car cl : carsList) { if (cs.equals(cl)) { count = count + 1; } } System.out.println(cs + "\t" + count); } }
From source file:com.act.lcms.db.analysis.IonDetectionAnalysis.java
public static void main(String[] args) throws Exception { Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build());/*from www . j a va 2 s . com*/ } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { System.err.format("Argument parsing failed: %s", e.getMessage()); HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } File lcmsDir = new File(cl.getOptionValue(OPTION_LCMS_FILE_DIRECTORY)); if (!lcmsDir.isDirectory()) { System.err.format("File at %s is not a directory", lcmsDir.getAbsolutePath()); HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } String plottingDirectory = cl.getOptionValue(OPTION_PLOTTING_DIR); // Get include and excluse ions from command line Set<String> includeIons; if (cl.hasOption(OPTION_INCLUDE_IONS)) { includeIons = new HashSet<>(Arrays.asList(cl.getOptionValues(OPTION_INCLUDE_IONS))); LOGGER.info("Including ions in search: %s", StringUtils.join(includeIons, ", ")); } else { includeIons = new HashSet<>(); includeIons.add(DEFAULT_ION); } try (DB db = DB.openDBFromCLI(cl)) { ScanFile.insertOrUpdateScanFilesInDirectory(db, lcmsDir); File inputPredictionCorpus = new File(cl.getOptionValue(OPTION_INPUT_PREDICTION_CORPUS)); Map<Double, Set<Pair<String, String>>> massChargeToChemicalAndIon = constructMassChargeToChemicalIonsFromInputFile( inputPredictionCorpus, includeIons, cl.hasOption(OPTION_LIST_OF_INCHIS_INPUT_FILE)); Pair<Set<Pair<String, Double>>, Map<String, Double>> values = constructFakeNameToMassChargeAndSetOfMassChargePairs( massChargeToChemicalAndIon.keySet()); Set<Pair<String, Double>> searchMZs = values.getLeft(); Map<String, Double> chemIDToMassCharge = values.getRight(); LOGGER.info("The number of mass charges are: %d", searchMZs.size()); Map<ScanData.KIND, List<LCMSWell>> wellTypeToLCMSWells = readInputExperimentalSetup(db, new File(cl.getOptionValue(OPTION_INPUT_POSITIVE_AND_NEGATIVE_CONTROL_WELLS_FILE))); // Get experimental setup ie. positive and negative wells from config file List<LCMSWell> positiveWells = wellTypeToLCMSWells.get(ScanData.KIND.POS_SAMPLE); List<LCMSWell> negativeWells = wellTypeToLCMSWells.get(ScanData.KIND.NEG_CONTROL); LOGGER.info("Number of positive wells is: %d", positiveWells.size()); LOGGER.info("Number of negative wells is: %d", negativeWells.size()); HashMap<Integer, Plate> plateCache = new HashMap<>(); String outputPrefix = cl.getOptionValue(OPTION_OUTPUT_PREFIX); IonDetectionAnalysis<LCMSWell> ionDetectionAnalysis = new IonDetectionAnalysis<LCMSWell>(lcmsDir, positiveWells, negativeWells, plottingDirectory, plateCache, searchMZs, db); ionDetectionAnalysis.runLCMSMiningAnalysisAndPlotResults(chemIDToMassCharge, massChargeToChemicalAndIon, outputPrefix, cl.hasOption(OPTION_NON_REPLICATE_ANALYSIS)); } }
From source file:com.lightboxtechnologies.spectrum.SequenceFileExport.java
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); String imageID;/*from w ww . j a v a2s . c o m*/ String outpath; String friendlyname; final Set<String> exts = new HashSet<String>(); if ("-f".equals(otherArgs[0])) { if (otherArgs.length != 4) { die(); } // load extensions from file final Path extpath = new Path(otherArgs[1]); InputStream in = null; try { in = extpath.getFileSystem(conf).open(extpath); Reader r = null; try { r = new InputStreamReader(in); BufferedReader br = null; try { br = new BufferedReader(r); String line; while ((line = br.readLine()) != null) { exts.add(line.trim().toLowerCase()); } br.close(); } finally { IOUtils.closeQuietly(br); } r.close(); } finally { IOUtils.closeQuietly(r); } in.close(); } finally { IOUtils.closeQuietly(in); } imageID = otherArgs[2]; friendlyname = otherArgs[3]; outpath = otherArgs[4]; } else { if (otherArgs.length < 3) { die(); } // read extensions from trailing args imageID = otherArgs[0]; friendlyname = otherArgs[1]; outpath = otherArgs[2]; // lowercase all file extensions for (int i = 2; i < otherArgs.length; ++i) { exts.add(otherArgs[i].toLowerCase()); } } conf.setStrings("extensions", exts.toArray(new String[exts.size()])); final Job job = SKJobFactory.createJobFromConf(imageID, friendlyname, "SequenceFileExport", conf); job.setJarByClass(SequenceFileExport.class); job.setMapperClass(SequenceFileExportMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(MapWritable.class); job.setInputFormatClass(FsEntryHBaseInputFormat.class); FsEntryHBaseInputFormat.setupJob(job, imageID); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setOutputPath(job, new Path(outpath)); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:gr.iti.mklab.bubing.parser.ITIHTMLParser.java
public static void main(String arg[]) throws IllegalArgumentException, IOException, URISyntaxException, JSAPException, NoSuchAlgorithmException { final SimpleJSAP jsap = new SimpleJSAP(ITIHTMLParser.class.getName(), "Produce the digest of a page: the page is downloaded or passed as argument by specifying a file", new Parameter[] { new UnflaggedOption("url", JSAP.STRING_PARSER, JSAP.REQUIRED, "The url of the page."), new Switch("crossAuthorityDuplicates", 'c', "cross-authority-duplicates"), new FlaggedOption("charBufferSize", JSAP.INTSIZE_PARSER, Integer.toString(CHAR_BUFFER_SIZE), JSAP.NOT_REQUIRED, 'b', "buffer", "The size of the parser character buffer (0 for dynamic sizing)."), new FlaggedOption("file", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f', "file", "The page to be processed."), new FlaggedOption("digester", JSAP.STRING_PARSER, "MD5", JSAP.NOT_REQUIRED, 'd', "digester", "The digester to be used.") }); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) System.exit(1);/*from w w w. j a v a 2 s .c o m*/ final String url = jsapResult.getString("url"); final String digester = jsapResult.getString("digester"); final boolean crossAuthorityDuplicates = jsapResult.userSpecified("crossAuthorityDuplicates"); final int charBufferSize = jsapResult.getInt("charBufferSize"); final ITIHTMLParser<Void> htmlParser = new ITIHTMLParser<Void>(BinaryParser.forName(digester), (TextProcessor<Void>) null, crossAuthorityDuplicates, charBufferSize); final SetLinkReceiver linkReceiver = new SetLinkReceiver(); final byte[] digest; if (!jsapResult.userSpecified("file")) { final URI uri = new URI(url); final HttpGet request = new HttpGet(uri); request.setConfig(RequestConfig.custom().setRedirectsEnabled(false).build()); digest = htmlParser.parse(uri, HttpClients.createDefault().execute(request), linkReceiver); } else { final String file = jsapResult.getString("file"); String content = IOUtils.toString(new InputStreamReader(new FileInputStream(file))); digest = htmlParser.parse(BURL.parse(url), new StringHttpMessages.HttpResponse(content), linkReceiver); } System.out.println("DigestHexString: " + Hex.encodeHexString(digest)); System.out.println("Links: " + linkReceiver.urls); Set<String> urlStrings = new ObjectOpenHashSet<String>(); for (URI link : linkReceiver.urls) urlStrings.add(link.toString()); if (urlStrings.size() != linkReceiver.urls.size()) System.out.println( "There are " + linkReceiver.urls.size() + " URIs but " + urlStrings.size() + " strings"); }
From source file:edu.msu.cme.rdp.kmer.cli.FastKmerFilter.java
public static void main(String[] args) throws Exception { final KmerSet<Set<RefKmer>> kmerSet; final SeqReader queryReader; final SequenceType querySeqType; final File queryFile; final KmerStartsWriter out; final boolean translQuery; final int wordSize; final int translTable; final boolean alignedSeqs; final List<String> refLabels = new ArrayList(); final int maxThreads; final int trieWordSize; try {/* www .jav a 2 s. co m*/ CommandLine cmdLine = new PosixParser().parse(options, args); args = cmdLine.getArgs(); if (args.length < 3) { throw new Exception("Unexpected number of arguments"); } if (cmdLine.hasOption("out")) { out = new KmerStartsWriter(cmdLine.getOptionValue("out")); } else { out = new KmerStartsWriter(System.out); } if (cmdLine.hasOption("aligned")) { alignedSeqs = true; } else { alignedSeqs = false; } if (cmdLine.hasOption("transl-table")) { translTable = Integer.valueOf(cmdLine.getOptionValue("transl-table")); } else { translTable = 11; } if (cmdLine.hasOption("threads")) { maxThreads = Integer.valueOf(cmdLine.getOptionValue("threads")); } else { maxThreads = Runtime.getRuntime().availableProcessors(); } queryFile = new File(args[1]); wordSize = Integer.valueOf(args[0]); SequenceType refSeqType = null; querySeqType = SeqUtils.guessSequenceType(queryFile); queryReader = new SequenceReader(queryFile); if (querySeqType == SequenceType.Protein) { throw new Exception("Expected nucl query sequences"); } refSeqType = SeqUtils .guessSequenceType(new File(args[2].contains("=") ? args[2].split("=")[1] : args[2])); translQuery = refSeqType == SequenceType.Protein; if (translQuery && wordSize % 3 != 0) { throw new Exception("Word size must be a multiple of 3 for nucl ref seqs"); } if (translQuery) { trieWordSize = wordSize / 3; } else { trieWordSize = wordSize; } kmerSet = new KmerSet<Set<RefKmer>>();//new KmerTrie(trieWordSize, translQuery); for (int index = 2; index < args.length; index++) { String refName; String refFileName = args[index]; if (refFileName.contains("=")) { String[] lexemes = refFileName.split("="); refName = lexemes[0]; refFileName = lexemes[1]; } else { String tmpName = new File(refFileName).getName(); if (tmpName.contains(".")) { refName = tmpName.substring(0, tmpName.lastIndexOf(".")); } else { refName = tmpName; } } File refFile = new File(refFileName); if (refSeqType != SeqUtils.guessSequenceType(refFile)) { throw new Exception( "Reference file " + refFile + " contains " + SeqUtils.guessFileFormat(refFile) + " sequences but expected " + refSeqType + " sequences"); } SequenceReader seqReader = new SequenceReader(refFile); Sequence seq; while ((seq = seqReader.readNextSequence()) != null) { if (seq.getSeqName().startsWith("#")) { continue; } KmerGenerator kmers; try { if (translQuery) { //protein ref kmers = new ProtKmerGenerator(seq.getSeqString(), trieWordSize, alignedSeqs); } else { kmers = new NuclKmerGenerator(seq.getSeqString(), trieWordSize, alignedSeqs); } while (kmers.hasNext()) { Kmer temp = kmers.next(); long[] next = temp.getLongKmers(); Set<RefKmer> refKmers = kmerSet.get(next); if (refKmers == null) { refKmers = new HashSet(); kmerSet.add(next, refKmers); } RefKmer kmerRef = new RefKmer(); kmerRef.modelPos = kmers.getPosition(); kmerRef.refFileIndex = refLabels.size(); kmerRef.refSeqid = seq.getSeqName(); refKmers.add(kmerRef); } } catch (IllegalArgumentException ex) { //System.err.println(seq.getSeqName()+ " " + ex.getMessage()); } } seqReader.close(); refLabels.add(refName); } } catch (Exception e) { new HelpFormatter().printHelp( "KmerSearch <kmerSize> <query_file> [name=]<ref_file> ...\nkmerSize should be multiple of 3, (recommend 45, minimum 30, maximum 63) ", options); e.printStackTrace(); System.exit(1); throw new RuntimeException("Stupid jvm"); //While this will never get thrown it is required to make sure javac doesn't get confused about uninitialized variables } long startTime = System.currentTimeMillis(); long seqCount = 0; final int maxTasks = 25000; System.err.println("Starting kmer mapping at " + new Date()); System.err.println("* Number of threads: " + maxThreads); System.err.println("* References: " + refLabels); System.err.println("* Reads file: " + queryFile); System.err.println("* Kmer length: " + trieWordSize); System.err.println("* Kmer Refset Size: " + kmerSet.size()); final AtomicInteger processed = new AtomicInteger(); final AtomicInteger outstandingTasks = new AtomicInteger(); ExecutorService service = Executors.newFixedThreadPool(maxThreads); Sequence querySeq; while ((querySeq = queryReader.readNextSequence()) != null) { seqCount++; String seqString = querySeq.getSeqString(); if ((!translQuery && seqString.length() < wordSize) || (translQuery && seqString.length() < wordSize + 2)) { //System.err.println(querySeq.getSeqName() + "\t" + seqString.length()); continue; } final Sequence threadSeq = querySeq; Runnable r = new Runnable() { public void run() { try { processSeq(threadSeq, refLabels, kmerSet, out, wordSize, translQuery, translTable, false); processSeq(threadSeq, refLabels, kmerSet, out, wordSize, translQuery, translTable, true); processed.incrementAndGet(); outstandingTasks.decrementAndGet(); } catch (Exception e) { e.printStackTrace(); } } }; outstandingTasks.incrementAndGet(); service.submit(r); while (outstandingTasks.get() >= maxTasks) ; if ((processed.get() + 1) % 1000000 == 0) { System.err.println("Processed " + processed + " sequences in " + (System.currentTimeMillis() - startTime) + " ms"); } } service.shutdown(); service.awaitTermination(1, TimeUnit.DAYS); System.err.println("Finished Processed " + processed + " sequences in " + (System.currentTimeMillis() - startTime) + " ms"); out.close(); }
From source file:com.act.analysis.surfactant.AnalysisDriver.java
public static void main(String[] args) throws Exception { Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build());/* ww w .j ava 2 s .c o m*/ } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { System.err.format("Argument parsing failed: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } Set<String> seenOutputIds = new HashSet<>(); TSVWriter<String, String> tsvWriter = null; if (cl.hasOption(OPTION_OUTPUT_FILE)) { File outputFile = new File(cl.getOptionValue(OPTION_OUTPUT_FILE)); List<Map<String, String>> oldResults = null; if (outputFile.exists()) { System.err.format( "Output file already exists, reading old results and skipping processed molecules.\n"); TSVParser outputParser = new TSVParser(); outputParser.parse(outputFile); oldResults = outputParser.getResults(); for (Map<String, String> row : oldResults) { // TODO: verify that the last row was written cleanly/completely. seenOutputIds.add(row.get("id")); } } List<String> header = new ArrayList<>(); header.add("name"); header.add("id"); header.add("inchi"); header.add("label"); for (SurfactantAnalysis.FEATURES f : SurfactantAnalysis.FEATURES.values()) { header.add(f.toString()); } // TODO: make this API more auto-closable friendly. tsvWriter = new TSVWriter<>(header); tsvWriter.open(outputFile); if (oldResults != null) { System.out.format("Re-writing %d existing result rows\n", oldResults.size()); tsvWriter.append(oldResults); } } try { Map<SurfactantAnalysis.FEATURES, Double> analysisFeatures; LicenseManager.setLicenseFile(cl.getOptionValue(OPTION_LICENSE_FILE)); if (cl.hasOption(OPTION_INCHI)) { analysisFeatures = SurfactantAnalysis.performAnalysis(cl.getOptionValue(OPTION_INCHI), cl.hasOption(OPTION_DISPLAY)); Map<String, String> tsvFeatures = new HashMap<>(); // Convert features to strings to avoid some weird formatting issues. It's ugly, but it works. for (Map.Entry<SurfactantAnalysis.FEATURES, Double> entry : analysisFeatures.entrySet()) { tsvFeatures.put(entry.getKey().toString(), String.format("%.6f", entry.getValue())); } tsvFeatures.put("name", "direct-inchi-input"); if (tsvWriter != null) { tsvWriter.append(tsvFeatures); } } else if (cl.hasOption(OPTION_INPUT_FILE)) { TSVParser parser = new TSVParser(); parser.parse(new File(cl.getOptionValue(OPTION_INPUT_FILE))); int i = 0; List<Map<String, String>> inputRows = parser.getResults(); for (Map<String, String> row : inputRows) { i++; // Just for warning messages. if (!row.containsKey("name") || !row.containsKey("id") || !row.containsKey("inchi")) { System.err.format( "WARNING: TSV rows must contain at least name, id, and inchi, skipping row %d\n", i); continue; } if (seenOutputIds.contains(row.get("id"))) { System.out.format("Skipping input row with id already in output: %s\n", row.get("id")); continue; } System.out.format("Analysis for chemical %s\n", row.get("name")); try { analysisFeatures = SurfactantAnalysis.performAnalysis(row.get("inchi"), false); } catch (Exception e) { // Ignore exceptions for now. Sometimes the regression analysis or Chemaxon processing chokes unexpectedly. System.err.format("ERROR caught exception while processing '%s':\n", row.get("name")); System.err.format("%s\n", e.getMessage()); e.printStackTrace(System.err); System.err.println("Skipping..."); continue; } System.out.format("--- Done analysis for chemical %s\n", row.get("name")); // This is a duplicate of the OPTION_INCHI block code, but it's inside of a tight loop, so... Map<String, String> tsvFeatures = new HashMap<>(); for (Map.Entry<SurfactantAnalysis.FEATURES, Double> entry : analysisFeatures.entrySet()) { tsvFeatures.put(entry.getKey().toString(), String.format("%.6f", entry.getValue())); } tsvFeatures.put("name", row.get("name")); tsvFeatures.put("id", row.get("id")); tsvFeatures.put("inchi", row.get("inchi")); tsvFeatures.put("label", row.containsKey("label") ? row.get("label") : "?"); if (tsvWriter != null) { tsvWriter.append(tsvFeatures); // Flush every time in case we crash or get interrupted. The features must flow! tsvWriter.flush(); } } } else { throw new RuntimeException("Must specify inchi or input file"); } } finally { if (tsvWriter != null) { tsvWriter.close(); } } }
From source file:de.citec.sc.matoll.process.Matoll_CreateMax.java
public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException, InstantiationException, IllegalAccessException, ClassNotFoundException, Exception { String directory;/* www . jav a 2 s .c o m*/ String gold_standard_lexicon; String output_lexicon; String configFile; Language language; String output; Stopwords stopwords = new Stopwords(); HashMap<String, Double> maxima; maxima = new HashMap<String, Double>(); if (args.length < 3) { System.out.print("Usage: Matoll --mode=train/test <DIRECTORY> <CONFIG>\n"); return; } // Classifier classifier; directory = args[1]; configFile = args[2]; final Config config = new Config(); config.loadFromFile(configFile); gold_standard_lexicon = config.getGoldStandardLexicon(); String model_file = config.getModel(); output_lexicon = config.getOutputLexicon(); output = config.getOutput(); language = config.getLanguage(); LexiconLoader loader = new LexiconLoader(); Lexicon gold = loader.loadFromFile(gold_standard_lexicon); Set<String> uris = new HashSet<>(); // Map<Integer,String> sentence_list = new HashMap<>(); Map<Integer, Set<Integer>> mapping_words_sentences = new HashMap<>(); //consider only properties for (LexicalEntry entry : gold.getEntries()) { try { for (Sense sense : entry.getSenseBehaviours().keySet()) { String tmp_uri = sense.getReference().getURI().replace("http://dbpedia.org/ontology/", ""); if (!Character.isUpperCase(tmp_uri.charAt(0))) { uris.add(sense.getReference().getURI()); } } } catch (Exception e) { } ; } ModelPreprocessor preprocessor = new ModelPreprocessor(language); preprocessor.setCoreferenceResolution(false); Set<String> dep = new HashSet<>(); dep.add("prep"); dep.add("appos"); dep.add("nn"); dep.add("dobj"); dep.add("pobj"); dep.add("num"); preprocessor.setDEP(dep); List<File> list_files = new ArrayList<>(); if (config.getFiles().isEmpty()) { File folder = new File(directory); File[] files = folder.listFiles(); for (File file : files) { if (file.toString().contains(".ttl")) list_files.add(file); } } else { list_files.addAll(config.getFiles()); } System.out.println(list_files.size()); int sentence_counter = 0; Map<String, Set<Integer>> bag_words_uri = new HashMap<>(); Map<String, Integer> mapping_word_id = new HashMap<>(); for (File file : list_files) { Model model = RDFDataMgr.loadModel(file.toString()); for (Model sentence : getSentences(model)) { String reference = getReference(sentence); reference = reference.replace("http://dbpedia/", "http://dbpedia.org/"); if (uris.contains(reference)) { sentence_counter += 1; Set<Integer> words_ids = getBagOfWords(sentence, stopwords, mapping_word_id); //TODO: add sentence preprocessing String obj = getObject(sentence); String subj = getSubject(sentence); preprocessor.preprocess(sentence, subj, obj, language); //TODO: also return marker if object or subject of property (in SPARQL this has to be optional of course) String parsed_sentence = getParsedSentence(sentence); try (FileWriter fw = new FileWriter("mapping_sentences_to_ids_goldstandard.tsv", true); BufferedWriter bw = new BufferedWriter(fw); PrintWriter out = new PrintWriter(bw)) { out.println(sentence_counter + "\t" + parsed_sentence); } catch (IOException e) { e.printStackTrace(); } for (Integer word_id : words_ids) { if (mapping_words_sentences.containsKey(word_id)) { Set<Integer> tmp_set = mapping_words_sentences.get(word_id); tmp_set.add(sentence_counter); mapping_words_sentences.put(word_id, tmp_set); } else { Set<Integer> tmp_set = new HashSet<>(); tmp_set.add(sentence_counter); mapping_words_sentences.put(word_id, tmp_set); } } if (bag_words_uri.containsKey(reference)) { Set<Integer> tmp = bag_words_uri.get(reference); for (Integer w : words_ids) { tmp.add(w); } bag_words_uri.put(reference, tmp); } else { Set<Integer> tmp = new HashSet<>(); for (Integer w : words_ids) { tmp.add(w); } bag_words_uri.put(reference, tmp); } } } model.close(); } PrintWriter writer = new PrintWriter("bag_of_words_only_goldstandard.tsv"); StringBuilder string_builder = new StringBuilder(); for (String r : bag_words_uri.keySet()) { string_builder.append(r); for (Integer i : bag_words_uri.get(r)) { string_builder.append("\t"); string_builder.append(i); } string_builder.append("\n"); } writer.write(string_builder.toString()); writer.close(); writer = new PrintWriter("mapping_words_to_sentenceids_goldstandard.tsv"); string_builder = new StringBuilder(); for (Integer w : mapping_words_sentences.keySet()) { string_builder.append(w); for (int i : mapping_words_sentences.get(w)) { string_builder.append("\t"); string_builder.append(i); } string_builder.append("\n"); } writer.write(string_builder.toString()); writer.close(); }
From source file:org.ala.hbase.RepoDataLoader.java
/** * This takes a list of infosource ids... * <p/>//w ww . java2 s . c o m * Usage: -stats or -reindex or -gList and list of infosourceId * * @param args */ public static void main(String[] args) throws Exception { //RepoDataLoader loader = new RepoDataLoader(); ApplicationContext context = SpringUtils.getContext(); RepoDataLoader loader = (RepoDataLoader) context.getBean(RepoDataLoader.class); long start = System.currentTimeMillis(); loader.loadInfoSources(); String filePath = repositoryDir; if (args.length > 0) { if (args[0].equalsIgnoreCase("-stats")) { loader.statsOnly = true; args = (String[]) ArrayUtils.subarray(args, 1, args.length); } if (args[0].equalsIgnoreCase("-reindex")) { loader.reindex = true; loader.indexer = context.getBean(PartialIndex.class); args = (String[]) ArrayUtils.subarray(args, 1, args.length); logger.info("**** -reindex: " + loader.reindex); logger.debug("reindex url: " + loader.reindexUrl); } if (args[0].equalsIgnoreCase("-gList")) { loader.gList = true; args = (String[]) ArrayUtils.subarray(args, 1, args.length); logger.info("**** -gList: " + loader.gList); } if (args[0].equalsIgnoreCase("-biocache")) { Hashtable<String, String> hashTable = new Hashtable<String, String>(); hashTable.put("accept", "application/json"); ObjectMapper mapper = new ObjectMapper(); mapper.getDeserializationConfig().set(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false); RestfulClient restfulClient = new RestfulClient(0); String fq = "&fq="; if (args.length > 1) { java.util.Date date = new java.util.Date(); if (args[1].equals("-lastWeek")) { date = DateUtils.addWeeks(date, -1); } else if (args[1].equals("-lastMonth")) { date = DateUtils.addMonths(date, -1); } else if (args[1].equals("-lastYear")) { date = DateUtils.addYears(date, -1); } else date = null; if (date != null) { SimpleDateFormat sfd = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); fq += "last_load_date:%5B" + sfd.format(date) + "%20TO%20*%5D"; } } Object[] resp = restfulClient .restGet("http://biocache.ala.org.au/ws/occurrences/search?q=multimedia:Image" + fq + "&facets=data_resource_uid&pageSize=0", hashTable); logger.info("The URL: " + "http://biocache.ala.org.au/ws/occurrences/search?q=multimedia:Image" + fq + "&facets=data_resource_uid&pageSize=0"); if ((Integer) resp[0] == HttpStatus.SC_OK) { String content = resp[1].toString(); logger.debug(resp[1]); if (content != null && content.length() > "[]".length()) { Map map = mapper.readValue(content, Map.class); try { List<java.util.LinkedHashMap<String, String>> list = ((List<java.util.LinkedHashMap<String, String>>) ((java.util.LinkedHashMap) ((java.util.ArrayList) map .get("facetResults")).get(0)).get("fieldResult")); Set<String> arg = new LinkedHashSet<String>(); for (int i = 0; i < list.size(); i++) { java.util.LinkedHashMap<String, String> value = list.get(i); String dataResource = getDataResource(value.get("fq")); Object provider = (loader.getUidInfoSourceMap().get(dataResource)); if (provider != null) { arg.add(provider.toString()); } } logger.info("Set of biocache infosource ids to load: " + arg); args = new String[] {}; args = arg.toArray(args); //handle the situation where biocache-service reports no data resources if (args.length < 1) { logger.error("No biocache data resources found. Unable to load."); System.exit(0); } } catch (Exception e) { logger.error("ERROR: exit process....." + e); e.printStackTrace(); System.exit(0); } } } else { logger.warn("Unable to process url: "); } } } int filesRead = loader.load(filePath, args); //FIX ME - move to config long finish = System.currentTimeMillis(); logger.info(filesRead + " files scanned/loaded in: " + ((finish - start) / 60000) + " minutes " + ((finish - start) / 1000) + " seconds."); System.exit(1); }