List of usage examples for java.util HashSet size
public int size()
From source file:Main.java
public static void main(String[] a) { String elements[] = { "A", "B", "C", "D", "E" }; HashSet<String> set = new HashSet<String>(Arrays.asList(elements)); System.out.println(set.size()); }
From source file:Main.java
public static void main(String[] args) { HashSet<Integer> hSet = new HashSet<Integer>(); System.out.println("Size of HashSet : " + hSet.size()); hSet.add(new Integer("1")); hSet.add(new Integer("2")); hSet.add(new Integer("3")); System.out.println(hSet.size()); hSet.remove(new Integer("1")); System.out.println(hSet.size()); }
From source file:Main.java
public static void main(String args[]) { HashSet<String> newset = new HashSet<String>(); // populate hash set newset.add("Learning"); newset.add("from"); newset.add("java2s.com"); System.out.println("Size of the set: " + newset.size()); }
From source file:org.aksw.gerbil.bat.annotator.FOXAnnotator.java
public static void main(String[] a) { String test = "The philosopher and mathematician Gottfried Wilhelm Leibniz was born in Leipzig."; HashSet<Annotation> set = new FOXAnnotator(SingletonWikipediaApi.getInstance()).solveA2W(test); LOGGER.info("Got {} annotations.", set.size()); }
From source file:ProducerAndConsumerTool.java
public static void main(String[] args) { ConsumerTool consumerTool = new ConsumerTool(); String[] unknown = CommandLineSupport.setOptions(consumerTool, args); HashSet<String> set1 = new HashSet<String>(Arrays.asList(unknown)); ProducerTool producerTool = new ProducerTool(); unknown = CommandLineSupport.setOptions(producerTool, args); HashSet<String> set2 = new HashSet<String>(Arrays.asList(unknown)); set1.retainAll(set2);//from w w w.j a v a2s .c om if (set1.size() > 0) { System.out.println("Unknown options: " + set1); System.exit(-1); } consumerTool.run(); producerTool.run(); }
From source file:ProducerAndConsumerTool.java
public static void main(String[] args) { ConsumerTool consumerTool = new ConsumerTool(); String[] unknown = CommandLineSupport.setOptions(consumerTool, args); HashSet<String> set1 = new HashSet<String>(Arrays.asList(unknown)); HashSet<String> set4 = new HashSet<String>(5); ProducerTool producerTool = new ProducerTool(); unknown = CommandLineSupport.setOptions(producerTool, args); HashSet<String> set2 = new HashSet<String>(Arrays.asList(unknown)); set1.retainAll(set2);/* w w w .j a v a 2s.co m*/ if (set1.size() > 0) { System.out.println("Unknown options: " + set1); System.exit(-1); } /* consumerTool.run(); */ producerTool.run(); }
From source file:mase.deprecated.SelectionBenchmark.java
public static void main(String[] args) { int L = 100, N = 10000; double[] truncationP = new double[] { 0.25, 0.50, 0.75 }; int[] tournamentP = new int[] { 2, 5, 7, 10 }; DescriptiveStatistics[] truncationStat = new DescriptiveStatistics[truncationP.length]; for (int i = 0; i < truncationStat.length; i++) { truncationStat[i] = new DescriptiveStatistics(); }// w w w . j av a2 s . co m DescriptiveStatistics[] tournamentStat = new DescriptiveStatistics[tournamentP.length]; DescriptiveStatistics[] tournamentStat2 = new DescriptiveStatistics[tournamentP.length]; for (int i = 0; i < tournamentStat.length; i++) { tournamentStat[i] = new DescriptiveStatistics(); tournamentStat2[i] = new DescriptiveStatistics(); } DescriptiveStatistics rouletteStat = new DescriptiveStatistics(); DescriptiveStatistics rouletteStat2 = new DescriptiveStatistics(); DescriptiveStatistics baseStat = new DescriptiveStatistics(); for (int i = 0; i < N; i++) { // generate test vector double[] test = new double[L]; for (int j = 0; j < L; j++) { test[j] = Math.random(); } // truncation for (int p = 0; p < truncationP.length; p++) { double[] v = Arrays.copyOf(test, test.length); Arrays.sort(v); int nElites = (int) Math.ceil(truncationP[p] * test.length); double cutoff = v[test.length - nElites]; double[] weights = new double[test.length]; for (int k = 0; k < test.length; k++) { weights[k] = test[k] >= cutoff ? test[k] * (1 / truncationP[p]) : 0; } truncationStat[p].addValue(sum(weights)); } // tournament for (int p = 0; p < tournamentP.length; p++) { double[] weights = new double[test.length]; HashSet<Integer> added = new HashSet<Integer>(); for (int k = 0; k < test.length; k++) { int idx = makeTournament(test, tournamentP[p]); weights[idx] += test[idx]; added.add(idx); } tournamentStat2[p].addValue(added.size()); tournamentStat[p].addValue(sum(weights)); } // roulette double[] weights = new double[test.length]; HashSet<Integer> added = new HashSet<Integer>(); for (int k = 0; k < test.length; k++) { int idx = roulette(test); weights[idx] += test[idx]; added.add(idx); } rouletteStat.addValue(sum(weights)); rouletteStat2.addValue(added.size()); // base baseStat.addValue(sum(test)); } for (int p = 0; p < truncationP.length; p++) { System.out.println("Truncation\t" + truncationP[p] + "\t" + truncationStat[p].getMean() + "\t" + truncationStat[p].getStandardDeviation() + "\t" + ((int) Math.ceil(L * truncationP[p])) + "\t 0"); } for (int p = 0; p < tournamentP.length; p++) { System.out.println("Tournament\t" + tournamentP[p] + "\t" + tournamentStat[p].getMean() + "\t" + tournamentStat[p].getStandardDeviation() + "\t" + tournamentStat2[p].getMean() + "\t" + tournamentStat2[p].getStandardDeviation()); } System.out.println("Roulette\t\t" + rouletteStat.getMean() + "\t" + rouletteStat.getStandardDeviation() + "\t" + rouletteStat2.getMean() + "\t" + rouletteStat2.getStandardDeviation()); System.out.println( "Base \t\t" + baseStat.getMean() + "\t" + baseStat.getStandardDeviation() + "\t " + L + "\t0"); }
From source file:org.ph.commonjoiner.CommonJoiner.java
public static void main(String[] args) { try {/*from w ww.j a v a2s .co m*/ Options options = new Options(); options.addOption("f", true, "archivo de entrada inicial."); options.addOption("g", true, "archivo de entrada a comparar."); options.addOption("o", true, "archivo de salida (omitir para salida por pantalla)."); options.addOption("h", false, "muestra esta ayuda."); options.addOption("l", false, "muestra la licencia GPL v.3 al completo."); options.addOption("s", false, "muestra estadisticas al finalizar el proceso."); options.addOption("v", false, "muestra la version del software."); options.addOption("c", true, "copia los archivos concidentes con los cargados en la lista mediante -f a la carpeta indicada."); CommandLineParser parser = new DefaultParser(); CommandLine cmd = parser.parse(options, args); if ((args.length == 1)) { if (cmd.hasOption("l")) System.out.println(GNULicense.FULLLICENSE); if (cmd.hasOption("v")) System.out.println(InfoCommonJoiner.INFO); if (cmd.hasOption("h")) printSmallHelp(options); } else { String entrada1, salida; // <editor-fold defaultstate="expanded" desc=" opciones de comparacin de contenido de dos archivos "> if (cmd.hasOption("f") && cmd.hasOption("g") && cmd.hasOption("o")) { entrada1 = cmd.getOptionValue("f"); String entrada2 = cmd.getOptionValue("g"); salida = cmd.getOptionValue("o"); File fEntrada1 = new File(entrada1); File fEntrada2 = new File(entrada2); File fSalida = new File(salida); boolean okEntrada1 = fEntrada1.exists(); boolean okEntrada2 = fEntrada2.exists(); boolean okSalida = !(fSalida.exists()); if (okEntrada1 && okEntrada2) { String thisLine = null; HashSet nums1 = new HashSet<Integer>(); ArrayList numComun = new ArrayList<Integer>(); System.out.println(InfoCommonJoiner.INTRO); try { Timer tF1 = new Timer("Inicio lectura archivo " + entrada1); BufferedReader br1 = new BufferedReader(new FileReader(fEntrada1)); while ((thisLine = br1.readLine()) != null) { nums1.add(Integer.parseInt(thisLine)); } tF1.setT1(); br1.close(); System.out .println("Nmero de elementos nicos en '" + entrada1 + "': " + nums1.size()); Timer tF2 = new Timer("Inicio lectura archivo " + entrada2); long readedLines = 0; BufferedReader br2 = new BufferedReader(new FileReader(fEntrada2)); while ((thisLine = br2.readLine()) != null) { readedLines++; if (!nums1.add(Integer.parseInt(thisLine))) { numComun.add(Integer.parseInt(thisLine)); } } tF2.setT1(); br2.close(); System.out .println("Nmero de elementos ledos en '" + entrada2 + "': " + readedLines); Timer tF3 = new Timer("Inicio escritura de archivo"); if (!okSalida) { System.out.println("Nmero de elementos comunes: " + numComun.size()); System.out.println("" + Arrays.toString(numComun.toArray())); } else { // escritura del archivo BufferedWriter bw = new BufferedWriter(new FileWriter(fSalida)); // ordenamiento de los resultados Collections.sort(numComun); for (Object i : numComun) { bw.write(i.toString()); bw.newLine(); } bw.close(); tF3.setT1(); System.out.println("Nmero de elementos comunes escritos: " + numComun.size()); } if (cmd.hasOption("s")) { System.out.println("\nESTADISTICAS"); System.out.println(" Tiempo de operacin de lectura de archivo " + entrada1 + ": " + tF1.getEllapsedTime() + "ns"); System.out.println(" Tiempo de operacin de lectura de archivo " + entrada2 + ": " + tF2.getEllapsedTime() + "ns"); if (okSalida) { System.out.println(" Tiempo de operacin de escritura de archivo " + salida + ": " + tF3.getEllapsedTime() + "ns"); } long tiempoTotal = tF1.getEllapsedTime() + tF2.getEllapsedTime() + ((okSalida) ? tF3.getEllapsedTime() : 0); System.out .println(" Tiempo total de las operaciones " + ": " + tiempoTotal + "ms"); } } catch (Exception e) { e.printStackTrace(); } } } // </editor-fold> if (cmd.hasOption("c") && cmd.hasOption("f")) { boolean copiar = false; entrada1 = cmd.getOptionValue("f"); salida = cmd.getOptionValue("c"); File archivoEntrada = new File(entrada1); File carpetaActual = new File("."); File carpetaSalida = new File(salida); if (!carpetaSalida.exists()) { System.out.println("Se crear la estructura de carpetas " + carpetaSalida); carpetaSalida.mkdirs(); } copiar = carpetaSalida.isDirectory(); copiar = (copiar) ? archivoEntrada.canRead() : false; if (copiar) { String thisLine; ArrayList<Integer> nums1 = new ArrayList<>(); Timer tF1 = new Timer("Inicio lectura archivo " + entrada1); BufferedReader br1 = new BufferedReader(new FileReader(archivoEntrada)); while ((thisLine = br1.readLine()) != null) { nums1.add(Integer.parseInt(thisLine)); } System.out.println("Archivo de entrada ledo correctamente."); boolean cpy; int progreso = 0/*, totalArchivos = carpetaActual.listFiles().length*/; int totalArchivos = nums1.size(); String f, subF; for (File file : carpetaActual.listFiles()) { cpy = false; // progreso++; f = (file.getName().contains(".")) ? file.getName().substring(0, file.getName().lastIndexOf(".")) : ""; if (f.length() > 0 && !file.isDirectory()) { for (Integer i : nums1) { if (f.endsWith(i.toString())) { subF = f.substring(f.length() - i.toString().length() - 1); try { if (Math.abs(Integer.parseInt(subF)) == i) { if (String.valueOf(Integer.parseInt(subF)).length() == subF .length()) { cpy = true; break; } } } catch (NumberFormatException ex) { // error -> cadena alfanumrica cpy = true; } } } } if (cpy) { // progreso++; // System.out.println(" " + file.getName() + " -> " + carpetaSalida.getPath() + "/" + file.getName()); // if(progreso == 1) System.out.print("" + ); if (progreso == 1) System.out.print("Copiando archivos: "); System.out.print("" + progreso + "/" + totalArchivos + " "); // if((progreso * 1.0 / totalArchivos)*100 % 20 == 0) System.out.print("....." + ((progreso * 1.0 / totalArchivos)*100) + "%"); Archives.copy(file, carpetaSalida); } } tF1.setT1(); System.out.println("\nOperacin finalizada en " + tF1.getEllapsedTime() + " ns"); } else { System.out.println("No se ha podido realizar la operacin. Posibles causas:"); System.out.println("Carpeta de salida: " + carpetaSalida.getName() + " -> " + carpetaSalida.isDirectory()); System.out.println("Archivo de entrada: " + archivoEntrada.getName() + " -> " + archivoEntrada.canRead()); } } else { printSmallHelp(options); } } } catch (ParseException | IOException ex) { System.out.println("Error: " + ex.getMessage() + "\n" + ex.getCause()); } }
From source file:nl.systemsgenetics.genenetworkbackend.hpo.TestDiseaseGenePerformance.java
/** * @param args the command line arguments * @throws java.lang.Exception/*from w ww. j a v a 2 s . c o m*/ */ public static void main(String[] args) throws Exception { final File diseaseGeneHpoFile = new File( "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\HPO\\135\\ALL_SOURCES_ALL_FREQUENCIES_diseases_to_genes_to_phenotypes.txt"); final File ncbiToEnsgMapFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\ensgNcbiId.txt"); final File hgncToEnsgMapFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\ensgHgnc.txt"); final File ensgSymbolMappingFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\ensgHgnc.txt"); final File predictionMatrixFile = new File( "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\hpo_predictions_zscores.txt.gz"); final File predictionMatrixCorrelationFile = new File( "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\hpo_predictions_pathwayCorrelation.txt"); final File significantTermsFile = new File( "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\hpo_predictions_bonSigTerms.txt"); final double correctedPCutoff = 0.05; final File hpoOboFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\HPO\\135\\hp.obo"); final File hpoPredictionInfoFile = new File( "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\hpo_predictions_auc_bonferroni.txt"); final File hposToExcludeFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\hpoToExclude.txt"); final File skewnessFile = new File( "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\skewnessSummary.txt"); final boolean randomize = true; final File annotationMatrixFile = new File( "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\PathwayMatrix\\ALL_SOURCES_ALL_FREQUENCIES_phenotype_to_genes.txt_matrix.txt.gz"); final File backgroundForRandomize = new File( "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\PathwayMatrix\\Ensembl2Reactome_All_Levels.txt_genesInPathways.txt"); //final File backgroundForRandomize = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\expressedReactomeGenes.txt"); final boolean randomizeCustomBackground = true; Map<String, String> ensgSymbolMapping = loadEnsgToHgnc(ensgSymbolMappingFile); final File outputFile; final ArrayList<String> backgroundGenes; if (randomize) { if (randomizeCustomBackground) { System.err.println("First need to fix so ranking list contains all genes in background list"); return; // backgroundGenes = loadBackgroundGenes(backgroundForRandomize); // outputFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\hpoDiseaseBenchmarkRandomizedCustomBackground.txt"); } else { backgroundGenes = null; outputFile = new File( "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\hpoDiseaseBenchmarkRandomizedExtraNorm.txt"); } } else { backgroundGenes = null; outputFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\hpoDiseaseBenchmarkExtraNorm.txt"); } final HashMap<String, ArrayList<String>> ncbiToEnsgMap = loadNcbiToEnsgMap(ncbiToEnsgMapFile); final HashMap<String, ArrayList<String>> hgncToEnsgMap = loadHgncToEnsgMap(hgncToEnsgMapFile); final HashSet<String> exludedHpo = loadHpoExclude(hposToExcludeFile); final SkewnessInfo skewnessInfo = new SkewnessInfo(skewnessFile); LinkedHashSet<String> significantTerms = loadSignificantTerms(significantTermsFile); DoubleMatrixDataset<String, String> predictionMatrix = DoubleMatrixDataset .loadDoubleData(predictionMatrixFile.getAbsolutePath()); DoubleMatrixDataset<String, String> predictionMatrixSignificant = predictionMatrix .viewColSelection(significantTerms); DoubleMatrixDataset<String, String> predictionMatrixSignificantCorrelationMatrix = DoubleMatrixDataset .loadDoubleData(predictionMatrixCorrelationFile.getAbsolutePath()); DiseaseGeneHpoData diseaseGeneHpoData = new DiseaseGeneHpoData(diseaseGeneHpoFile, ncbiToEnsgMap, hgncToEnsgMap, exludedHpo, new HashSet(predictionMatrix.getHashRows().keySet()), "OMIM"); //NOTE if one would use a differnt background this needs to be updated HashSet<String> diseaseGenes = new HashSet<>(diseaseGeneHpoData.getDiseaseGenes()); if (randomize) { diseaseGeneHpoData = diseaseGeneHpoData.getPermutation(1, backgroundGenes); } for (String gene : diseaseGenes) { if (!predictionMatrixSignificant.containsRow(gene)) { throw new Exception("Error: " + gene); } } int[] mapGeneIndexToDiseaseGeneIndex = new int[predictionMatrix.rows()]; ArrayList<String> predictedGenes = predictionMatrix.getRowObjects(); int g2 = 0; for (int g = 0; g < predictedGenes.size(); ++g) { mapGeneIndexToDiseaseGeneIndex[g] = diseaseGenes.contains(predictedGenes.get(g)) ? g2++ : -1; } DoubleMatrixDataset<String, String> annotationnMatrix = DoubleMatrixDataset .loadDoubleData(annotationMatrixFile.getAbsolutePath()); DoubleMatrixDataset<String, String> annotationMatrixSignificant = annotationnMatrix .viewColSelection(significantTerms); HashMap<String, MeanSd> hpoMeanSds = calculatePathayMeansOfAnnotatedGenes(predictionMatrixSignificant, annotationMatrixSignificant); Map<String, PredictionInfo> predictionInfo = HpoFinder.loadPredictionInfo(hpoPredictionInfoFile); Ontology hpoOntology = HpoFinder.loadHpoOntology(hpoOboFile); HpoFinder hpoFinder = new HpoFinder(hpoOntology, predictionInfo); final int totalGenes = predictionMatrixSignificant.rows(); final int totalDiseaseGenes = diseaseGenes.size(); final double[] geneScores = new double[totalGenes]; final double[] geneScoresDiseaseGenes = new double[totalDiseaseGenes]; final NaturalRanking naturalRanking = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.MAXIMUM); CSVWriter writer = new CSVWriter(new FileWriter(outputFile), '\t', '\0', '\0', "\n"); String[] outputLine = new String[16]; int c = 0; outputLine[c++] = "Disease"; outputLine[c++] = "Gene"; outputLine[c++] = "Hgnc"; outputLine[c++] = "Rank"; outputLine[c++] = "RankAmongDiseaseGenes"; outputLine[c++] = "Z-score"; outputLine[c++] = "HPO_skewness"; outputLine[c++] = "Other_mean_skewness"; outputLine[c++] = "Other_max_skewness"; outputLine[c++] = "HPO_phenotypic_match_score"; outputLine[c++] = "HPO_count"; outputLine[c++] = "HPO_sum_auc"; outputLine[c++] = "HPO_mean_auc"; outputLine[c++] = "HPO_median_auc"; outputLine[c++] = "HPO_terms"; outputLine[c++] = "HPO_terms_match_score"; writer.writeNext(outputLine); Random random = new Random(1); Mean meanCalculator = new Mean(); Median medianCalculator = new Median(); for (DiseaseGeneHpoData.DiseaseGene diseaseGene : diseaseGeneHpoData.getDiseaseGeneHpos()) { String gene = diseaseGene.getGene(); String disease = diseaseGene.getDisease(); if (!predictionMatrixSignificant.containsRow(gene)) { continue; } Set<String> geneHpos = diseaseGeneHpoData.getDiseaseEnsgHpos(diseaseGene); LinkedHashSet<String> geneHposPredictable = new LinkedHashSet<>(); for (String hpo : geneHpos) { geneHposPredictable .addAll(hpoFinder.getTermsToNames(hpoFinder.getPredictableTerms(hpo, correctedPCutoff))); } if (geneHposPredictable.isEmpty()) { continue; } // if(geneHposPredictable.size() > 1){ // String hpoSelected = geneHposPredictable.toArray(new String[geneHposPredictable.size()])[random.nextInt(geneHposPredictable.size())]; // geneHposPredictable = new LinkedHashSet<>(1); // geneHposPredictable.add(hpoSelected); // } DoubleMatrixDataset<String, String> predictionCaseTerms = predictionMatrixSignificant .viewColSelection(geneHposPredictable); DoubleMatrix2D predictionCaseTermsMatrix = predictionCaseTerms.getMatrix(); double denominator = Math.sqrt(geneHposPredictable.size()); for (int g = 0; g < totalGenes; ++g) { geneScores[g] = predictionCaseTermsMatrix.viewRow(g).zSum() / denominator; if (Double.isNaN(geneScores[g])) { geneScores[g] = 0; } g2 = mapGeneIndexToDiseaseGeneIndex[g]; if (g2 >= 0) { geneScoresDiseaseGenes[g2] = geneScores[g]; } } double[] geneRanks = naturalRanking.rank(geneScores); int diseaseGeneIndex = predictionMatrixSignificant.getRowIndex(gene); double[] geneRanksDiseaseGenes = naturalRanking.rank(geneScoresDiseaseGenes); int diseaseGeneIndexInDiseaseGenesOnly = mapGeneIndexToDiseaseGeneIndex[diseaseGeneIndex]; double zscore = geneScores[diseaseGeneIndex]; double rank = (totalGenes - geneRanks[diseaseGeneIndex]) + 1; double rankAmongDiseaseGenes = (totalDiseaseGenes - geneRanksDiseaseGenes[diseaseGeneIndexInDiseaseGenesOnly]) + 1; double hpoPhenotypicMatchScore = 0; StringBuilder individualMatchScore = new StringBuilder(); boolean notFirst = false; int usedHpos = 0; double[] aucs = new double[geneHposPredictable.size()]; double sumAucs = 0; int i = 0; for (String hpo : geneHposPredictable) { usedHpos++; MeanSd hpoMeanSd = hpoMeanSds.get(hpo); double hpoPredictionZ = predictionMatrixSignificant.getElement(gene, hpo); double hpoPredictionOutlierScore = ((hpoPredictionZ - hpoMeanSd.getMean()) / hpoMeanSd.getSd()); if (notFirst) { individualMatchScore.append(';'); } notFirst = true; individualMatchScore.append(hpoPredictionOutlierScore); hpoPhenotypicMatchScore += hpoPredictionOutlierScore; aucs[i++] = predictionInfo.get(hpo).getAuc(); sumAucs += predictionInfo.get(hpo).getAuc(); } double meanAuc = meanCalculator.evaluate(aucs); double medianAuc = medianCalculator.evaluate(aucs); if (usedHpos == 0) { hpoPhenotypicMatchScore = Double.NaN; } else { hpoPhenotypicMatchScore = hpoPhenotypicMatchScore / usedHpos; } String symbol = ensgSymbolMapping.get(gene); if (symbol == null) { symbol = ""; } c = 0; outputLine[c++] = disease; outputLine[c++] = gene; outputLine[c++] = symbol; outputLine[c++] = String.valueOf(rank); outputLine[c++] = String.valueOf(rankAmongDiseaseGenes); outputLine[c++] = String.valueOf(zscore); outputLine[c++] = String.valueOf(skewnessInfo.getHpoSkewness(gene)); outputLine[c++] = String.valueOf(skewnessInfo.getMeanSkewnessExHpo(gene)); outputLine[c++] = String.valueOf(skewnessInfo.getMaxSkewnessExHpo(gene)); outputLine[c++] = String.valueOf(hpoPhenotypicMatchScore); outputLine[c++] = String.valueOf(geneHposPredictable.size()); outputLine[c++] = String.valueOf(sumAucs); outputLine[c++] = String.valueOf(meanAuc); outputLine[c++] = String.valueOf(medianAuc); outputLine[c++] = String.join(";", geneHposPredictable); outputLine[c++] = individualMatchScore.toString(); writer.writeNext(outputLine); } writer.close(); }
From source file:de.unileipzig.ub.indexer.App.java
public static void main(String[] args) throws IOException { // create Options object Options options = new Options(); options.addOption("h", "help", false, "display this help"); options.addOption("f", "filename", true, "name of the JSON file whose content should be indexed"); options.addOption("i", "index", true, "the name of the target index"); options.addOption("d", "doctype", true, "the name of the doctype (title, local, ...)"); options.addOption("t", "host", true, "elasticsearch hostname (default: 0.0.0.0)"); options.addOption("p", "port", true, "transport port (that's NOT the http port, default: 9300)"); options.addOption("c", "cluster", true, "cluster name (default: elasticsearch_mdma)"); options.addOption("b", "bulksize", true, "number of docs sent in one request (default: 3000)"); options.addOption("v", "verbose", false, "show processing speed while indexing"); options.addOption("s", "status", false, "only show status of index for file"); options.addOption("r", "repair", false, "attempt to repair recoverable inconsistencies on the go"); options.addOption("e", "debug", false, "set logging level to debug"); options.addOption("l", "logfile", true, "logfile - in not specified only log to stdout"); options.addOption("m", "memcached", true, "host and port of memcached (default: localhost:11211)"); options.addOption("z", "latest-flag-on", true, "enable latest flag according to field (within content, e.g. 001)"); options.addOption("a", "flat", false, "flat-mode: do not check for inconsistencies"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null;/*w ww .j ava 2 s . c o m*/ try { cmd = parser.parse(options, args); } catch (ParseException ex) { logger.error(ex); System.exit(1); } // setup logging Properties systemProperties = System.getProperties(); systemProperties.put("net.spy.log.LoggerImpl", "net.spy.memcached.compat.log.Log4JLogger"); System.setProperties(systemProperties); Logger.getLogger("net.spy.memcached").setLevel(Level.ERROR); Properties props = new Properties(); props.load(props.getClass().getResourceAsStream("/log4j.properties")); if (cmd.hasOption("debug")) { props.setProperty("log4j.logger.de.unileipzig", "DEBUG"); } if (cmd.hasOption("logfile")) { props.setProperty("log4j.rootLogger", "INFO, stdout, F"); props.setProperty("log4j.appender.F", "org.apache.log4j.FileAppender"); props.setProperty("log4j.appender.F.File", cmd.getOptionValue("logfile")); props.setProperty("log4j.appender.F.layout", "org.apache.log4j.PatternLayout"); props.setProperty("log4j.appender.F.layout.ConversionPattern", "%5p | %d | %F | %L | %m%n"); } PropertyConfigurator.configure(props); InetAddress addr = InetAddress.getLocalHost(); String memcachedHostAndPort = addr.getHostAddress() + ":11211"; if (cmd.hasOption("m")) { memcachedHostAndPort = cmd.getOptionValue("m"); } // setup caching try { if (memcachedClient == null) { memcachedClient = new MemcachedClient( new ConnectionFactoryBuilder().setFailureMode(FailureMode.Cancel).build(), AddrUtil.getAddresses("0.0.0.0:11211")); try { // give client and server 500ms Thread.sleep(300); } catch (InterruptedException ex) { } Collection availableServers = memcachedClient.getAvailableServers(); logger.info(availableServers); if (availableServers.size() == 0) { logger.info("no memcached servers found"); memcachedClient.shutdown(); memcachedClient = null; } else { logger.info(availableServers.size() + " memcached server(s) detected, fine."); } } } catch (IOException ex) { logger.warn("couldn't create a connection, bailing out: " + ex.getMessage()); } // process options if (cmd.hasOption("h")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("indexer", options, true); quit(0); } boolean verbose = false; if (cmd.hasOption("verbose")) { verbose = true; } // ES options String[] hosts = new String[] { "0.0.0.0" }; int port = 9300; String clusterName = "elasticsearch_mdma"; int bulkSize = 3000; if (cmd.hasOption("host")) { hosts = cmd.getOptionValues("host"); } if (cmd.hasOption("port")) { port = Integer.parseInt(cmd.getOptionValue("port")); } if (cmd.hasOption("cluster")) { clusterName = cmd.getOptionValue("cluster"); } if (cmd.hasOption("bulksize")) { bulkSize = Integer.parseInt(cmd.getOptionValue("bulksize")); if (bulkSize < 1 || bulkSize > 100000) { logger.error("bulksize must be between 1 and 100,000"); quit(1); } } // ES Client final Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "elasticsearch_mdma") .build(); final TransportClient client = new TransportClient(settings); for (String host : hosts) { client.addTransportAddress(new InetSocketTransportAddress(host, port)); } if (cmd.hasOption("filename") && cmd.hasOption("index") && cmd.hasOption("doctype")) { final String filename = cmd.getOptionValue("filename"); final File _file = new File(filename); if (_file.length() == 0) { logger.info(_file.getAbsolutePath() + " is empty, skipping"); quit(0); // file is empty } // for flat mode: leave a stampfile beside the json to // indicate previous successful processing File directory = new File(filename).getParentFile(); File stampfile = new File(directory, DigestUtils.shaHex(filename) + ".indexed"); long start = System.currentTimeMillis(); long lineCount = 0; final String indexName = cmd.getOptionValue("index"); final String docType = cmd.getOptionValue("doctype"); BulkRequestBuilder bulkRequest = client.prepareBulk(); try { if (cmd.hasOption("flat")) { // flat mode // ......... if (stampfile.exists()) { logger.info("SKIPPING, since it seems this file has already " + "been imported (found: " + stampfile.getAbsolutePath() + ")"); quit(0); } } else { final String srcSHA1 = extractSrcSHA1(filename); logger.debug(filename + " srcsha1: " + srcSHA1); long docsInIndex = getIndexedRecordCount(client, indexName, srcSHA1); logger.debug(filename + " indexed: " + docsInIndex); long docsInFile = getLineCount(filename); logger.debug(filename + " lines: " + docsInFile); // in non-flat-mode, indexing would take care // of inconsistencies if (docsInIndex == docsInFile) { logger.info("UP-TO DATE: " + filename + " (" + docsInIndex + ", " + srcSHA1 + ")"); client.close(); quit(0); } if (docsInIndex > 0) { logger.warn("INCONSISTENCY DETECTED: " + filename + ": indexed:" + docsInIndex + " lines:" + docsInFile); if (!cmd.hasOption("r")) { logger.warn( "Please re-run indexer with --repair flag or delete residues first with: $ curl -XDELETE " + hosts[0] + ":9200/" + indexName + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'"); client.close(); quit(1); } else { logger.info("Attempting to clear residues..."); // attempt to repair once DeleteByQueryResponse dbqr = client.prepareDeleteByQuery(indexName) .setQuery(termQuery("meta.srcsha1", srcSHA1)).execute().actionGet(); Iterator<IndexDeleteByQueryResponse> it = dbqr.iterator(); long deletions = 0; while (it.hasNext()) { IndexDeleteByQueryResponse response = it.next(); deletions += 1; } logger.info("Deleted residues of " + filename); logger.info("Refreshing [" + indexName + "]"); RefreshResponse refreshResponse = client.admin().indices() .refresh(new RefreshRequest(indexName)).actionGet(); long indexedAfterDelete = getIndexedRecordCount(client, indexName, srcSHA1); logger.info(indexedAfterDelete + " docs remained"); if (indexedAfterDelete > 0) { logger.warn("Not all residues cleaned. Try to fix this manually: $ curl -XDELETE " + hosts[0] + ":9200/" + indexName + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'"); quit(1); } else { logger.info("Residues are gone. Now trying to reindex: " + filename); } } } } logger.info("INDEXING-REQUIRED: " + filename); if (cmd.hasOption("status")) { quit(0); } HashSet idsInBatch = new HashSet(); String idField = null; if (cmd.hasOption("z")) { idField = cmd.getOptionValue("z"); } final FileReader fr = new FileReader(filename); final BufferedReader br = new BufferedReader(fr); String line; // one line is one document while ((line = br.readLine()) != null) { // "Latest-Flag" machine // This gets obsolete with a "flat" index if (cmd.hasOption("z")) { // flag that indicates, whether the document // about to be indexed will be the latest boolean willBeLatest = true; // check if there is a previous (lower meta.timestamp) document with // the same identifier (whatever that may be - queried under "content") final String contentIdentifier = getContentIdentifier(line, idField); idsInBatch.add(contentIdentifier); // assumed in meta.timestamp final Long timestamp = Long.parseLong(getTimestamp(line)); logger.debug("Checking whether record is latest (line: " + lineCount + ")"); logger.debug(contentIdentifier + ", " + timestamp); // get all docs, which match the contentIdentifier // by filter, which doesn't score final TermFilterBuilder idFilter = new TermFilterBuilder("content." + idField, contentIdentifier); final TermFilterBuilder kindFilter = new TermFilterBuilder("meta.kind", docType); final AndFilterBuilder afb = new AndFilterBuilder(); afb.add(idFilter).add(kindFilter); final FilteredQueryBuilder fb = filteredQuery(matchAllQuery(), afb); final SearchResponse searchResponse = client.prepareSearch(indexName) .setSearchType(SearchType.DFS_QUERY_THEN_FETCH).setQuery(fb).setFrom(0) .setSize(1200) // 3 years and 105 days assuming daily updates at the most .setExplain(false).execute().actionGet(); final SearchHits searchHits = searchResponse.getHits(); logger.debug("docs with this id in the index: " + searchHits.getTotalHits()); for (final SearchHit hit : searchHits.getHits()) { final String docId = hit.id(); final Map<String, Object> source = hit.sourceAsMap(); final Map meta = (Map) source.get("meta"); final Long docTimestamp = Long.parseLong(meta.get("timestamp").toString()); // if the indexed doc timestamp is lower the the current one, // remove any latest flag if (timestamp >= docTimestamp) { source.remove("latest"); final ObjectMapper mapper = new ObjectMapper(); // put the updated doc back // IndexResponse response = client.prepareIndex(indexName, docType).setCreate(false).setId(docId) .setSource(mapper.writeValueAsBytes(source)) .execute(new ActionListener<IndexResponse>() { public void onResponse(IndexResponse rspns) { logger.debug("Removed latest flag from " + contentIdentifier + ", " + docTimestamp + ", " + hit.id() + " since (" + timestamp + " > " + docTimestamp + ")"); } public void onFailure(Throwable thrwbl) { logger.error("Could not remove flag from " + hit.id() + ", " + contentIdentifier); } }); // .execute() //.actionGet(); } else { logger.debug("Doc " + hit.id() + " is newer (" + docTimestamp + ")"); willBeLatest = false; } } if (willBeLatest) { line = setLatestFlag(line); logger.info("Setting latest flag on " + contentIdentifier + ", " + timestamp); } // end of latest-flag machine // beware - this will be correct as long as there // are no dups within one bulk! } bulkRequest.add(client.prepareIndex(indexName, docType).setSource(line)); lineCount++; logger.debug("Added line " + lineCount + " to BULK"); logger.debug(line); if (lineCount % bulkSize == 0) { if (idsInBatch.size() != bulkSize && cmd.hasOption("z")) { logger.error( "This batch has duplications in the ID. That's not bad for the index, just makes the latest flag fuzzy"); logger.error( "Bulk size was: " + bulkSize + ", but " + idsInBatch.size() + " IDs (only)"); } idsInBatch.clear(); logger.debug("Issuing BULK request"); final long actionCount = bulkRequest.numberOfActions(); final BulkResponse bulkResponse = bulkRequest.execute().actionGet(); final long tookInMillis = bulkResponse.getTookInMillis(); if (bulkResponse.hasFailures()) { logger.fatal("FAILED, bulk not indexed. exiting now."); Iterator<BulkItemResponse> it = bulkResponse.iterator(); while (it.hasNext()) { BulkItemResponse bir = it.next(); if (bir.isFailed()) { Failure failure = bir.getFailure(); logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage() + ", type: " + failure.getType() + ", index: " + failure.getIndex()); } } quit(1); } else { if (verbose) { final double elapsed = System.currentTimeMillis() - start; final double speed = (lineCount / elapsed * 1000); logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)"); } } bulkRequest = client.prepareBulk(); } } // handle the remaining items final long actionCount = bulkRequest.numberOfActions(); if (actionCount > 0) { final BulkResponse bulkResponse = bulkRequest.execute().actionGet(); final long tookInMillis = bulkResponse.getTookInMillis(); if (bulkResponse.hasFailures()) { logger.fatal("FAILED, bulk not indexed. exiting now."); Iterator<BulkItemResponse> it = bulkResponse.iterator(); while (it.hasNext()) { BulkItemResponse bir = it.next(); if (bir.isFailed()) { Failure failure = bir.getFailure(); logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage() + ", type: " + failure.getType() + ", index: " + failure.getIndex()); } } quit(1); } else { // trigger update now RefreshResponse refreshResponse = client.admin().indices() .refresh(new RefreshRequest(indexName)).actionGet(); if (verbose) { final double elapsed = System.currentTimeMillis() - start; final double speed = (lineCount / elapsed * 1000); logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)"); } } } br.close(); client.close(); final double elapsed = (System.currentTimeMillis() - start) / 1000; final double speed = (lineCount / elapsed); logger.info("indexing (" + filename + ") " + lineCount + " docs took " + elapsed + "s (speed: " + String.format("%.2f", speed) + "r/s)"); if (cmd.hasOption("flat")) { try { FileUtils.touch(stampfile); } catch (IOException ioe) { logger.warn(".indexed files not created. Will reindex everything everytime."); } } } catch (IOException e) { client.close(); logger.error(e); quit(1); } finally { client.close(); } } quit(0); }