Example usage for java.util HashSet size

List of usage examples for java.util HashSet size

Introduction

In this page you can find the example usage for java.util HashSet size.

Prototype

public int size() 

Source Link

Document

Returns the number of elements in this set (its cardinality).

Usage

From source file:Main.java

public static void main(String[] a) {
    String elements[] = { "A", "B", "C", "D", "E" };
    HashSet<String> set = new HashSet<String>(Arrays.asList(elements));

    System.out.println(set.size());
}

From source file:Main.java

public static void main(String[] args) {
    HashSet<Integer> hSet = new HashSet<Integer>();
    System.out.println("Size of HashSet : " + hSet.size());

    hSet.add(new Integer("1"));
    hSet.add(new Integer("2"));
    hSet.add(new Integer("3"));

    System.out.println(hSet.size());

    hSet.remove(new Integer("1"));
    System.out.println(hSet.size());
}

From source file:Main.java

public static void main(String args[]) {
    HashSet<String> newset = new HashSet<String>();

    // populate hash set
    newset.add("Learning");
    newset.add("from");
    newset.add("java2s.com");

    System.out.println("Size of the set: " + newset.size());
}

From source file:org.aksw.gerbil.bat.annotator.FOXAnnotator.java

public static void main(String[] a) {
    String test = "The philosopher and mathematician Gottfried Wilhelm Leibniz was born in Leipzig.";
    HashSet<Annotation> set = new FOXAnnotator(SingletonWikipediaApi.getInstance()).solveA2W(test);
    LOGGER.info("Got {} annotations.", set.size());
}

From source file:ProducerAndConsumerTool.java

public static void main(String[] args) {

    ConsumerTool consumerTool = new ConsumerTool();
    String[] unknown = CommandLineSupport.setOptions(consumerTool, args);
    HashSet<String> set1 = new HashSet<String>(Arrays.asList(unknown));

    ProducerTool producerTool = new ProducerTool();
    unknown = CommandLineSupport.setOptions(producerTool, args);
    HashSet<String> set2 = new HashSet<String>(Arrays.asList(unknown));

    set1.retainAll(set2);//from   w w w.j a  v a2s .c om
    if (set1.size() > 0) {
        System.out.println("Unknown options: " + set1);
        System.exit(-1);
    }

    consumerTool.run();
    producerTool.run();

}

From source file:ProducerAndConsumerTool.java

public static void main(String[] args) {

    ConsumerTool consumerTool = new ConsumerTool();
    String[] unknown = CommandLineSupport.setOptions(consumerTool, args);
    HashSet<String> set1 = new HashSet<String>(Arrays.asList(unknown));

    HashSet<String> set4 = new HashSet<String>(5);

    ProducerTool producerTool = new ProducerTool();
    unknown = CommandLineSupport.setOptions(producerTool, args);
    HashSet<String> set2 = new HashSet<String>(Arrays.asList(unknown));

    set1.retainAll(set2);/*  w  w  w  .j  a  v a 2s.co  m*/
    if (set1.size() > 0) {
        System.out.println("Unknown options: " + set1);
        System.exit(-1);
    }

    /* consumerTool.run(); */

    producerTool.run();

}

From source file:mase.deprecated.SelectionBenchmark.java

public static void main(String[] args) {

    int L = 100, N = 10000;
    double[] truncationP = new double[] { 0.25, 0.50, 0.75 };
    int[] tournamentP = new int[] { 2, 5, 7, 10 };

    DescriptiveStatistics[] truncationStat = new DescriptiveStatistics[truncationP.length];
    for (int i = 0; i < truncationStat.length; i++) {
        truncationStat[i] = new DescriptiveStatistics();
    }// w w  w  .  j  av  a2 s .  co  m
    DescriptiveStatistics[] tournamentStat = new DescriptiveStatistics[tournamentP.length];
    DescriptiveStatistics[] tournamentStat2 = new DescriptiveStatistics[tournamentP.length];
    for (int i = 0; i < tournamentStat.length; i++) {
        tournamentStat[i] = new DescriptiveStatistics();
        tournamentStat2[i] = new DescriptiveStatistics();
    }
    DescriptiveStatistics rouletteStat = new DescriptiveStatistics();
    DescriptiveStatistics rouletteStat2 = new DescriptiveStatistics();
    DescriptiveStatistics baseStat = new DescriptiveStatistics();

    for (int i = 0; i < N; i++) {
        // generate test vector
        double[] test = new double[L];
        for (int j = 0; j < L; j++) {
            test[j] = Math.random();
        }

        // truncation
        for (int p = 0; p < truncationP.length; p++) {
            double[] v = Arrays.copyOf(test, test.length);
            Arrays.sort(v);
            int nElites = (int) Math.ceil(truncationP[p] * test.length);
            double cutoff = v[test.length - nElites];
            double[] weights = new double[test.length];
            for (int k = 0; k < test.length; k++) {
                weights[k] = test[k] >= cutoff ? test[k] * (1 / truncationP[p]) : 0;
            }
            truncationStat[p].addValue(sum(weights));
        }

        // tournament
        for (int p = 0; p < tournamentP.length; p++) {
            double[] weights = new double[test.length];
            HashSet<Integer> added = new HashSet<Integer>();
            for (int k = 0; k < test.length; k++) {
                int idx = makeTournament(test, tournamentP[p]);
                weights[idx] += test[idx];
                added.add(idx);
            }
            tournamentStat2[p].addValue(added.size());
            tournamentStat[p].addValue(sum(weights));
        }

        // roulette
        double[] weights = new double[test.length];
        HashSet<Integer> added = new HashSet<Integer>();
        for (int k = 0; k < test.length; k++) {
            int idx = roulette(test);
            weights[idx] += test[idx];
            added.add(idx);
        }
        rouletteStat.addValue(sum(weights));
        rouletteStat2.addValue(added.size());

        // base
        baseStat.addValue(sum(test));
    }

    for (int p = 0; p < truncationP.length; p++) {
        System.out.println("Truncation\t" + truncationP[p] + "\t" + truncationStat[p].getMean() + "\t"
                + truncationStat[p].getStandardDeviation() + "\t" + ((int) Math.ceil(L * truncationP[p]))
                + "\t 0");
    }
    for (int p = 0; p < tournamentP.length; p++) {
        System.out.println("Tournament\t" + tournamentP[p] + "\t" + tournamentStat[p].getMean() + "\t"
                + tournamentStat[p].getStandardDeviation() + "\t" + tournamentStat2[p].getMean() + "\t"
                + tournamentStat2[p].getStandardDeviation());
    }
    System.out.println("Roulette\t\t" + rouletteStat.getMean() + "\t" + rouletteStat.getStandardDeviation()
            + "\t" + rouletteStat2.getMean() + "\t" + rouletteStat2.getStandardDeviation());
    System.out.println(
            "Base    \t\t" + baseStat.getMean() + "\t" + baseStat.getStandardDeviation() + "\t " + L + "\t0");
}

From source file:org.ph.commonjoiner.CommonJoiner.java

public static void main(String[] args) {

    try {/*from  w ww.j a v a2s  .co  m*/
        Options options = new Options();
        options.addOption("f", true, "archivo de entrada inicial.");
        options.addOption("g", true, "archivo de entrada a comparar.");
        options.addOption("o", true, "archivo de salida (omitir para salida por pantalla).");
        options.addOption("h", false, "muestra esta ayuda.");
        options.addOption("l", false, "muestra la licencia GPL v.3 al completo.");
        options.addOption("s", false, "muestra estadisticas al finalizar el proceso.");
        options.addOption("v", false, "muestra la version del software.");

        options.addOption("c", true,
                "copia los archivos concidentes con los cargados en la lista mediante -f a la carpeta indicada.");

        CommandLineParser parser = new DefaultParser();
        CommandLine cmd = parser.parse(options, args);

        if ((args.length == 1)) {
            if (cmd.hasOption("l"))
                System.out.println(GNULicense.FULLLICENSE);
            if (cmd.hasOption("v"))
                System.out.println(InfoCommonJoiner.INFO);
            if (cmd.hasOption("h"))
                printSmallHelp(options);

        } else {
            String entrada1, salida;

            // <editor-fold defaultstate="expanded" desc=" opciones de comparacin de contenido de dos archivos ">
            if (cmd.hasOption("f") && cmd.hasOption("g") && cmd.hasOption("o")) {
                entrada1 = cmd.getOptionValue("f");
                String entrada2 = cmd.getOptionValue("g");
                salida = cmd.getOptionValue("o");

                File fEntrada1 = new File(entrada1);
                File fEntrada2 = new File(entrada2);
                File fSalida = new File(salida);

                boolean okEntrada1 = fEntrada1.exists();
                boolean okEntrada2 = fEntrada2.exists();
                boolean okSalida = !(fSalida.exists());

                if (okEntrada1 && okEntrada2) {
                    String thisLine = null;
                    HashSet nums1 = new HashSet<Integer>();
                    ArrayList numComun = new ArrayList<Integer>();

                    System.out.println(InfoCommonJoiner.INTRO);

                    try {
                        Timer tF1 = new Timer("Inicio lectura archivo " + entrada1);
                        BufferedReader br1 = new BufferedReader(new FileReader(fEntrada1));
                        while ((thisLine = br1.readLine()) != null) {
                            nums1.add(Integer.parseInt(thisLine));
                        }
                        tF1.setT1();
                        br1.close();
                        System.out
                                .println("Nmero de elementos nicos en '" + entrada1 + "': " + nums1.size());

                        Timer tF2 = new Timer("Inicio lectura archivo " + entrada2);
                        long readedLines = 0;
                        BufferedReader br2 = new BufferedReader(new FileReader(fEntrada2));
                        while ((thisLine = br2.readLine()) != null) {
                            readedLines++;
                            if (!nums1.add(Integer.parseInt(thisLine))) {
                                numComun.add(Integer.parseInt(thisLine));
                            }
                        }
                        tF2.setT1();
                        br2.close();
                        System.out
                                .println("Nmero de elementos ledos en '" + entrada2 + "': " + readedLines);

                        Timer tF3 = new Timer("Inicio escritura de archivo");
                        if (!okSalida) {
                            System.out.println("Nmero de elementos comunes: " + numComun.size());
                            System.out.println("" + Arrays.toString(numComun.toArray()));
                        } else {
                            // escritura del archivo
                            BufferedWriter bw = new BufferedWriter(new FileWriter(fSalida));
                            // ordenamiento de los resultados
                            Collections.sort(numComun);

                            for (Object i : numComun) {
                                bw.write(i.toString());
                                bw.newLine();
                            }
                            bw.close();
                            tF3.setT1();

                            System.out.println("Nmero de elementos comunes escritos: " + numComun.size());
                        }

                        if (cmd.hasOption("s")) {
                            System.out.println("\nESTADISTICAS");
                            System.out.println("  Tiempo de operacin de lectura de archivo " + entrada1 + ": "
                                    + tF1.getEllapsedTime() + "ns");
                            System.out.println("  Tiempo de operacin de lectura de archivo " + entrada2 + ": "
                                    + tF2.getEllapsedTime() + "ns");
                            if (okSalida) {
                                System.out.println("  Tiempo de operacin de escritura de archivo " + salida
                                        + ": " + tF3.getEllapsedTime() + "ns");
                            }
                            long tiempoTotal = tF1.getEllapsedTime() + tF2.getEllapsedTime()
                                    + ((okSalida) ? tF3.getEllapsedTime() : 0);
                            System.out
                                    .println("  Tiempo total de las operaciones " + ": " + tiempoTotal + "ms");
                        }

                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            }

            // </editor-fold>

            if (cmd.hasOption("c") && cmd.hasOption("f")) {
                boolean copiar = false;
                entrada1 = cmd.getOptionValue("f");
                salida = cmd.getOptionValue("c");
                File archivoEntrada = new File(entrada1);
                File carpetaActual = new File(".");
                File carpetaSalida = new File(salida);

                if (!carpetaSalida.exists()) {
                    System.out.println("Se crear la estructura de carpetas " + carpetaSalida);
                    carpetaSalida.mkdirs();
                }

                copiar = carpetaSalida.isDirectory();
                copiar = (copiar) ? archivoEntrada.canRead() : false;

                if (copiar) {
                    String thisLine;
                    ArrayList<Integer> nums1 = new ArrayList<>();
                    Timer tF1 = new Timer("Inicio lectura archivo " + entrada1);
                    BufferedReader br1 = new BufferedReader(new FileReader(archivoEntrada));
                    while ((thisLine = br1.readLine()) != null) {
                        nums1.add(Integer.parseInt(thisLine));
                    }

                    System.out.println("Archivo de entrada ledo correctamente.");

                    boolean cpy;
                    int progreso = 0/*, totalArchivos = carpetaActual.listFiles().length*/;
                    int totalArchivos = nums1.size();

                    String f, subF;
                    for (File file : carpetaActual.listFiles()) {
                        cpy = false;
                        //                            progreso++;
                        f = (file.getName().contains("."))
                                ? file.getName().substring(0, file.getName().lastIndexOf("."))
                                : "";

                        if (f.length() > 0 && !file.isDirectory()) {
                            for (Integer i : nums1) {
                                if (f.endsWith(i.toString())) {
                                    subF = f.substring(f.length() - i.toString().length() - 1);

                                    try {
                                        if (Math.abs(Integer.parseInt(subF)) == i) {
                                            if (String.valueOf(Integer.parseInt(subF)).length() == subF
                                                    .length()) {
                                                cpy = true;
                                                break;
                                            }
                                        }
                                    } catch (NumberFormatException ex) {
                                        // error -> cadena alfanumrica
                                        cpy = true;
                                    }
                                }
                            }
                        }

                        if (cpy) {
                            //
                            progreso++;
                            //                                System.out.println("    " + file.getName() + " -> " + carpetaSalida.getPath() + "/" + file.getName());
                            //                                if(progreso == 1) System.out.print("" + );
                            if (progreso == 1)
                                System.out.print("Copiando archivos: ");
                            System.out.print("" + progreso + "/" + totalArchivos + "     ");
                            //                                if((progreso * 1.0 / totalArchivos)*100 % 20 == 0) System.out.print("....." + ((progreso * 1.0 / totalArchivos)*100) + "%");
                            Archives.copy(file, carpetaSalida);
                        }
                    }

                    tF1.setT1();
                    System.out.println("\nOperacin finalizada en " + tF1.getEllapsedTime() + " ns");
                } else {
                    System.out.println("No se ha podido realizar la operacin. Posibles causas:");
                    System.out.println("Carpeta de salida: " + carpetaSalida.getName() + " -> "
                            + carpetaSalida.isDirectory());
                    System.out.println("Archivo de entrada: " + archivoEntrada.getName() + " -> "
                            + archivoEntrada.canRead());
                }

            } else {
                printSmallHelp(options);
            }
        }

    } catch (ParseException | IOException ex) {
        System.out.println("Error: " + ex.getMessage() + "\n" + ex.getCause());
    }

}

From source file:nl.systemsgenetics.genenetworkbackend.hpo.TestDiseaseGenePerformance.java

/**
 * @param args the command line arguments
 * @throws java.lang.Exception/*from  w ww.  j  a  v  a  2  s . c  o  m*/
 */
public static void main(String[] args) throws Exception {

    final File diseaseGeneHpoFile = new File(
            "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\HPO\\135\\ALL_SOURCES_ALL_FREQUENCIES_diseases_to_genes_to_phenotypes.txt");
    final File ncbiToEnsgMapFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\ensgNcbiId.txt");
    final File hgncToEnsgMapFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\ensgHgnc.txt");
    final File ensgSymbolMappingFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\ensgHgnc.txt");
    final File predictionMatrixFile = new File(
            "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\hpo_predictions_zscores.txt.gz");
    final File predictionMatrixCorrelationFile = new File(
            "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\hpo_predictions_pathwayCorrelation.txt");
    final File significantTermsFile = new File(
            "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\hpo_predictions_bonSigTerms.txt");
    final double correctedPCutoff = 0.05;
    final File hpoOboFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\HPO\\135\\hp.obo");
    final File hpoPredictionInfoFile = new File(
            "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\hpo_predictions_auc_bonferroni.txt");
    final File hposToExcludeFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\hpoToExclude.txt");
    final File skewnessFile = new File(
            "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\predictions\\skewnessSummary.txt");
    final boolean randomize = true;
    final File annotationMatrixFile = new File(
            "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\PathwayMatrix\\ALL_SOURCES_ALL_FREQUENCIES_phenotype_to_genes.txt_matrix.txt.gz");
    final File backgroundForRandomize = new File(
            "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\Data31995Genes05-12-2017\\PCA_01_02_2018\\PathwayMatrix\\Ensembl2Reactome_All_Levels.txt_genesInPathways.txt");
    //final File backgroundForRandomize = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\expressedReactomeGenes.txt");
    final boolean randomizeCustomBackground = true;

    Map<String, String> ensgSymbolMapping = loadEnsgToHgnc(ensgSymbolMappingFile);

    final File outputFile;
    final ArrayList<String> backgroundGenes;
    if (randomize) {

        if (randomizeCustomBackground) {
            System.err.println("First need to fix so ranking list contains all genes in background list");
            return;
            //            backgroundGenes = loadBackgroundGenes(backgroundForRandomize);
            //            outputFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\hpoDiseaseBenchmarkRandomizedCustomBackground.txt");
        } else {
            backgroundGenes = null;
            outputFile = new File(
                    "C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\hpoDiseaseBenchmarkRandomizedExtraNorm.txt");
        }

    } else {
        backgroundGenes = null;
        outputFile = new File("C:\\UMCG\\Genetica\\Projects\\GeneNetwork\\hpoDiseaseBenchmarkExtraNorm.txt");
    }

    final HashMap<String, ArrayList<String>> ncbiToEnsgMap = loadNcbiToEnsgMap(ncbiToEnsgMapFile);
    final HashMap<String, ArrayList<String>> hgncToEnsgMap = loadHgncToEnsgMap(hgncToEnsgMapFile);
    final HashSet<String> exludedHpo = loadHpoExclude(hposToExcludeFile);

    final SkewnessInfo skewnessInfo = new SkewnessInfo(skewnessFile);

    LinkedHashSet<String> significantTerms = loadSignificantTerms(significantTermsFile);

    DoubleMatrixDataset<String, String> predictionMatrix = DoubleMatrixDataset
            .loadDoubleData(predictionMatrixFile.getAbsolutePath());
    DoubleMatrixDataset<String, String> predictionMatrixSignificant = predictionMatrix
            .viewColSelection(significantTerms);

    DoubleMatrixDataset<String, String> predictionMatrixSignificantCorrelationMatrix = DoubleMatrixDataset
            .loadDoubleData(predictionMatrixCorrelationFile.getAbsolutePath());

    DiseaseGeneHpoData diseaseGeneHpoData = new DiseaseGeneHpoData(diseaseGeneHpoFile, ncbiToEnsgMap,
            hgncToEnsgMap, exludedHpo, new HashSet(predictionMatrix.getHashRows().keySet()), "OMIM");

    //NOTE if one would use a differnt background this needs to be updated
    HashSet<String> diseaseGenes = new HashSet<>(diseaseGeneHpoData.getDiseaseGenes());

    if (randomize) {
        diseaseGeneHpoData = diseaseGeneHpoData.getPermutation(1, backgroundGenes);
    }

    for (String gene : diseaseGenes) {
        if (!predictionMatrixSignificant.containsRow(gene)) {
            throw new Exception("Error: " + gene);
        }
    }

    int[] mapGeneIndexToDiseaseGeneIndex = new int[predictionMatrix.rows()];
    ArrayList<String> predictedGenes = predictionMatrix.getRowObjects();

    int g2 = 0;
    for (int g = 0; g < predictedGenes.size(); ++g) {
        mapGeneIndexToDiseaseGeneIndex[g] = diseaseGenes.contains(predictedGenes.get(g)) ? g2++ : -1;
    }

    DoubleMatrixDataset<String, String> annotationnMatrix = DoubleMatrixDataset
            .loadDoubleData(annotationMatrixFile.getAbsolutePath());
    DoubleMatrixDataset<String, String> annotationMatrixSignificant = annotationnMatrix
            .viewColSelection(significantTerms);

    HashMap<String, MeanSd> hpoMeanSds = calculatePathayMeansOfAnnotatedGenes(predictionMatrixSignificant,
            annotationMatrixSignificant);

    Map<String, PredictionInfo> predictionInfo = HpoFinder.loadPredictionInfo(hpoPredictionInfoFile);

    Ontology hpoOntology = HpoFinder.loadHpoOntology(hpoOboFile);

    HpoFinder hpoFinder = new HpoFinder(hpoOntology, predictionInfo);

    final int totalGenes = predictionMatrixSignificant.rows();
    final int totalDiseaseGenes = diseaseGenes.size();
    final double[] geneScores = new double[totalGenes];
    final double[] geneScoresDiseaseGenes = new double[totalDiseaseGenes];
    final NaturalRanking naturalRanking = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.MAXIMUM);

    CSVWriter writer = new CSVWriter(new FileWriter(outputFile), '\t', '\0', '\0', "\n");

    String[] outputLine = new String[16];
    int c = 0;
    outputLine[c++] = "Disease";
    outputLine[c++] = "Gene";
    outputLine[c++] = "Hgnc";
    outputLine[c++] = "Rank";
    outputLine[c++] = "RankAmongDiseaseGenes";
    outputLine[c++] = "Z-score";
    outputLine[c++] = "HPO_skewness";
    outputLine[c++] = "Other_mean_skewness";
    outputLine[c++] = "Other_max_skewness";
    outputLine[c++] = "HPO_phenotypic_match_score";
    outputLine[c++] = "HPO_count";
    outputLine[c++] = "HPO_sum_auc";
    outputLine[c++] = "HPO_mean_auc";
    outputLine[c++] = "HPO_median_auc";
    outputLine[c++] = "HPO_terms";
    outputLine[c++] = "HPO_terms_match_score";
    writer.writeNext(outputLine);

    Random random = new Random(1);

    Mean meanCalculator = new Mean();
    Median medianCalculator = new Median();

    for (DiseaseGeneHpoData.DiseaseGene diseaseGene : diseaseGeneHpoData.getDiseaseGeneHpos()) {

        String gene = diseaseGene.getGene();
        String disease = diseaseGene.getDisease();

        if (!predictionMatrixSignificant.containsRow(gene)) {
            continue;
        }

        Set<String> geneHpos = diseaseGeneHpoData.getDiseaseEnsgHpos(diseaseGene);

        LinkedHashSet<String> geneHposPredictable = new LinkedHashSet<>();

        for (String hpo : geneHpos) {
            geneHposPredictable
                    .addAll(hpoFinder.getTermsToNames(hpoFinder.getPredictableTerms(hpo, correctedPCutoff)));
        }

        if (geneHposPredictable.isEmpty()) {
            continue;
        }

        //         if(geneHposPredictable.size() > 1){
        //            String hpoSelected = geneHposPredictable.toArray(new String[geneHposPredictable.size()])[random.nextInt(geneHposPredictable.size())];
        //            geneHposPredictable = new LinkedHashSet<>(1);
        //            geneHposPredictable.add(hpoSelected);
        //         }
        DoubleMatrixDataset<String, String> predictionCaseTerms = predictionMatrixSignificant
                .viewColSelection(geneHposPredictable);
        DoubleMatrix2D predictionCaseTermsMatrix = predictionCaseTerms.getMatrix();

        double denominator = Math.sqrt(geneHposPredictable.size());

        for (int g = 0; g < totalGenes; ++g) {
            geneScores[g] = predictionCaseTermsMatrix.viewRow(g).zSum() / denominator;
            if (Double.isNaN(geneScores[g])) {
                geneScores[g] = 0;
            }

            g2 = mapGeneIndexToDiseaseGeneIndex[g];
            if (g2 >= 0) {
                geneScoresDiseaseGenes[g2] = geneScores[g];
            }

        }

        double[] geneRanks = naturalRanking.rank(geneScores);
        int diseaseGeneIndex = predictionMatrixSignificant.getRowIndex(gene);

        double[] geneRanksDiseaseGenes = naturalRanking.rank(geneScoresDiseaseGenes);
        int diseaseGeneIndexInDiseaseGenesOnly = mapGeneIndexToDiseaseGeneIndex[diseaseGeneIndex];

        double zscore = geneScores[diseaseGeneIndex];
        double rank = (totalGenes - geneRanks[diseaseGeneIndex]) + 1;
        double rankAmongDiseaseGenes = (totalDiseaseGenes
                - geneRanksDiseaseGenes[diseaseGeneIndexInDiseaseGenesOnly]) + 1;

        double hpoPhenotypicMatchScore = 0;
        StringBuilder individualMatchScore = new StringBuilder();
        boolean notFirst = false;
        int usedHpos = 0;

        double[] aucs = new double[geneHposPredictable.size()];
        double sumAucs = 0;

        int i = 0;
        for (String hpo : geneHposPredictable) {

            usedHpos++;

            MeanSd hpoMeanSd = hpoMeanSds.get(hpo);

            double hpoPredictionZ = predictionMatrixSignificant.getElement(gene, hpo);

            double hpoPredictionOutlierScore = ((hpoPredictionZ - hpoMeanSd.getMean()) / hpoMeanSd.getSd());

            if (notFirst) {
                individualMatchScore.append(';');
            }
            notFirst = true;

            individualMatchScore.append(hpoPredictionOutlierScore);

            hpoPhenotypicMatchScore += hpoPredictionOutlierScore;

            aucs[i++] = predictionInfo.get(hpo).getAuc();
            sumAucs += predictionInfo.get(hpo).getAuc();

        }

        double meanAuc = meanCalculator.evaluate(aucs);
        double medianAuc = medianCalculator.evaluate(aucs);

        if (usedHpos == 0) {
            hpoPhenotypicMatchScore = Double.NaN;
        } else {
            hpoPhenotypicMatchScore = hpoPhenotypicMatchScore / usedHpos;
        }

        String symbol = ensgSymbolMapping.get(gene);
        if (symbol == null) {
            symbol = "";
        }

        c = 0;
        outputLine[c++] = disease;
        outputLine[c++] = gene;
        outputLine[c++] = symbol;
        outputLine[c++] = String.valueOf(rank);
        outputLine[c++] = String.valueOf(rankAmongDiseaseGenes);
        outputLine[c++] = String.valueOf(zscore);
        outputLine[c++] = String.valueOf(skewnessInfo.getHpoSkewness(gene));
        outputLine[c++] = String.valueOf(skewnessInfo.getMeanSkewnessExHpo(gene));
        outputLine[c++] = String.valueOf(skewnessInfo.getMaxSkewnessExHpo(gene));
        outputLine[c++] = String.valueOf(hpoPhenotypicMatchScore);
        outputLine[c++] = String.valueOf(geneHposPredictable.size());
        outputLine[c++] = String.valueOf(sumAucs);
        outputLine[c++] = String.valueOf(meanAuc);
        outputLine[c++] = String.valueOf(medianAuc);
        outputLine[c++] = String.join(";", geneHposPredictable);
        outputLine[c++] = individualMatchScore.toString();
        writer.writeNext(outputLine);

    }

    writer.close();

}

From source file:de.unileipzig.ub.indexer.App.java

public static void main(String[] args) throws IOException {

    // create Options object
    Options options = new Options();

    options.addOption("h", "help", false, "display this help");

    options.addOption("f", "filename", true, "name of the JSON file whose content should be indexed");
    options.addOption("i", "index", true, "the name of the target index");
    options.addOption("d", "doctype", true, "the name of the doctype (title, local, ...)");

    options.addOption("t", "host", true, "elasticsearch hostname (default: 0.0.0.0)");
    options.addOption("p", "port", true, "transport port (that's NOT the http port, default: 9300)");
    options.addOption("c", "cluster", true, "cluster name (default: elasticsearch_mdma)");

    options.addOption("b", "bulksize", true, "number of docs sent in one request (default: 3000)");
    options.addOption("v", "verbose", false, "show processing speed while indexing");
    options.addOption("s", "status", false, "only show status of index for file");

    options.addOption("r", "repair", false, "attempt to repair recoverable inconsistencies on the go");
    options.addOption("e", "debug", false, "set logging level to debug");
    options.addOption("l", "logfile", true, "logfile - in not specified only log to stdout");

    options.addOption("m", "memcached", true, "host and port of memcached (default: localhost:11211)");
    options.addOption("z", "latest-flag-on", true,
            "enable latest flag according to field (within content, e.g. 001)");
    options.addOption("a", "flat", false, "flat-mode: do not check for inconsistencies");

    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;/*w ww .j  ava  2 s . c o m*/

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException ex) {
        logger.error(ex);
        System.exit(1);
    }

    // setup logging
    Properties systemProperties = System.getProperties();
    systemProperties.put("net.spy.log.LoggerImpl", "net.spy.memcached.compat.log.Log4JLogger");
    System.setProperties(systemProperties);
    Logger.getLogger("net.spy.memcached").setLevel(Level.ERROR);

    Properties props = new Properties();
    props.load(props.getClass().getResourceAsStream("/log4j.properties"));

    if (cmd.hasOption("debug")) {
        props.setProperty("log4j.logger.de.unileipzig", "DEBUG");
    }

    if (cmd.hasOption("logfile")) {
        props.setProperty("log4j.rootLogger", "INFO, stdout, F");
        props.setProperty("log4j.appender.F", "org.apache.log4j.FileAppender");
        props.setProperty("log4j.appender.F.File", cmd.getOptionValue("logfile"));
        props.setProperty("log4j.appender.F.layout", "org.apache.log4j.PatternLayout");
        props.setProperty("log4j.appender.F.layout.ConversionPattern", "%5p | %d | %F | %L | %m%n");
    }

    PropertyConfigurator.configure(props);

    InetAddress addr = InetAddress.getLocalHost();
    String memcachedHostAndPort = addr.getHostAddress() + ":11211";
    if (cmd.hasOption("m")) {
        memcachedHostAndPort = cmd.getOptionValue("m");
    }

    // setup caching
    try {
        if (memcachedClient == null) {
            memcachedClient = new MemcachedClient(
                    new ConnectionFactoryBuilder().setFailureMode(FailureMode.Cancel).build(),
                    AddrUtil.getAddresses("0.0.0.0:11211"));
            try {
                // give client and server 500ms
                Thread.sleep(300);
            } catch (InterruptedException ex) {
            }

            Collection availableServers = memcachedClient.getAvailableServers();
            logger.info(availableServers);
            if (availableServers.size() == 0) {
                logger.info("no memcached servers found");
                memcachedClient.shutdown();
                memcachedClient = null;
            } else {
                logger.info(availableServers.size() + " memcached server(s) detected, fine.");
            }
        }
    } catch (IOException ex) {
        logger.warn("couldn't create a connection, bailing out: " + ex.getMessage());
    }

    // process options

    if (cmd.hasOption("h")) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("indexer", options, true);
        quit(0);
    }

    boolean verbose = false;
    if (cmd.hasOption("verbose")) {
        verbose = true;
    }

    // ES options
    String[] hosts = new String[] { "0.0.0.0" };
    int port = 9300;
    String clusterName = "elasticsearch_mdma";
    int bulkSize = 3000;

    if (cmd.hasOption("host")) {
        hosts = cmd.getOptionValues("host");
    }
    if (cmd.hasOption("port")) {
        port = Integer.parseInt(cmd.getOptionValue("port"));
    }
    if (cmd.hasOption("cluster")) {
        clusterName = cmd.getOptionValue("cluster");
    }
    if (cmd.hasOption("bulksize")) {
        bulkSize = Integer.parseInt(cmd.getOptionValue("bulksize"));
        if (bulkSize < 1 || bulkSize > 100000) {
            logger.error("bulksize must be between 1 and 100,000");
            quit(1);
        }
    }

    // ES Client
    final Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "elasticsearch_mdma")
            .build();
    final TransportClient client = new TransportClient(settings);
    for (String host : hosts) {
        client.addTransportAddress(new InetSocketTransportAddress(host, port));
    }

    if (cmd.hasOption("filename") && cmd.hasOption("index") && cmd.hasOption("doctype")) {

        final String filename = cmd.getOptionValue("filename");

        final File _file = new File(filename);
        if (_file.length() == 0) {
            logger.info(_file.getAbsolutePath() + " is empty, skipping");
            quit(0); // file is empty
        }

        // for flat mode: leave a stampfile beside the json to 
        // indicate previous successful processing
        File directory = new File(filename).getParentFile();
        File stampfile = new File(directory, DigestUtils.shaHex(filename) + ".indexed");

        long start = System.currentTimeMillis();
        long lineCount = 0;

        final String indexName = cmd.getOptionValue("index");
        final String docType = cmd.getOptionValue("doctype");
        BulkRequestBuilder bulkRequest = client.prepareBulk();

        try {
            if (cmd.hasOption("flat")) {
                // flat mode
                // .........
                if (stampfile.exists()) {
                    logger.info("SKIPPING, since it seems this file has already " + "been imported (found: "
                            + stampfile.getAbsolutePath() + ")");
                    quit(0);
                }
            } else {

                final String srcSHA1 = extractSrcSHA1(filename);

                logger.debug(filename + " srcsha1: " + srcSHA1);

                long docsInIndex = getIndexedRecordCount(client, indexName, srcSHA1);
                logger.debug(filename + " indexed: " + docsInIndex);

                long docsInFile = getLineCount(filename);
                logger.debug(filename + " lines: " + docsInFile);

                // in non-flat-mode, indexing would take care
                // of inconsistencies
                if (docsInIndex == docsInFile) {
                    logger.info("UP-TO DATE: " + filename + " (" + docsInIndex + ", " + srcSHA1 + ")");
                    client.close();
                    quit(0);
                }

                if (docsInIndex > 0) {
                    logger.warn("INCONSISTENCY DETECTED: " + filename + ": indexed:" + docsInIndex + " lines:"
                            + docsInFile);

                    if (!cmd.hasOption("r")) {
                        logger.warn(
                                "Please re-run indexer with --repair flag or delete residues first with: $ curl -XDELETE "
                                        + hosts[0] + ":9200/" + indexName
                                        + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1
                                        + "\" }}'");
                        client.close();
                        quit(1);
                    } else {
                        logger.info("Attempting to clear residues...");
                        // attempt to repair once
                        DeleteByQueryResponse dbqr = client.prepareDeleteByQuery(indexName)
                                .setQuery(termQuery("meta.srcsha1", srcSHA1)).execute().actionGet();

                        Iterator<IndexDeleteByQueryResponse> it = dbqr.iterator();
                        long deletions = 0;
                        while (it.hasNext()) {
                            IndexDeleteByQueryResponse response = it.next();
                            deletions += 1;
                        }
                        logger.info("Deleted residues of " + filename);
                        logger.info("Refreshing [" + indexName + "]");
                        RefreshResponse refreshResponse = client.admin().indices()
                                .refresh(new RefreshRequest(indexName)).actionGet();

                        long indexedAfterDelete = getIndexedRecordCount(client, indexName, srcSHA1);
                        logger.info(indexedAfterDelete + " docs remained");
                        if (indexedAfterDelete > 0) {
                            logger.warn("Not all residues cleaned. Try to fix this manually: $ curl -XDELETE "
                                    + hosts[0] + ":9200/" + indexName
                                    + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'");
                            quit(1);
                        } else {
                            logger.info("Residues are gone. Now trying to reindex: " + filename);
                        }
                    }
                }
            }

            logger.info("INDEXING-REQUIRED: " + filename);
            if (cmd.hasOption("status")) {
                quit(0);
            }

            HashSet idsInBatch = new HashSet();

            String idField = null;
            if (cmd.hasOption("z")) {
                idField = cmd.getOptionValue("z");
            }

            final FileReader fr = new FileReader(filename);
            final BufferedReader br = new BufferedReader(fr);

            String line;
            // one line is one document
            while ((line = br.readLine()) != null) {

                // "Latest-Flag" machine
                // This gets obsolete with a "flat" index
                if (cmd.hasOption("z")) {
                    // flag that indicates, whether the document
                    // about to be indexed will be the latest
                    boolean willBeLatest = true;

                    // check if there is a previous (lower meta.timestamp) document with 
                    // the same identifier (whatever that may be - queried under "content")
                    final String contentIdentifier = getContentIdentifier(line, idField);
                    idsInBatch.add(contentIdentifier);

                    // assumed in meta.timestamp
                    final Long timestamp = Long.parseLong(getTimestamp(line));

                    logger.debug("Checking whether record is latest (line: " + lineCount + ")");
                    logger.debug(contentIdentifier + ", " + timestamp);

                    // get all docs, which match the contentIdentifier
                    // by filter, which doesn't score
                    final TermFilterBuilder idFilter = new TermFilterBuilder("content." + idField,
                            contentIdentifier);
                    final TermFilterBuilder kindFilter = new TermFilterBuilder("meta.kind", docType);
                    final AndFilterBuilder afb = new AndFilterBuilder();
                    afb.add(idFilter).add(kindFilter);
                    final FilteredQueryBuilder fb = filteredQuery(matchAllQuery(), afb);

                    final SearchResponse searchResponse = client.prepareSearch(indexName)
                            .setSearchType(SearchType.DFS_QUERY_THEN_FETCH).setQuery(fb).setFrom(0)
                            .setSize(1200) // 3 years and 105 days assuming daily updates at the most
                            .setExplain(false).execute().actionGet();

                    final SearchHits searchHits = searchResponse.getHits();

                    logger.debug("docs with this id in the index: " + searchHits.getTotalHits());

                    for (final SearchHit hit : searchHits.getHits()) {
                        final String docId = hit.id();
                        final Map<String, Object> source = hit.sourceAsMap();
                        final Map meta = (Map) source.get("meta");
                        final Long docTimestamp = Long.parseLong(meta.get("timestamp").toString());
                        // if the indexed doc timestamp is lower the the current one, 
                        // remove any latest flag
                        if (timestamp >= docTimestamp) {
                            source.remove("latest");
                            final ObjectMapper mapper = new ObjectMapper();
                            // put the updated doc back
                            // IndexResponse response = 
                            client.prepareIndex(indexName, docType).setCreate(false).setId(docId)
                                    .setSource(mapper.writeValueAsBytes(source))
                                    .execute(new ActionListener<IndexResponse>() {
                                        public void onResponse(IndexResponse rspns) {
                                            logger.debug("Removed latest flag from " + contentIdentifier + ", "
                                                    + docTimestamp + ", " + hit.id() + " since (" + timestamp
                                                    + " > " + docTimestamp + ")");
                                        }

                                        public void onFailure(Throwable thrwbl) {
                                            logger.error("Could not remove flag from " + hit.id() + ", "
                                                    + contentIdentifier);
                                        }
                                    });
                            // .execute()
                            //.actionGet();
                        } else {
                            logger.debug("Doc " + hit.id() + " is newer (" + docTimestamp + ")");
                            willBeLatest = false;
                        }
                    }

                    if (willBeLatest) {
                        line = setLatestFlag(line);
                        logger.info("Setting latest flag on " + contentIdentifier + ", " + timestamp);
                    }

                    // end of latest-flag machine
                    // beware - this will be correct as long as there
                    // are no dups within one bulk!
                }

                bulkRequest.add(client.prepareIndex(indexName, docType).setSource(line));
                lineCount++;
                logger.debug("Added line " + lineCount + " to BULK");
                logger.debug(line);

                if (lineCount % bulkSize == 0) {

                    if (idsInBatch.size() != bulkSize && cmd.hasOption("z")) {
                        logger.error(
                                "This batch has duplications in the ID. That's not bad for the index, just makes the latest flag fuzzy");
                        logger.error(
                                "Bulk size was: " + bulkSize + ", but " + idsInBatch.size() + " IDs (only)");
                    }
                    idsInBatch.clear();

                    logger.debug("Issuing BULK request");

                    final long actionCount = bulkRequest.numberOfActions();
                    final BulkResponse bulkResponse = bulkRequest.execute().actionGet();
                    final long tookInMillis = bulkResponse.getTookInMillis();

                    if (bulkResponse.hasFailures()) {
                        logger.fatal("FAILED, bulk not indexed. exiting now.");
                        Iterator<BulkItemResponse> it = bulkResponse.iterator();
                        while (it.hasNext()) {
                            BulkItemResponse bir = it.next();
                            if (bir.isFailed()) {
                                Failure failure = bir.getFailure();
                                logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage()
                                        + ", type: " + failure.getType() + ", index: " + failure.getIndex());
                            }
                        }
                        quit(1);
                    } else {
                        if (verbose) {
                            final double elapsed = System.currentTimeMillis() - start;
                            final double speed = (lineCount / elapsed * 1000);
                            logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount
                                    + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)");
                        }
                    }
                    bulkRequest = client.prepareBulk();
                }
            }

            // handle the remaining items
            final long actionCount = bulkRequest.numberOfActions();
            if (actionCount > 0) {
                final BulkResponse bulkResponse = bulkRequest.execute().actionGet();
                final long tookInMillis = bulkResponse.getTookInMillis();

                if (bulkResponse.hasFailures()) {
                    logger.fatal("FAILED, bulk not indexed. exiting now.");
                    Iterator<BulkItemResponse> it = bulkResponse.iterator();
                    while (it.hasNext()) {
                        BulkItemResponse bir = it.next();
                        if (bir.isFailed()) {
                            Failure failure = bir.getFailure();
                            logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage()
                                    + ", type: " + failure.getType() + ", index: " + failure.getIndex());
                        }
                    }
                    quit(1);
                } else {

                    // trigger update now
                    RefreshResponse refreshResponse = client.admin().indices()
                            .refresh(new RefreshRequest(indexName)).actionGet();

                    if (verbose) {
                        final double elapsed = System.currentTimeMillis() - start;
                        final double speed = (lineCount / elapsed * 1000);
                        logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/"
                                + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)");
                    }

                }

            }

            br.close();
            client.close();
            final double elapsed = (System.currentTimeMillis() - start) / 1000;
            final double speed = (lineCount / elapsed);
            logger.info("indexing (" + filename + ") " + lineCount + " docs took " + elapsed + "s (speed: "
                    + String.format("%.2f", speed) + "r/s)");
            if (cmd.hasOption("flat")) {
                try {
                    FileUtils.touch(stampfile);
                } catch (IOException ioe) {
                    logger.warn(".indexed files not created. Will reindex everything everytime.");
                }
            }
        } catch (IOException e) {
            client.close();
            logger.error(e);
            quit(1);
        } finally {
            client.close();
        }
    }
    quit(0);
}