Example usage for java.util.stream Collectors groupingBy

Introduction

In this page you can find the example usage for java.util.stream Collectors groupingBy.

Prototype

public static <T, K> Collector<T, ?, Map<K, List<T>>> groupingBy(Function<? super T, ? extends K> classifier)

Source Link

Document

Returns a Collector implementing a "group by" operation on input elements of type T , grouping elements according to a classification function, and returning the results in a Map .

Usage

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void trainAndTest(String trainDir, String testDir)
        throws IOException, FileNotFoundException, ClassNotFoundException, NoSuchMethodException,
        IllegalAccessException, IllegalArgumentException, InvocationTargetException {
    SBURoleTrain trainer = new SBURoleTrain(trainDir.concat("/train.ser"), isMultiClass);
    ArrayList<Sentence> trainData = (ArrayList<Sentence>) FileUtil
            .deserializeFromFile(trainDir.concat("/train.ser"));
    if (isMultiClass) {
        trainer.trainMultiClassClassifier(trainDir);
    } else {/*from   w  ww .java2s  . c o  m*/
        trainer.trainBinaryClassifier(trainDir);
    }

    FileUtil.serializeToFile(trainData, trainDir.concat("/train.ser"));
    SBURolePredict predict = new SBURolePredict(trainDir, testDir.concat("/test.arggold.ser"), isMultiClass);
    predict.performPrediction(testDir.concat("/test.arggold.ser"));
    ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil
            .deserializeFromFile(testDir.concat("/test.argpredict.ser"));
    Map<String, List<Sentence>> groupByProcess = predictedSentences.stream()
            .collect(Collectors.groupingBy(Sentence::getProcessName));

    ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess);
    SentenceUtil.flushDataToJSON(jsonData, testDir.concat("/test.srlout.json"), false);
    SentenceUtil.flushDataToJSON(jsonData, testDir.concat("/test.srlpredict.json"), true);
}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void knowledgeExtractor() throws IOException, FileNotFoundException, ClassNotFoundException,
        NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException {
    boolean dirCreated = FileUtil.mkDir(outputDir);
    dirCreated = FileUtil.mkDir(outputDir.concat("/train"));
    dirCreated = FileUtil.mkDir(outputDir.concat("/test"));
    if (dirCreated) // this is not a good checking, leave it for now
    {/*  ww  w  .j ava 2s . c  om*/
        // TRAINING
        sentences = (ArrayList<Sentence>) sentences.stream().filter(data -> data.isAnnotated())
                .collect(Collectors.toList());
        FileUtil.serializeToFile(sentences, outputDir.concat("/train/train.ser"));
        SBURoleTrain trainer = new SBURoleTrain(outputDir.concat("/train/train.ser"), isMultiClass);
        trainer.train(outputDir.concat("/train"));
        FileUtil.serializeToFile(sentences, outputDir.concat("/train/train.ser"));

        // Read the knowledge sentences using SPOCK data reader
        SpockDataReader reader = new SpockDataReader(testingFileName, configFileName, true); // process, config, is testing
        reader.readData();
        ArrayList<Sentence> testSentences = reader.getSentences();
        FileUtil.serializeToFile(testSentences, outputDir.concat("/test/test.ser"));
        SBURolePredict predict = new SBURolePredict(outputDir.concat("/train"),
                outputDir.concat("/test/test.ser"), isMultiClass);
        predict.knownAnnotation = false;
        predict.performPrediction(outputDir.concat("/test/test.ser"));
        ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil
                .deserializeFromFile(outputDir.concat("/test/predict.ser"));
        Map<String, List<Sentence>> groupByProcess = predictedSentences.stream()
                .collect(Collectors.groupingBy(Sentence::getProcessName));
        ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess);
        SentenceUtil.flushDataToJSON(jsonData, outputDir.concat("/test/srlpredict.json"), true);
    }
}

From source file:sbu.srl.rolextract.SpockDataReader.java

public void readData() throws FileNotFoundException, IOException {
    List<String[]> data = new ArrayList<>();
    data = FileUtil.readDataObject(processFileName, "\t");
    mapFieldIdx(data.get(0));/*from   ww w.j  ava 2  s  .c  om*/
    data = data.subList(1, data.size());
    final Map<String, List<String[]>> sentenceMap = data.stream()
            .collect(Collectors.groupingBy(row -> row[fieldIdxMap.get("sentence")]));
    String roles[] = fieldMap.get("role").split(":");

    int totalUniqueSentence = sentenceMap.keySet().size();
    int sentProcessed = 0;
    System.out.println("TOTAL UNIQUE SENTENCE : " + totalUniqueSentence);
    for (String sentenceStr : sentenceMap.keySet()) {
        Sentence sentence = new Sentence(sentenceStr);
        boolean isAnnotated = false;
        sentence.setRawText(sentenceStr);
        sentence.setProcess(sentenceMap.get(sentenceStr).get(0)[fieldIdxMap.get("process")]);
        HashMap<String, ArrayList<ArgumentSpan>> roleAnnotationSpan = new HashMap<String, ArrayList<ArgumentSpan>>();
        for (String[] sentenceData : sentenceMap.get(sentenceStr)) {
            // Adding argument span here
            for (int i = 0; i < roles.length; i++) {
                int roleColumnIdx = fieldIdxMap.get(roles[i]);
                ArrayList<ArgumentSpan> spans = new ArrayList<ArgumentSpan>();
                if (sentenceData[roleColumnIdx].length() > 0) { // IF role filler is not empty
                    // Set role filler
                    List<String> tokens = StanfordTokenizerSingleton.getInstance()
                            .tokenize(sentenceData[roleColumnIdx].trim());
                    List<String> tokenizedRawText = StanfordTokenizerSingleton.getInstance()
                            .tokenize(sentence.getRawText());
                    String[] pattern = new String[tokens.size()];
                    tokens.toArray(pattern);
                    ArrayList<Integer> matchIdxs = getIdxMatchesv2(pattern,
                            tokenizedRawText.toArray(new String[tokenizedRawText.size()]));
                    DependencyTree tree = StanfordDepParserSingleton.getInstance().parse(sentence.getRawText());
                    ArrayList<DependencyNode> arrDepNodes = new ArrayList<DependencyNode>();

                    if (matchIdxs != null) {
                        for (int j = 1; j <= tree.lastKey(); j++) {
                            if (matchIdxs.contains(j)) {
                                arrDepNodes.add(tree.get(j));
                            }
                        }
                    }
                    ArgumentSpan span = new ArgumentSpan(arrDepNodes, roles[i]);
                    int annotationIdx = fieldIdxMap.get("is" + roles[i]);
                    if (sentenceData[annotationIdx].length() > 0) { // IF contains annotation
                        if (matchIdxs != null) {
                            isAnnotated = true;
                            if (sentenceData[annotationIdx].equalsIgnoreCase("1")) {
                                span.setAnnotatedLabel("1");
                            } else {
                                span.setAnnotatedLabel("-1");
                            }
                        }
                        span.setPattern(sentenceMap.get(sentenceStr).get(0)[fieldIdxMap.get("pattern")]);
                        spans.add(span);
                    }
                    if (isTestingFile) {
                        if (matchIdxs != null) {
                            isAnnotated = true;
                            span.setAnnotatedLabel("-1");
                            span.setPattern(sentenceMap.get(sentenceStr).get(0)[fieldIdxMap.get("pattern")]);
                            spans.add(span);
                        }
                    }
                    // IF THIS IS A TESTING FILE THEN LABEL IT AS -1 
                }
                if (roleAnnotationSpan.get(roles[i]) == null) {
                    roleAnnotationSpan.put(roles[i], spans);
                } else {
                    ArrayList<ArgumentSpan> existingSpans = roleAnnotationSpan.get(roles[i]);
                    existingSpans.addAll(spans);
                    roleAnnotationSpan.put(roles[i], existingSpans);
                }
            }
        }
        if (skipNotAnnotated) {
            if (isAnnotated) {
                sentence.setAnnotated(isAnnotated);
                sentence.setRoleArgAnnotation(roleAnnotationSpan);
                sentences.add(sentence);
            }
        } else {
            sentence.setRoleArgAnnotation(roleAnnotationSpan);
            sentences.add(sentence);
        }
        System.out.println("Sentence processed : " + (++sentProcessed));
    }

    // SET THE ID
    for (int i = 0; i < sentences.size(); i++) {
        sentences.get(i).setId(i);
        int argId = 0;
        ArrayList<ArgumentSpan> spans = sentences.get(i).getAllAnnotatedArgumentSpan();
        for (ArgumentSpan span : spans) {
            span.setId(argId++);
        }
    }
}

From source file:sbu.srl.rolextract.SpockDataReader.java

public void dumpFrameElements(String fileName) throws FileNotFoundException {
    PrintWriter writer = new PrintWriter(fileName);
    Set<String> labels = getRoleLabels();
    Map<String, List<Sentence>> processSentPair = sentences.stream()
            .collect(Collectors.groupingBy(s -> s.getProcessName()));
    for (String process : processSentPair.keySet()) {
        writer.println(process + "\t" + String.join(":", labels));
    }/*from   w w  w  . ja  va  2 s . c  om*/
    writer.close();
}

From source file:sbu.srl.rolextract.SpockDataReader.java

public void generateLexicalUnitFile(String dirName, int frameStartID, int luStartID) throws IOException {
    boolean success = FileUtil.mkDir(dirName);
    if (success) {
        // iterate through each process, give them ID
        int frameID = frameStartID;
        int luID = luStartID;
        Map<String, List<Sentence>> procSentPair = sentences.stream()
                .collect(Collectors.groupingBy(s -> s.getProcessName()));
        for (String process : procSentPair.keySet()) {
            System.out.println(process + "frameID " + (frameID));
            List<Sentence> sentenceArr = procSentPair.get(process);
            for (int i = 0; i < sentenceArr.size(); i++) {
                Sentence currentSent = sentenceArr.get(i);
                String lu = currentSent.getLexicalUnitFrame();
                System.out.println("luID " + (luID));

                // Create file here
                PrintWriter xmlWriter = new PrintWriter(dirName + "/lu" + luID + ".xml");
                xmlWriter.println("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n"
                        + "<?xml-stylesheet type=\"text/xsl\" href=\"lexUnit.xsl\"?>\n"
                        + "<lexUnit status=\"Finished_Initial\" POS=\"N\" name=\"" + lu + "\" ID=\"" + luID
                        + "\" frame=\"" + process + "\" frameID=\"" + frameID
                        + "\" totalAnnotated=\"13\" xsi:schemaLocation=\"../schema/lexUnit.xsd\" xmlns=\"http://framenet.icsi.berkeley.edu\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n"
                        + "    <header>\n" + "        <frame>\n"
                        + "            <FE fgColor=\"FFFFFF\" bgColor=\"9400D3\" type=\"Core\" abbrev=\"res\" name=\"result\"/>\n"
                        + "            <FE fgColor=\"FFFFFF\" bgColor=\"00008B\" type=\"Core\" abbrev=\"trig\" name=\"trigger\"/>\n"
                        + "            <FE fgColor=\"FFFFFF\" bgColor=\"FFA500\" type=\"Core\" abbrev=\"ena\" name=\"enabler\"/>\n"
                        + "            <FE fgColor=\"FFFFFF\" bgColor=\"0000FF\" type=\"Core\" abbrev=\"und\" name=\"undergoer\"/>\n"
                        + "        </frame>\n" + "    </header>\n" + "</lexUnit>");
                xmlWriter.close();//from   www  . jav a  2 s  . c  o m
                luID++;
            }
            frameID++;
        }

    }
    // extract the lexical unit, give them ID
    // create the XML file as well
}