List of usage examples for java.util.stream Collectors groupingBy
public static <T, K> Collector<T, ?, Map<K, List<T>>> groupingBy(Function<? super T, ? extends K> classifier)
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void trainAndTest(String trainDir, String testDir) throws IOException, FileNotFoundException, ClassNotFoundException, NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException { SBURoleTrain trainer = new SBURoleTrain(trainDir.concat("/train.ser"), isMultiClass); ArrayList<Sentence> trainData = (ArrayList<Sentence>) FileUtil .deserializeFromFile(trainDir.concat("/train.ser")); if (isMultiClass) { trainer.trainMultiClassClassifier(trainDir); } else {/*from w ww .java2s . c o m*/ trainer.trainBinaryClassifier(trainDir); } FileUtil.serializeToFile(trainData, trainDir.concat("/train.ser")); SBURolePredict predict = new SBURolePredict(trainDir, testDir.concat("/test.arggold.ser"), isMultiClass); predict.performPrediction(testDir.concat("/test.arggold.ser")); ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil .deserializeFromFile(testDir.concat("/test.argpredict.ser")); Map<String, List<Sentence>> groupByProcess = predictedSentences.stream() .collect(Collectors.groupingBy(Sentence::getProcessName)); ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess); SentenceUtil.flushDataToJSON(jsonData, testDir.concat("/test.srlout.json"), false); SentenceUtil.flushDataToJSON(jsonData, testDir.concat("/test.srlpredict.json"), true); }
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void knowledgeExtractor() throws IOException, FileNotFoundException, ClassNotFoundException, NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException { boolean dirCreated = FileUtil.mkDir(outputDir); dirCreated = FileUtil.mkDir(outputDir.concat("/train")); dirCreated = FileUtil.mkDir(outputDir.concat("/test")); if (dirCreated) // this is not a good checking, leave it for now {/* ww w .j ava 2s . c om*/ // TRAINING sentences = (ArrayList<Sentence>) sentences.stream().filter(data -> data.isAnnotated()) .collect(Collectors.toList()); FileUtil.serializeToFile(sentences, outputDir.concat("/train/train.ser")); SBURoleTrain trainer = new SBURoleTrain(outputDir.concat("/train/train.ser"), isMultiClass); trainer.train(outputDir.concat("/train")); FileUtil.serializeToFile(sentences, outputDir.concat("/train/train.ser")); // Read the knowledge sentences using SPOCK data reader SpockDataReader reader = new SpockDataReader(testingFileName, configFileName, true); // process, config, is testing reader.readData(); ArrayList<Sentence> testSentences = reader.getSentences(); FileUtil.serializeToFile(testSentences, outputDir.concat("/test/test.ser")); SBURolePredict predict = new SBURolePredict(outputDir.concat("/train"), outputDir.concat("/test/test.ser"), isMultiClass); predict.knownAnnotation = false; predict.performPrediction(outputDir.concat("/test/test.ser")); ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil .deserializeFromFile(outputDir.concat("/test/predict.ser")); Map<String, List<Sentence>> groupByProcess = predictedSentences.stream() .collect(Collectors.groupingBy(Sentence::getProcessName)); ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess); SentenceUtil.flushDataToJSON(jsonData, outputDir.concat("/test/srlpredict.json"), true); } }
From source file:sbu.srl.rolextract.SpockDataReader.java
public void readData() throws FileNotFoundException, IOException { List<String[]> data = new ArrayList<>(); data = FileUtil.readDataObject(processFileName, "\t"); mapFieldIdx(data.get(0));/*from ww w.j ava 2 s .c om*/ data = data.subList(1, data.size()); final Map<String, List<String[]>> sentenceMap = data.stream() .collect(Collectors.groupingBy(row -> row[fieldIdxMap.get("sentence")])); String roles[] = fieldMap.get("role").split(":"); int totalUniqueSentence = sentenceMap.keySet().size(); int sentProcessed = 0; System.out.println("TOTAL UNIQUE SENTENCE : " + totalUniqueSentence); for (String sentenceStr : sentenceMap.keySet()) { Sentence sentence = new Sentence(sentenceStr); boolean isAnnotated = false; sentence.setRawText(sentenceStr); sentence.setProcess(sentenceMap.get(sentenceStr).get(0)[fieldIdxMap.get("process")]); HashMap<String, ArrayList<ArgumentSpan>> roleAnnotationSpan = new HashMap<String, ArrayList<ArgumentSpan>>(); for (String[] sentenceData : sentenceMap.get(sentenceStr)) { // Adding argument span here for (int i = 0; i < roles.length; i++) { int roleColumnIdx = fieldIdxMap.get(roles[i]); ArrayList<ArgumentSpan> spans = new ArrayList<ArgumentSpan>(); if (sentenceData[roleColumnIdx].length() > 0) { // IF role filler is not empty // Set role filler List<String> tokens = StanfordTokenizerSingleton.getInstance() .tokenize(sentenceData[roleColumnIdx].trim()); List<String> tokenizedRawText = StanfordTokenizerSingleton.getInstance() .tokenize(sentence.getRawText()); String[] pattern = new String[tokens.size()]; tokens.toArray(pattern); ArrayList<Integer> matchIdxs = getIdxMatchesv2(pattern, tokenizedRawText.toArray(new String[tokenizedRawText.size()])); DependencyTree tree = StanfordDepParserSingleton.getInstance().parse(sentence.getRawText()); ArrayList<DependencyNode> arrDepNodes = new ArrayList<DependencyNode>(); if (matchIdxs != null) { for (int j = 1; j <= tree.lastKey(); j++) { if (matchIdxs.contains(j)) { arrDepNodes.add(tree.get(j)); } } } ArgumentSpan span = new ArgumentSpan(arrDepNodes, roles[i]); int annotationIdx = fieldIdxMap.get("is" + roles[i]); if (sentenceData[annotationIdx].length() > 0) { // IF contains annotation if (matchIdxs != null) { isAnnotated = true; if (sentenceData[annotationIdx].equalsIgnoreCase("1")) { span.setAnnotatedLabel("1"); } else { span.setAnnotatedLabel("-1"); } } span.setPattern(sentenceMap.get(sentenceStr).get(0)[fieldIdxMap.get("pattern")]); spans.add(span); } if (isTestingFile) { if (matchIdxs != null) { isAnnotated = true; span.setAnnotatedLabel("-1"); span.setPattern(sentenceMap.get(sentenceStr).get(0)[fieldIdxMap.get("pattern")]); spans.add(span); } } // IF THIS IS A TESTING FILE THEN LABEL IT AS -1 } if (roleAnnotationSpan.get(roles[i]) == null) { roleAnnotationSpan.put(roles[i], spans); } else { ArrayList<ArgumentSpan> existingSpans = roleAnnotationSpan.get(roles[i]); existingSpans.addAll(spans); roleAnnotationSpan.put(roles[i], existingSpans); } } } if (skipNotAnnotated) { if (isAnnotated) { sentence.setAnnotated(isAnnotated); sentence.setRoleArgAnnotation(roleAnnotationSpan); sentences.add(sentence); } } else { sentence.setRoleArgAnnotation(roleAnnotationSpan); sentences.add(sentence); } System.out.println("Sentence processed : " + (++sentProcessed)); } // SET THE ID for (int i = 0; i < sentences.size(); i++) { sentences.get(i).setId(i); int argId = 0; ArrayList<ArgumentSpan> spans = sentences.get(i).getAllAnnotatedArgumentSpan(); for (ArgumentSpan span : spans) { span.setId(argId++); } } }
From source file:sbu.srl.rolextract.SpockDataReader.java
public void dumpFrameElements(String fileName) throws FileNotFoundException { PrintWriter writer = new PrintWriter(fileName); Set<String> labels = getRoleLabels(); Map<String, List<Sentence>> processSentPair = sentences.stream() .collect(Collectors.groupingBy(s -> s.getProcessName())); for (String process : processSentPair.keySet()) { writer.println(process + "\t" + String.join(":", labels)); }/*from w w w . ja va 2 s . c om*/ writer.close(); }
From source file:sbu.srl.rolextract.SpockDataReader.java
public void generateLexicalUnitFile(String dirName, int frameStartID, int luStartID) throws IOException { boolean success = FileUtil.mkDir(dirName); if (success) { // iterate through each process, give them ID int frameID = frameStartID; int luID = luStartID; Map<String, List<Sentence>> procSentPair = sentences.stream() .collect(Collectors.groupingBy(s -> s.getProcessName())); for (String process : procSentPair.keySet()) { System.out.println(process + "frameID " + (frameID)); List<Sentence> sentenceArr = procSentPair.get(process); for (int i = 0; i < sentenceArr.size(); i++) { Sentence currentSent = sentenceArr.get(i); String lu = currentSent.getLexicalUnitFrame(); System.out.println("luID " + (luID)); // Create file here PrintWriter xmlWriter = new PrintWriter(dirName + "/lu" + luID + ".xml"); xmlWriter.println("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n" + "<?xml-stylesheet type=\"text/xsl\" href=\"lexUnit.xsl\"?>\n" + "<lexUnit status=\"Finished_Initial\" POS=\"N\" name=\"" + lu + "\" ID=\"" + luID + "\" frame=\"" + process + "\" frameID=\"" + frameID + "\" totalAnnotated=\"13\" xsi:schemaLocation=\"../schema/lexUnit.xsd\" xmlns=\"http://framenet.icsi.berkeley.edu\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n" + " <header>\n" + " <frame>\n" + " <FE fgColor=\"FFFFFF\" bgColor=\"9400D3\" type=\"Core\" abbrev=\"res\" name=\"result\"/>\n" + " <FE fgColor=\"FFFFFF\" bgColor=\"00008B\" type=\"Core\" abbrev=\"trig\" name=\"trigger\"/>\n" + " <FE fgColor=\"FFFFFF\" bgColor=\"FFA500\" type=\"Core\" abbrev=\"ena\" name=\"enabler\"/>\n" + " <FE fgColor=\"FFFFFF\" bgColor=\"0000FF\" type=\"Core\" abbrev=\"und\" name=\"undergoer\"/>\n" + " </frame>\n" + " </header>\n" + "</lexUnit>"); xmlWriter.close();//from www . jav a 2 s . c o m luID++; } frameID++; } } // extract the lexical unit, give them ID // create the XML file as well }