List of usage examples for java.util.stream Collectors groupingBy
public static <T, K> Collector<T, ?, Map<K, List<T>>> groupingBy(Function<? super T, ? extends K> classifier)
From source file:org.wso2.is.portal.user.client.api.ChallengeQuestionManagerClientServiceImpl.java
@Override public List<ChallengeQuestionSetEntry> getChallengeQuestionList(String userUniqueId) throws IdentityRecoveryException, IdentityStoreException, UserNotFoundException { List<ChallengeQuestionSetEntry> challengeQuestionSetEntryList = new ArrayList<ChallengeQuestionSetEntry>(); if (challengeQuestionManager == null || realmService == null) { throw new IdentityRecoveryException("Challenge question manager or Realm service is not available."); }//from w w w.j a v a2 s . c om User user = realmService.getIdentityStore().getUser(userUniqueId); List<ChallengeQuestion> challengeQuestions = challengeQuestionManager.getAllChallengeQuestionsForUser(user); Map<String, List<ChallengeQuestion>> groupedChallengeQuestionMap = challengeQuestions.stream() .collect(Collectors.groupingBy(ChallengeQuestion::getQuestionSetId)); for (Map.Entry<String, List<ChallengeQuestion>> entry : groupedChallengeQuestionMap.entrySet()) { ChallengeQuestionSetEntry challengeQuestionSetEntry = new ChallengeQuestionSetEntry(); challengeQuestionSetEntry.setChallengeQuestionSetId(encodeChallengeQuestionSetId(entry.getKey())); List<ChallengeQuestion> encodedSetIdChallengeQuestionsList = entry.getValue().stream() .map(challengeQuestion -> { challengeQuestion.setQuestionSetId( encodeChallengeQuestionSetId(challengeQuestion.getQuestionSetId())); return challengeQuestion; }).collect(Collectors.toList()); challengeQuestionSetEntry.setChallengeQuestionList(encodedSetIdChallengeQuestionsList); challengeQuestionSetEntryList.add(challengeQuestionSetEntry); } return challengeQuestionSetEntryList; }
From source file:pl.prutkowski.java.playground.java8.TestCollectors.java
/** * @param args the command line arguments *///from w w w . j a v a 2 s. c om public static void main(String[] args) { Map<String, Integer> monthByLen = months.stream() .collect(Collectors.toMap(String::toUpperCase, m -> StringUtils.countMatches(m, "e"))); monthByLen.forEach((month, eCount) -> System.out.println(month + " -> " + eCount)); System.out.println("---------------------------------"); Map<Object, List<String>> monthByLen2 = months.stream() .collect(Collectors.groupingBy(m -> StringUtils.countMatches(m, "e"))); monthByLen2.forEach((count, groupedMonths) -> System.out.println(count + " -> " + groupedMonths)); System.out.println("---------------------------------"); Double averageLength = months.stream().collect(Collectors.averagingDouble(String::length)); System.out.println("Average length: " + averageLength); System.out.println("---------------------------------"); Double max = months.stream().collect(Collectors.summarizingDouble(String::length)).getMax(); System.out.println("Max length: " + max); System.out.println("---------------------------------"); String reduced = months.stream().collect(Collectors.reducing((m1, m2) -> (m1 + ", " + m2))).get(); System.out.println("Reduced: " + reduced); System.out.println("---------------------------------"); System.out.println(String.join(", ", months)); System.out.println("---------------------------------"); List<String> monthsWithZ = months.stream().filter(m -> m.contains("z")).collect(new ListCollector<>()); System.out.println(monthsWithZ); }
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void performCrossValidation(String outputDir, int crossValidation) throws IOException, FileNotFoundException, ClassNotFoundException, NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException { for (int i = 1; i <= crossValidation; i++) { File trainFoldDir = new File(outputDir.concat("/fold-").concat("" + i).concat("/train")); File testFoldDir = new File(outputDir.concat("/fold-").concat("" + i).concat("/test")); SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"), isMultiClass);/* w w w . j av a 2 s .co m*/ trainer.train(trainFoldDir.getAbsolutePath()); SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(), testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass); predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser")); ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser")); Map<String, List<Sentence>> groupByProcess = predictedSentences.stream() .collect(Collectors.groupingBy(Sentence::getProcessName)); ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlout.json"), false); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"), true); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.ilppredict.json"), true); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.semaforpredict.json"), true); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true); /*predict.performPredictionEasySRL(testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), outputDir.concat("/fold-" + i).concat("/test/cv." + i + ".test.sentence.sbu"), outputDir.concat("/fold-" + i).concat("/test/cv." + i + ".raw.predict.easysrl"), "./data/modelCCG", outputDir.concat("/fold-" + i)); predictedSentences = (ArrayList<Sentence>) FileUtil.deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argeasysrlpredict.ser")); groupByProcess = predictedSentences.stream().collect(Collectors.groupingBy(Sentence::getProcessName)); jsonData = SentenceUtil.generateJSONData(groupByProcess);*/ SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true); } }
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void performAblation(String outputDir, int crossValidation) throws IOException, FileNotFoundException, ClassNotFoundException, NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, InterruptedException { ArrayList<String> triedFeatures = new ArrayList<String>( Arrays.asList(FileUtil.readLinesFromFile("./configSBUProcRel/features.ori"))); List<String> ablationFeatures = getAblationFeatures("./configSBUProcRel/features.ablation"); for (int idxAblation = 0; idxAblation < ablationFeatures.size(); idxAblation++) { System.out.println("Removing features : " + ablationFeatures.get(idxAblation)); Thread.sleep(3000);//from ww w . java 2 s. c o m List<String> removedFeatures = Arrays.asList(ablationFeatures.get(idxAblation).split(",")); triedFeatures.removeAll(removedFeatures); FileUtil.dumpToFile(triedFeatures, "./configSBUProcRel/features"); for (int idxFold = 1; idxFold <= crossValidation; idxFold++) { File trainFoldDir = new File(outputDir.concat("/fold-").concat("" + idxFold).concat("/train")); File testFoldDir = new File(outputDir.concat("/fold-").concat("" + idxFold).concat("/test")); SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"), isMultiClass); trainer.train(trainFoldDir.getAbsolutePath()); SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(), testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass); predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser")); ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser")); Map<String, List<Sentence>> groupByProcess = predictedSentences.stream() .collect(Collectors.groupingBy(Sentence::getProcessName)); ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlout.json"), false); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"), true); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.ilppredict.json"), true); // dummy SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.semaforpredict.json"), true);// dummy SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true);// dummy } // copy all data to ILP's data folder // cp -r outputDir /home/slouvan/NetBeansProjects/ILP/data/ try { ProcessBuilder pb = new ProcessBuilder( "/home/slouvan/NetBeansProjects/SRL-Integrated/script/cpDir.sh", outputDir, "/home/slouvan/NetBeansProjects/ILP/data/"); //pb.environment().put("param1", ) Process p = pb.start(); // Start the process. p.waitFor(); // Wait for the process to finish. StdUtil.printOutput(p); pb = new ProcessBuilder("/usr/bin/python", "/home/slouvan/NetBeansProjects/ILP/evaluate.py"); p = pb.start(); // Start the process. p.waitFor(); // Wait for the process to finish. StdUtil.printOutput(p); System.out.println("Script executed successfully"); } catch (Exception e) { e.printStackTrace(); } String[] lines = FileUtil.readLinesFromFile("/home/slouvan/NetBeansProjects/ILP/stats.txt"); PrintWriter out = new PrintWriter( new BufferedWriter(new FileWriter(GlobalV.PROJECT_DIR + "/ablationNew.txt", true))); //more code out.println((new Date()).toString() + " Removed features " + removedFeatures); out.println("Eval : " + Arrays.toString(lines)); out.close(); triedFeatures.addAll(removedFeatures); } }
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void performGreedySearch(String outputDir, int crossValidation) throws FileNotFoundException, IOException, ClassNotFoundException, NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, InterruptedException { // availFeatures = Get all available features) List<String> availableFeatures = new ArrayList<String>( Arrays.asList(FileUtil.readLinesFromFile("./configSBUProcRel/features"))); int nbFeat = availableFeatures.size(); ArrayList<String> triedFeatures = Lists.newArrayList(); while (triedFeatures.size() < nbFeat) { double maxF1 = 0.0; String bestFeat = ""; for (int i = 0; i < availableFeatures.size(); i++) { String nextFeat = availableFeatures.get(i); System.out.println("Trying with " + nextFeat); Thread.sleep(5000);// ww w. ja v a 2s. co m triedFeatures.add(nextFeat); FileUtil.dumpToFile(triedFeatures, "./configSBUProcRel/features"); for (int j = 1; j <= 1; j++) { File trainFoldDir = new File(outputDir.concat("/fold-").concat("" + j).concat("/train")); File testFoldDir = new File(outputDir.concat("/fold-").concat("" + j).concat("/test")); SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"), isMultiClass); trainer.train(trainFoldDir.getAbsolutePath()); SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(), testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass); predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser")); ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser")); Map<String, List<Sentence>> groupByProcess = predictedSentences.stream() .collect(Collectors.groupingBy(Sentence::getProcessName)); ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlout.json"), false); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"), true); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.ilppredict.json"), true); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.semaforpredict.json"), true); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true); } // copy all data to ILP's data folder // cp -r outputDir /home/slouvan/NetBeansProjects/ILP/data/ try { ProcessBuilder pb = new ProcessBuilder( "/home/slouvan/NetBeansProjects/SRL-Integrated/script/cpDir.sh", outputDir, "/home/slouvan/NetBeansProjects/ILP/data/"); //pb.environment().put("param1", ) Process p = pb.start(); // Start the process. p.waitFor(); // Wait for the process to finish. StdUtil.printOutput(p); pb = new ProcessBuilder("/usr/bin/python", "/home/slouvan/NetBeansProjects/ILP/evaluate.py"); p = pb.start(); // Start the process. p.waitFor(); // Wait for the process to finish. StdUtil.printOutput(p); System.out.println("Script executed successfully"); } catch (Exception e) { e.printStackTrace(); } String[] lines = FileUtil.readLinesFromFile("/home/slouvan/NetBeansProjects/ILP/f1.txt"); double currentF1 = Double.parseDouble(lines[0]); if (currentF1 > maxF1) { maxF1 = currentF1; bestFeat = nextFeat; } triedFeatures.remove(nextFeat); } triedFeatures.add(bestFeat); System.out.println("Features used : " + triedFeatures); System.out.println( "Best feature at length " + triedFeatures.size() + " is " + bestFeat + " currentF1 : " + maxF1); availableFeatures.remove(bestFeat); PrintWriter out = new PrintWriter( new BufferedWriter(new FileWriter(GlobalV.PROJECT_DIR + "/ablation.txt", true))); out.println("Features used : " + triedFeatures); //more code out.println((new Date()).toString() + " Best feature at length " + triedFeatures.size() + " is " + bestFeat + " currentF1 : " + maxF1); System.out.println("Tried features length : " + triedFeatures.size() + " NbFeat :" + nbFeat); out.close(); //more code } // for each feat from availFeat // add nextFEat to triedFeat // set the feature config file // doCrossVal, output dummy semafor etc // measureF1 {python here} output to a file, read that file // updateMax // remove nextFeat // print best F1 here // add bestFeat to triedFeat }
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void performedFeatureAddition(String outputDir, int crossValidation) throws FileNotFoundException, IOException, ClassNotFoundException, NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, InterruptedException { List<String> ablationFeatures = getAblationFeatures("./configSBUProcRel/features.ablation"); ArrayList<String> stepwiseFeatures = new ArrayList<String>(); for (int idxAblation = 0; idxAblation < ablationFeatures.size(); idxAblation++) { double maxF1 = Double.MIN_VALUE; ArrayList<String> currentBestFeat = new ArrayList<String>(); String[] metricsBest = null; for (int idxFeat = 0; idxFeat < ablationFeatures.size(); idxFeat++) { Thread.sleep(3000);/*from w ww .j a v a2 s . c o m*/ ArrayList<String> addedFeatures = new ArrayList<String>(); addedFeatures.addAll(Arrays.asList(ablationFeatures.get(idxFeat).split(","))); //(ArrayList<String>) Arrays.asList(ablationFeatures.get(idxAblation).split(",")); boolean triedFeatures = false; for (int i = 0; i < addedFeatures.size(); i++) { if (stepwiseFeatures.contains(addedFeatures.get(i))) { triedFeatures = true; } } if (triedFeatures) { continue; } System.out.println("Adding features : " + ablationFeatures.get(idxFeat)); stepwiseFeatures.addAll(addedFeatures); FileUtil.dumpToFile(stepwiseFeatures, "./configSBUProcRel/features"); for (int idxFold = 1; idxFold <= crossValidation; idxFold++) { File trainFoldDir = new File(outputDir.concat("/fold-").concat("" + idxFold).concat("/train")); File testFoldDir = new File(outputDir.concat("/fold-").concat("" + idxFold).concat("/test")); SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"), isMultiClass); trainer.train(trainFoldDir.getAbsolutePath()); SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(), testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass); predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser")); ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser")); Map<String, List<Sentence>> groupByProcess = predictedSentences.stream() .collect(Collectors.groupingBy(Sentence::getProcessName)); ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlout.json"), false); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"), true); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.ilppredict.json"), true); // dummy SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.semaforpredict.json"), true);// dummy SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true);// dummy SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.fgpredict.json"), true);// dummy } // copy all data to ILP's data folder // cp -r outputDir /home/slouvan/NetBeansProjects/ILP/data/ copyAndEval(outputDir); String[] lines = FileUtil.readLinesFromFile("/home/slouvan/NetBeansProjects/ILP/stats.txt"); double currentF1 = Double.parseDouble(lines[0].split("\t")[2]); if (currentF1 > maxF1) { maxF1 = currentF1; currentBestFeat = addedFeatures; metricsBest = lines; } stepwiseFeatures.removeAll(addedFeatures); } PrintWriter out = new PrintWriter( new BufferedWriter(new FileWriter(GlobalV.PROJECT_DIR + "/additionNew.txt", true))); out.println((new Date()).toString() + " Best features at this stage is " + currentBestFeat); out.println("Eval : " + Arrays.toString(metricsBest)); stepwiseFeatures.addAll(currentBestFeat); out.println("All current features :" + stepwiseFeatures); out.close(); } }
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void generateDevSet(String outputDir, int nbFold, List<String> processes) throws FileNotFoundException, IOException { sentences = (ArrayList<Sentence>) sentences.stream().filter(s -> processes.contains(s.getProcessName())) .collect(Collectors.toList()); Map<String, List<Sentence>> processSentPair = sentences.stream() .collect(Collectors.groupingBy(s -> s.getProcessName())); int partitionSize = sentences.size() / nbFold; int blockSize = 0; int currentFoldCnt = 1; ArrayList<Sentence> trainingData = new ArrayList<Sentence>(); ArrayList<Sentence> testingData = new ArrayList<Sentence>(); HashMap<String, String> testProcessName = new HashMap<String, String>(); HashMap<String, String> trainingProcessName = new HashMap<String, String>(); for (String testingProcess : processSentPair.keySet()) { System.out.println(//from ww w . j a v a2s . co m "Process " + testingProcess + " Nb Sentence :" + processSentPair.get(testingProcess).size()); // if foldNumber is equal to totalFold then // keep adding to testData if (currentFoldCnt == nbFold) { System.out.println("Processing last fold"); testingData.addAll(processSentPair.get(testingProcess)); testProcessName.put(testingProcess, testingProcess); } // if the block counter still less than partition size AND foldNumber is less than totalFold // keep adding to testingData else if (blockSize < partitionSize && currentFoldCnt < nbFold) { System.out.println("Has not reached the boundary, keep adding testing data"); blockSize += processSentPair.get(testingProcess).size(); testingData.addAll(processSentPair.get(testingProcess)); testProcessName.put(testingProcess, testingProcess); System.out.println("BLOCK SIZE : " + blockSize); } else { System.out.println("Boundary reached, get the training data and flush everything"); for (String trainingProcess : processSentPair.keySet()) { if (testProcessName.get(trainingProcess) == null) { trainingData.addAll(processSentPair.get(trainingProcess)); trainingProcessName.put(trainingProcess, trainingProcess); } } System.out.println("Flushing fold " + currentFoldCnt); // serialize training & testing processes String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator()); String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator()); FileUtil.dumpToFile(trainingProcessesStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name")); FileUtil.dumpToFile(testingProcessessStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name")); System.out.println("Nb Sentence in train" + trainingData.size()); System.out.println("Nb Sentence in test" + testingData.size()); FileUtil.serializeToFile(trainingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser")); // ============================================== SEMAFOR ============================================================================================================================================== // ============================================================================================================================================================================================ SpockDataReader.generateSEMAFORFrameAnnotation(trainingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), semOffset); // DUMP REQUIRED DATA FOR SEMAFOR SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu")); SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.process.target")); // EXECUTE ./runMalt.sh here try { ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train")); //pb.environment().put("param1", ) Process p = pb.start(); // Start the process. p.waitFor(); // Wait for the process to finish. StdUtil.printOutput(p); System.out.println("Script executed successfully"); AllAnnotationsMergingWithoutNE.mergeAllAnnotations( outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu")); } catch (Exception e) { e.printStackTrace(); } // ============================================================================================================================================================================================ // ============================================== END OF SEMAFOR ========================================================================================== FileUtil.serializeToFile(testingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser")); trainingData.clear(); testingData.clear(); blockSize = 0; currentFoldCnt++; testProcessName.clear(); trainingProcessName.clear(); } } // handle for the last fold"" for (String trainingProcess : processSentPair.keySet()) { if (testProcessName.get(trainingProcess) == null) { trainingData.addAll(processSentPair.get(trainingProcess)); trainingProcessName.put(trainingProcess, trainingProcess); } } // serialize training & testing processes System.out.println("Flushing fold " + currentFoldCnt); String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator()); String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator()); FileUtil.dumpToFile(trainingProcessesStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name")); FileUtil.dumpToFile(testingProcessessStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name")); System.out.println("Nb Sentence in train" + trainingData.size()); System.out.println("Nb Sentence in test" + testingData.size()); FileUtil.serializeToFile(trainingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser")); // ============================================== SEMAFOR ============================================================================================================================================== // ============================================================================================================================================================================================ SpockDataReader.generateSEMAFORFrameAnnotation(trainingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), semOffset); // DUMP REQUIRED DATA FOR SEMAFOR SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu")); SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.process.target")); // EXECUTE ./runMalt.sh here try { ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train")); //pb.environment().put("param1", ) Process p = pb.start(); // Start the process. p.waitFor(); // Wait for the process to finish. StdUtil.printOutput(p); System.out.println("Script executed successfully"); AllAnnotationsMergingWithoutNE.mergeAllAnnotations( outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu")); } catch (Exception e) { e.printStackTrace(); } // ============================================================================================================================================================================================ // ============================================== END OF SEMAFOR ========================================================================================== FileUtil.serializeToFile(testingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser")); }
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void distributeCrossValidationByProcess(String outputDir, int nbFold) throws FileNotFoundException, IOException, InterruptedException { // /* www .j a va 2 s . c o m*/ Map<String, List<Sentence>> processSentPair = sentences.stream() .collect(Collectors.groupingBy(s -> s.getProcessName())); int partitionSize = sentences.size() / nbFold; int blockSize = 0; int currentFoldCnt = 1; Thread.sleep(10000); System.out.println("Total sentences : " + sentences.size()); ArrayList<Sentence> trainingData = new ArrayList<Sentence>(); ArrayList<Sentence> testingData = new ArrayList<Sentence>(); HashMap<String, String> testProcessName = new HashMap<String, String>(); HashMap<String, String> trainingProcessName = new HashMap<String, String>(); for (String testingProcess : processSentPair.keySet()) { System.out.println( "Process " + testingProcess + " Nb Sentence :" + processSentPair.get(testingProcess).size()); // if foldNumber is equal to totalFold then // keep adding to testData if (currentFoldCnt == nbFold) { System.out.println("Processing last fold"); testingData.addAll(processSentPair.get(testingProcess)); testProcessName.put(testingProcess, testingProcess); } // if the block counter still less than partition size AND foldNumber is less than totalFold // keep adding to testingData else if (blockSize < partitionSize && currentFoldCnt < nbFold) { System.out.println("Has not reached the boundary, keep adding testing data"); blockSize += processSentPair.get(testingProcess).size(); testingData.addAll(processSentPair.get(testingProcess)); testProcessName.put(testingProcess, testingProcess); System.out.println("BLOCK SIZE : " + blockSize); } else { System.out.println("Boundary reached, get the training data and flush everything"); for (String trainingProcess : processSentPair.keySet()) { if (testProcessName.get(trainingProcess) == null) { trainingData.addAll(processSentPair.get(trainingProcess)); trainingProcessName.put(trainingProcess, trainingProcess); } } System.out.println("Flushing fold " + currentFoldCnt); // serialize training & testing processes String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator()); String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator()); FileUtil.dumpToFile(trainingProcessesStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name")); FileUtil.dumpToFile(testingProcessessStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name")); System.out.println("Nb Sentence in train" + trainingData.size()); System.out.println("Nb Sentence in test" + testingData.size()); FileUtil.serializeToFile(trainingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser")); // ============================================== SEMAFOR ============================================================================================================================================== // ============================================================================================================================================================================================ SpockDataReader.generateSEMAFORFrameAnnotation(trainingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), semOffset); // DUMP REQUIRED DATA FOR SEMAFOR SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu")); SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.process.target")); // EXECUTE ./runMalt.sh here try { ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train")); //pb.environment().put("param1", ) Process p = pb.start(); // Start the process. p.waitFor(); // Wait for the process to finish. StdUtil.printOutput(p); System.out.println("Script executed successfully"); AllAnnotationsMergingWithoutNE.mergeAllAnnotations( outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu")); } catch (Exception e) { e.printStackTrace(); } // ============================================================================================================================================================================================ // ============================================== END OF SEMAFOR ========================================================================================== FileUtil.serializeToFile(testingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser")); trainingData.clear(); testingData.clear(); blockSize = 0; currentFoldCnt++; testProcessName.clear(); trainingProcessName.clear(); } } // handle for the last fold"" for (String trainingProcess : processSentPair.keySet()) { if (testProcessName.get(trainingProcess) == null) { trainingData.addAll(processSentPair.get(trainingProcess)); trainingProcessName.put(trainingProcess, trainingProcess); } } // serialize training & testing processes System.out.println("Flushing fold " + currentFoldCnt); String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator()); String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator()); FileUtil.dumpToFile(trainingProcessesStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name")); FileUtil.dumpToFile(testingProcessessStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name")); System.out.println("Nb Sentence in train" + trainingData.size()); System.out.println("Nb Sentence in test" + testingData.size()); FileUtil.serializeToFile(trainingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser")); // ============================================== SEMAFOR ============================================================================================================================================== // ============================================================================================================================================================================================ SpockDataReader.generateSEMAFORFrameAnnotation(trainingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), semOffset); // DUMP REQUIRED DATA FOR SEMAFOR SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu")); SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.process.target")); // EXECUTE ./runMalt.sh here try { ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train")); //pb.environment().put("param1", ) Process p = pb.start(); // Start the process. p.waitFor(); // Wait for the process to finish. StdUtil.printOutput(p); System.out.println("Script executed successfully"); AllAnnotationsMergingWithoutNE.mergeAllAnnotations( outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu")); } catch (Exception e) { e.printStackTrace(); } // ============================================================================================================================================================================================ // ============================================== END OF SEMAFOR ========================================================================================== FileUtil.serializeToFile(testingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser")); }
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void distributeTrainTest() throws FileNotFoundException, IOException, InterruptedException, ClassNotFoundException { //sentences = (ArrayList<Sentence>) FileUtil.deserializeFromFile("./data/training_4_roles.ser"); Map<String, List<Sentence>> processSentPair = sentences.stream() .collect(Collectors.groupingBy(s -> s.getProcessName())); int blockSize = 0; int currentFoldCnt = 1; Thread.sleep(10000);//ww w. j av a2s .co m System.out.println("Total sentences : " + sentences.size()); ArrayList<Sentence> trainingData = new ArrayList<Sentence>(); ArrayList<Sentence> testingData = new ArrayList<Sentence>(); HashMap<String, String> testProcessName = new HashMap<String, String>(); HashMap<String, String> trainingProcessName = new HashMap<String, String>(); for (String trainingProcess : processSentPair.keySet()) { if (testProcessName.get(trainingProcess) == null) { trainingData.addAll(processSentPair.get(trainingProcess)); trainingProcessName.put(trainingProcess, trainingProcess); } } // serialize training & testing processes String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator()); FileUtil.dumpToFile(trainingProcessesStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name")); System.out.println("Nb Sentence in train" + trainingData.size()); FileUtil.serializeToFile(trainingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser")); // ============================================== SEMAFOR ============================================================================================================================================== // ============================================================================================================================================================================================ SpockDataReader.generateSEMAFORFrameAnnotation(trainingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), semOffset); // DUMP REQUIRED DATA FOR SEMAFOR // ============================================== TESTING ======================================================================= SpockDataReader testDataReader = new SpockDataReader(testingFileName, configFileName, true); testDataReader.readData(); ArrayList<Sentence> testingSentences = testDataReader.getSentences();//= (ArrayList<Sentence>)FileUtil.deserializeFromFile("/home/slouvan/NetBeansProjects/SRL-Integrated/thousand_sentences.ser"); FileUtil.serializeToFile(testingSentences, "/home/slouvan/NetBeansProjects/SRL-Integrated/thousand_sentences.ser"); Map<String, List<Sentence>> testProcessSentPair = testingSentences.stream() .collect(Collectors.groupingBy(s -> s.getProcessName())); for (String testingProcess : testProcessSentPair.keySet()) { testProcessName.put(testingProcess, testingProcess); testingData.addAll(testProcessSentPair.get(testingProcess)); } String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator()); System.out.println("Nb Sentence in test" + testingData.size()); FileUtil.dumpToFile(testingProcessessStr, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name")); SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu")); SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt) .concat("/test/cv." + currentFoldCnt + ".test.process.target")); // EXECUTE ./runMalt.sh here try { ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH, outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train")); //pb.environment().put("param1", ) Process p = pb.start(); // Start the process. p.waitFor(); // Wait for the process to finish. StdUtil.printOutput(p); System.out.println("Script executed successfully"); AllAnnotationsMergingWithoutNE.mergeAllAnnotations( outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"), outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"), outputDir.concat("/fold-" + currentFoldCnt) .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu")); } catch (Exception e) { e.printStackTrace(); } // ============================================================================================================================================================================================ // ============================================== END OF SEMAFOR ========================================================================================== FileUtil.serializeToFile(testingData, outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser")); }
From source file:sbu.srl.rolextract.ArgumentClassifier.java
public void doTrainClassify(double trainPctg) throws IOException, FileNotFoundException, ClassNotFoundException, NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException { setupCrossValidationEnvironment(outputDir, 1); Collections.shuffle(sentences, new Random(System.nanoTime())); // /*w ww .j a v a2 s. co m*/ int startIdx = 0; int nbTrain = (int) (trainPctg * sentences.size()); ArrayList<Sentence> trainingData = new ArrayList<>(); ArrayList<Sentence> testingData = new ArrayList<>(); trainingData.addAll(sentences.subList(0, nbTrain)); testingData.addAll(sentences.subList(nbTrain, sentences.size())); FileUtil.serializeToFile(trainingData, outputDir.concat("/fold-1").concat("/train/train.ser")); FileUtil.serializeToFile(testingData, outputDir.concat("/fold-1").concat("/test/test.arggold.ser")); File trainFoldDir = new File(outputDir.concat("/fold-1").concat("/train")); File testFoldDir = new File(outputDir.concat("/fold-1").concat("/test")); SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"), isMultiClass); if (isMultiClass) { trainer.trainMultiClassClassifier(trainFoldDir.getAbsolutePath()); } else { trainer.trainBinaryClassifier(trainFoldDir.getAbsolutePath()); } FileUtil.serializeToFile(trainingData, outputDir.concat("/fold-1").concat("/train/train.ser")); SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(), testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass); predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser")); ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser")); Map<String, List<Sentence>> groupByProcess = predictedSentences.stream() .collect(Collectors.groupingBy(Sentence::getProcessName)); ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlout.json"), false); SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"), true); }