List of usage examples for java.util ArrayList forEach
@Override public void forEach(Consumer<? super E> action)
From source file:structuredPredictionNLG.SFX.java
/** * * @param wordSequence// www . j a v a2 s.c o m * @return */ public ArrayList<String> getPredictedAttrList(ArrayList<Action> wordSequence) { ArrayList<Action> cleanActionList = new ArrayList<>(); wordSequence.stream().filter((action) -> (!action.getWord().equals(Action.TOKEN_START) && !action.getWord().equals(Action.TOKEN_END))).forEachOrdered((action) -> { cleanActionList.add(action); }); ArrayList<String> predictedAttrList = new ArrayList<>(); cleanActionList.forEach((action) -> { if (predictedAttrList.isEmpty()) { predictedAttrList.add(action.getAttribute()); } else if (!predictedAttrList.get(predictedAttrList.size() - 1).equals(action.getAttribute())) { predictedAttrList.add(action.getAttribute()); } }); return predictedAttrList; }
From source file:structuredPredictionNLG.SFX.java
/** * * @param printResults/*from w ww. j a v a 2s . c om*/ * @param resultsFile */ public void parseWenFiles(boolean printResults, String resultsFile) { wenEvaluationReferenceSequences = new HashMap<>(); wenEvaluationReferences = new HashMap<>(); HashMap<String, String> daToGen = new HashMap<>(); HashMap<String, String> predictedWordSequences_overAllPredicates = new HashMap<>(); HashMap<String, HashMap<String, String>> predictedWordSequences_perPredicates = new HashMap<>(); HashMap<String, ArrayList<String>> finalReferencesWordSequences = new HashMap<>(); HashMap<String, Double> attrCoverage = new HashMap<>(); HashMap<String, String> MRtoAbstractMR = new HashMap<>(); HashMap<String, String> abstractMRtoMR = new HashMap<>(); HashMap<String, String> abstractMRtoText = new HashMap<>(); try (BufferedReader br = new BufferedReader(new FileReader(resultsFile))) { String s; boolean inGen = false; boolean inRef = false; boolean firstRef = true; ArrayList<ArrayList<Sequence<IString>>> finalReferences = new ArrayList<>(); ArrayList<ScoredFeaturizedTranslation<IString, String>> generations = new ArrayList<>(); ArrayList<Sequence<IString>> references = new ArrayList<>(); ArrayList<String> referencesStrings = new ArrayList<>(); String da = ""; String predicate = ""; String abstractMR = ""; String predictedWordSequence = ""; String MRstr = ""; int gens = 0; int c = 0; HashMap<String, HashSet<String>> attributeValues = new HashMap<>(); while ((s = br.readLine()) != null) { if (s.startsWith("DA")) { inGen = false; inRef = false; firstRef = true; da = s.substring(s.indexOf(':') + 1).trim(); c++; MRstr = s.substring(s.indexOf(':') + 1).replaceAll(",", ";") .replaceAll("no or yes", "yes or no").replaceAll("ave ; presidio", "ave and presidio") .replaceAll("point ; ste", "point and ste").trim(); predicate = MRstr.substring(0, MRstr.indexOf('(')); abstractMR = predicate + ":"; String attributesStr = MRstr.substring(MRstr.indexOf('(') + 1, MRstr.length() - 1); attributeValues = new HashMap<>(); if (!attributesStr.isEmpty()) { HashMap<String, Integer> attrXIndeces = new HashMap<>(); String[] args = attributesStr.split(";"); if (attributesStr.contains("|")) { System.exit(0); } for (String arg : args) { String attr; String value = ""; if (arg.contains("=")) { String[] subAttr = arg.split("="); value = subAttr[1].toLowerCase(); attr = subAttr[0].toLowerCase().replaceAll("_", ""); if (value.startsWith("\'")) { value = value.substring(1, value.length() - 1); } if (value.equals("true")) { value = "yes"; } if (value.equals("false")) { value = "no"; } if (value.equals("dontcare")) { value = "dont_care"; } if (value.equals("no") || value.equals("yes") || value.equals("yes or no") || value.equals("none") || value.equals("empty")) { attr += "_" + value.replaceAll(" ", "_"); value = attr; } if (value.equals("dont_care")) { String v = value; value = attr; attr = v; } } else { attr = arg.replaceAll("_", ""); } if (!attributeValues.containsKey(attr)) { attributeValues.put(attr, new HashSet<String>()); } if (value.isEmpty()) { value = attr; } if (value.startsWith("\'")) { value = value.substring(1, value.length() - 1); } if (value.toLowerCase().startsWith("x")) { int index = 0; if (!attrXIndeces.containsKey(attr)) { attrXIndeces.put(attr, 1); } else { index = attrXIndeces.get(attr); attrXIndeces.put(attr, index + 1); } value = "x" + index; } if (value.isEmpty()) { System.exit(0); } attributeValues.get(attr).add(value.trim().toLowerCase()); } } } else if (s.startsWith("Gen")) { gens = 0; inGen = true; } else if (s.startsWith("Ref")) { inRef = true; references = new ArrayList<>(); } else if (inGen) { if (s.trim().isEmpty()) { inGen = false; } else { predictedWordSequence = s.trim().toLowerCase(); Sequence<IString> translation = IStrings .tokenize(NISTTokenizer.tokenize(predictedWordSequence)); ScoredFeaturizedTranslation<IString, String> tran = new ScoredFeaturizedTranslation<>( translation, null, 0); generations.add(tran); inGen = false; gens++; da = ""; } } else if (inRef) { if (s.trim().isEmpty()) { for (int i = 0; i < gens; i++) { finalReferences.add(references); finalReferencesWordSequences.put(MRstr, referencesStrings); } wenEvaluationReferenceSequences.put(MRstr, references); wenEvaluationReferences.put(MRstr, referencesStrings); references = new ArrayList<>(); referencesStrings = new ArrayList<>(); inRef = false; da = ""; } else { String cleanedWords = s.trim().replaceAll("\\?", " \\? ").replaceAll(":", " : ") .replaceAll("\\.", " \\. ").replaceAll(",", " , ").replaceAll(" ", " ").trim(); referencesStrings.add(cleanedWords); references.add(IStrings.tokenize(NISTTokenizer.tokenize(cleanedWords))); if (firstRef) { ArrayList<String> attrs = new ArrayList<>(attributeValues.keySet()); Collections.sort(attrs); HashMap<String, Integer> xCounts = new HashMap<>(); attrs.forEach((attr) -> { xCounts.put(attr, 0); }); for (String attr : attrs) { abstractMR += attr + "={"; ArrayList<String> values = new ArrayList<>(attributeValues.get(attr)); Collections.sort(values); for (String value : values) { if (attr.equals("name") || attr.equals("type") || attr.equals("pricerange") || attr.equals("price") || attr.equals("phone") || attr.equals("address") || attr.equals("postcode") || attr.equals("area") || attr.equals("near") || attr.equals("food") || attr.equals("goodformeal") || attr.equals("count")) { abstractMR += Action.TOKEN_X + attr + "_" + xCounts.get(attr) + ","; xCounts.put(attr, xCounts.get(attr) + 1); } else { abstractMR += value + ","; } } abstractMR += "}"; } MRtoAbstractMR.put(da + c, abstractMR); abstractMRtoMR.put(abstractMR, da); abstractMRtoText.put(abstractMR, predictedWordSequence); predictedWordSequences_overAllPredicates.put(MRstr, predictedWordSequence); if (!predictedWordSequences_perPredicates.containsKey(predicate)) { predictedWordSequences_perPredicates.put(predicate, new HashMap<String, String>()); } predictedWordSequences_perPredicates.get(predicate).put(MRstr, predictedWordSequence); if (!daToGen.containsKey(da)) { daToGen.put(da.toLowerCase(), s.trim()); } int mentioned = 0; int total = 0; ArrayList<String> errors = new ArrayList<String>(); String gen = " " + s.trim().toLowerCase() + " "; for (String attr : attributeValues.keySet()) { for (String value : attributeValues.get(attr)) { String searchValue = value; boolean ment = false; if (attr.startsWith("hasinternet") && gen.contains("internet")) { ment = true; } else if (attr.startsWith("dogsallowed") && gen.contains("dog")) { ment = true; } else if (attr.startsWith("kidsallowed") && (gen.contains("kid") || gen.contains("child"))) { ment = true; } else if (attr.startsWith("goodformeal") && (gen.contains(searchValue) || gen.contains("meal") || gen.contains("breakfast"))) { ment = true; } else if (attr.startsWith("acceptscreditcards") && gen.contains("credit")) { ment = true; } if (searchValue.startsWith("dont_care") || searchValue.startsWith("none") || searchValue.startsWith("yes or no") || searchValue.startsWith("no or yes")) { searchValue = attr; } else if (searchValue.contains("or")) { String[] values = searchValue.split(" or "); for (String v : values) { if (gen.contains(v)) { ment = true; } } } if (searchValue.startsWith("pricerange") && (gen.contains(searchValue) || gen.contains("price") || gen.contains("range"))) { ment = true; } else if (searchValue.startsWith("area") && (gen.contains(searchValue) || gen.contains("location") || gen.contains("part") || gen.contains("where"))) { ment = true; } else if (searchValue.startsWith("near") && (gen.contains(searchValue) || gen.contains("location") || gen.contains("part") || gen.contains("where"))) { ment = true; } else if (gen.contains(searchValue)) { ment = true; } if (ment) { mentioned++; } else { errors.add("not mentioned -> " + attr + ":" + attributeValues.get(attr)); } total++; } } double err = 1.0 - (mentioned / (double) total); if (Double.isNaN(err)) { err = 0.0; } attrCoverage.put(predictedWordSequence, err); } } firstRef = false; } } if (printResults) { double avgErr = 0.0; avgErr = attrCoverage.values().stream().map((err) -> err).reduce(avgErr, (accumulator, _item) -> accumulator + _item); avgErr /= attrCoverage.size(); System.out.println(finalReferences.size() + "\t" + generations.size() + "\t" + attrCoverage.size()); BLEUMetric BLEU = new BLEUMetric(finalReferences, 4, false); NISTMetric NIST = new NISTMetric(finalReferences); System.out.println("WEN BLEU: \t" + BLEU.score(generations)); double avgRougeScore = 0.0; String detailedRes = ""; avgRougeScore = getTestingData().stream().map((di) -> { double maxRouge = 0.0; String pWS = predictedWordSequences_overAllPredicates .get(di.getMeaningRepresentation().getMRstr()).replaceAll("\\?", " \\? ") .replaceAll(":", " : ").replaceAll("\\.", " \\. ").replaceAll(",", " , ") .replaceAll(" ", " ").trim(); for (String ref : finalReferencesWordSequences.get(di.getMeaningRepresentation().getMRstr())) { double rouge = Rouge.ROUGE_N(pWS, ref, 4); if (rouge > maxRouge) { maxRouge = rouge; } } return maxRouge; }).map((maxRouge) -> maxRouge).reduce(avgRougeScore, (accumulator, _item) -> accumulator + _item); System.out.println("WEN ROUGE: \t" + (avgRougeScore / getTestingData().size())); System.out.println("WEN NIST: \t" + NIST.score(generations)); System.out.println("WEN ERR: \t" + avgErr); System.out.println("WEN TOTAL: \t" + getTestingData().size()); System.out.println("///////////////////"); double uniqueMRsInTestAndNotInTrainAllPredWordBLEU = 0.0; double uniqueMRsInTestAndNotInTrainAllPredWordROUGE = 0.0; double uniqueMRsInTestAndNotInTrainAllPredWordCOVERAGEERR = 0.0; double uniqueMRsInTestAndNotInTrainAllPredWordBRC = 0.0; detailedRes = ""; ArrayList<DatasetInstance> abstractMRList = new ArrayList<>(); HashSet<String> reportedAbstractMRs = new HashSet<>(); for (DatasetInstance di : getTestingData()) { if (!reportedAbstractMRs.contains(di.getMeaningRepresentation().getAbstractMR())) { reportedAbstractMRs.add(di.getMeaningRepresentation().getAbstractMR()); boolean isInTraining = false; for (DatasetInstance di2 : getTrainingData()) { if (di2.getMeaningRepresentation().getAbstractMR() .equals(di.getMeaningRepresentation().getAbstractMR())) { isInTraining = true; } } if (!isInTraining) { for (DatasetInstance di2 : getValidationData()) { if (di2.getMeaningRepresentation().getAbstractMR() .equals(di.getMeaningRepresentation().getAbstractMR())) { isInTraining = true; } } } if (!isInTraining) { abstractMRList.add(di); } } } for (DatasetInstance di : abstractMRList) { Double bestROUGE = -100.0; Double bestBLEU = -100.0; Double bestCover = -100.0; Double bestHarmonicMean = -100.0; String predictedString = predictedWordSequences_overAllPredicates .get(di.getMeaningRepresentation().getMRstr()); reportedAbstractMRs.add(di.getMeaningRepresentation().getAbstractMR()); double maxRouge = 0.0; String pWS = predictedString.replaceAll("\\?", " \\? ").replaceAll(":", " : ") .replaceAll("\\.", " \\. ").replaceAll(",", " , ").replaceAll(" ", " ").trim(); for (String ref : finalReferencesWordSequences.get(di.getMeaningRepresentation().getMRstr())) { double rouge = Rouge.ROUGE_N(pWS, ref, 4); if (rouge > maxRouge) { maxRouge = rouge; } } double BLEUSmooth = BLEUMetric.computeLocalSmoothScore(pWS, finalReferencesWordSequences.get(di.getMeaningRepresentation().getMRstr()), 4); double cover = 1.0 - attrCoverage.get(predictedString); double harmonicMean = 3.0 / (1.0 / BLEUSmooth + 1.0 / maxRouge + 1.0 / cover); if (harmonicMean > bestHarmonicMean) { bestROUGE = maxRouge; bestBLEU = BLEUSmooth; bestCover = cover; bestHarmonicMean = harmonicMean; } uniqueMRsInTestAndNotInTrainAllPredWordBLEU += bestBLEU; uniqueMRsInTestAndNotInTrainAllPredWordROUGE += bestROUGE; uniqueMRsInTestAndNotInTrainAllPredWordCOVERAGEERR += bestCover; uniqueMRsInTestAndNotInTrainAllPredWordBRC += bestHarmonicMean; } uniqueMRsInTestAndNotInTrainAllPredWordBLEU /= abstractMRList.size(); uniqueMRsInTestAndNotInTrainAllPredWordROUGE /= abstractMRList.size(); uniqueMRsInTestAndNotInTrainAllPredWordCOVERAGEERR /= abstractMRList.size(); uniqueMRsInTestAndNotInTrainAllPredWordBRC /= abstractMRList.size(); System.out.println("WEN UNIQUE (NOT IN TRAIN) WORD ALL PRED BLEU: \t" + uniqueMRsInTestAndNotInTrainAllPredWordBLEU); System.out.println("WEN UNIQUE (NOT IN TRAIN) WORD ALL PRED ROUGE: \t" + uniqueMRsInTestAndNotInTrainAllPredWordROUGE); System.out.println("WEN UNIQUE (NOT IN TRAIN) WORD ALL PRED COVERAGE ERROR: \t" + (1.0 - uniqueMRsInTestAndNotInTrainAllPredWordCOVERAGEERR)); System.out.println("WEN UNIQUE (NOT IN TRAIN) WORD ALL PRED BRC: \t" + uniqueMRsInTestAndNotInTrainAllPredWordBRC); abstractMRList.forEach((di) -> { System.out.println(di.getMeaningRepresentation().getAbstractMR() + "\t" + di.getMeaningRepresentation().getMRstr()); }); System.out.println("TOTAL SET SIZE: \t" + abstractMRList.size()); //System.out.println(abstractMRList); //System.out.println(detailedRes); //System.out.println(abstractMRList); //System.out.println(detailedRes); System.out.println("///////////////////"); ArrayList<String> bestPredictedStrings = new ArrayList<>(); ArrayList<String> bestPredictedStringsMRs = new ArrayList<>(); double uniqueAllPredWordBLEU = 0.0; double uniqueAllPredWordROUGE = 0.0; double uniqueAllPredWordCOVERAGEERR = 0.0; double uniqueAllPredWordBRC = 0.0; reportedAbstractMRs = new HashSet<>(); for (DatasetInstance di : getTestingData()) { //for (String predictedString : predictedWordSequences_overAllPredicates.get(di)) { if (!reportedAbstractMRs.contains(di.getMeaningRepresentation().getAbstractMR())) { String bestPredictedString = ""; Double bestROUGE = -100.0; Double bestBLEU = -100.0; Double bestCover = -100.0; Double bestHarmonicMean = -100.0; String predictedString = predictedWordSequences_overAllPredicates .get(di.getMeaningRepresentation().getMRstr()); reportedAbstractMRs.add(di.getMeaningRepresentation().getAbstractMR()); double maxRouge = 0.0; String pWS = predictedString.replaceAll("\\?", " \\? ").replaceAll(":", " : ") .replaceAll("\\.", " \\. ").replaceAll(",", " , ").replaceAll(" ", " ").trim(); for (String ref : finalReferencesWordSequences .get(di.getMeaningRepresentation().getMRstr())) { double rouge = Rouge.ROUGE_N(pWS, ref, 4); if (rouge > maxRouge) { maxRouge = rouge; } } double BLEUSmooth = BLEUMetric.computeLocalSmoothScore(pWS, finalReferencesWordSequences.get(di.getMeaningRepresentation().getMRstr()), 4); double cover = 1.0 - attrCoverage.get(predictedString); double harmonicMean = 3.0 / (1.0 / BLEUSmooth + 1.0 / maxRouge + 1.0 / cover); if (harmonicMean > bestHarmonicMean) { bestPredictedString = predictedString; bestROUGE = maxRouge; bestBLEU = BLEUSmooth; bestCover = cover; bestHarmonicMean = harmonicMean; } bestPredictedStrings.add(bestPredictedString); bestPredictedStringsMRs.add(di.getMeaningRepresentation().getMRstr()); uniqueAllPredWordBLEU += bestBLEU; uniqueAllPredWordROUGE += bestROUGE; uniqueAllPredWordCOVERAGEERR += bestCover; uniqueAllPredWordBRC += bestHarmonicMean; } //} } uniqueAllPredWordBLEU /= reportedAbstractMRs.size(); uniqueAllPredWordROUGE /= reportedAbstractMRs.size(); uniqueAllPredWordCOVERAGEERR /= reportedAbstractMRs.size(); uniqueAllPredWordBRC /= reportedAbstractMRs.size(); System.out.println("WEN UNIQUE WORD ALL PRED BLEU: \t" + uniqueAllPredWordBLEU); System.out.println("WEN UNIQUE WORD ALL PRED ROUGE: \t" + uniqueAllPredWordROUGE); System.out.println( "WEN UNIQUE WORD ALL PRED COVERAGE ERROR: \t" + (1.0 - uniqueAllPredWordCOVERAGEERR)); System.out.println("WEN UNIQUE WORD ALL PRED BRC: \t" + uniqueAllPredWordBRC); System.out.println(detailedRes); System.out.println("WEN TOTAL: \t" + reportedAbstractMRs.size()); //////////////////////// for (String pred : getPredicates()) { detailedRes = ""; bestPredictedStrings = new ArrayList<>(); bestPredictedStringsMRs = new ArrayList<>(); double uniquePredWordBLEU = 0.0; double uniquePredWordROUGE = 0.0; double uniquePredWordCOVERAGEERR = 0.0; double uniquePredWordBRC = 0.0; reportedAbstractMRs = new HashSet<>(); for (DatasetInstance di : getTestingData()) { if (di.getMeaningRepresentation().getPredicate().equals(pred) && !reportedAbstractMRs.contains(di.getMeaningRepresentation().getAbstractMR())) { String bestPredictedString = ""; Double bestROUGE = -100.0; Double bestBLEU = -100.0; Double bestCover = -100.0; Double bestHarmonicMean = -100.0; String predictedString = predictedWordSequences_overAllPredicates .get(di.getMeaningRepresentation().getMRstr()); reportedAbstractMRs.add(di.getMeaningRepresentation().getAbstractMR()); double maxRouge = 0.0; String pWS = predictedString.replaceAll("\\?", " \\? ").replaceAll(":", " : ") .replaceAll("\\.", " \\. ").replaceAll(",", " , ").replaceAll(" ", " ").trim(); for (String ref : finalReferencesWordSequences .get(di.getMeaningRepresentation().getMRstr())) { double rouge = Rouge.ROUGE_N(pWS, ref, 4); if (rouge > maxRouge) { maxRouge = rouge; } } double BLEUSmooth = BLEUMetric.computeLocalSmoothScore(pWS, finalReferencesWordSequences.get(di.getMeaningRepresentation().getMRstr()), 4); double cover = 1.0 - attrCoverage.get(predictedString); double harmonicMean = 3.0 / (1.0 / BLEUSmooth + 1.0 / maxRouge + 1.0 / cover); if (harmonicMean > bestHarmonicMean) { bestPredictedString = predictedString; bestROUGE = maxRouge; bestBLEU = BLEUSmooth; bestCover = cover; bestHarmonicMean = harmonicMean; } bestPredictedStrings.add(bestPredictedString); bestPredictedStringsMRs.add(di.getMeaningRepresentation().getMRstr()); uniquePredWordBLEU += bestBLEU; uniquePredWordROUGE += bestROUGE; uniquePredWordCOVERAGEERR += bestCover; uniquePredWordBRC += bestHarmonicMean; } } uniquePredWordBLEU /= reportedAbstractMRs.size(); uniquePredWordROUGE /= reportedAbstractMRs.size(); uniquePredWordCOVERAGEERR /= reportedAbstractMRs.size(); uniquePredWordBRC /= reportedAbstractMRs.size(); System.out.println("WEN UNIQUE WORD " + pred + " BLEU: \t" + uniquePredWordBLEU); System.out.println("WEN UNIQUE WORD " + pred + " ROUGE: \t" + uniquePredWordROUGE); System.out.println( "WEN UNIQUE WORD " + pred + " COVERAGE ERROR: \t" + (1.0 - uniquePredWordCOVERAGEERR)); System.out.println("WEN UNIQUE WORD " + pred + " BRC: \t" + uniquePredWordBRC); System.out.println(detailedRes); System.out.println("WEN TOTAL " + pred + ": \t" + reportedAbstractMRs.size()); } } } catch (FileNotFoundException ex) { Logger.getLogger(Bagel.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(Bagel.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:structuredPredictionNLG.SFX.java
/** * * @param trainingData/*from w w w .j av a2 s .c o m*/ */ public void createRandomAlignments(ArrayList<DatasetInstance> trainingData) { HashMap<String, HashMap<ArrayList<Action>, HashMap<Action, Integer>>> punctPatterns = new HashMap<>(); getPredicates().forEach((predicate) -> { punctPatterns.put(predicate, new HashMap<ArrayList<Action>, HashMap<Action, Integer>>()); }); HashMap<DatasetInstance, ArrayList<Action>> punctRealizations = new HashMap<DatasetInstance, ArrayList<Action>>(); HashMap<ArrayList<Action>, ArrayList<Action>> calculatedRealizationsCache = new HashMap<>(); trainingData.stream().map((di) -> { HashSet<ArrayList<Action>> initRealizations = new HashSet<>(); if (!calculatedRealizationsCache.containsKey(di.getDirectReferenceSequence())) { initRealizations.add(di.getDirectReferenceSequence()); } initRealizations.stream().map((realization) -> { HashMap<String, HashSet<String>> values = new HashMap<>(); di.getMeaningRepresentation().getAttributeValues().keySet().forEach((attr) -> { values.put(attr, new HashSet<>(di.getMeaningRepresentation().getAttributeValues().get(attr))); }); ArrayList<Action> randomRealization = new ArrayList<Action>(); realization.forEach((a) -> { if (a.getAttribute().equals(Action.TOKEN_PUNCT)) { randomRealization.add(new Action(a.getWord(), a.getAttribute())); } else { randomRealization.add(new Action(a.getWord(), "")); } }); HashSet<String> unalignedAttrs = new HashSet<>(); if (values.keySet().isEmpty()) { for (int i = 0; i < randomRealization.size(); i++) { if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { if (!getAttributes().get(di.getMeaningRepresentation().getPredicate()) .contains("empty")) { getAttributes().get(di.getMeaningRepresentation().getPredicate()).add("empty"); } randomRealization.get(i).setAttribute("empty=empty"); } } } else { values.keySet().forEach((attr) -> { values.get(attr).forEach((value) -> { if ((!(value.matches("\"[xX][0-9]+\"") || value.matches("[xX][0-9]+") || value.startsWith(Action.TOKEN_X))) && !value.isEmpty()) { String valueToCheck = value; if (valueToCheck.equals("no") || valueToCheck.equals("yes") || valueToCheck.equals("yes or no") || valueToCheck.equals("none") //|| valueToCheck.equals("dont_care") || valueToCheck.equals("empty")) { valueToCheck = attr + ":" + value; unalignedAttrs.add(attr + "=" + value); } if (valueToCheck.equals(attr)) { unalignedAttrs.add(attr + "=" + value); } if (!valueToCheck.equals("empty:empty") && getValueAlignments().containsKey(valueToCheck)) { unalignedAttrs.add(attr + "=" + valueToCheck); } } else { unalignedAttrs.add(attr + "=" + value); } }); }); unalignedAttrs.forEach((attrValue) -> { int index = getRandomGen().nextInt(randomRealization.size()); boolean change = false; while (!change) { if (!randomRealization.get(index).getAttribute().equals(Action.TOKEN_PUNCT)) { randomRealization.get(index).setAttribute(attrValue.toLowerCase().trim()); change = true; } else { index = getRandomGen().nextInt(randomRealization.size()); } } }); String previousAttr = ""; for (int i = 0; i < randomRealization.size(); i++) { if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { if (!previousAttr.isEmpty()) { randomRealization.get(i).setAttribute(previousAttr); } } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { previousAttr = randomRealization.get(i).getAttribute(); } else { previousAttr = ""; } } //System.out.println("1: " + randomRealization); previousAttr = ""; for (int i = randomRealization.size() - 1; i >= 0; i--) { if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { if (!previousAttr.isEmpty()) { randomRealization.get(i).setAttribute(previousAttr); } } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { previousAttr = randomRealization.get(i).getAttribute(); } else { previousAttr = ""; } } //System.out.println("2: " + randomRealization); previousAttr = ""; for (int i = 0; i < randomRealization.size(); i++) { if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { if (!previousAttr.isEmpty()) { randomRealization.get(i).setAttribute(previousAttr); } } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { previousAttr = randomRealization.get(i).getAttribute(); } } //System.out.println("3: " + randomRealization); previousAttr = ""; for (int i = randomRealization.size() - 1; i >= 0; i--) { if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { if (!previousAttr.isEmpty()) { randomRealization.get(i).setAttribute(previousAttr); } } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { previousAttr = randomRealization.get(i).getAttribute(); } } //System.out.println("4: " + randomRealization); } //FIX WRONG @PUNCT@ String previousAttr = ""; for (int i = randomRealization.size() - 1; i >= 0; i--) { if (randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT) && !randomRealization.get(i).getWord().matches("[,.?!;:']")) { if (!previousAttr.isEmpty()) { randomRealization.get(i).setAttribute(previousAttr); } } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { previousAttr = randomRealization.get(i).getAttribute(); } } ArrayList<Action> cleanRandomRealization = new ArrayList<>(); randomRealization.stream().filter((a) -> (!a.getAttribute().equals(Action.TOKEN_PUNCT))) .forEachOrdered((a) -> { cleanRandomRealization.add(a); }); //ADD END TOKENS ArrayList<Action> endRandomRealization = new ArrayList<>(); previousAttr = ""; for (int i = 0; i < cleanRandomRealization.size(); i++) { Action a = cleanRandomRealization.get(i); if (!previousAttr.isEmpty() && !a.getAttribute().equals(previousAttr)) { endRandomRealization.add(new Action(Action.TOKEN_END, previousAttr)); } endRandomRealization.add(a); previousAttr = a.getAttribute(); } endRandomRealization.add(new Action(Action.TOKEN_END, previousAttr)); endRandomRealization.add(new Action(Action.TOKEN_END, Action.TOKEN_END)); calculatedRealizationsCache.put(realization, endRandomRealization); //System.out.println(di.getMeaningRepresentation().getPredicate() + ": " + endRandomRealization); ArrayList<String> attrValues = new ArrayList<String>(); endRandomRealization.forEach((a) -> { if (attrValues.isEmpty()) { attrValues.add(a.getAttribute()); } else if (!attrValues.get(attrValues.size() - 1).equals(a.getAttribute())) { attrValues.add(a.getAttribute()); } }); if (attrValues.size() > getMaxContentSequenceLength()) { setMaxContentSequenceLength(attrValues.size()); } ArrayList<Action> punctRealization = new ArrayList<>(); punctRealization.addAll(randomRealization); previousAttr = ""; for (int i = 0; i < punctRealization.size(); i++) { if (!punctRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { if (!punctRealization.get(i).getAttribute().equals(previousAttr) && !previousAttr.isEmpty()) { punctRealization.add(i, new Action(Action.TOKEN_END, previousAttr)); i++; } previousAttr = punctRealization.get(i).getAttribute(); } } if (!punctRealization.get(punctRealization.size() - 1).getWord().equals(Action.TOKEN_END)) { punctRealization.add(new Action(Action.TOKEN_END, previousAttr)); } return punctRealization; }).map((punctRealization) -> { punctRealizations.put(di, punctRealization); return punctRealization; }).forEachOrdered((punctRealization) -> { for (int i = 0; i < punctRealization.size(); i++) { Action a = punctRealization.get(i); if (a.getAttribute().equals(Action.TOKEN_PUNCT)) { boolean legal = true; ArrayList<Action> surroundingActions = new ArrayList<>(); /*if (i - 3 >= 0) { surroundingActions.add(punctRealization.get(i - 3)); } else { surroundingActions.add(null); }*/ if (i - 2 >= 0) { surroundingActions.add(punctRealization.get(i - 2)); } else { surroundingActions.add(null); } if (i - 1 >= 0) { surroundingActions.add(punctRealization.get(i - 1)); } else { legal = false; } boolean oneMore = false; if (i + 1 < punctRealization.size()) { surroundingActions.add(punctRealization.get(i + 1)); if (!punctRealization.get(i + 1).getAttribute().equals(Action.TOKEN_END)) { oneMore = true; } } else { legal = false; } if (oneMore && i + 2 < punctRealization.size()) { surroundingActions.add(punctRealization.get(i + 2)); } else { surroundingActions.add(null); } if (legal) { if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .containsKey(surroundingActions)) { punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .put(surroundingActions, new HashMap<Action, Integer>()); } if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .get(surroundingActions).containsKey(a)) { punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .get(surroundingActions).put(a, 1); } else { punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .get(surroundingActions) .put(a, punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .get(surroundingActions).get(a) + 1); } } } } }); return di; }).map((di) -> { di.setDirectReferenceSequence(calculatedRealizationsCache.get(di.getDirectReferenceSequence())); return di; }); punctRealizations.keySet().forEach((di) -> { ArrayList<Action> punctRealization = punctRealizations.get(di); punctPatterns.get(di.getMeaningRepresentation().getPredicate()).keySet().forEach((surrounds) -> { int beforeNulls = 0; if (surrounds.get(0) == null) { beforeNulls++; } if (surrounds.get(1) == null) { beforeNulls++; } for (int i = 0 - beforeNulls; i < punctRealization.size(); i++) { boolean matches = true; int m = 0; for (int s = 0; s < surrounds.size(); s++) { if (surrounds.get(s) != null) { if (i + s < punctRealization.size()) { if (!punctRealization.get(i + s).getWord().equals(surrounds.get(s) .getWord()) /*|| !cleanActionList.get(i).getAttribute().equals(surrounds.get(s).getAttribute())*/) { matches = false; s = surrounds.size(); } else { m++; } } else { matches = false; s = surrounds.size(); } } else if (s < 2 && i + s >= 0) { matches = false; s = surrounds.size(); } else if (s >= 2 && i + s < punctRealization.size()) { matches = false; s = surrounds.size(); } } if (matches && m > 0) { Action a = new Action("", ""); if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds) .containsKey(a)) { punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds).put(a, 1); } else { punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds).put(a, punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds) .get(a) + 1); } } } }); }); punctPatterns.keySet().forEach((predicate) -> { punctPatterns.get(predicate).keySet().forEach((punct) -> { Action bestAction = null; int bestCount = 0; for (Action a : punctPatterns.get(predicate).get(punct).keySet()) { if (punctPatterns.get(predicate).get(punct).get(a) > bestCount) { bestAction = a; bestCount = punctPatterns.get(predicate).get(punct).get(a); } else if (punctPatterns.get(predicate).get(punct).get(a) == bestCount && bestAction.getWord().isEmpty()) { bestAction = a; } } if (!getPunctuationPatterns().containsKey(predicate)) { getPunctuationPatterns().put(predicate, new HashMap<ArrayList<Action>, Action>()); } if (!bestAction.getWord().isEmpty()) { getPunctuationPatterns().get(predicate).put(punct, bestAction); } }); }); }
From source file:structuredPredictionNLG.SFX.java
/** * * @param trainingData/*from w ww. j av a 2 s . c o m*/ */ @Override public void createNaiveAlignments(ArrayList<DatasetInstance> trainingData) { HashMap<String, HashMap<ArrayList<Action>, HashMap<Action, Integer>>> punctPatterns = new HashMap<>(); getPredicates().forEach((predicate) -> { punctPatterns.put(predicate, new HashMap<ArrayList<Action>, HashMap<Action, Integer>>()); }); HashMap<DatasetInstance, ArrayList<Action>> punctRealizations = new HashMap<DatasetInstance, ArrayList<Action>>(); trainingData.stream().map((di) -> { HashMap<ArrayList<Action>, ArrayList<Action>> calculatedRealizationsCache = new HashMap<>(); HashSet<ArrayList<Action>> initRealizations = new HashSet<>(); if (!calculatedRealizationsCache.containsKey(di.getDirectReferenceSequence())) { initRealizations.add(di.getDirectReferenceSequence()); } initRealizations.stream().map((realization) -> { HashMap<String, HashSet<String>> values = new HashMap<>(); di.getMeaningRepresentation().getAttributeValues().keySet().forEach((attr) -> { values.put(attr, new HashSet<>(di.getMeaningRepresentation().getAttributeValues().get(attr))); }); ArrayList<Action> randomRealization = new ArrayList<>(); for (int i = 0; i < realization.size(); i++) { Action a = realization.get(i); if (a.getAttribute().equals(Action.TOKEN_PUNCT)) { randomRealization.add(new Action(a.getWord(), a.getAttribute())); } else { randomRealization.add(new Action(a.getWord(), "")); } } if (values.keySet().isEmpty()) { for (int i = 0; i < randomRealization.size(); i++) { if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { if (!getAttributes().get(di.getMeaningRepresentation().getPredicate()) .contains("empty")) { getAttributes().get(di.getMeaningRepresentation().getPredicate()).add("empty"); } randomRealization.get(i).setAttribute("empty=empty"); } } } else { HashMap<Double, HashMap<String, ArrayList<Integer>>> indexAlignments = new HashMap<>(); HashSet<String> noValueAttrs = new HashSet<String>(); values.keySet().forEach((attr) -> { values.get(attr).stream().filter( (value) -> ((!(value.matches("\"[xX][0-9]+\"") || value.matches("[xX][0-9]+") || value.startsWith(Action.TOKEN_X))) && !value.isEmpty())) .map((value) -> { String valueToCheck = value; if (valueToCheck.equals("no") || valueToCheck.equals("yes") || valueToCheck.equals("yes or no") || valueToCheck.equals("none") //|| attr.equals("dont_care") || valueToCheck.equals("empty")) { valueToCheck = attr + ":" + value; noValueAttrs.add(attr + "=" + value); } if (valueToCheck.equals(attr)) { noValueAttrs.add(attr + "=" + value); } return valueToCheck; }) .filter((valueToCheck) -> (!valueToCheck.equals("empty:empty") && getValueAlignments().containsKey(valueToCheck))) .forEachOrdered((valueToCheck) -> { for (ArrayList<String> align : getValueAlignments().get(valueToCheck) .keySet()) { int n = align.size(); for (int i = 0; i <= randomRealization.size() - n; i++) { ArrayList<String> compare = new ArrayList<String>(); ArrayList<Integer> indexAlignment = new ArrayList<Integer>(); for (int j = 0; j < n; j++) { compare.add(randomRealization.get(i + j).getWord()); indexAlignment.add(i + j); } if (compare.equals(align)) { if (!indexAlignments.containsKey( getValueAlignments().get(valueToCheck).get(align))) { indexAlignments.put( getValueAlignments().get(valueToCheck).get(align), new HashMap()); } indexAlignments .get(getValueAlignments().get(valueToCheck).get(align)) .put(attr + "=" + valueToCheck, indexAlignment); } } } }); }); ArrayList<Double> similarities = new ArrayList<>(indexAlignments.keySet()); Collections.sort(similarities); HashSet<String> assignedAttrValues = new HashSet<String>(); HashSet<Integer> assignedIntegers = new HashSet<Integer>(); for (int i = similarities.size() - 1; i >= 0; i--) { for (String attrValue : indexAlignments.get(similarities.get(i)).keySet()) { if (!assignedAttrValues.contains(attrValue)) { boolean isUnassigned = true; for (Integer index : indexAlignments.get(similarities.get(i)).get(attrValue)) { if (assignedIntegers.contains(index)) { isUnassigned = false; } } if (isUnassigned) { assignedAttrValues.add(attrValue); for (Integer index : indexAlignments.get(similarities.get(i)).get(attrValue)) { assignedIntegers.add(index); randomRealization.get(index).setAttribute(attrValue.toLowerCase().trim()); } } } } } //System.out.println("-1: " + randomRealization); randomRealization.stream().filter((a) -> (a.getWord().startsWith(Action.TOKEN_X))) .forEachOrdered((a) -> { String attr = a.getWord().substring(3, a.getWord().lastIndexOf('_')).toLowerCase() .trim(); a.setAttribute(attr + "=" + a.getWord()); }); HashSet<String> unalignedNoValueAttrs = new HashSet<>(); noValueAttrs.forEach((noValueAttr) -> { boolean assigned = false; for (Action a : randomRealization) { if (a.getAttribute().equals(noValueAttr)) { assigned = true; } } if (!assigned) { unalignedNoValueAttrs.add(noValueAttr); } }); boolean isAllEmpty = true; boolean hasSpace = false; for (int i = 0; i < randomRealization.size(); i++) { if (!randomRealization.get(i).getAttribute().isEmpty() && !randomRealization.get(i).getAttribute().equals("[]") && !randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { isAllEmpty = false; } if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { hasSpace = true; } } if (isAllEmpty && hasSpace && !unalignedNoValueAttrs.isEmpty()) { unalignedNoValueAttrs.forEach((attrValue) -> { int index = getRandomGen().nextInt(randomRealization.size()); boolean change = false; while (!change) { if (!randomRealization.get(index).getAttribute().equals(Action.TOKEN_PUNCT)) { randomRealization.get(index).setAttribute(attrValue.toLowerCase().trim()); change = true; } else { index = getRandomGen().nextInt(randomRealization.size()); } } }); } //System.out.println(isAllEmpty + " " + hasSpace + " " + unalignedNoValueAttrs); //System.out.println(">> " + noValueAttrs); //System.out.println(">> " + values); //System.out.println("0: " + randomRealization); String previousAttr = ""; int start = -1; for (int i = 0; i < randomRealization.size(); i++) { if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT) && !randomRealization.get(i).getAttribute().isEmpty() && !randomRealization.get(i).getAttribute().equals("[]")) { if (start != -1) { int middle = (start + i - 1) / 2 + 1; for (int j = start; j < middle; j++) { if (randomRealization.get(j).getAttribute().isEmpty() || randomRealization.get(j).getAttribute().equals("[]")) { randomRealization.get(j).setAttribute(previousAttr); } } for (int j = middle; j < i; j++) { if (randomRealization.get(j).getAttribute().isEmpty() || randomRealization.get(j).getAttribute().equals("[]")) { randomRealization.get(j) .setAttribute(randomRealization.get(i).getAttribute()); } } } start = i; previousAttr = randomRealization.get(i).getAttribute(); } else { previousAttr = ""; } } //System.out.println("1: " + randomRealization); previousAttr = ""; for (int i = randomRealization.size() - 1; i >= 0; i--) { if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { if (!previousAttr.isEmpty()) { randomRealization.get(i).setAttribute(previousAttr); } } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { previousAttr = randomRealization.get(i).getAttribute(); } else { previousAttr = ""; } } //System.out.println("2: " + randomRealization); previousAttr = ""; for (int i = 0; i < randomRealization.size(); i++) { if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { if (!previousAttr.isEmpty()) { randomRealization.get(i).setAttribute(previousAttr); } } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { previousAttr = randomRealization.get(i).getAttribute(); } } //System.out.println("3: " + randomRealization); previousAttr = ""; for (int i = randomRealization.size() - 1; i >= 0; i--) { if (randomRealization.get(i).getAttribute().isEmpty() || randomRealization.get(i).getAttribute().equals("[]")) { if (!previousAttr.isEmpty()) { randomRealization.get(i).setAttribute(previousAttr); } } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { previousAttr = randomRealization.get(i).getAttribute(); } } //System.out.println("4: " + randomRealization); } //FIX WRONG @PUNCT@ String previousAttr = ""; for (int i = randomRealization.size() - 1; i >= 0; i--) { if (randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT) && !randomRealization.get(i).getWord().matches("[,.?!;:']")) { if (!previousAttr.isEmpty()) { randomRealization.get(i).setAttribute(previousAttr); } } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { previousAttr = randomRealization.get(i).getAttribute(); } } ArrayList<Action> cleanRandomRealization = new ArrayList<>(); randomRealization.stream().filter((a) -> (!a.getAttribute().equals(Action.TOKEN_PUNCT))) .forEachOrdered((a) -> { cleanRandomRealization.add(a); }); //ADD END TOKENS ArrayList<Action> endRandomRealization = new ArrayList<>(); previousAttr = ""; for (int i = 0; i < cleanRandomRealization.size(); i++) { Action a = cleanRandomRealization.get(i); if (!previousAttr.isEmpty() && !a.getAttribute().equals(previousAttr)) { endRandomRealization.add(new Action(Action.TOKEN_END, previousAttr)); } endRandomRealization.add(a); previousAttr = a.getAttribute(); } endRandomRealization.add(new Action(Action.TOKEN_END, previousAttr)); endRandomRealization.add(new Action(Action.TOKEN_END, Action.TOKEN_END)); calculatedRealizationsCache.put(realization, endRandomRealization); //System.out.println(di.getMeaningRepresentation().getPredicate() + ": " + endRandomRealization); ArrayList<String> attrValues = new ArrayList<String>(); endRandomRealization.forEach((a) -> { if (attrValues.isEmpty()) { attrValues.add(a.getAttribute()); } else if (!attrValues.get(attrValues.size() - 1).equals(a.getAttribute())) { attrValues.add(a.getAttribute()); } }); if (attrValues.size() > getMaxContentSequenceLength()) { setMaxContentSequenceLength(attrValues.size()); } ArrayList<Action> punctRealization = new ArrayList<>(); punctRealization.addAll(randomRealization); previousAttr = ""; for (int i = 0; i < punctRealization.size(); i++) { if (!punctRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) { if (!punctRealization.get(i).getAttribute().equals(previousAttr) && !previousAttr.isEmpty()) { punctRealization.add(i, new Action(Action.TOKEN_END, previousAttr)); i++; } previousAttr = punctRealization.get(i).getAttribute(); } } if (!punctRealization.get(punctRealization.size() - 1).getWord().equals(Action.TOKEN_END)) { punctRealization.add(new Action(Action.TOKEN_END, previousAttr)); } return punctRealization; }).map((punctRealization) -> { punctRealizations.put(di, punctRealization); return punctRealization; }).forEachOrdered((punctRealization) -> { for (int i = 0; i < punctRealization.size(); i++) { Action a = punctRealization.get(i); if (a.getAttribute().equals(Action.TOKEN_PUNCT)) { boolean legal = true; ArrayList<Action> surroundingActions = new ArrayList<>(); if (i - 2 >= 0) { surroundingActions.add(punctRealization.get(i - 2)); } else { surroundingActions.add(null); } if (i - 1 >= 0) { surroundingActions.add(punctRealization.get(i - 1)); } else { legal = false; } boolean oneMore = false; if (i + 1 < punctRealization.size()) { surroundingActions.add(punctRealization.get(i + 1)); if (!punctRealization.get(i + 1).getAttribute().equals(Action.TOKEN_END)) { oneMore = true; } } else { legal = false; } if (oneMore && i + 2 < punctRealization.size()) { surroundingActions.add(punctRealization.get(i + 2)); } else { surroundingActions.add(null); } if (legal) { if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .containsKey(surroundingActions)) { punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .put(surroundingActions, new HashMap<Action, Integer>()); } if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .get(surroundingActions).containsKey(a)) { punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .get(surroundingActions).put(a, 1); } else { punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .get(surroundingActions) .put(a, punctPatterns.get(di.getMeaningRepresentation().getPredicate()) .get(surroundingActions).get(a) + 1); } } } } }); di.setDirectReferenceSequence(calculatedRealizationsCache.get(di.getDirectReferenceSequence())); return di; }).forEachOrdered((di) -> { HashSet<String> attrValuesToBeMentioned = new HashSet<>(); di.getMeaningRepresentation().getAttributeValues().keySet().forEach((attribute) -> { int a = 0; for (String value : di.getMeaningRepresentation().getAttributeValues().get(attribute)) { if (value.startsWith("\"x")) { value = "x" + a; a++; } else if (value.startsWith("\"")) { value = value.substring(1, value.length() - 1).replaceAll(" ", "_"); } attrValuesToBeMentioned.add(attribute + "=" + value); } }); di.getDirectReferenceSequence().stream().map((key) -> { attrValuesToBeMentioned.remove(key.getAttribute()); return key; }); }); punctRealizations.keySet().forEach((di) -> { ArrayList<Action> punctRealization = punctRealizations.get(di); punctPatterns.get(di.getMeaningRepresentation().getPredicate()).keySet().forEach((surrounds) -> { int beforeNulls = 0; if (surrounds.get(0) == null) { beforeNulls++; } if (surrounds.get(1) == null) { beforeNulls++; } for (int i = 0 - beforeNulls; i < punctRealization.size(); i++) { boolean matches = true; int m = 0; for (int s = 0; s < surrounds.size(); s++) { if (surrounds.get(s) != null) { if (i + s < punctRealization.size()) { if (!punctRealization.get(i + s).getWord().equals(surrounds.get(s) .getWord()) /*|| !cleanActionList.get(i).getAttribute().equals(surrounds.get(s).getAttribute())*/) { matches = false; s = surrounds.size(); } else { m++; } } else { matches = false; s = surrounds.size(); } } else if (s < 2 && i + s >= 0) { matches = false; s = surrounds.size(); } else if (s >= 2 && i + s < punctRealization.size()) { matches = false; s = surrounds.size(); } } if (matches && m > 0) { Action a = new Action("", ""); if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds) .containsKey(a)) { punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds).put(a, 1); } else { punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds).put(a, punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds) .get(a) + 1); } } } }); }); punctPatterns.keySet().forEach((predicate) -> { punctPatterns.get(predicate).keySet().forEach((punct) -> { Action bestAction = null; int bestCount = 0; for (Action a : punctPatterns.get(predicate).get(punct).keySet()) { if (punctPatterns.get(predicate).get(punct).get(a) > bestCount) { bestAction = a; bestCount = punctPatterns.get(predicate).get(punct).get(a); } else if (punctPatterns.get(predicate).get(punct).get(a) == bestCount && bestAction.getWord().isEmpty()) { bestAction = a; } } if (!getPunctuationPatterns().containsKey(predicate)) { getPunctuationPatterns().put(predicate, new HashMap<ArrayList<Action>, Action>()); } if (!bestAction.getWord().isEmpty()) { getPunctuationPatterns().get(predicate).put(punct, bestAction); } }); }); }
From source file:structuredPredictionNLG.SFX.java
/** * Populates the predicate, attribute, attribute/value pair, and value alignment collections * @param dataFile The dataset file./*from w w w. j a va2 s . c o m*/ */ public void createLists(File dataFile) { try { // Initialize the collections setPredicates(new ArrayList<>()); setAttributes(new HashMap<>()); setAttributeValuePairs(new HashMap<>()); setValueAlignments(new HashMap<>()); // Obtain the dataset portion of the file String dataPart = new String(); boolean begin = false; try (BufferedReader br = new BufferedReader(new FileReader(dataFile))) { String s; while ((s = br.readLine()) != null) { if (s.startsWith("[")) { begin = true; } if (begin) { dataPart += s; } } } catch (FileNotFoundException ex) { Logger.getLogger(Bagel.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(Bagel.class.getName()).log(Level.SEVERE, null, ex); } // Parse the dataset with JSON JSONArray overArray = new JSONArray(dataPart); for (int o = 0; o < overArray.length(); o++) { // "dial" notes each seperate dialog JSONArray arr = overArray.getJSONObject(o).getJSONArray("dial"); for (int i = 0; i < arr.length(); i++) { String MRstr; String ref; // "dact" notes every meaning representation MRstr = arr.getJSONObject(i).getJSONObject("S").getString("dact"); // "ref" notes every corresponding reference ref = arr.getJSONObject(i).getJSONObject("S").getString("ref").replaceAll("-s", "s"); //We split some composite words (based on Wen et al's (2016) code) ref = (" " + ref + " ").replaceAll(" it's ", " it is ").replaceAll(" don't ", " do not ") .replaceAll(" doesn't ", " does not ").replaceAll(" didn't ", " did not ") .replaceAll(" you'd ", " you would ").replaceAll(" you're ", " you are ") .replaceAll(" you'll ", " you will ").replaceAll(" i'm ", " i am ") .replaceAll(" they're ", " they are ").replaceAll(" that's ", " that is ") .replaceAll(" what's ", " what is ").replaceAll(" couldn't ", " could not ") .replaceAll(" i've ", " i have ").replaceAll(" we've ", " we have ") .replaceAll(" can't ", " cannot ").replaceAll(" i'd ", " i would ") .replaceAll(" i'd ", " i would ").replaceAll(" aren't ", " are not ") .replaceAll(" isn't ", " is not ").replaceAll(" wasn't ", " was not ") .replaceAll(" weren't ", " were not ").replaceAll(" won't ", " will not ") .replaceAll(" there's ", " there is ").replaceAll(" there're ", " there are ") .replaceAll(" \\. \\. ", " \\. ").replaceAll(" restaurants ", " restaurant -s ") .replaceAll(" hotels ", " hotel -s ").replaceAll(" laptops ", " laptop -s ") .replaceAll(" cheaper ", " cheap -er ").replaceAll(" dinners ", " dinner -s ") .replaceAll(" lunches ", " lunch -s ").replaceAll(" breakfasts ", " breakfast -s ") .replaceAll(" expensively ", " expensive -ly ") .replaceAll(" moderately ", " moderate -ly ").replaceAll(" cheaply ", " cheap -ly ") .replaceAll(" prices ", " price -s ").replaceAll(" places ", " place -s ") .replaceAll(" venues ", " venue -s ").replaceAll(" ranges ", " range -s ") .replaceAll(" meals ", " meal -s ").replaceAll(" locations ", " location -s ") .replaceAll(" areas ", " area -s ").replaceAll(" policies ", " policy -s ") .replaceAll(" children ", " child -s ").replaceAll(" kids ", " kid -s ") .replaceAll(" kidfriendly ", " kid friendly ").replaceAll(" cards ", " card -s ") .replaceAll(" st ", " street ").replaceAll(" ave ", " avenue ") .replaceAll(" upmarket ", " expensive ").replaceAll(" inpricey ", " cheap ") .replaceAll(" inches ", " inch -s ").replaceAll(" uses ", " use -s ") .replaceAll(" dimensions ", " dimension -s ") .replaceAll(" driverange ", " drive range ").replaceAll(" includes ", " include -s ") .replaceAll(" computers ", " computer -s ").replaceAll(" machines ", " machine -s ") .replaceAll(" ecorating ", " eco rating ").replaceAll(" families ", " family -s ") .replaceAll(" ratings ", " rating -s ").replaceAll(" constraints ", " constraint -s ") .replaceAll(" pricerange ", " price range ") .replaceAll(" batteryrating ", " battery rating ") .replaceAll(" requirements ", " requirement -s ").replaceAll(" drives ", " drive -s ") .replaceAll(" specifications ", " specification -s ") .replaceAll(" weightrange ", " weight range ").replaceAll(" harddrive ", " hard drive ") .replaceAll(" batterylife ", " battery life ") .replaceAll(" businesses ", " business -s ").replaceAll(" hours ", " hour -s ") .replaceAll(" accessories ", " accessory -s ").replaceAll(" ports ", " port -s ") .replaceAll(" televisions ", " television -s ") .replaceAll(" restrictions ", " restriction -s ") .replaceAll(" extremely ", " extreme -ly ").replaceAll(" actually ", " actual -ly ") .replaceAll(" typically ", " typical -ly ").replaceAll(" drivers ", " driver -s ") .replaceAll(" teh ", " the ").replaceAll(" definitely ", " definite -ly ") .replaceAll(" factors ", " factor -s ").replaceAll(" truly ", " true -ly ") .replaceAll(" mostly ", " most -ly ").replaceAll(" nicely ", " nice -ly ") .replaceAll(" surely ", " sure -ly ").replaceAll(" certainly ", " certain -ly ") .replaceAll(" totally ", " total -ly ").replaceAll(" \\# ", " number ") .replaceAll(" \\& ", " and ").replaceAll(" avenue ", " ave ").replaceAll(" -s ", " s ") .trim(); // If the MR concerns one of the following predicates, and a ref is available if ((MRstr.startsWith("inform(") || MRstr.startsWith("inform_only") || MRstr.startsWith("inform_no_match(") || MRstr.startsWith("?confirm(") || MRstr.startsWith("?select(") || MRstr.startsWith("?request(") || MRstr.startsWith("?reqmore(") || MRstr.startsWith("goodbye(")) && !ref.isEmpty()) { // Obtain the predicate String predicate = MRstr.substring(0, MRstr.indexOf('(')); if (!getPredicates().contains(predicate) && predicate != null) { getPredicates().add(predicate); if (!getAttributes().containsKey(predicate)) { getAttributes().put(predicate, new HashSet<String>()); } if (!getDatasetInstances().containsKey(predicate)) { getDatasetInstances().put(predicate, new ArrayList<DatasetInstance>()); } } // Obtain the attributes String attributesStr = MRstr.substring(MRstr.indexOf('(') + 1, MRstr.length() - 1); HashMap<String, HashSet<String>> attributeValues = new HashMap<>(); // Track the indexes used for variables identifiers (seperately for each attribute) HashMap<String, Integer> attrXIndeces = new HashMap<>(); if (!attributesStr.isEmpty()) { // Parse the attributes and their values String[] args = attributesStr.split(";"); for (String arg : args) { String attr; String value = ""; // If the attribute has corresponding values if (arg.contains("=")) { String[] subAttr = arg.split("="); value = subAttr[1].toLowerCase(); attr = subAttr[0].toLowerCase().replaceAll("_", ""); if (value.startsWith("\'")) { value = value.substring(1, value.length() - 1); } // Normalize some closed set values if (value.equals("true")) { value = "yes"; } if (value.equals("false")) { value = "no"; } if (value.equals("dontcare")) { value = "dont_care"; } if ((" " + value + " ").contains(" avenue ")) { value = (" " + value + " ").replace(" avenue ", " ave ").trim(); } // Treat these values as seperate attributes since they are expressed quite differently if (value.equals("no") || value.equals("yes") || value.equals("yes or no") || value.equals("none") || value.equals("empty")) { attr += "_" + value.replaceAll(" ", "_"); value = attr; } // Treat "dont_care" instances, as if "dont_care" is the attribute, and the original attribute is the value // We do this because the phrasing is very similar between different "dont_care" realizations if (value.equals("dont_care")) { String v = value; value = attr; attr = v; } } else { attr = arg.replaceAll("_", ""); } if (!getAttributes().get(predicate).contains(attr)) { getAttributes().get(predicate).add(attr); } if (!attributeValues.containsKey(attr)) { attributeValues.put(attr, new HashSet<String>()); } // If the attribute has no corresponding value, we encode it by using the attibute identifier as the value if (value.isEmpty()) { value = attr; } // If the value is a variable, we name it as {@X@ + attribute identifier + variable index (for this attribute)} // This occurs when values are already set as variables in the MR, before any delexicalization happens if (value.toLowerCase().startsWith("x")) { int index = 0; if (!attrXIndeces.containsKey(attr)) { attrXIndeces.put(attr, 1); } else { index = attrXIndeces.get(attr); attrXIndeces.put(attr, index + 1); } value = "x" + index; } attributeValues.get(attr).add(value.trim().toLowerCase()); } } // Delexicalizing the attribute/value pairs HashMap<String, HashSet<String>> delexicalizedAttributeValues = new HashMap<>(); HashMap<String, HashMap<String, Integer>> attrValuePriorities = new HashMap<>(); int maximumPriority = 0; /* Delixalization of values needs to happen incrementally with priority given to the values of greater lenth, to avoid overlap of values in the reference * e.g. for the MR: inform{name="inn on castro", near="castro"}, with the reference "inn on castro is a nice restaurant", * we need to first align and delexicalize the "inn on castro" value, before the "castro" value * (in this case because "castro" doesn't appear in the reference, but even if it appeared later the priorities would help align it with the correct one) */ // We begin by determining which values may require delexicalization, and which not for (String attr : attributeValues.keySet()) { if (!attr.isEmpty()) { delexicalizedAttributeValues.put(attr, new HashSet<String>()); attrValuePriorities.put(attr, new HashMap<String, Integer>()); for (String value : attributeValues.get(attr)) { if (!value.equals("none") && !value.equals("empty") && !value.equals("yes") && !value.equals("yes or no") && !value.equals("no") && !value.equals(attr)) { // Initially priorities are given according to value order attrValuePriorities.get(attr).put(value, maximumPriority); maximumPriority++; } else { // No delexicalization is needed here delexicalizedAttributeValues.get(attr).add(value); } } } } // We shift the priorities of different values, according to their perspective lengths (i.e. longer values have higher priority) boolean change = true; while (change) { change = false; for (String attr1 : attrValuePriorities.keySet()) { for (String value1 : attrValuePriorities.get(attr1).keySet()) { for (String attr2 : attrValuePriorities.keySet()) { for (String value2 : attrValuePriorities.get(attr2).keySet()) { if (!value1.equals(value2) && value1.contains(value2) && attrValuePriorities.get(attr1).get( value1) > attrValuePriorities.get(attr2).get(value2)) { int prio1 = attrValuePriorities.get(attr1).get(value1); int prio2 = attrValuePriorities.get(attr2).get(value2); attrValuePriorities.get(attr1).put(value1, prio2); attrValuePriorities.get(attr2).put(value2, prio1); change = true; } } } } } } // Map between variables and their lexicalized values, required for relexicalization during postprocessing after the sentence generation HashMap<String, String> delexicalizationMap = new HashMap<>(); ref = " " + ref + " "; // Delexicalization occurs, in order of priority for (int priority = 0; priority < maximumPriority; priority++) { for (String attr : attrValuePriorities.keySet()) { if (!attrXIndeces.containsKey(attr)) { attrXIndeces.put(attr, 0); } for (String value : attrValuePriorities.get(attr).keySet()) { if (attrValuePriorities.get(attr).get(value) == priority) { // If the value doesn't appear verbatim in the reference, and the value is not composed of multiple subvalues (i.e. doesn't contain connectives) if (!ref.contains(" " + value + " ") && !value.contains(" and ") && !value.contains(" or ")) { if (value.equals("restaurant") && ref.contains(" place ")) { ref = ref.replace(" place ", " " + Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr) + " "); ref = ref.replaceAll(" ", " "); delexicalizedAttributeValues.get(attr) .add(Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr)); delexicalizationMap.put( Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr), "place"); attrXIndeces.put(attr, attrXIndeces.get(attr) + 1); } else { delexicalizedAttributeValues.get(attr).add(value); } // If the value doesn't appear verbatim in the reference, but the value is composed of multiple sub-values } else if (!ref.contains(" " + value + " ") && (value.contains(" and ") || value.contains(" or "))) { // We first check if the value appears verbatim when we switch "and" with "or" and vice versa // We do this due to some inconsistencies in the dataset on how conjuctions are treated String tempValue = value; if (value.contains(" and ")) { tempValue = value.replace(" and ", " or "); } else if (value.contains(" or ")) { tempValue = value.replace(" or ", " and "); } if (ref.contains(" " + tempValue + " ")) { ref = ref.replace(" " + tempValue + " ", " " + Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr) + " "); ref = ref.replaceAll(" ", " "); delexicalizedAttributeValues.get(attr) .add(Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr)); delexicalizationMap.put( Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr), value); attrXIndeces.put(attr, attrXIndeces.get(attr) + 1); } else { // We split the conjunction into the seperate values; so far the code supports only 2 sub-values String[] values = new String[2]; if (value.contains(" and ")) { values = value.split(" and "); } else if (value.contains(" or ")) { values = value.split(" or "); } // And check if the conjunction appears verbatim when we switch the position of the sub-values String newValue1 = values[1] + " and " + values[0]; String newValue2 = values[1] + " or " + values[0]; if (ref.contains(" " + newValue1 + " ")) { ref = ref.replace(" " + newValue1 + " ", " " + Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr) + " "); ref = ref.replaceAll(" ", " "); delexicalizedAttributeValues.get(attr).add( Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr)); delexicalizationMap.put( Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr), value); attrXIndeces.put(attr, attrXIndeces.get(attr) + 1); } else if (ref.contains(" " + newValue2 + " ")) { ref = ref.replace(" " + newValue2 + " ", " " + Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr) + " "); ref = ref.replaceAll(" ", " "); delexicalizedAttributeValues.get(attr).add( Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr)); delexicalizationMap.put( Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr), value); attrXIndeces.put(attr, attrXIndeces.get(attr) + 1); } } // If the value appears verbatim in the reference, delexicalize it } else { ref = ref.replace(" " + value + " ", " " + Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr) + " "); ref = ref.replaceAll(" ", " "); delexicalizedAttributeValues.get(attr) .add(Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr)); delexicalizationMap.put( Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr), value); attrXIndeces.put(attr, attrXIndeces.get(attr) + 1); } } } } } ref = ref.trim(); // We construct the MeaningRepresentation MeaningRepresentation MR = new MeaningRepresentation(predicate, delexicalizedAttributeValues, MRstr, delexicalizationMap); // Sequences of attribute/values pairs and words in the order we observe this in the reference ArrayList<String> observedAttrValueSequence = new ArrayList<>(); ArrayList<String> observedWordSequence = new ArrayList<>(); // The observed word sequence does not include punctuation String[] words = ref.replaceAll("([,.?!;:'])", " $1").split(" "); // We construct the observed word sequence (and fix some orthographical errors along the way) for (int w = 0; w < words.length; w++) { if (!words[w].trim().isEmpty()) { if (!words[w].trim().isEmpty() && (observedWordSequence.isEmpty() || !words[w].trim().equals( observedWordSequence.get(observedWordSequence.size() - 1)))) { if (words[w].trim().equals("s") && (observedWordSequence .get(observedWordSequence.size() - 1).equals("child"))) { observedWordSequence.set(observedWordSequence.size() - 1, "children"); } else if (words[w].trim().equals("addres") || words[w].trim().equals("adress")) { observedWordSequence.add("address"); } else if (words[w].trim().equals("mathch")) { observedWordSequence.add("match"); } else if (words[w].trim().equals("prefered")) { observedWordSequence.add("preferred"); } else if (words[w].trim().equals("relevent")) { observedWordSequence.add("relevant"); } else if (words[w].trim().equals("alloed")) { observedWordSequence.add("allowed"); } else if (words[w].trim().equals("avalible") || words[w].trim().equals("avalable")) { observedWordSequence.add("available"); } else if (words[w].trim().equals("tha") || words[w].trim().equals("te")) { observedWordSequence.add("the"); } else if (words[w].trim().equals("internect")) { observedWordSequence.add("internet"); } else if (words[w].trim().equals("wether")) { observedWordSequence.add("whether"); } else if (words[w].trim().equals("aplogize")) { observedWordSequence.add("apologize"); } else if (words[w].trim().equals("accomodations")) { observedWordSequence.add("accommodations"); } else if (words[w].trim().equals("whould")) { observedWordSequence.add("would"); } else if (words[w].trim().equals("aceepted")) { observedWordSequence.add("accepted"); } else if (words[w].trim().equals("postode")) { observedWordSequence.add("postcode"); } else if (words[w].trim().equals("ive")) { observedWordSequence.add("i"); observedWordSequence.add("have"); } else if (words[w].trim().equals("waht")) { observedWordSequence.add("what"); } else if (words[w].trim().equals("neighborhood")) { observedWordSequence.add("neighbourhood"); } else if (words[w].trim().equals("prefernce")) { observedWordSequence.add("preference"); } else if (words[w].trim().equals("dont")) { observedWordSequence.add("don't"); } else if (words[w].trim().equals("isnt")) { observedWordSequence.add("isn't"); } else if (words[w].trim().equals("intenet") || words[w].trim().equals("internetn")) { observedWordSequence.add("internet"); } else if (words[w].trim().equals("cannote")) { observedWordSequence.add("cannot"); } else if (words[w].trim().equals("notels")) { observedWordSequence.add("hotels"); } else if (words[w].trim().equals("phne")) { observedWordSequence.add("phone"); } else if (words[w].trim().equals("taht")) { observedWordSequence.add("that"); } else if (words[w].trim().equals("postdocde")) { observedWordSequence.add("postcode"); } else if (words[w].trim().equals("accpects")) { observedWordSequence.add("accepts"); } else if (words[w].trim().equals("doesn") || words[w].trim().equals("doesnt") || words[w].trim().equals("doesn")) { observedWordSequence.add("doesn't"); } else if (words[w].trim().equals("restaurnats")) { observedWordSequence.add("restarnauts"); } else if (words[w].trim().equals("ther") || words[w].trim().equals("thers")) { observedWordSequence.add("there"); // The dataset treats the suffixes "s" and "-ly" as separate words // We combine these suffixes with their preceding words but keep a cache with these changes to revert them before evaluation (we have to do this so that the token-based evaluation metrics are calculated in a consistent manner with Wen et al.'s) } else if (words[w].trim().equals("s")) { if (observedWordSequence.isEmpty()) { observedWordSequence.add(words[w].trim().toLowerCase()); } else if (observedWordSequence.get(observedWordSequence.size() - 1) .startsWith(Action.TOKEN_X)) { observedWordSequence.add(words[w].trim().toLowerCase()); } else { getCompositeSuffixesInData().put( observedWordSequence.get(observedWordSequence.size() - 1) + "s", observedWordSequence.get(observedWordSequence.size() - 1) + " s"); observedWordSequence.set(observedWordSequence.size() - 1, observedWordSequence.get(observedWordSequence.size() - 1) + "s"); } } else if (words[w].trim().equals("-ly")) { if (observedWordSequence.isEmpty()) { observedWordSequence.add(words[w].trim().toLowerCase()); } else if (observedWordSequence.get(observedWordSequence.size() - 1) .startsWith(Action.TOKEN_X)) { observedWordSequence.add(words[w].trim().toLowerCase()); } else { getCompositeSuffixesInData().put( observedWordSequence.get(observedWordSequence.size() - 1) + "ly", observedWordSequence.get(observedWordSequence.size() - 1) + " -ly"); observedWordSequence.set(observedWordSequence.size() - 1, observedWordSequence.get(observedWordSequence.size() - 1) + "ly"); } } else { observedWordSequence.add(words[w].trim().toLowerCase()); } } } } //Probably deprecated, need to do some more tests MR.getAttributeValues().keySet().forEach((attr) -> { MR.getAttributeValues().get(attr).stream() .filter((value) -> (attr.equals("name") && value.equals("none"))) .forEachOrdered((value) -> { observedAttrValueSequence.add(0, attr.toLowerCase() + "=" + value.toLowerCase()); }); }); observedAttrValueSequence.add(Action.TOKEN_END); // We store the maximum observed word sequence length, to use as a limit during generation if (observedWordSequence.size() > getMaxWordSequenceLength()) { setMaxWordSequenceLength(observedWordSequence.size()); } // We initialize the alignments between words and attribute/value pairs ArrayList<String> wordToAttrValueAlignment = new ArrayList<>(); // And populate them with "unaligned" tokens (i.e. "[]") and punctuation alignments; we do the latter so we know to filter out punctuation when estimating the alignments in later stages observedWordSequence.forEach((word) -> { if (word.trim().matches("[,.?!;:']")) { wordToAttrValueAlignment.add(Action.TOKEN_PUNCT); } else { wordToAttrValueAlignment.add("[]"); } }); // And using both word sequence and initial alignments, we construct a draft sequence of word actions corresponding to the reference ArrayList<Action> directReferenceSequence = new ArrayList<>(); for (int r = 0; r < observedWordSequence.size(); r++) { directReferenceSequence .add(new Action(observedWordSequence.get(r), wordToAttrValueAlignment.get(r))); } // Finally, we construct the DatasetInstance DatasetInstance DI = new DatasetInstance(MR, directReferenceSequence, postProcessRef(MR, directReferenceSequence)); // We add the evaluation references of all previously constructed DatasetInstances (that are identical to this one) as available evaluation references getDatasetInstances().get(predicate).stream() .filter((existingDI) -> (existingDI.getMeaningRepresentation().getAbstractMR() .equals(DI.getMeaningRepresentation().getAbstractMR()))) .map((existingDI) -> { existingDI.getEvaluationReferences().addAll(DI.getEvaluationReferences()); return existingDI; }).forEachOrdered((existingDI) -> { // We add the direct reference of this DatasetInstance as an available evaluation reference to all previously constructed DatasetInstance that are identical to this one DI.getEvaluationReferences().addAll(existingDI.getEvaluationReferences()); }); getDatasetInstances().get(predicate).add(DI); // Calculate the possible alignments between (non-delexicalized) attribute values and reference subphrases // We do this by comparing the values with n-gram subphrases of the reference, using character-level Levenshtein distance // These are used during the estimation of naive alignments, but also for tracking which values have possibly been expressed during generation HashMap<String, HashMap<String, Double>> observedValueAlignments = new HashMap<>(); MR.getAttributeValues().keySet().forEach((attr) -> { MR.getAttributeValues().get(attr).stream() .filter((value) -> (!value.equals("name=none") && !value.startsWith(Action.TOKEN_X) && !(value.matches("\"[xX][0-9]+\"") || value.matches("[xX][0-9]+")))) .forEachOrdered((value) -> { String valueToCompare = value; if (value.equals("no") || value.equals("yes") || value.equals("yes or no") || value.equals("none") || value.equals("empty")) { // If the value is boolean or non-existant, we also compare using the attribute name valueToCompare = attr; observedValueAlignments.put(valueToCompare + ":" + value, new HashMap<String, Double>()); } else { observedValueAlignments.put(valueToCompare, new HashMap<String, Double>()); } //For all n-grams in the referenec for (int n = 1; n < observedWordSequence.size(); n++) { //Calculate the similaritie between them and valueToCompare for (int r = 0; r <= observedWordSequence.size() - n; r++) { boolean compareAgainstNGram = true; for (int j = 0; j < n; j++) { if (observedWordSequence.get(r + j).startsWith(Action.TOKEN_X) || wordToAttrValueAlignment.get(r + j) .equals(Action.TOKEN_PUNCT) || StringNLPUtilities .isArticle(observedWordSequence.get(r + j)) || observedWordSequence.get(r + j) .equalsIgnoreCase("and") || observedWordSequence.get(r + j) .equalsIgnoreCase("or")) { // We ignore n-grams that contain variables, punctuation, articles, or conjuctions // In other words, we do not allow values to align with such n-grams compareAgainstNGram = false; } } if (compareAgainstNGram) { String align = ""; String compare = ""; String backwardCompare = ""; for (int j = 0; j < n; j++) { // The coordinates of the alignment align += (r + j) + " "; compare += observedWordSequence.get(r + j); backwardCompare = observedWordSequence.get(r + j) + backwardCompare; } align = align.trim(); // Calculate the character-level distance between the value and the nGram (in its original and reversed order) Double distance = Levenshtein.getSimilarity( valueToCompare.toLowerCase(), compare.toLowerCase(), true); Double backwardDistance = Levenshtein.getSimilarity( valueToCompare.toLowerCase(), backwardCompare.toLowerCase(), true); // We keep the best distance score; note that the Levenshtein distance is normalized so that greater is better if (backwardDistance > distance) { distance = backwardDistance; } // We ignore all nGrams that are less similar than a threshold if (distance > 0.3) { if (value.equals("no") || value.equals("yes") || value.equals("yes or no") || value.equals("none") || value.equals("empty")) { observedValueAlignments .get(valueToCompare + ":" + value) .put(align, distance); } else { observedValueAlignments.get(valueToCompare).put(align, distance); } } } } } }); }); // We filter out any values that haven't been aligned HashSet<String> toRemove = new HashSet<>(); for (String value : observedValueAlignments.keySet()) { if (observedValueAlignments.get(value).isEmpty()) { toRemove.add(value); } } for (String value : toRemove) { observedValueAlignments.remove(value); } // We keep the best aligned nGrams; since we do not want the aligned nGrams to be overlapping, we remove any overlapping alignments after we pick each one while (!observedValueAlignments.keySet().isEmpty()) { // Find the best aligned nGram Double max = Double.NEGATIVE_INFINITY; String[] bestAlignment = new String[2]; for (String value : observedValueAlignments.keySet()) { for (String alignment : observedValueAlignments.get(value).keySet()) { if (observedValueAlignments.get(value).get(alignment) > max) { max = observedValueAlignments.get(value).get(alignment); bestAlignment[0] = value; bestAlignment[1] = alignment; } } } // Find the subphrase that corresponds to the best aligned nGram, according to the coordinates ArrayList<String> alignedStr = new ArrayList<>(); String[] coords = bestAlignment[1].split(" "); if (coords.length == 1) { alignedStr.add(observedWordSequence.get(Integer.parseInt(coords[0].trim()))); } else { for (int a = Integer.parseInt(coords[0].trim()); a <= Integer .parseInt(coords[coords.length - 1].trim()); a++) { alignedStr.add(observedWordSequence.get(a)); } } // Store the best aligned nGram if (!getValueAlignments().containsKey(bestAlignment[0])) { getValueAlignments().put(bestAlignment[0], new HashMap<ArrayList<String>, Double>()); } getValueAlignments().get(bestAlignment[0]).put(alignedStr, max); // And remove it from the observed ones for this instance observedValueAlignments.remove(bestAlignment[0]); // And also remove any other aligned nGrams that are overlapping with the best aligned nGram observedValueAlignments.keySet().forEach((value) -> { HashSet<String> alignmentsToBeRemoved = new HashSet<>(); observedValueAlignments.get(value).keySet().forEach((alignment) -> { String[] othCoords = alignment.split(" "); if (Integer.parseInt(coords[0].trim()) <= Integer.parseInt(othCoords[0].trim()) && (Integer.parseInt(coords[coords.length - 1].trim()) >= Integer .parseInt(othCoords[0].trim())) || (Integer.parseInt(othCoords[0].trim()) <= Integer .parseInt(coords[0].trim()) && Integer.parseInt( othCoords[othCoords.length - 1].trim()) >= Integer .parseInt(coords[0].trim()))) { alignmentsToBeRemoved.add(alignment); } }); alignmentsToBeRemoved.forEach((alignment) -> { observedValueAlignments.get(value).remove(alignment); }); }); // We filter out any values that are no logner aligned (due to overlapping conflicts) toRemove = new HashSet<>(); for (String value : observedValueAlignments.keySet()) { if (observedValueAlignments.get(value).isEmpty()) { toRemove.add(value); } } for (String value : toRemove) { observedValueAlignments.remove(value); } } getObservedAttrValueSequences().add(observedAttrValueSequence); } } } } catch (JSONException ex) { } }
From source file:structuredPredictionNLG.SFX.java
/** * * @param classifierAttrs/*from w w w . j a va 2 s. c o m*/ * @param classifierWords * @param testingData * @param epoch * @return */ @Override public Double evaluateGeneration(HashMap<String, JAROW> classifierAttrs, HashMap<String, HashMap<String, JAROW>> classifierWords, ArrayList<DatasetInstance> testingData, int epoch) { System.out.println("Evaluate argument generation "); ArrayList<ScoredFeaturizedTranslation<IString, String>> generations = new ArrayList<>(); HashMap<DatasetInstance, ArrayList<Action>> generationActions = new HashMap<>(); ArrayList<ArrayList<Sequence<IString>>> finalReferences = new ArrayList<>(); HashMap<DatasetInstance, ArrayList<String>> finalReferencesWordSequences = new HashMap<>(); HashMap<DatasetInstance, String> predictedWordSequences_overAllPredicates = new HashMap<>(); ArrayList<String> allPredictedWordSequences = new ArrayList<>(); ArrayList<String> allPredictedMRStr = new ArrayList<>(); ArrayList<ArrayList<String>> allPredictedReferences = new ArrayList<>(); HashMap<String, Double> attrCoverage = new HashMap<>(); HashMap<String, HashSet<String>> abstractMRsToMRs = new HashMap<>(); for (DatasetInstance di : testingData) { String predicate = di.getMeaningRepresentation().getPredicate(); ArrayList<Action> predictedActionList = new ArrayList<>(); ArrayList<Action> predictedWordList = new ArrayList<>(); //PHRASE GENERATION EVALUATION String predictedAttr = ""; ArrayList<String> predictedAttrValues = new ArrayList<>(); HashSet<String> attrValuesToBeMentioned = new HashSet<>(); HashSet<String> attrValuesAlreadyMentioned = new HashSet<>(); for (String attribute : di.getMeaningRepresentation().getAttributeValues().keySet()) { for (String value : di.getMeaningRepresentation().getAttributeValues().get(attribute)) { attrValuesToBeMentioned.add(attribute.toLowerCase() + "=" + value.toLowerCase()); } } if (attrValuesToBeMentioned.isEmpty()) { attrValuesToBeMentioned.add("empty=empty"); } while (!predictedAttr.equals(Action.TOKEN_END) && predictedAttrValues.size() < getMaxContentSequenceLength()) { if (!predictedAttr.isEmpty()) { attrValuesToBeMentioned.remove(predictedAttr); } if (!attrValuesToBeMentioned.isEmpty()) { Instance attrTrainingVector = createContentInstance(predicate, "@TOK@", predictedAttrValues, attrValuesAlreadyMentioned, attrValuesToBeMentioned, di.getMeaningRepresentation(), getAvailableContentActions()); if (attrTrainingVector != null) { Prediction predictAttr = classifierAttrs.get(predicate).predict(attrTrainingVector); if (predictAttr.getLabel() != null) { predictedAttr = predictAttr.getLabel().trim(); if (!classifierAttrs.get(predicate).getCurrentWeightVectors().keySet() .containsAll(di.getMeaningRepresentation().getAttributeValues().keySet())) { System.out.println("MR ATTR NOT IN CLASSIFIERS"); System.out .println(classifierAttrs.get(predicate).getCurrentWeightVectors().keySet()); } String predictedValue = ""; if (!predictedAttr.equals(Action.TOKEN_END)) { predictedValue = chooseNextValue(predictedAttr, attrValuesToBeMentioned); HashSet<String> rejectedAttrs = new HashSet<>(); while (predictedValue.isEmpty() && (!predictedAttr.equals(Action.TOKEN_END) || (predictedAttrValues.isEmpty() && classifierAttrs.get(predicate).getCurrentWeightVectors().keySet() .containsAll(di.getMeaningRepresentation() .getAttributeValues().keySet())))) { rejectedAttrs.add(predictedAttr); predictedAttr = Action.TOKEN_END; double maxScore = -Double.MAX_VALUE; for (String attr : predictAttr.getLabel2Score().keySet()) { if (!rejectedAttrs.contains(attr) && (Double .compare(predictAttr.getLabel2Score().get(attr), maxScore) > 0)) { maxScore = predictAttr.getLabel2Score().get(attr); predictedAttr = attr; } } if (!predictedAttr.equals(Action.TOKEN_END)) { predictedValue = chooseNextValue(predictedAttr, attrValuesToBeMentioned); } } } if (!predictedAttr.equals(Action.TOKEN_END)) { predictedAttr += "=" + predictedValue; } predictedAttrValues.add(predictedAttr); if (!predictedAttr.isEmpty()) { attrValuesAlreadyMentioned.add(predictedAttr); attrValuesToBeMentioned.remove(predictedAttr); } } else { predictedAttr = Action.TOKEN_END; predictedAttrValues.add(predictedAttr); } } else { predictedAttr = Action.TOKEN_END; predictedAttrValues.add(predictedAttr); } } else { predictedAttr = Action.TOKEN_END; predictedAttrValues.add(predictedAttr); } } //WORD SEQUENCE EVALUATION predictedAttr = ""; ArrayList<String> predictedAttributes = new ArrayList<>(); attrValuesToBeMentioned = new HashSet<>(); attrValuesAlreadyMentioned = new HashSet<>(); HashMap<String, ArrayList<String>> valuesToBeMentioned = new HashMap<>(); for (String attribute : di.getMeaningRepresentation().getAttributeValues().keySet()) { for (String value : di.getMeaningRepresentation().getAttributeValues().get(attribute)) { attrValuesToBeMentioned.add(attribute.toLowerCase() + "=" + value.toLowerCase()); } valuesToBeMentioned.put(attribute, new ArrayList<>(di.getMeaningRepresentation().getAttributeValues().get(attribute))); } if (attrValuesToBeMentioned.isEmpty()) { attrValuesToBeMentioned.add("empty=empty"); } HashSet<String> attrValuesToBeMentionedCopy = new HashSet<>(attrValuesToBeMentioned); int a = -1; for (String attrValue : predictedAttrValues) { a++; if (!attrValue.equals(Action.TOKEN_END)) { String attribute = attrValue.split("=")[0]; predictedAttributes.add(attrValue); //GENERATE PHRASES if (!attribute.equals(Action.TOKEN_END)) { if (classifierWords.get(predicate).containsKey(attribute)) { ArrayList<String> nextAttributesForInstance = new ArrayList<>( predictedAttrValues.subList(a + 1, predictedAttrValues.size())); String predictedWord = ""; boolean isValueMentioned = false; String valueTBM = ""; if (attrValue.contains("=")) { valueTBM = attrValue.substring(attrValue.indexOf('=') + 1); } if (valueTBM.isEmpty()) { isValueMentioned = true; } ArrayList<String> subPhrase = new ArrayList<>(); while (!predictedWord.equals(Action.TOKEN_END) && predictedWordList.size() < getMaxWordSequenceLength()) { ArrayList<String> predictedAttributesForInstance = new ArrayList<>(); for (int i = 0; i < predictedAttributes.size() - 1; i++) { predictedAttributesForInstance.add(predictedAttributes.get(i)); } if (!predictedAttributes.get(predictedAttributes.size() - 1).equals(attrValue)) { predictedAttributesForInstance .add(predictedAttributes.get(predictedAttributes.size() - 1)); } Instance wordTrainingVector = createWordInstance(predicate, new Action("@TOK@", attrValue), predictedAttributesForInstance, predictedActionList, nextAttributesForInstance, attrValuesAlreadyMentioned, attrValuesToBeMentioned, isValueMentioned, getAvailableWordActions().get(predicate)); if (wordTrainingVector != null && classifierWords.get(predicate) != null) { if (classifierWords.get(predicate).get(attribute) != null) { Prediction predictWord = classifierWords.get(predicate).get(attribute) .predict(wordTrainingVector); if (predictWord.getLabel() != null) { predictedWord = predictWord.getLabel().trim(); while (predictedWord.equals(Action.TOKEN_END) && !predictedActionList.isEmpty() && predictedActionList.get(predictedActionList.size() - 1) .getWord().equals(Action.TOKEN_END)) { double maxScore = -Double.MAX_VALUE; for (String word : predictWord.getLabel2Score().keySet()) { if (!word.equals(Action.TOKEN_END) && (Double.compare( predictWord.getLabel2Score().get(word), maxScore) > 0)) { maxScore = predictWord.getLabel2Score().get(word); predictedWord = word; } } } predictedActionList.add(new Action(predictedWord, attrValue)); if (!predictedWord.equals(Action.TOKEN_START) && !predictedWord.equals(Action.TOKEN_END)) { subPhrase.add(predictedWord); predictedWordList.add(new Action(predictedWord, attrValue)); } } else { predictedWord = Action.TOKEN_END; predictedActionList.add(new Action(predictedWord, attrValue)); } } else { predictedWord = Action.TOKEN_END; predictedActionList.add(new Action(predictedWord, attrValue)); } } if (!isValueMentioned) { if (!predictedWord.equals(Action.TOKEN_END)) { if (predictedWord.startsWith(Action.TOKEN_X) && (valueTBM.matches("\"[xX][0-9]+\"") || valueTBM.matches("[xX][0-9]+") || valueTBM.startsWith(Action.TOKEN_X))) { isValueMentioned = true; } else if (!predictedWord.startsWith(Action.TOKEN_X) && !(valueTBM.matches("\"[xX][0-9]+\"") || valueTBM.matches("[xX][0-9]+") || valueTBM.startsWith(Action.TOKEN_X))) { String valueToCheck = valueTBM; if (valueToCheck.equals("no") || valueToCheck.equals("yes") || valueToCheck.equals("yes or no") || valueToCheck.equals("none") //|| valueToCheck.equals("dont_care") || valueToCheck.equals("empty")) { if (attribute.contains("=")) { valueToCheck = attribute.replace("=", ":"); } else { valueToCheck = attribute + ":" + valueTBM; } } if (!valueToCheck.equals("empty:empty") && getValueAlignments().containsKey(valueToCheck)) { for (ArrayList<String> alignedStr : getValueAlignments() .get(valueToCheck).keySet()) { if (endsWith(subPhrase, alignedStr)) { isValueMentioned = true; break; } } } } } if (isValueMentioned) { attrValuesAlreadyMentioned.add(attrValue); attrValuesToBeMentioned.remove(attrValue); } } String mentionedAttrValue = ""; if (!predictedWord.startsWith(Action.TOKEN_X)) { for (String attrValueTBM : attrValuesToBeMentioned) { if (attrValueTBM.contains("=")) { String value = attrValueTBM.substring(attrValueTBM.indexOf('=') + 1); if (!(value.matches("\"[xX][0-9]+\"") || value.matches("[xX][0-9]+") || value.startsWith(Action.TOKEN_X))) { String valueToCheck = value; if (valueToCheck.equals("no") || valueToCheck.equals("yes") || valueToCheck.equals("yes or no") || valueToCheck.equals("none") //|| valueToCheck.equals("dont_care") || valueToCheck.equals("empty")) { valueToCheck = attrValueTBM.replace("=", ":"); } if (!valueToCheck.equals("empty:empty") && getValueAlignments().containsKey(valueToCheck)) { for (ArrayList<String> alignedStr : getValueAlignments() .get(valueToCheck).keySet()) { if (endsWith(subPhrase, alignedStr)) { mentionedAttrValue = attrValueTBM; break; } } } } } } } if (!mentionedAttrValue.isEmpty()) { attrValuesAlreadyMentioned.add(mentionedAttrValue); attrValuesToBeMentioned.remove(mentionedAttrValue); } } if (predictedWordList.size() >= getMaxWordSequenceLength() && !predictedActionList .get(predictedActionList.size() - 1).getWord().equals(Action.TOKEN_END)) { predictedWord = Action.TOKEN_END; predictedActionList.add(new Action(predictedWord, attrValue)); } } else { String predictedWord = Action.TOKEN_END; predictedActionList.add(new Action(predictedWord, attrValue)); } } } } ArrayList<String> predictedAttrs = new ArrayList<>(); predictedAttrValues.forEach((attributeValuePair) -> { predictedAttrs.add(attributeValuePair.split("=")[0]); }); String predictedWordSequence = postProcessWordSequence(di, predictedActionList); ArrayList<String> predictedAttrList = getPredictedAttrList(predictedActionList); if (attrValuesToBeMentionedCopy.size() != 0.0) { double missingAttrs = 0.0; missingAttrs = attrValuesToBeMentionedCopy.stream() .filter((attr) -> (!predictedAttrList.contains(attr))).map((_item) -> 1.0) .reduce(missingAttrs, (accumulator, _item) -> accumulator + _item); double attrSize = attrValuesToBeMentionedCopy.size(); attrCoverage.put(predictedWordSequence, missingAttrs / attrSize); } allPredictedWordSequences.add(predictedWordSequence); allPredictedMRStr.add(di.getMeaningRepresentation().getMRstr()); predictedWordSequences_overAllPredicates.put(di, predictedWordSequence); if (!abstractMRsToMRs.containsKey(di.getMeaningRepresentation().getAbstractMR())) { abstractMRsToMRs.put(di.getMeaningRepresentation().getAbstractMR(), new HashSet<String>()); } abstractMRsToMRs.get(di.getMeaningRepresentation().getAbstractMR()) .add(di.getMeaningRepresentation().getMRstr()); Sequence<IString> translation = IStrings .tokenize(NISTTokenizer.tokenize(predictedWordSequence.toLowerCase())); ScoredFeaturizedTranslation<IString, String> tran = new ScoredFeaturizedTranslation<>(translation, null, 0); generations.add(tran); generationActions.put(di, predictedActionList); ArrayList<Sequence<IString>> references = new ArrayList<>(); ArrayList<String> referencesStrings = new ArrayList<>(); if (getPerformEvaluationOn().equals("valid") || getPerformEvaluationOn().equals("train")) { for (String ref : di.getEvaluationReferences()) { referencesStrings.add(ref); references.add(IStrings.tokenize(NISTTokenizer.tokenize(ref))); } } else { references = wenEvaluationReferenceSequences.get(di.getMeaningRepresentation().getMRstr()); referencesStrings = wenEvaluationReferences.get(di.getMeaningRepresentation().getMRstr()); if (references == null) { references = new ArrayList<>(); referencesStrings = new ArrayList<>(); for (String ref : di.getEvaluationReferences()) { referencesStrings.add(ref); references.add(IStrings.tokenize(NISTTokenizer.tokenize(ref))); } } } allPredictedReferences.add(referencesStrings); finalReferencesWordSequences.put(di, referencesStrings); finalReferences.add(references); } BLEUMetric BLEU = new BLEUMetric(finalReferences, 4, false); Double bleuScore = BLEU.score(generations); double finalCoverageError = 0.0; finalCoverageError = attrCoverage.values().stream().map((c) -> c).reduce(finalCoverageError, (accumulator, _item) -> accumulator + _item); finalCoverageError /= attrCoverage.size(); for (int i = 0; i < allPredictedWordSequences.size(); i++) { double maxRouge = 0.0; String predictedWordSequence = allPredictedWordSequences.get(i).replaceAll("\\?", " \\? ") .replaceAll(":", " : ").replaceAll("\\.", " \\. ").replaceAll(",", " , ").replaceAll(" ", " ") .trim(); for (String ref : allPredictedReferences.get(i)) { double rouge = Rouge.ROUGE_N(predictedWordSequence, ref, 4); if (rouge > maxRouge) { maxRouge = rouge; } } //System.out.println(allPredictedMRStr.get(i) + "\t" + maxRouge + "\t" + allPredictedWordSequences.get(i) + "\t" + refs); } double avgRougeScore = 0.0; String detailedRes = ""; avgRougeScore = testingData.stream().map((di) -> { double maxRouge = 0.0; if (!finalReferencesWordSequences.containsKey(di)) { System.out.println(di.getMeaningRepresentation().getAbstractMR()); } String predictedWordSequence = predictedWordSequences_overAllPredicates.get(di) .replaceAll("\\?", " \\? ").replaceAll(":", " : ").replaceAll("\\.", " \\. ") .replaceAll(",", " , ").replaceAll(" ", " ").trim(); for (String ref : finalReferencesWordSequences.get(di)) { double rouge = Rouge.ROUGE_N(predictedWordSequence, ref, 4); if (rouge > maxRouge) { maxRouge = rouge; } } return maxRouge; }).map((maxRouge) -> maxRouge).reduce(avgRougeScore, (accumulator, _item) -> accumulator + _item); System.out.println("BLEU: \t" + bleuScore); //System.out.println("g: " + generations); //System.out.println("attr: " + predictedAttrLists); //System.out.println("BLEU smooth: \t" + bleuSmoothScore); //System.out.println("g: " + generations); //System.out.println("attr: " + predictedAttrLists); //System.out.println("BLEU smooth: \t" + bleuSmoothScore); System.out.println("ROUGE: \t" + (avgRougeScore / allPredictedWordSequences.size())); System.out.println("COVERAGE ERROR: \t" + finalCoverageError); System.out.println("BRC: \t" + ((avgRougeScore / allPredictedWordSequences.size()) + bleuScore + (1.0 - finalCoverageError)) / 3.0); if (isCalculateResultsPerPredicate()) { //////////////////////// //ArrayList<String> bestPredictedStrings = new ArrayList<>(); //ArrayList<String> bestPredictedStringsMRs = new ArrayList<>(); double uniqueMRsInTestAndNotInTrainAllPredWordBLEU = 0.0; double uniqueMRsInTestAndNotInTrainAllPredWordROUGE = 0.0; double uniqueMRsInTestAndNotInTrainAllPredWordCOVERAGEERR = 0.0; double uniqueMRsInTestAndNotInTrainAllPredWordBRC = 0.0; detailedRes = ""; ArrayList<DatasetInstance> abstractMRList = new ArrayList<>(); HashSet<String> reportedAbstractMRs = new HashSet<>(); testingData.stream() .filter((di) -> (!reportedAbstractMRs.contains(di.getMeaningRepresentation().getAbstractMR()))) .map((di) -> { reportedAbstractMRs.add(di.getMeaningRepresentation().getAbstractMR()); return di; }).forEachOrdered((di) -> { boolean isInTraining = false; for (DatasetInstance di2 : getTrainingData()) { if (di2.getMeaningRepresentation().getAbstractMR() .equals(di.getMeaningRepresentation().getAbstractMR())) { isInTraining = true; } } if (!isInTraining) { for (DatasetInstance di2 : getValidationData()) { if (di2.getMeaningRepresentation().getAbstractMR() .equals(di.getMeaningRepresentation().getAbstractMR())) { isInTraining = true; } } } if (!isInTraining) { abstractMRList.add(di); } }); for (DatasetInstance di : abstractMRList) { Double bestROUGE = -100.0; Double bestBLEU = -100.0; Double bestCover = -100.0; Double bestHarmonicMean = -100.0; String predictedString = predictedWordSequences_overAllPredicates.get(di); reportedAbstractMRs.add(di.getMeaningRepresentation().getAbstractMR()); double maxRouge = 0.0; String predictedWordSequence = predictedString.replaceAll("\\?", " \\? ").replaceAll(":", " : ") .replaceAll("\\.", " \\. ").replaceAll(",", " , ").replaceAll(" ", " ").trim(); for (String ref : finalReferencesWordSequences.get(di)) { double rouge = Rouge.ROUGE_N(predictedWordSequence, ref, 4); if (rouge > maxRouge) { maxRouge = rouge; } } double BLEUSmooth = BLEUMetric.computeLocalSmoothScore(predictedWordSequence, finalReferencesWordSequences.get(di), 4); double cover = 1.0 - attrCoverage.get(predictedString); double harmonicMean = 3.0 / (1.0 / BLEUSmooth + 1.0 / maxRouge + 1.0 / cover); if (harmonicMean > bestHarmonicMean) { bestROUGE = maxRouge; bestBLEU = BLEUSmooth; bestCover = cover; bestHarmonicMean = harmonicMean; } uniqueMRsInTestAndNotInTrainAllPredWordBLEU += bestBLEU; uniqueMRsInTestAndNotInTrainAllPredWordROUGE += bestROUGE; uniqueMRsInTestAndNotInTrainAllPredWordCOVERAGEERR += bestCover; uniqueMRsInTestAndNotInTrainAllPredWordBRC += bestHarmonicMean; } uniqueMRsInTestAndNotInTrainAllPredWordBLEU /= abstractMRList.size(); uniqueMRsInTestAndNotInTrainAllPredWordROUGE /= abstractMRList.size(); uniqueMRsInTestAndNotInTrainAllPredWordCOVERAGEERR /= abstractMRList.size(); uniqueMRsInTestAndNotInTrainAllPredWordBRC /= abstractMRList.size(); System.out.println( "UNIQUE (NOT IN TRAIN) WORD ALL PRED BLEU: \t" + uniqueMRsInTestAndNotInTrainAllPredWordBLEU); System.out.println( "UNIQUE (NOT IN TRAIN) WORD ALL PRED ROUGE: \t" + uniqueMRsInTestAndNotInTrainAllPredWordROUGE); System.out.println("UNIQUE (NOT IN TRAIN) WORD ALL PRED COVERAGE ERROR: \t" + (1.0 - uniqueMRsInTestAndNotInTrainAllPredWordCOVERAGEERR)); System.out.println( "UNIQUE (NOT IN TRAIN) WORD ALL PRED BRC: \t" + uniqueMRsInTestAndNotInTrainAllPredWordBRC); abstractMRList.forEach((di) -> { System.out.println(di.getMeaningRepresentation().getAbstractMR() + "\t" + predictedWordSequences_overAllPredicates.get(di)); }); System.out.println("TOTAL SET SIZE: \t" + abstractMRList.size()); //System.out.println(abstractMRList); //System.out.println(detailedRes); } ArrayList<String> bestPredictedStrings = new ArrayList<>(); ArrayList<String> bestPredictedStringsMRs = new ArrayList<>(); double uniqueAllPredWordBLEU = 0.0; double uniqueAllPredWordROUGE = 0.0; double uniqueAllPredWordCOVERAGEERR = 0.0; double uniqueAllPredWordBRC = 0.0; HashSet<String> reportedAbstractMRs = new HashSet<>(); for (DatasetInstance di : testingData) { if (!reportedAbstractMRs.contains(di.getMeaningRepresentation().getAbstractMR())) { String bestPredictedString = ""; Double bestROUGE = -100.0; Double bestBLEU = -100.0; Double bestCover = -100.0; Double bestHarmonicMean = -100.0; String predictedString = predictedWordSequences_overAllPredicates.get(di); reportedAbstractMRs.add(di.getMeaningRepresentation().getAbstractMR()); double maxRouge = 0.0; String predictedWordSequence = predictedString.replaceAll("\\?", " \\? ").replaceAll(":", " : ") .replaceAll("\\.", " \\. ").replaceAll(",", " , ").replaceAll(" ", " ").trim(); for (String ref : finalReferencesWordSequences.get(di)) { double rouge = Rouge.ROUGE_N(predictedWordSequence, ref, 4); if (rouge > maxRouge) { maxRouge = rouge; } } double BLEUSmooth = BLEUMetric.computeLocalSmoothScore(predictedWordSequence, finalReferencesWordSequences.get(di), 4); double cover = 1.0 - attrCoverage.get(predictedString); double harmonicMean = 3.0 / (1.0 / BLEUSmooth + 1.0 / maxRouge + 1.0 / cover); if (harmonicMean > bestHarmonicMean) { bestPredictedString = predictedString; bestROUGE = maxRouge; bestBLEU = BLEUSmooth; bestCover = cover; bestHarmonicMean = harmonicMean; } bestPredictedStrings.add(bestPredictedString); bestPredictedStringsMRs.add(di.getMeaningRepresentation().getMRstr()); uniqueAllPredWordBLEU += bestBLEU; uniqueAllPredWordROUGE += bestROUGE; uniqueAllPredWordCOVERAGEERR += bestCover; uniqueAllPredWordBRC += bestHarmonicMean; } //} } if (isCalculateResultsPerPredicate()) { uniqueAllPredWordBLEU /= reportedAbstractMRs.size(); uniqueAllPredWordROUGE /= reportedAbstractMRs.size(); uniqueAllPredWordCOVERAGEERR /= reportedAbstractMRs.size(); uniqueAllPredWordBRC /= reportedAbstractMRs.size(); System.out.println("UNIQUE WORD ALL PRED BLEU: \t" + uniqueAllPredWordBLEU); System.out.println("UNIQUE WORD ALL PRED ROUGE: \t" + uniqueAllPredWordROUGE); System.out.println("UNIQUE WORD ALL PRED COVERAGE ERROR: \t" + (1.0 - uniqueAllPredWordCOVERAGEERR)); System.out.println("UNIQUE WORD ALL PRED BRC: \t" + uniqueAllPredWordBRC); System.out.println(detailedRes); System.out.println("TOTAL: \t" + reportedAbstractMRs.size()); //////////////////////// for (String predicate : getPredicates()) { detailedRes = ""; bestPredictedStrings = new ArrayList<>(); bestPredictedStringsMRs = new ArrayList<>(); double uniquePredWordBLEU = 0.0; double uniquePredWordROUGE = 0.0; double uniquePredWordCOVERAGEERR = 0.0; double uniquePredWordBRC = 0.0; reportedAbstractMRs = new HashSet<>(); for (DatasetInstance di : testingData) { if (di.getMeaningRepresentation().getPredicate().equals(predicate) && !reportedAbstractMRs.contains(di.getMeaningRepresentation().getAbstractMR())) { String bestPredictedString = ""; Double bestROUGE = -100.0; Double bestBLEU = -100.0; Double bestCover = -100.0; Double bestHarmonicMean = -100.0; String predictedString = predictedWordSequences_overAllPredicates.get(di); reportedAbstractMRs.add(di.getMeaningRepresentation().getAbstractMR()); double maxRouge = 0.0; String predictedWordSequence = predictedString.replaceAll("\\?", " \\? ") .replaceAll(":", " : ").replaceAll("\\.", " \\. ").replaceAll(",", " , ") .replaceAll(" ", " ").trim(); for (String ref : finalReferencesWordSequences.get(di)) { double rouge = Rouge.ROUGE_N(predictedWordSequence, ref, 4); if (rouge > maxRouge) { maxRouge = rouge; } } double BLEUSmooth = BLEUMetric.computeLocalSmoothScore(predictedWordSequence, finalReferencesWordSequences.get(di), 4); double cover = 1.0 - attrCoverage.get(predictedString); double harmonicMean = 3.0 / (1.0 / BLEUSmooth + 1.0 / maxRouge + 1.0 / cover); if (harmonicMean > bestHarmonicMean) { bestPredictedString = predictedString; bestROUGE = maxRouge; bestBLEU = BLEUSmooth; bestCover = cover; bestHarmonicMean = harmonicMean; } bestPredictedStrings.add(bestPredictedString); bestPredictedStringsMRs.add(di.getMeaningRepresentation().getMRstr()); uniquePredWordBLEU += bestBLEU; uniquePredWordROUGE += bestROUGE; uniquePredWordCOVERAGEERR += bestCover; uniquePredWordBRC += bestHarmonicMean; } } uniquePredWordBLEU /= reportedAbstractMRs.size(); uniquePredWordROUGE /= reportedAbstractMRs.size(); uniquePredWordCOVERAGEERR /= reportedAbstractMRs.size(); uniquePredWordBRC /= reportedAbstractMRs.size(); System.out.println("UNIQUE WORD " + predicate + " BLEU: \t" + uniquePredWordBLEU); System.out.println("UNIQUE WORD " + predicate + " ROUGE: \t" + uniquePredWordROUGE); System.out.println( "UNIQUE WORD " + predicate + " COVERAGE ERROR: \t" + (1.0 - uniquePredWordCOVERAGEERR)); System.out.println("UNIQUE WORD " + predicate + " BRC: \t" + uniquePredWordBRC); System.out.println(detailedRes); System.out.println("TOTAL " + predicate + ": \t" + reportedAbstractMRs.size()); } } if (isCalculateResultsPerPredicate()) { BufferedWriter bw = null; File f = null; try { f = new File("results/random_SFX" + getDataset() + "TextsAfter" + (epoch) + "_" + JLOLS.sentenceCorrectionFurtherSteps + "_" + JLOLS.p + "epochsTESTINGDATA.txt"); } catch (NullPointerException e) { } try { bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f))); } catch (FileNotFoundException e) { } try { bw.write("BLEU:" + bleuScore); bw.write("\n"); } catch (IOException e) { } for (int i = 0; i < bestPredictedStrings.size(); i++) { try { String mr = bestPredictedStringsMRs.get(i); bw.write("MR;" + mr.replaceAll(";", ",") + ";"); if (getDataset().equals("hotel")) { bw.write("LOLS_SFHOT;"); } else { bw.write("LOLS_SFRES;"); } bw.write("\n"); } catch (IOException e) { } } try { bw.close(); } catch (IOException e) { } } return bleuScore; }
From source file:structuredPredictionNLG.SFX.java
/** * * @param predicate//from w w w . ja v a 2 s . co m * @param currentAttrValue * @param costs * @param generatedAttributes * @param previousGeneratedWords * @param nextGeneratedAttributes * @param attrValuesAlreadyMentioned * @param attrValuesThatFollow * @param wasValueMentioned * @param availableWordActions * @return */ @Override public Instance createWordInstanceWithCosts(String predicate, String currentAttrValue, TObjectDoubleHashMap<String> costs, ArrayList<String> generatedAttributes, ArrayList<Action> previousGeneratedWords, ArrayList<String> nextGeneratedAttributes, HashSet<String> attrValuesAlreadyMentioned, HashSet<String> attrValuesThatFollow, boolean wasValueMentioned, HashMap<String, HashSet<Action>> availableWordActions) { String currentAttr = currentAttrValue; String currentValue = ""; if (currentAttr.contains("=")) { currentAttr = currentAttrValue.substring(0, currentAttrValue.indexOf('=')); currentValue = currentAttrValue.substring(currentAttrValue.indexOf('=') + 1); } if (currentValue.contains(":")) { currentValue = currentAttrValue.substring(currentAttrValue.indexOf(':') + 1); } if (currentValue.isEmpty()) { //System.exit(0); } TObjectDoubleHashMap<String> generalFeatures = new TObjectDoubleHashMap<>(); HashMap<String, TObjectDoubleHashMap<String>> valueSpecificFeatures = new HashMap<>(); for (Action action : availableWordActions.get(currentAttr)) { valueSpecificFeatures.put(action.getAction(), new TObjectDoubleHashMap<String>()); } /*if (gWords.get(wIndex).getWord().equals(Action.TOKEN_END)) { System.out.println("!!! "+ gWords.subList(0, wIndex + 1)); }*/ ArrayList<Action> generatedWords = new ArrayList<>(); ArrayList<Action> generatedWordsInSameAttrValue = new ArrayList<>(); ArrayList<String> generatedPhrase = new ArrayList<>(); for (int i = 0; i < previousGeneratedWords.size(); i++) { Action a = previousGeneratedWords.get(i); if (!a.getWord().equals(Action.TOKEN_START) && !a.getWord().equals(Action.TOKEN_END)) { generatedWords.add(a); generatedPhrase.add(a.getWord()); if (a.getAttribute().equals(currentAttrValue)) { generatedWordsInSameAttrValue.add(a); } } } //Previous word features for (int j = 1; j <= 1; j++) { String previousWord = "@@"; if (generatedWords.size() - j >= 0) { previousWord = generatedWords.get(generatedWords.size() - j).getWord().trim(); } generalFeatures.put("feature_word_" + j + "_" + previousWord.toLowerCase(), 1.0); } String prevWord = "@@"; if (generatedWords.size() - 1 >= 0) { prevWord = generatedWords.get(generatedWords.size() - 1).getWord().trim(); } String prev2Word = "@@"; if (generatedWords.size() - 2 >= 0) { prev2Word = generatedWords.get(generatedWords.size() - 2).getWord().trim(); } String prev3Word = "@@"; if (generatedWords.size() - 3 >= 0) { prev3Word = generatedWords.get(generatedWords.size() - 3).getWord().trim(); } String prev4Word = "@@"; if (generatedWords.size() - 4 >= 0) { prev4Word = generatedWords.get(generatedWords.size() - 4).getWord().trim(); } String prev5Word = "@@"; if (generatedWords.size() - 5 >= 0) { prev5Word = generatedWords.get(generatedWords.size() - 5).getWord().trim(); } String prevBigram = prev2Word + "|" + prevWord; String prevTrigram = prev3Word + "|" + prev2Word + "|" + prevWord; String prev4gram = prev4Word + "|" + prev3Word + "|" + prev2Word + "|" + prevWord; String prev5gram = prev5Word + "|" + prev4Word + "|" + prev3Word + "|" + prev2Word + "|" + prevWord; generalFeatures.put("feature_word_bigram_" + prevBigram.toLowerCase(), 1.0); generalFeatures.put("feature_word_trigram_" + prevTrigram.toLowerCase(), 1.0); generalFeatures.put("feature_word_4gram_" + prev4gram.toLowerCase(), 1.0); generalFeatures.put("feature_word_5gram_" + prev5gram.toLowerCase(), 1.0); /*String bigramWord54 = prev5Word + "|" + prev4Word; String bigramWord43 = prev4Word + "|" + prev3Word; String bigramWord32 = prev3Word + "|" + prev2Word; generalFeatures.put("feature_word_bigramWord54_" + bigramWord54, 1.0); generalFeatures.put("feature_word_bigramWord43_" + bigramWord43, 1.0); generalFeatures.put("feature_word_bigramWord32_" + bigramWord32, 1.0); String bigramWordSkip53 = prev5Word + "|" + prev3Word; String bigramWordSkip42 = prev4Word + "|" + prev2Word; String bigramWordSkip31 = prev3Word + "|" + prevWord; generalFeatures.put("feature_word_bigramWordSkip53_" + bigramWordSkip53, 1.0); generalFeatures.put("feature_word_bigramWordSkip42_" + bigramWordSkip42, 1.0); generalFeatures.put("feature_word_bigramWordSkip31_" + bigramWordSkip31, 1.0); String trigramWord543 = prev5Word + "|" + prev4Word + "|" + prev3Word; String trigramWord432 = prev4Word + "|" + prev3Word + "|" + prev2Word; generalFeatures.put("feature_word_trigramWord543_" + trigramWord543, 1.0); generalFeatures.put("feature_word_trigramWord432_" + trigramWord432, 1.0); String trigramWordSkip542 = prev5Word + "|" + prev4Word + "|" + prev2Word; String trigramWordSkip532 = prev5Word + "|" + prev3Word + "|" + prev2Word; String trigramWordSkip431 = prev4Word + "|" + prev3Word + "|" + prevWord; String trigramWordSkip421 = prev4Word + "|" + prev2Word + "|" + prevWord; generalFeatures.put("feature_word_trigramWordSkip542_" + trigramWordSkip542, 1.0); generalFeatures.put("feature_word_trigramWordSkip532_" + trigramWordSkip532, 1.0); generalFeatures.put("feature_word_trigramWordSkip431_" + trigramWordSkip431, 1.0); generalFeatures.put("feature_word_trigramWordSkip421_" + trigramWordSkip421, 1.0);*/ //Previous words in same as current attrValue features /*if (generatedWordsInSameAttrValue.isEmpty()) { generalFeatures.put("feature_currentAttrValueWord_isEmpty", 1.0); } for (int j = 1; j <= 1; j++) { String previousCurrentAttrValueWord = "@@"; if (generatedWordsInSameAttrValue.size() - j >= 0) { previousCurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - j).getWord().trim(); } generalFeatures.put("feature_currentAttrValueWord_" + j + "_" + previousCurrentAttrValueWord.toLowerCase(), 1.0); } String prevCurrentAttrValueWord = "@@"; if (generatedWordsInSameAttrValue.size() - 1 >= 0) { prevCurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 1).getWord().trim(); } String prev2CurrentAttrValueWord = "@@"; if (generatedWordsInSameAttrValue.size() - 2 >= 0) { prev2CurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 2).getWord().trim(); } String prev3CurrentAttrValueWord = "@@"; if (generatedWordsInSameAttrValue.size() - 3 >= 0) { prev3CurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 3).getWord().trim(); } String prev4CurrentAttrValueWord = "@@"; if (generatedWordsInSameAttrValue.size() - 4 >= 0) { prev4CurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 4).getWord().trim(); } String prev5CurrentAttrValueWord = "@@"; if (generatedWordsInSameAttrValue.size() - 5 >= 0) { prev5CurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 5).getWord().trim(); } String prevCurrentAttrValueBigram = prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord; String prevCurrentAttrValueTrigram = prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord; String prevCurrentAttrValue4gram = prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord; String prevCurrentAttrValue5gram = prev5CurrentAttrValueWord + "|" + prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord; generalFeatures.put("feature_currentAttrValueWord_bigram_" + prevCurrentAttrValueBigram.toLowerCase(), 1.0); generalFeatures.put("feature_currentAttrValueWord_trigram_" + prevCurrentAttrValueTrigram.toLowerCase(), 1.0); generalFeatures.put("feature_currentAttrValueWord_4gram_" + prevCurrentAttrValue4gram.toLowerCase(), 1.0); generalFeatures.put("feature_currentAttrValueWord_5gram_" + prevCurrentAttrValue5gram.toLowerCase(), 1.0);*/ /*String bigramCurrentAttrValueWord54 = prev5CurrentAttrValueWord + "|" + prev4CurrentAttrValueWord; String bigramCurrentAttrValueWord43 = prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord; String bigramCurrentAttrValueWord32 = prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord; generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWord54_" + bigramCurrentAttrValueWord54, 1.0); generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWord43_" + bigramCurrentAttrValueWord43, 1.0); generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWord32_" + bigramCurrentAttrValueWord32, 1.0); String bigramCurrentAttrValueWordSkip53 = prev5CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord; String bigramCurrentAttrValueWordSkip42 = prev4CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord; String bigramCurrentAttrValueWordSkip31 = prev3CurrentAttrValueWord + "|" + prevCurrentAttrValueWord; generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWordSkip53_" + bigramCurrentAttrValueWordSkip53, 1.0); generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWordSkip42_" + bigramCurrentAttrValueWordSkip42, 1.0); generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWordSkip31_" + bigramCurrentAttrValueWordSkip31, 1.0); String trigramCurrentAttrValueWord543 = prev5CurrentAttrValueWord + "|" + prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord; String trigramCurrentAttrValueWord432 = prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord; generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWord543_" + trigramCurrentAttrValueWord543, 1.0); generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWord432_" + trigramCurrentAttrValueWord432, 1.0); String trigramCurrentAttrValueWordSkip542 = prev5CurrentAttrValueWord + "|" + prev4CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord; String trigramCurrentAttrValueWordSkip532 = prev5CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord; String trigramCurrentAttrValueWordSkip431 = prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prevCurrentAttrValueWord; String trigramCurrentAttrValueWordSkip421 = prev4CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord; generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWordSkip542_" + trigramCurrentAttrValueWordSkip542, 1.0); generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWordSkip532_" + trigramCurrentAttrValueWordSkip532, 1.0); generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWordSkip431_" + trigramCurrentAttrValueWordSkip431, 1.0); generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWordSkip421_" + trigramCurrentAttrValueWordSkip421, 1.0);*/ //Previous Attr|Word features for (int j = 1; j <= 1; j++) { String previousAttrWord = "@@"; if (generatedWords.size() - j >= 0) { if (generatedWords.get(generatedWords.size() - j).getAttribute().contains("=")) { previousAttrWord = generatedWords.get(generatedWords.size() - j).getAttribute().trim() .substring(0, generatedWords.get(generatedWords.size() - j).getAttribute().indexOf('=')) + "|" + generatedWords.get(generatedWords.size() - j).getWord().trim(); } else { previousAttrWord = generatedWords.get(generatedWords.size() - j).getAttribute().trim() + "|" + generatedWords.get(generatedWords.size() - j).getWord().trim(); } } generalFeatures.put("feature_attrWord_" + j + "_" + previousAttrWord.toLowerCase(), 1.0); } String prevAttrWord = "@@"; if (generatedWords.size() - 1 >= 0) { if (generatedWords.get(generatedWords.size() - 1).getAttribute().contains("=")) { prevAttrWord = generatedWords.get(generatedWords.size() - 1).getAttribute().trim().substring(0, generatedWords.get(generatedWords.size() - 1).getAttribute().indexOf('=')) + ":" + generatedWords.get(generatedWords.size() - 1).getWord().trim(); } else { prevAttrWord = generatedWords.get(generatedWords.size() - 1).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 1).getWord().trim(); } } String prev2AttrWord = "@@"; if (generatedWords.size() - 2 >= 0) { if (generatedWords.get(generatedWords.size() - 2).getAttribute().contains("=")) { prev2AttrWord = generatedWords.get(generatedWords.size() - 2).getAttribute().trim().substring(0, generatedWords.get(generatedWords.size() - 2).getAttribute().indexOf('=')) + ":" + generatedWords.get(generatedWords.size() - 2).getWord().trim(); } else { prev2AttrWord = generatedWords.get(generatedWords.size() - 2).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 2).getWord().trim(); } } String prev3AttrWord = "@@"; if (generatedWords.size() - 3 >= 0) { if (generatedWords.get(generatedWords.size() - 3).getAttribute().contains("=")) { prev3AttrWord = generatedWords.get(generatedWords.size() - 3).getAttribute().trim().substring(0, generatedWords.get(generatedWords.size() - 3).getAttribute().indexOf('=')) + ":" + generatedWords.get(generatedWords.size() - 3).getWord().trim(); } else { prev3AttrWord = generatedWords.get(generatedWords.size() - 3).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 3).getWord().trim(); } } String prev4AttrWord = "@@"; if (generatedWords.size() - 4 >= 0) { if (generatedWords.get(generatedWords.size() - 4).getAttribute().contains("=")) { prev4AttrWord = generatedWords.get(generatedWords.size() - 4).getAttribute().trim().substring(0, generatedWords.get(generatedWords.size() - 4).getAttribute().indexOf('=')) + ":" + generatedWords.get(generatedWords.size() - 4).getWord().trim(); } else { prev4AttrWord = generatedWords.get(generatedWords.size() - 4).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 4).getWord().trim(); } } String prev5AttrWord = "@@"; if (generatedWords.size() - 5 >= 0) { if (generatedWords.get(generatedWords.size() - 5).getAttribute().contains("=")) { prev5AttrWord = generatedWords.get(generatedWords.size() - 5).getAttribute().trim().substring(0, generatedWords.get(generatedWords.size() - 5).getAttribute().indexOf('=')) + ":" + generatedWords.get(generatedWords.size() - 5).getWord().trim(); } else { prev5AttrWord = generatedWords.get(generatedWords.size() - 5).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 5).getWord().trim(); } } String prevAttrWordBigram = prev2AttrWord + "|" + prevAttrWord; String prevAttrWordTrigram = prev3AttrWord + "|" + prev2AttrWord + "|" + prevAttrWord; String prevAttrWord4gram = prev4AttrWord + "|" + prev3AttrWord + "|" + prev2AttrWord + "|" + prevAttrWord; String prevAttrWord5gram = prev5AttrWord + "|" + prev4AttrWord + "|" + prev3AttrWord + "|" + prev2AttrWord + "|" + prevAttrWord; generalFeatures.put("feature_attrWord_bigram_" + prevAttrWordBigram.toLowerCase(), 1.0); generalFeatures.put("feature_attrWord_trigram_" + prevAttrWordTrigram.toLowerCase(), 1.0); generalFeatures.put("feature_attrWord_4gram_" + prevAttrWord4gram.toLowerCase(), 1.0); generalFeatures.put("feature_attrWord_5gram_" + prevAttrWord5gram.toLowerCase(), 1.0); /*String bigramAttrWord54 = prev5AttrWord + "|" + prev4AttrWord; String bigramAttrWord43 = prev4AttrWord + "|" + prev3AttrWord; String bigramAttrWord32 = prev3AttrWord + "|" + prev2AttrWord; generalFeatures.put("feature_attrWord_bigramAttrWord54_" + bigramAttrWord54, 1.0); generalFeatures.put("feature_attrWord_bigramAttrWord43_" + bigramAttrWord43, 1.0); generalFeatures.put("feature_attrWord_bigramAttrWord32_" + bigramAttrWord32, 1.0); String bigramAttrWordSkip53 = prev5AttrWord + "|" + prev3AttrWord; String bigramAttrWordSkip42 = prev4AttrWord + "|" + prev2AttrWord; String bigramAttrWordSkip31 = prev3AttrWord + "|" + prevAttrWord; generalFeatures.put("feature_attrWord_bigramAttrWordSkip53_" + bigramAttrWordSkip53, 1.0); generalFeatures.put("feature_attrWord_bigramAttrWordSkip42_" + bigramAttrWordSkip42, 1.0); generalFeatures.put("feature_attrWord_bigramAttrWordSkip31_" + bigramAttrWordSkip31, 1.0); String trigramAttrWord543 = prev5AttrWord + "|" + prev4AttrWord + "|" + prev3AttrWord; String trigramAttrWord432 = prev4AttrWord + "|" + prev3AttrWord + "|" + prev2AttrWord; generalFeatures.put("feature_attrWord_trigramAttrWord543_" + trigramAttrWord543, 1.0); generalFeatures.put("feature_attrWord_trigramAttrWord432_" + trigramAttrWord432, 1.0); String trigramAttrWordSkip542 = prev5AttrWord + "|" + prev4AttrWord + "|" + prev2AttrWord; String trigramAttrWordSkip532 = prev5AttrWord + "|" + prev3AttrWord + "|" + prev2AttrWord; String trigramAttrWordSkip431 = prev4AttrWord + "|" + prev3AttrWord + "|" + prevAttrWord; String trigramAttrWordSkip421 = prev4AttrWord + "|" + prev2AttrWord + "|" + prevAttrWord; generalFeatures.put("feature_attrWord_trigramAttrWordSkip542_" + trigramAttrWordSkip542, 1.0); generalFeatures.put("feature_attrWord_trigramAttrWordSkip532_" + trigramAttrWordSkip532, 1.0); generalFeatures.put("feature_attrWord_trigramAttrWordSkip431_" + trigramAttrWordSkip431, 1.0); generalFeatures.put("feature_attrWord_trigramAttrWordSkip421_" + trigramAttrWordSkip421, 1.0);*/ //Previous AttrValue|Word features for (int j = 1; j <= 1; j++) { String previousAttrWord = "@@"; if (generatedWords.size() - j >= 0) { previousAttrWord = generatedWords.get(generatedWords.size() - j).getAttribute().trim() + "|" + generatedWords.get(generatedWords.size() - j).getWord().trim(); } generalFeatures.put("feature_attrValueWord_" + j + "_" + previousAttrWord.toLowerCase(), 1.0); } String prevAttrValueWord = "@@"; if (generatedWords.size() - 1 >= 0) { prevAttrValueWord = generatedWords.get(generatedWords.size() - 1).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 1).getWord().trim(); } String prev2AttrValueWord = "@@"; if (generatedWords.size() - 2 >= 0) { prev2AttrValueWord = generatedWords.get(generatedWords.size() - 2).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 2).getWord().trim(); } String prev3AttrValueWord = "@@"; if (generatedWords.size() - 3 >= 0) { prev3AttrValueWord = generatedWords.get(generatedWords.size() - 3).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 3).getWord().trim(); } String prev4AttrValueWord = "@@"; if (generatedWords.size() - 4 >= 0) { prev4AttrValueWord = generatedWords.get(generatedWords.size() - 4).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 4).getWord().trim(); } String prev5AttrValueWord = "@@"; if (generatedWords.size() - 5 >= 0) { prev5AttrValueWord = generatedWords.get(generatedWords.size() - 5).getAttribute().trim() + ":" + generatedWords.get(generatedWords.size() - 5).getWord().trim(); } String prevAttrValueWordBigram = prev2AttrValueWord + "|" + prevAttrValueWord; String prevAttrValueWordTrigram = prev3AttrValueWord + "|" + prev2AttrValueWord + "|" + prevAttrValueWord; String prevAttrValueWord4gram = prev4AttrValueWord + "|" + prev3AttrValueWord + "|" + prev2AttrValueWord + "|" + prevAttrValueWord; String prevAttrValueWord5gram = prev5AttrValueWord + "|" + prev4AttrValueWord + "|" + prev3AttrValueWord + "|" + prev2AttrValueWord + "|" + prevAttrValueWord; generalFeatures.put("feature_attrValueWord_bigram_" + prevAttrValueWordBigram.toLowerCase(), 1.0); generalFeatures.put("feature_attrValueWord_trigram_" + prevAttrValueWordTrigram.toLowerCase(), 1.0); generalFeatures.put("feature_attrValueWord_4gram_" + prevAttrValueWord4gram.toLowerCase(), 1.0); generalFeatures.put("feature_attrValueWord_5gram_" + prevAttrValueWord5gram.toLowerCase(), 1.0); /*String bigramAttrValueWord54 = prev5AttrValueWord + "|" + prev4AttrValueWord; String bigramAttrValueWord43 = prev4AttrValueWord + "|" + prev3AttrValueWord; String bigramAttrValueWord32 = prev3AttrValueWord + "|" + prev2AttrValueWord; generalFeatures.put("feature_attrValueWord_bigramAttrValueWord54_" + bigramAttrValueWord54, 1.0); generalFeatures.put("feature_attrValueWord_bigramAttrValueWord43_" + bigramAttrValueWord43, 1.0); generalFeatures.put("feature_attrValueWord_bigramAttrValueWord32_" + bigramAttrValueWord32, 1.0); String bigramAttrValueWordSkip53 = prev5AttrValueWord + "|" + prev3AttrValueWord; String bigramAttrValueWordSkip42 = prev4AttrValueWord + "|" + prev2AttrValueWord; String bigramAttrValueWordSkip31 = prev3AttrValueWord + "|" + prevAttrValueWord; generalFeatures.put("feature_attrValueWord_bigramAttrValueWordSkip53_" + bigramAttrValueWordSkip53, 1.0); generalFeatures.put("feature_attrValueWord_bigramAttrValueWordSkip42_" + bigramAttrValueWordSkip42, 1.0); generalFeatures.put("feature_attrValueWord_bigramAttrValueWordSkip31_" + bigramAttrValueWordSkip31, 1.0); String trigramAttrValueWord543 = prev5AttrValueWord + "|" + prev4AttrValueWord + "|" + prev3AttrValueWord; String trigramAttrValueWord432 = prev4AttrValueWord + "|" + prev3AttrValueWord + "|" + prev2AttrValueWord; generalFeatures.put("feature_attrValueWord_trigramAttrValueWord543_" + trigramAttrValueWord543, 1.0); generalFeatures.put("feature_attrValueWord_trigramAttrValueWord432_" + trigramAttrValueWord432, 1.0); String trigramAttrValueWordSkip542 = prev5AttrValueWord + "|" + prev4AttrValueWord + "|" + prev2AttrValueWord; String trigramAttrValueWordSkip532 = prev5AttrValueWord + "|" + prev3AttrValueWord + "|" + prev2AttrValueWord; String trigramAttrValueWordSkip431 = prev4AttrValueWord + "|" + prev3AttrValueWord + "|" + prevAttrValueWord; String trigramAttrValueWordSkip421 = prev4AttrValueWord + "|" + prev2AttrValueWord + "|" + prevAttrValueWord; generalFeatures.put("feature_attrValueWord_trigramAttrValueWordSkip542_" + trigramAttrValueWordSkip542, 1.0); generalFeatures.put("feature_attrValueWord_trigramAttrValueWordSkip532_" + trigramAttrValueWordSkip532, 1.0); generalFeatures.put("feature_attrValueWord_trigramAttrValueWordSkip431_" + trigramAttrValueWordSkip431, 1.0); generalFeatures.put("feature_attrValueWord_trigramAttrValueWordSkip421_" + trigramAttrValueWordSkip421, 1.0);*/ //Previous attrValue features int attributeSize = generatedAttributes.size(); for (int j = 1; j <= 1; j++) { String previousAttrValue = "@@"; if (attributeSize - j >= 0) { previousAttrValue = generatedAttributes.get(attributeSize - j).trim(); } generalFeatures.put("feature_attrValue_" + j + "_" + previousAttrValue, 1.0); } String prevAttrValue = "@@"; if (attributeSize - 1 >= 0) { prevAttrValue = generatedAttributes.get(attributeSize - 1).trim(); } String prev2AttrValue = "@@"; if (attributeSize - 2 >= 0) { prev2AttrValue = generatedAttributes.get(attributeSize - 2).trim(); } String prev3AttrValue = "@@"; if (attributeSize - 3 >= 0) { prev3AttrValue = generatedAttributes.get(attributeSize - 3).trim(); } String prev4AttrValue = "@@"; if (attributeSize - 4 >= 0) { prev4AttrValue = generatedAttributes.get(attributeSize - 4).trim(); } String prev5AttrValue = "@@"; if (attributeSize - 5 >= 0) { prev5AttrValue = generatedAttributes.get(attributeSize - 5).trim(); } String prevAttrBigramValue = prev2AttrValue + "|" + prevAttrValue; String prevAttrTrigramValue = prev3AttrValue + "|" + prev2AttrValue + "|" + prevAttrValue; String prevAttr4gramValue = prev4AttrValue + "|" + prev3AttrValue + "|" + prev2AttrValue + "|" + prevAttrValue; String prevAttr5gramValue = prev5AttrValue + "|" + prev4AttrValue + "|" + prev3AttrValue + "|" + prev2AttrValue + "|" + prevAttrValue; generalFeatures.put("feature_attrValue_bigram_" + prevAttrBigramValue.toLowerCase(), 1.0); generalFeatures.put("feature_attrValue_trigram_" + prevAttrTrigramValue.toLowerCase(), 1.0); generalFeatures.put("feature_attrValue_4gram_" + prevAttr4gramValue.toLowerCase(), 1.0); generalFeatures.put("feature_attrValue_5gram_" + prevAttr5gramValue.toLowerCase(), 1.0); /*String bigramAttrValue54 = prev5AttrValue + "|" + prev4AttrValue; String bigramAttrValue43 = prev4AttrValue + "|" + prev3AttrValue; String bigramAttrValue32 = prev3AttrValue + "|" + prev2AttrValue; generalFeatures.put("feature_attrValue_bigramAttrValue54_" + bigramAttrValue54, 1.0); generalFeatures.put("feature_attrValue_bigramAttrValue43_" + bigramAttrValue43, 1.0); generalFeatures.put("feature_attrValue_bigramAttrValue32_" + bigramAttrValue32, 1.0); String bigramAttrValueSkip53 = prev5AttrValue + "|" + prev3AttrValue; String bigramAttrValueSkip42 = prev4AttrValue + "|" + prev2AttrValue; String bigramAttrValueSkip31 = prev3AttrValue + "|" + prevAttrValue; generalFeatures.put("feature_attrValue_bigramAttrValueSkip53_" + bigramAttrValueSkip53, 1.0); generalFeatures.put("feature_attrValue_bigramAttrValueSkip42_" + bigramAttrValueSkip42, 1.0); generalFeatures.put("feature_attrValue_bigramAttrValueSkip31_" + bigramAttrValueSkip31, 1.0); String trigramAttrValue543 = prev5AttrValue + "|" + prev4AttrValue + "|" + prev3AttrValue; String trigramAttrValue432 = prev4AttrValue + "|" + prev3AttrValue + "|" + prev2AttrValue; generalFeatures.put("feature_attrValue_trigramAttrValue543_" + trigramAttrValue543, 1.0); generalFeatures.put("feature_attrValue_trigramAttrValue432_" + trigramAttrValue432, 1.0); String trigramAttrValueSkip542 = prev5AttrValue + "|" + prev4AttrValue + "|" + prev2AttrValue; String trigramAttrValueSkip532 = prev5AttrValue + "|" + prev3AttrValue + "|" + prev2AttrValue; String trigramAttrValueSkip431 = prev4AttrValue + "|" + prev3AttrValue + "|" + prevAttrValue; String trigramAttrValueSkip421 = prev4AttrValue + "|" + prev2AttrValue + "|" + prevAttrValue; generalFeatures.put("feature_attrValue_trigramAttrValueSkip542_" + trigramAttrValueSkip542, 1.0); generalFeatures.put("feature_attrValue_trigramAttrValueSkip532_" + trigramAttrValueSkip532, 1.0); generalFeatures.put("feature_attrValue_trigramAttrValueSkip431_" + trigramAttrValueSkip431, 1.0); generalFeatures.put("feature_attrValue_trigramAttrValueSkip421_" + trigramAttrValueSkip421, 1.0);*/ //Previous attr features for (int j = 1; j <= 1; j++) { String previousAttr = "@@"; if (attributeSize - j >= 0) { if (generatedAttributes.get(attributeSize - j).contains("=")) { previousAttr = generatedAttributes.get(attributeSize - j).trim().substring(0, generatedAttributes.get(attributeSize - j).indexOf('=')); } else { previousAttr = generatedAttributes.get(attributeSize - j).trim(); } } generalFeatures.put("feature_attr_" + j + "_" + previousAttr, 1.0); } String prevAttr = "@@"; if (attributeSize - 1 >= 0) { if (generatedAttributes.get(attributeSize - 1).contains("=")) { prevAttr = generatedAttributes.get(attributeSize - 1).trim().substring(0, generatedAttributes.get(attributeSize - 1).indexOf('=')); } else { prevAttr = generatedAttributes.get(attributeSize - 1).trim(); } } String prev2Attr = "@@"; if (attributeSize - 2 >= 0) { if (generatedAttributes.get(attributeSize - 2).contains("=")) { prev2Attr = generatedAttributes.get(attributeSize - 2).trim().substring(0, generatedAttributes.get(attributeSize - 2).indexOf('=')); } else { prev2Attr = generatedAttributes.get(attributeSize - 2).trim(); } } String prev3Attr = "@@"; if (attributeSize - 3 >= 0) { if (generatedAttributes.get(attributeSize - 3).contains("=")) { prev3Attr = generatedAttributes.get(attributeSize - 3).trim().substring(0, generatedAttributes.get(attributeSize - 3).indexOf('=')); } else { prev3Attr = generatedAttributes.get(attributeSize - 3).trim(); } } String prev4Attr = "@@"; if (attributeSize - 4 >= 0) { if (generatedAttributes.get(attributeSize - 4).contains("=")) { prev4Attr = generatedAttributes.get(attributeSize - 4).trim().substring(0, generatedAttributes.get(attributeSize - 4).indexOf('=')); } else { prev4Attr = generatedAttributes.get(attributeSize - 4).trim(); } } String prev5Attr = "@@"; if (attributeSize - 5 >= 0) { if (generatedAttributes.get(attributeSize - 5).contains("=")) { prev5Attr = generatedAttributes.get(attributeSize - 5).trim().substring(0, generatedAttributes.get(attributeSize - 5).indexOf('=')); } else { prev5Attr = generatedAttributes.get(attributeSize - 5).trim(); } } String prevAttrBigram = prev2Attr + "|" + prevAttr; String prevAttrTrigram = prev3Attr + "|" + prev2Attr + "|" + prevAttr; String prevAttr4gram = prev4Attr + "|" + prev3Attr + "|" + prev2Attr + "|" + prevAttr; String prevAttr5gram = prev5Attr + "|" + prev4Attr + "|" + prev3Attr + "|" + prev2Attr + "|" + prevAttr; generalFeatures.put("feature_attr_bigram_" + prevAttrBigram.toLowerCase(), 1.0); generalFeatures.put("feature_attr_trigram_" + prevAttrTrigram.toLowerCase(), 1.0); generalFeatures.put("feature_attr_4gram_" + prevAttr4gram.toLowerCase(), 1.0); generalFeatures.put("feature_attr_5gram_" + prevAttr5gram.toLowerCase(), 1.0); /*String bigramAttr54 = prev5Attr + "|" + prev4Attr; String bigramAttr43 = prev4Attr + "|" + prev3Attr; String bigramAttr32 = prev3Attr + "|" + prev2Attr; generalFeatures.put("feature_attr_bigramAttr54_" + bigramAttr54, 1.0); generalFeatures.put("feature_attr_bigramAttr43_" + bigramAttr43, 1.0); generalFeatures.put("feature_attr_bigramAttr32_" + bigramAttr32, 1.0); String bigramAttrSkip53 = prev5Attr + "|" + prev3Attr; String bigramAttrSkip42 = prev4Attr + "|" + prev2Attr; String bigramAttrSkip31 = prev3Attr + "|" + prevAttr; generalFeatures.put("feature_attr_bigramAttrSkip53_" + bigramAttrSkip53, 1.0); generalFeatures.put("feature_attr_bigramAttrSkip42_" + bigramAttrSkip42, 1.0); generalFeatures.put("feature_attr_bigramAttrSkip31_" + bigramAttrSkip31, 1.0); String trigramAttr543 = prev5Attr + "|" + prev4Attr + "|" + prev3Attr; String trigramAttr432 = prev4Attr + "|" + prev3Attr + "|" + prev2Attr; generalFeatures.put("feature_attr_trigramAttr543_" + trigramAttr543, 1.0); generalFeatures.put("feature_attr_trigramAttr432_" + trigramAttr432, 1.0); String trigramAttrSkip542 = prev5Attr + "|" + prev4Attr + "|" + prev2Attr; String trigramAttrSkip532 = prev5Attr + "|" + prev3Attr + "|" + prev2Attr; String trigramAttrSkip431 = prev4Attr + "|" + prev3Attr + "|" + prevAttr; String trigramAttrSkip421 = prev4Attr + "|" + prev2Attr + "|" + prevAttr; generalFeatures.put("feature_attr_trigramAttrSkip542_" + trigramAttrSkip542, 1.0); generalFeatures.put("feature_attr_trigramAttrSkip532_" + trigramAttrSkip532, 1.0); generalFeatures.put("feature_attr_trigramAttrSkip431_" + trigramAttrSkip431, 1.0); generalFeatures.put("feature_attr_trigramAttrSkip421_" + trigramAttrSkip421, 1.0);*/ //Next attr features for (int j = 0; j < 1; j++) { String nextAttr = "@@"; if (j < nextGeneratedAttributes.size()) { if (nextGeneratedAttributes.get(j).contains("=")) { nextAttr = nextGeneratedAttributes.get(j).trim().substring(0, nextGeneratedAttributes.get(j).indexOf('=')); } else { nextAttr = nextGeneratedAttributes.get(j).trim(); } } generalFeatures.put("feature_nextAttr_" + j + "_" + nextAttr, 1.0); } String nextAttr = "@@"; if (0 < nextGeneratedAttributes.size()) { if (nextGeneratedAttributes.get(0).contains("=")) { nextAttr = nextGeneratedAttributes.get(0).trim().substring(0, nextGeneratedAttributes.get(0).indexOf('=')); } else { nextAttr = nextGeneratedAttributes.get(0).trim(); } } String next2Attr = "@@"; if (1 < nextGeneratedAttributes.size()) { if (nextGeneratedAttributes.get(1).contains("=")) { next2Attr = nextGeneratedAttributes.get(1).trim().substring(0, nextGeneratedAttributes.get(1).indexOf('=')); } else { next2Attr = nextGeneratedAttributes.get(1).trim(); } } String next3Attr = "@@"; if (2 < nextGeneratedAttributes.size()) { if (nextGeneratedAttributes.get(2).contains("=")) { next3Attr = nextGeneratedAttributes.get(2).trim().substring(0, nextGeneratedAttributes.get(2).indexOf('=')); } else { next3Attr = nextGeneratedAttributes.get(2).trim(); } } String next4Attr = "@@"; if (3 < nextGeneratedAttributes.size()) { if (nextGeneratedAttributes.get(3).contains("=")) { next4Attr = nextGeneratedAttributes.get(3).trim().substring(0, nextGeneratedAttributes.get(3).indexOf('=')); } else { next4Attr = nextGeneratedAttributes.get(3).trim(); } } String next5Attr = "@@"; if (4 < nextGeneratedAttributes.size()) { if (nextGeneratedAttributes.get(4).contains("=")) { next5Attr = nextGeneratedAttributes.get(4).trim().substring(0, nextGeneratedAttributes.get(4).indexOf('=')); } else { next5Attr = nextGeneratedAttributes.get(4).trim(); } } String nextAttrBigram = nextAttr + "|" + next2Attr; String nextAttrTrigram = nextAttr + "|" + next2Attr + "|" + next3Attr; String nextAttr4gram = nextAttr + "|" + next2Attr + "|" + next3Attr + "|" + next4Attr; String nextAttr5gram = nextAttr + "|" + next2Attr + "|" + next3Attr + "|" + next4Attr + "|" + next5Attr; generalFeatures.put("feature_nextAttr_bigram_" + nextAttrBigram.toLowerCase(), 1.0); generalFeatures.put("feature_nextAttr_trigram_" + nextAttrTrigram.toLowerCase(), 1.0); generalFeatures.put("feature_nextAttr_4gram_" + nextAttr4gram.toLowerCase(), 1.0); generalFeatures.put("feature_nextAttr_5gram_" + nextAttr5gram.toLowerCase(), 1.0); //Next attrValue features for (int j = 0; j < 1; j++) { String nextAttrValue = "@@"; if (j < nextGeneratedAttributes.size()) { nextAttrValue = nextGeneratedAttributes.get(j).trim(); } generalFeatures.put("feature_nextAttrValue_" + j + "_" + nextAttrValue, 1.0); } String nextAttrValue = "@@"; if (0 < nextGeneratedAttributes.size()) { nextAttrValue = nextGeneratedAttributes.get(0).trim(); } String next2AttrValue = "@@"; if (1 < nextGeneratedAttributes.size()) { next2AttrValue = nextGeneratedAttributes.get(1).trim(); } String next3AttrValue = "@@"; if (2 < nextGeneratedAttributes.size()) { next3AttrValue = nextGeneratedAttributes.get(2).trim(); } String next4AttrValue = "@@"; if (3 < nextGeneratedAttributes.size()) { next4AttrValue = nextGeneratedAttributes.get(3).trim(); } String next5AttrValue = "@@"; if (4 < nextGeneratedAttributes.size()) { next5AttrValue = nextGeneratedAttributes.get(4).trim(); } String nextAttrValueBigram = nextAttrValue + "|" + next2AttrValue; String nextAttrValueTrigram = nextAttrValue + "|" + next2AttrValue + "|" + next3AttrValue; String nextAttrValue4gram = nextAttrValue + "|" + next2AttrValue + "|" + next3AttrValue + "|" + next4AttrValue; String nextAttrValue5gram = nextAttrValue + "|" + next2AttrValue + "|" + next3AttrValue + "|" + next4AttrValue + "|" + next5AttrValue; generalFeatures.put("feature_nextAttrValue_bigram_" + nextAttrValueBigram.toLowerCase(), 1.0); generalFeatures.put("feature_nextAttrValue_trigram_" + nextAttrValueTrigram.toLowerCase(), 1.0); generalFeatures.put("feature_nextAttrValue_4gram_" + nextAttrValue4gram.toLowerCase(), 1.0); generalFeatures.put("feature_nextAttrValue_5gram_" + nextAttrValue5gram.toLowerCase(), 1.0); //If values have already been generated or not generalFeatures.put("feature_valueToBeMentioned_" + currentValue.toLowerCase(), 1.0); if (wasValueMentioned) { generalFeatures.put("feature_wasValueMentioned_true", 1.0); } else { //generalFeatures.put("feature_wasValueMentioned_false", 1.0); } HashSet<String> valuesThatFollow = new HashSet<>(); attrValuesThatFollow.stream().map((attrValue) -> { generalFeatures.put("feature_attrValuesThatFollow_" + attrValue.toLowerCase(), 1.0); return attrValue; }).forEachOrdered((attrValue) -> { if (attrValue.contains("=")) { String v = attrValue.substring(attrValue.indexOf('=') + 1); if (v.matches("[xX][0-9]+")) { String attr = attrValue.substring(0, attrValue.indexOf('=')); valuesThatFollow.add(Action.TOKEN_X + attr + "_" + v.substring(1)); } else { valuesThatFollow.add(v); } generalFeatures.put( "feature_attrsThatFollow_" + attrValue.substring(0, attrValue.indexOf('=')).toLowerCase(), 1.0); } else { generalFeatures.put("feature_attrsThatFollow_" + attrValue.toLowerCase(), 1.0); } }); if (valuesThatFollow.isEmpty()) { generalFeatures.put("feature_noAttrsFollow", 1.0); } else { generalFeatures.put("feature_noAttrsFollow", 0.0); } HashSet<String> mentionedValues = new HashSet<>(); attrValuesAlreadyMentioned.stream().map((attrValue) -> { generalFeatures.put("feature_attrValuesAlreadyMentioned_" + attrValue.toLowerCase(), 1.0); return attrValue; }).forEachOrdered((attrValue) -> { if (attrValue.contains("=")) { generalFeatures.put("feature_attrsAlreadyMentioned_" + attrValue.substring(0, attrValue.indexOf('=')).toLowerCase(), 1.0); String v = attrValue.substring(attrValue.indexOf('=') + 1); if (v.matches("[xX][0-9]+")) { String attr = attrValue.substring(0, attrValue.indexOf('=')); mentionedValues.add(Action.TOKEN_X + attr + "_" + v.substring(1)); } else { mentionedValues.add(v); } } else { generalFeatures.put("feature_attrsAlreadyMentioned_" + attrValue.toLowerCase(), 1.0); } }); /*System.out.println("currentAttrValue: " + currentAttrValue); System.out.println("5W: " + prev5gram); System.out.println("5AW: " + prevAttrWord5gram); System.out.println("5A: " + prevAttr5gram); System.out.println("VM: " + wasValueMentioned); System.out.println("A_TF: " + attrValuesThatFollow); System.out.println("==============================");*/ if (currentValue.equals("no") || currentValue.equals("yes") || currentValue.equals("yes or no") || currentValue.equals("none") || currentValue.equals("empty") //|| currentValue.equals("dont_care") ) { generalFeatures.put("feature_emptyValue", 1.0); } //Word specific features (and also global features) for (Action action : availableWordActions.get(currentAttr)) { //Is word same as previous word if (prevWord.equals(action.getWord())) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_sameAsPreviousWord", 1.0); valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_sameAsPreviousWord", 1.0); } else { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_notSameAsPreviousWord", 1.0); valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_notSameAsPreviousWord", 1.0); } //Has word appeared in the same attrValue before generatedWords.forEach((previousAction) -> { if (previousAction.getWord().equals(action.getWord()) && previousAction.getAttribute().equals(currentAttrValue)) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_appearedInSameAttrValue", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_appearedInSameAttrValue", 1.0); } else { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_notAppearedInSameAttrValue", 1.0); //valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_notAppearedInSameAttrValue", 1.0); } }); //Has word appeared before generatedWords.forEach((previousAction) -> { if (previousAction.getWord().equals(action.getWord())) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_appeared", 1.0); valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_appeared", 1.0); } else { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_notAppeared", 1.0); //valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_notAppeared", 1.0); } }); if (currentValue.equals("no") || currentValue.equals("yes") || currentValue.equals("yes or no") || currentValue.equals("none") || currentValue.equals("empty") //|| currentValue.equals("dont_care") ) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_emptyValue", 1.0); valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_emptyValue", 1.0); } else { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_notEmptyValue", 1.0); //valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_notEmptyValue", 1.0); } HashSet<String> keys = new HashSet<>(valueSpecificFeatures.get(action.getAction()).keySet()); keys.forEach((feature1) -> { keys.stream() .filter((feature2) -> (valueSpecificFeatures.get(action.getAction()).get(feature1) == 1.0 && valueSpecificFeatures.get(action.getAction()).get(feature2) == 1.0 && feature1.compareTo(feature2) < 0)) .forEachOrdered((feature2) -> { valueSpecificFeatures.get(action.getAction()).put(feature1 + "&&" + feature2, 1.0); }); }); if (!action.getWord().startsWith(Action.TOKEN_X) && !currentValue.equals("no") && !currentValue.equals("yes") && !currentValue.equals("yes or no") && !currentValue.equals("none") && !currentValue.equals("empty") //&& !currentValue.equals("dont_care") ) { for (String value : getValueAlignments().keySet()) { for (ArrayList<String> alignedStr : getValueAlignments().get(value).keySet()) { if (alignedStr.get(0).equals(action.getWord())) { if (mentionedValues.contains(value)) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_beginsValue_alreadyMentioned", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_beginsValue_alreadyMentioned", 1.0); } else if (currentValue.equals(value)) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_beginsValue_current", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_beginsValue_current", 1.0); } else if (valuesThatFollow.contains(value)) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_beginsValue_thatFollows", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_beginsValue_thatFollows", 1.0); } else { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_beginsValue_notInMR", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_beginsValue_notInMR", 1.0); } } else { for (int i = 1; i < alignedStr.size(); i++) { if (alignedStr.get(i).equals(action.getWord())) { if (endsWith(generatedPhrase, new ArrayList<String>(alignedStr.subList(0, i + 1)))) { if (mentionedValues.contains(value)) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_inValue_alreadyMentioned", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_inValue_alreadyMentioned", 1.0); } else if (currentValue.equals(value)) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_inValue_current", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_inValue_current", 1.0); } else if (valuesThatFollow.contains(value)) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_inValue_thatFollows", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_inValue_thatFollows", 1.0); } else { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_inValue_notInMR", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_inValue_notInMR", 1.0); } } else { /*if (mentionedValues.contains(value)) { valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue_alreadyMentioned", 1.0); } else if (currentValue.equals(value)) { valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue_current", 1.0); } else if (valuesThatFollow.contains(value)) { valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue_thatFollows", 1.0); } else { valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue_notInMR", 1.0); }*/ //valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_outOfValue", 1.0); } } } } } } if (action.getWord().equals(Action.TOKEN_END)) { if (generatedWordsInSameAttrValue.isEmpty()) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_closingEmptyAttr", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_closingEmptyAttr", 1.0); } if (!wasValueMentioned) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_closingAttrWithValueNotMentioned", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_closingAttrWithValueNotMentioned", 1.0); } // if (!prevCurrentAttrValueWord.equals("@@")) { if (!prevWord.equals("@@")) { boolean alignmentIsOpen = false; for (String value : getValueAlignments().keySet()) { for (ArrayList<String> alignedStr : getValueAlignments().get(value).keySet()) { for (int i = 0; i < alignedStr.size() - 1; i++) { if (alignedStr.get(i).equals(prevWord) && endsWith(generatedPhrase, new ArrayList<>(alignedStr.subList(0, i + 1)))) { alignmentIsOpen = true; } } } } if (alignmentIsOpen) { // valueSpecificFeatures.get(action.getAction()).put("feature_specific_closingAttrWhileValueIsNotConcluded", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_closingAttrWhileValueIsNotConcluded", 1.0); } } } } else if (currentValue.equals("no") || currentValue.equals("yes") || currentValue.equals("yes or no") || currentValue.equals("none") || currentValue.equals("empty") //|| currentValue.equals("dont_care") ) { valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_XValue_notInMR", 1.0); } else { String currentValueVariant = ""; if (currentValue.matches("[xX][0-9]+")) { currentValueVariant = Action.TOKEN_X + currentAttr + "_" + currentValue.substring(1); } if (mentionedValues.contains(action.getWord())) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_XValue_alreadyMentioned", 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_specific_XValue_alreadyMentioned", 1.0); } else if (currentValueVariant.equals(action.getWord()) && !currentValueVariant.isEmpty()) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_XValue_current", 1.0); valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_XValue_current", 1.0); } else if (valuesThatFollow.contains(action.getWord())) { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_XValue_thatFollows", 1.0); valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_XValue_thatFollows", 1.0); } else { //valueSpecificFeatures.get(action.getAction()).put("feature_specific_XValue_notInMR", 1.0); valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_XValue_notInMR", 1.0); } } /*for (int i : nGrams.keySet()) { for (String nGram : nGrams.get(i)) { if (i == 2) { if (nGram.startsWith(prevWord + "|") && nGram.endsWith("|" + action.getAction())) { valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPreviousWord", 1.0); } } else if (i == 3) { if (nGram.startsWith(prevBigram + "|") && nGram.endsWith("|" + action.getAction())) { valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPreviousBigram", 1.0); } } else if (i == 4) { if (nGram.startsWith(prevTrigram + "|") && nGram.endsWith("|" + action.getAction())) { valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPreviousTrigram", 1.0); } } else if (i == 5) { if (nGram.startsWith(prev4gram + "|") && nGram.endsWith("|" + action.getAction())) { valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPrevious4gram", 1.0); } } else if (i == 6) { if (nGram.startsWith(prev5gram + "|") && nGram.endsWith("|" + action.getAction())) { valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPrevious5gram", 1.0); } } } }*/ //valueSpecificFeatures.get(action.getAction()).put("global_feature_abstractMR_" + mr.getAbstractMR(), 1.0); valueSpecificFeatures.get(action.getAction()) .put("global_feature_currentValue_" + currentValue.toLowerCase(), 1.0); ArrayList<String> fullGramLM = new ArrayList<>(); for (int i = 0; i < generatedWords.size(); i++) { fullGramLM.add(generatedWords.get(i).getWord()); } ArrayList<String> prev5wordGramLM = new ArrayList<>(); int j = 0; for (int i = generatedWords.size() - 1; (i >= 0 && j < 5); i--) { prev5wordGramLM.add(0, generatedWords.get(i).getWord()); j++; } prev5wordGramLM.add(action.getWord()); while (prev5wordGramLM.size() < 4) { prev5wordGramLM.add(0, "@@"); } double afterLMScorePerPred5Gram = getWordLMsPerPredicate().get(predicate) .getProbability(prev5wordGramLM); valueSpecificFeatures.get(action.getAction()).put("global_feature_LMWord_perPredicate_5gram_score", afterLMScorePerPred5Gram); double afterLMScorePerPred = getWordLMsPerPredicate().get(predicate).getProbability(fullGramLM); valueSpecificFeatures.get(action.getAction()).put("global_feature_LMWord_perPredicate_score", afterLMScorePerPred); } /*HashSet<String> keys = new HashSet<>(generalFeatures.keySet()); for (String feature1 : keys) { if (generalFeatures.get(feature1) == 1.0) { generalFeatures.put("global_feature_attr_" + currentValue.toLowerCase() + "&&" + feature1, 1.0); } }*/ //generalFeatures.put("feature_abstractMR_" + mr.getAbstractMR(), 1.0); /*HashSet<String> keys = new HashSet<>(generalFeatures.keySet()); for (String feature1 : keys) { for (String feature2 : keys) { if (generalFeatures.get(feature1) == 1.0 && generalFeatures.get(feature2) == 1.0 && feature1.compareTo(feature2) < 0) { generalFeatures.put(feature1 + "&&" + feature2, 1.0); } } }*/ return new Instance(generalFeatures, valueSpecificFeatures, costs); }