Example usage for java.util HashSet forEach

List of usage examples for java.util HashSet forEach


In this page you can find the example usage for java.util HashSet forEach.


default void forEach(Consumer<? super T> action) 

Source Link


Performs the given action for each element of the Iterable until all elements have been processed or the action throws an exception.


From source file:sh.isaac.convert.rxnorm.standard.RxNormMojo.java

 * Process CUI rows./*from   w  w w  .  ja va 2s .  c om*/
 * @param conceptData the concept data
 * @throws IOException Signals that an I/O exception has occurred.
 * @throws SQLException the SQL exception
 * @throws PropertyVetoException the property veto exception
private void processCUIRows(ArrayList<RXNCONSO> conceptData)
        throws IOException, SQLException, PropertyVetoException {
    final String rxCui = conceptData.get(0).rxcui;
    final HashSet<String> uniqueTTYs = new HashSet<>();
    final HashSet<String> uniqueSABs = new HashSet<>();

    // ensure all the same CUI, gather the TTYs involved
    conceptData.stream().map((row) -> {
        return row;
    }).map((row) -> {
        return row;
    }).filter((row) -> (!row.rxcui.equals(rxCui))).forEachOrdered((_item) -> {
        throw new RuntimeException("Oops");

    ComponentReference cuiConcept;

    if ((uniqueSABs.size() == 1) && uniqueSABs.iterator().next().equals(this.sctSab)) {
        // This is a SCT only concept - we don't want to create it.  But we might need to put some relationships or associations here.
        final String sctId = conceptData.get(0).code;

        if (sctId == null) {
            throw new RuntimeException("Unexpected");

        cuiConcept = ComponentReference.fromConcept(this.sctIdToUUID.get(sctId));

        // Add the RxCUI UUID
        this.importUtil.addUUID(cuiConcept.getPrimordialUuid(), createCUIConceptUUID(rxCui));

        // TODO need to look at what else I should be grabbing - the RXCUI for example should be attached.  What else?
    } else {
        // just creating the reference here, with the UUID - because we don't know if it should be active or inactive yet.
        // create the real concept later.
        cuiConcept = ComponentReference.fromConcept(createCUIConceptUUID(rxCui));

        long conceptTime = Integer.MAX_VALUE;

        // Activate the concept if any description is active
        Status conceptState = Status.INACTIVE;

        this.importUtil.addStringAnnotation(cuiConcept, rxCui,
                this.ptUMLSAttributes.getProperty("RXCUI").getUUID(), Status.ACTIVE);

        final ArrayList<ValuePropertyPairWithSAB> cuiDescriptions = new ArrayList<>();
        final HashSet<String> sabs = new HashSet<>();

        for (final RXNCONSO atom : conceptData) {
            if (atom.sab.equals(this.sctSab)) {

            // Add attributes from SAT table
            this.descSat.setString(1, rxCui);
            this.descSat.setString(2, atom.rxaui);

            final ArrayList<RXNSAT> satData;
            boolean disableDescription;
            Long descriptionTime;

            try (ResultSet rs = this.descSat.executeQuery()) {
                satData = new ArrayList<>();
                disableDescription = false;
                descriptionTime = null;

                while (rs.next()) {
                    final RXNSAT current = new RXNSAT(rs);


                    if ("RXN_OBSOLETED".equals(current.atn)) {
                        disableDescription = true;

                    if ("RXN_ACTIVATED".equals(current.atn)) {
                        try {
                            final long time = this.dateParse.parse(current.atv).getTime();

                            descriptionTime = time;

                            if (time < conceptTime) {
                                conceptTime = time;
                        } catch (final ParseException e) {
                            throw new RuntimeException("Can't parse date?");

            final ValuePropertyPairWithSAB desc = new ValuePropertyPairWithSAB(atom.str,
                    this.ptDescriptions.getProperty(atom.tty), atom.sab, satData);

            if (disableDescription) {
            } else {
                // if any description is active, concept is still active
                conceptState = Status.ACTIVE;

            if (descriptionTime != null) {


            // used for sorting description to figure out what to use for FULLY_QUALIFIED_NAME
            desc.addStringAttribute(this.ptUMLSAttributes.getProperty("RXAUI").getUUID(), atom.rxaui);

            if (StringUtils.isNotBlank(atom.code) && !atom.code.equals("NOCODE")) {
                desc.addStringAttribute(this.ptUMLSAttributes.getProperty("CODE").getUUID(), atom.code);

            if (StringUtils.isNotBlank(atom.saui)) {
                desc.addStringAttribute(this.ptUMLSAttributes.getProperty("SAUI").getUUID(), atom.saui);

            if (StringUtils.isNotBlank(atom.scui)) {
                desc.addStringAttribute(this.ptUMLSAttributes.getProperty("SCUI").getUUID(), atom.scui);

            if (StringUtils.isNotBlank(atom.suppress)) {

            if (StringUtils.isNotBlank(atom.cvf)) {
                if (atom.cvf.equals("4096")) {
                } else {
                    throw new RuntimeException("Unexpected value in RXNCONSO cvf column '" + atom.cvf + "'");

            if (!atom.lat.equals("ENG")) {
                ConsoleUtil.printErrorln("Non-english lang settings not handled yet!");

            // TODO - at this point, sometime in the future, we make make attributes out of the relationships that occur between the AUIs
            // and store them on the descriptions, since OTF doesn't allow relationships between descriptions
            // TODO am I supposed to be using sabs?

        // sanity check on descriptions - make sure we only have one that is of type synonym with the preferred flag
        final ArrayList<String> items = new ArrayList<>();

                .filter((vpp) -> ((vpp.getProperty().getPropertySubType() >= BPT_Descriptions.SYNONYM)
                        && (vpp.getProperty().getPropertySubType() <= (BPT_Descriptions.SYNONYM + 20))))
                .forEachOrdered((vpp) -> {
                    items.add(vpp.getProperty().getSourcePropertyNameFQN() + " "
                            + vpp.getProperty().getPropertySubType());
                }); // Numbers come from the rankings down below in makeDescriptionType(...)

        final HashSet<String> ranksLookedAt = new HashSet<>();


        boolean oneNotInList = false;

        if (items.size() > 1) {
            for (final String s : items) {
                if (!ranksLookedAt.contains(s.substring(s.length() - 3, s.length()))) {
                    oneNotInList = true;

        if (oneNotInList) {
                    "Need to rank multiple synonym types that are each marked preferred, determine if ranking is appropriate!");
            items.forEach((s) -> {

        final List<SemanticChronology> addedDescriptions = this.importUtil.addDescriptions(cuiConcept,

        if (addedDescriptions.size() != cuiDescriptions.size()) {
            throw new RuntimeException("oops");

        final HashSet<String> uniqueUMLSCUI = new HashSet<>();

        for (int i = 0; i < cuiDescriptions.size(); i++) {
            final SemanticChronology desc = addedDescriptions.get(i);
            final ValuePropertyPairWithSAB descPP = cuiDescriptions.get(i);
            final BiFunction<String, String, Boolean> functions = (atn, atv) -> {
                // Pull these up to the concept.
                if ("UMLSCUI".equals(atn)) {
                    return true;

                return false;

            // TODO should I be passing in item code here?
            processSAT(ComponentReference.fromChronology(desc), descPP.getSatData(), null, descPP.getSab(),

        // pulling up the UMLS CUIs.
        // uniqueUMLSCUI is populated during processSAT
        uniqueUMLSCUI.forEach((umlsCui) -> {
            final UUID itemUUID = ConverterUUID.createNamespaceUUIDFromString("UMLSCUI" + umlsCui);

            this.importUtil.addStringAnnotation(cuiConcept, itemUUID, umlsCui,
                    this.ptTermAttributes.getProperty("UMLSCUI").getUUID(), Status.ACTIVE);
        ValuePropertyPairWithAttributes.processAttributes(this.importUtil, cuiDescriptions, addedDescriptions);

        // there are no attributes in rxnorm without an AUI.
        //       try
        //       {
        this.importUtil.addRefsetMembership(cuiConcept, this.allCUIRefsetConcept.getPrimordialUuid(),
                Status.ACTIVE, null);

        //       }
        //       catch (RuntimeException e)
        //       {
        //               if (e.toString().contains("duplicate UUID"))
        //               {
        //                       //ok - this can happen due to multiple merges onto an existing SCT concept
        //               }
        //               else
        //               {
        //                       throw e;
        //               }
        //       }
        // add semantic types
        this.semanticTypeStatement.setString(1, rxCui);

        final ResultSet rs = this.semanticTypeStatement.executeQuery();

        processSemanticTypes(cuiConcept, rs);

        if (conceptTime < 0) {
            throw new RuntimeException("oops");

        this.importUtil.createConcept(cuiConcept.getPrimordialUuid(), conceptTime, conceptState, null);

    final HashSet<UUID> parents = new HashSet<>();

    this.cuiRelStatementForward.setString(1, rxCui);
            REL.read(null, this.cuiRelStatementForward.executeQuery(), true, this.allowedCUIsForSABs,
                    this.skippedRelForNotMatchingCUIFilter, true, (string -> reverseRel(string)))));
    this.cuiRelStatementBackward.setString(1, rxCui);
            REL.read(null, this.cuiRelStatementBackward.executeQuery(), false, this.allowedCUIsForSABs,
                    this.skippedRelForNotMatchingCUIFilter, true, (string -> reverseRel(string)))));

    // Have to add multiple parents at once, no place to keep all the other details.  Load those as associations for now.
    if (parents.size() > 0) {
        ComponentReference.fromChronology(this.importUtil.addParent(cuiConcept, null,
                parents.toArray(new UUID[parents.size()]), null, null));

From source file:structuredPredictionNLG.SFX.java

 * During this method, we calculate the alignments (naive or random), the language models, the available content and word actions, and finally the feature vectors.
 *///from  ww w.  j  a  v  a 2  s  . c om
public void createTrainingData() {
    //setTrainingData(new ArrayList<>(getTrainingData().subList(0, 50)));
    //setTestingData(new ArrayList<>(getTrainingData()));

    // Calculate alignments between the word of the sentence and the atribute/values
    if (getUseAlignments().equals("naive")) {
    } else {

    // Create (or load from cache) the content and word language models per predicate
    if (isResetStoredCaches() || !loadLMs()) {
        HashMap<String, ArrayList<ArrayList<String>>> LMWordTrainingPerPred = new HashMap<>();
        HashMap<String, ArrayList<ArrayList<String>>> LMAttrTrainingPerPred = new HashMap<>();
        getTrainingData().stream().map((di) -> {
            if (!LMWordTrainingPerPred.containsKey(di.getMeaningRepresentation().getPredicate())) {
                        new ArrayList<ArrayList<String>>());
                        new ArrayList<ArrayList<String>>());
            return di;
        }).forEachOrdered((di) -> {
            HashSet<ArrayList<Action>> seqs = new HashSet<>();
            seqs.forEach((seq) -> {
                ArrayList<String> wordSeq = new ArrayList<>();
                ArrayList<String> attrSeq = new ArrayList<>();

                // We add some empty tokens at the start of each sequence
                for (int i = 0; i < seq.size(); i++) {
                    if (!seq.get(i).getAttribute().equals(Action.TOKEN_END)
                            && !seq.get(i).getWord().equals(Action.TOKEN_END)) {
                    if (attrSeq.isEmpty()) {
                    } else if (!attrSeq.get(attrSeq.size() - 1).equals(seq.get(i).getAttribute())) {

        setWordLMsPerPredicate(new HashMap<>());
        setContentLMsPerPredicate(new HashMap<>());
        LMWordTrainingPerPred.keySet().stream().map((pred) -> {
            SimpleLM simpleWordLM = new SimpleLM(3);
            getWordLMsPerPredicate().put(pred, simpleWordLM);
            return pred;
        }).forEachOrdered((pred) -> {
            SimpleLM simpleAttrLM = new SimpleLM(3);
            getContentLMsPerPredicate().put(pred, simpleAttrLM);

    // Go through the sequences in the data and populate the available content and word action dictionaries
    // We populate a distinct word dictionary for each attribute, and populate it with the words of word sequences whose corresponding content sequences contain that attribute
    HashMap<String, HashSet<String>> availableContentActions = new HashMap<>();
    HashMap<String, HashMap<String, HashSet<Action>>> availableWordActions = new HashMap<>();
    getTrainingData().forEach((DI) -> {
        String predicate = DI.getMeaningRepresentation().getPredicate();
        if (!availableContentActions.containsKey(predicate)) {
            availableContentActions.put(predicate, new HashSet<String>());
        if (!availableWordActions.containsKey(predicate)) {
            availableWordActions.put(predicate, new HashMap<String, HashSet<Action>>());
        ArrayList<Action> realization = DI.getDirectReferenceSequence();
        realization.stream().filter((a) -> (!a.getAttribute().equals(Action.TOKEN_END)))
                .forEachOrdered((Action a) -> {
                    String attr;
                    if (a.getAttribute().contains("=")) {
                        attr = a.getAttribute().substring(0, a.getAttribute().indexOf('='));
                    } else {
                        attr = a.getAttribute();
                    if (!availableWordActions.get(predicate).containsKey(attr)) {
                        availableWordActions.get(predicate).put(attr, new HashSet<Action>());
                        availableWordActions.get(predicate).get(attr).add(new Action(Action.TOKEN_END, attr));
                    if (!a.getWord().equals(Action.TOKEN_START) && !a.getWord().equals(Action.TOKEN_END)
                            && !a.getWord().matches("([,.?!;:'])")) {
                        if (a.getWord().startsWith(Action.TOKEN_X)) {
                            if (a.getWord().substring(3, a.getWord().lastIndexOf('_')).toLowerCase().trim()
                                    .equals(attr)) {
                                        .add(new Action(a.getWord(), attr));
                        } else {
                            availableWordActions.get(predicate).get(attr).add(new Action(a.getWord(), attr));

    //When using random alignments we do not consider the value alignments either
    if (getUseAlignments().equals("random")) {
        setValueAlignments(new HashMap<>());

    // Infer the feature vectors of the training data
    if (isResetStoredCaches() || !loadTrainingData(getTrainingData().size())) {
        System.out.print("Create training data...");
        Object[] results = inferFeatureAndCostVectors();

        ConcurrentHashMap<DatasetInstance, HashMap<String, ArrayList<Instance>>> getPredicateContentTrainingDataBefore = (ConcurrentHashMap<DatasetInstance, HashMap<String, ArrayList<Instance>>>) results[0];
        ConcurrentHashMap<DatasetInstance, HashMap<String, HashMap<String, ArrayList<Instance>>>> getPredicateWordTrainingDataBefore = (ConcurrentHashMap<DatasetInstance, HashMap<String, HashMap<String, ArrayList<Instance>>>>) results[1];

        // Reorganize the feature/cost vector collections 
        // Initially they are mapped according to DatasetInstance (since it helps with parallel processing) but we prefer them mapped by predicate for training
        setPredicateContentTrainingData(new HashMap<>());
        getTrainingData().forEach((di) -> {
            getPredicateContentTrainingDataBefore.get(di).keySet().stream().map((predicate) -> {
                if (!getPredicateContentTrainingData().containsKey(predicate)) {
                    getPredicateContentTrainingData().put(predicate, new ArrayList<Instance>());
                return predicate;
            }).forEachOrdered((predicate) -> {
        setPredicateWordTrainingData(new HashMap<>());
        getTrainingData().forEach((di) -> {
            getPredicateWordTrainingDataBefore.get(di).keySet().stream().map((predicate) -> {
                if (!getPredicateWordTrainingData().containsKey(predicate)) {
                    getPredicateWordTrainingData().put(predicate, new HashMap<String, ArrayList<Instance>>());
                return predicate;
            }).forEachOrdered((predicate) -> {
                getPredicateWordTrainingDataBefore.get(di).get(predicate).keySet().stream().map((attribute) -> {
                    if (!getPredicateWordTrainingData().get(predicate).containsKey(attribute)) {
                        getPredicateWordTrainingData().get(predicate).put(attribute, new ArrayList<Instance>());
                    return attribute;
                }).forEachOrdered((attribute) -> {

From source file:structuredPredictionNLG.SFX.java

 * @param predicate//ww w. j a  v a 2  s  .c  o m
 * @param currentAttrValue
 * @param costs
 * @param generatedAttributes
 * @param previousGeneratedWords
 * @param nextGeneratedAttributes
 * @param attrValuesAlreadyMentioned
 * @param attrValuesThatFollow
 * @param wasValueMentioned
 * @param availableWordActions
 * @return
public Instance createWordInstanceWithCosts(String predicate, String currentAttrValue,
        TObjectDoubleHashMap<String> costs, ArrayList<String> generatedAttributes,
        ArrayList<Action> previousGeneratedWords, ArrayList<String> nextGeneratedAttributes,
        HashSet<String> attrValuesAlreadyMentioned, HashSet<String> attrValuesThatFollow,
        boolean wasValueMentioned, HashMap<String, HashSet<Action>> availableWordActions) {
    String currentAttr = currentAttrValue;
    String currentValue = "";
    if (currentAttr.contains("=")) {
        currentAttr = currentAttrValue.substring(0, currentAttrValue.indexOf('='));
        currentValue = currentAttrValue.substring(currentAttrValue.indexOf('=') + 1);
    if (currentValue.contains(":")) {
        currentValue = currentAttrValue.substring(currentAttrValue.indexOf(':') + 1);
    if (currentValue.isEmpty()) {

    TObjectDoubleHashMap<String> generalFeatures = new TObjectDoubleHashMap<>();
    HashMap<String, TObjectDoubleHashMap<String>> valueSpecificFeatures = new HashMap<>();
    for (Action action : availableWordActions.get(currentAttr)) {
        valueSpecificFeatures.put(action.getAction(), new TObjectDoubleHashMap<String>());

    /*if (gWords.get(wIndex).getWord().equals(Action.TOKEN_END)) {
    System.out.println("!!! "+ gWords.subList(0, wIndex + 1));
    ArrayList<Action> generatedWords = new ArrayList<>();
    ArrayList<Action> generatedWordsInSameAttrValue = new ArrayList<>();
    ArrayList<String> generatedPhrase = new ArrayList<>();
    for (int i = 0; i < previousGeneratedWords.size(); i++) {
        Action a = previousGeneratedWords.get(i);
        if (!a.getWord().equals(Action.TOKEN_START) && !a.getWord().equals(Action.TOKEN_END)) {
            if (a.getAttribute().equals(currentAttrValue)) {

    //Previous word features
    for (int j = 1; j <= 1; j++) {
        String previousWord = "@@";
        if (generatedWords.size() - j >= 0) {
            previousWord = generatedWords.get(generatedWords.size() - j).getWord().trim();
        generalFeatures.put("feature_word_" + j + "_" + previousWord.toLowerCase(), 1.0);
    String prevWord = "@@";
    if (generatedWords.size() - 1 >= 0) {
        prevWord = generatedWords.get(generatedWords.size() - 1).getWord().trim();
    String prev2Word = "@@";
    if (generatedWords.size() - 2 >= 0) {
        prev2Word = generatedWords.get(generatedWords.size() - 2).getWord().trim();
    String prev3Word = "@@";
    if (generatedWords.size() - 3 >= 0) {
        prev3Word = generatedWords.get(generatedWords.size() - 3).getWord().trim();
    String prev4Word = "@@";
    if (generatedWords.size() - 4 >= 0) {
        prev4Word = generatedWords.get(generatedWords.size() - 4).getWord().trim();
    String prev5Word = "@@";
    if (generatedWords.size() - 5 >= 0) {
        prev5Word = generatedWords.get(generatedWords.size() - 5).getWord().trim();

    String prevBigram = prev2Word + "|" + prevWord;
    String prevTrigram = prev3Word + "|" + prev2Word + "|" + prevWord;
    String prev4gram = prev4Word + "|" + prev3Word + "|" + prev2Word + "|" + prevWord;
    String prev5gram = prev5Word + "|" + prev4Word + "|" + prev3Word + "|" + prev2Word + "|" + prevWord;

    generalFeatures.put("feature_word_bigram_" + prevBigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_word_trigram_" + prevTrigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_word_4gram_" + prev4gram.toLowerCase(), 1.0);
    generalFeatures.put("feature_word_5gram_" + prev5gram.toLowerCase(), 1.0);

    /*String bigramWord54 = prev5Word + "|" + prev4Word;
    String bigramWord43 = prev4Word + "|" + prev3Word;
    String bigramWord32 = prev3Word + "|" + prev2Word;
    generalFeatures.put("feature_word_bigramWord54_" + bigramWord54, 1.0);
    generalFeatures.put("feature_word_bigramWord43_" + bigramWord43, 1.0);
    generalFeatures.put("feature_word_bigramWord32_" + bigramWord32, 1.0);
    String bigramWordSkip53 = prev5Word + "|" + prev3Word;
    String bigramWordSkip42 = prev4Word + "|" + prev2Word;
    String bigramWordSkip31 = prev3Word + "|" + prevWord;
    generalFeatures.put("feature_word_bigramWordSkip53_" + bigramWordSkip53, 1.0);
    generalFeatures.put("feature_word_bigramWordSkip42_" + bigramWordSkip42, 1.0);
    generalFeatures.put("feature_word_bigramWordSkip31_" + bigramWordSkip31, 1.0);
    String trigramWord543 = prev5Word + "|" + prev4Word + "|" + prev3Word;
    String trigramWord432 = prev4Word + "|" + prev3Word + "|" + prev2Word;
    generalFeatures.put("feature_word_trigramWord543_" + trigramWord543, 1.0);
    generalFeatures.put("feature_word_trigramWord432_" + trigramWord432, 1.0);
    String trigramWordSkip542 = prev5Word + "|" + prev4Word + "|" + prev2Word;
    String trigramWordSkip532 = prev5Word + "|" + prev3Word + "|" + prev2Word;
    String trigramWordSkip431 = prev4Word + "|" + prev3Word + "|" + prevWord;
    String trigramWordSkip421 = prev4Word + "|" + prev2Word + "|" + prevWord;
    generalFeatures.put("feature_word_trigramWordSkip542_" + trigramWordSkip542, 1.0);
    generalFeatures.put("feature_word_trigramWordSkip532_" + trigramWordSkip532, 1.0);
    generalFeatures.put("feature_word_trigramWordSkip431_" + trigramWordSkip431, 1.0);
    generalFeatures.put("feature_word_trigramWordSkip421_" + trigramWordSkip421, 1.0);*/
    //Previous words in same as current attrValue features
    /*if (generatedWordsInSameAttrValue.isEmpty()) {
    generalFeatures.put("feature_currentAttrValueWord_isEmpty", 1.0);
    for (int j = 1; j <= 1; j++) {
    String previousCurrentAttrValueWord = "@@";
    if (generatedWordsInSameAttrValue.size() - j >= 0) {
        previousCurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - j).getWord().trim();
    generalFeatures.put("feature_currentAttrValueWord_" + j + "_" + previousCurrentAttrValueWord.toLowerCase(), 1.0);
    String prevCurrentAttrValueWord = "@@";
    if (generatedWordsInSameAttrValue.size() - 1 >= 0) {
    prevCurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 1).getWord().trim();
    String prev2CurrentAttrValueWord = "@@";
    if (generatedWordsInSameAttrValue.size() - 2 >= 0) {
    prev2CurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 2).getWord().trim();
    String prev3CurrentAttrValueWord = "@@";
    if (generatedWordsInSameAttrValue.size() - 3 >= 0) {
    prev3CurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 3).getWord().trim();
    String prev4CurrentAttrValueWord = "@@";
    if (generatedWordsInSameAttrValue.size() - 4 >= 0) {
    prev4CurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 4).getWord().trim();
    String prev5CurrentAttrValueWord = "@@";
    if (generatedWordsInSameAttrValue.size() - 5 >= 0) {
    prev5CurrentAttrValueWord = generatedWordsInSameAttrValue.get(generatedWordsInSameAttrValue.size() - 5).getWord().trim();
    String prevCurrentAttrValueBigram = prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord;
    String prevCurrentAttrValueTrigram = prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord;
    String prevCurrentAttrValue4gram = prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord;
    String prevCurrentAttrValue5gram = prev5CurrentAttrValueWord + "|" + prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord;
    generalFeatures.put("feature_currentAttrValueWord_bigram_" + prevCurrentAttrValueBigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_currentAttrValueWord_trigram_" + prevCurrentAttrValueTrigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_currentAttrValueWord_4gram_" + prevCurrentAttrValue4gram.toLowerCase(), 1.0);
    generalFeatures.put("feature_currentAttrValueWord_5gram_" + prevCurrentAttrValue5gram.toLowerCase(), 1.0);*/

    /*String bigramCurrentAttrValueWord54 = prev5CurrentAttrValueWord + "|" + prev4CurrentAttrValueWord;
           String bigramCurrentAttrValueWord43 = prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord;
           String bigramCurrentAttrValueWord32 = prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord;
           generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWord54_" + bigramCurrentAttrValueWord54, 1.0);
           generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWord43_" + bigramCurrentAttrValueWord43, 1.0);
           generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWord32_" + bigramCurrentAttrValueWord32, 1.0);
           String bigramCurrentAttrValueWordSkip53 = prev5CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord;
           String bigramCurrentAttrValueWordSkip42 = prev4CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord;
           String bigramCurrentAttrValueWordSkip31 = prev3CurrentAttrValueWord + "|" + prevCurrentAttrValueWord;
           generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWordSkip53_" + bigramCurrentAttrValueWordSkip53, 1.0);
           generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWordSkip42_" + bigramCurrentAttrValueWordSkip42, 1.0);
           generalFeatures.put("feature_currentAttrValueWord_bigramCurrentAttrValueWordSkip31_" + bigramCurrentAttrValueWordSkip31, 1.0);
           String trigramCurrentAttrValueWord543 = prev5CurrentAttrValueWord + "|" + prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord;
           String trigramCurrentAttrValueWord432 = prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord;
           generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWord543_" + trigramCurrentAttrValueWord543, 1.0);
           generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWord432_" + trigramCurrentAttrValueWord432, 1.0);
           String trigramCurrentAttrValueWordSkip542 = prev5CurrentAttrValueWord + "|" + prev4CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord;
           String trigramCurrentAttrValueWordSkip532 = prev5CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord;
           String trigramCurrentAttrValueWordSkip431 = prev4CurrentAttrValueWord + "|" + prev3CurrentAttrValueWord + "|" + prevCurrentAttrValueWord;
           String trigramCurrentAttrValueWordSkip421 = prev4CurrentAttrValueWord + "|" + prev2CurrentAttrValueWord + "|" + prevCurrentAttrValueWord;
           generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWordSkip542_" + trigramCurrentAttrValueWordSkip542, 1.0);
           generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWordSkip532_" + trigramCurrentAttrValueWordSkip532, 1.0);
           generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWordSkip431_" + trigramCurrentAttrValueWordSkip431, 1.0);
           generalFeatures.put("feature_currentAttrValueWord_trigramCurrentAttrValueWordSkip421_" + trigramCurrentAttrValueWordSkip421, 1.0);*/
    //Previous Attr|Word features
    for (int j = 1; j <= 1; j++) {
        String previousAttrWord = "@@";
        if (generatedWords.size() - j >= 0) {
            if (generatedWords.get(generatedWords.size() - j).getAttribute().contains("=")) {
                previousAttrWord = generatedWords.get(generatedWords.size() - j).getAttribute().trim()
                        .substring(0, generatedWords.get(generatedWords.size() - j).getAttribute().indexOf('='))
                        + "|" + generatedWords.get(generatedWords.size() - j).getWord().trim();
            } else {
                previousAttrWord = generatedWords.get(generatedWords.size() - j).getAttribute().trim() + "|"
                        + generatedWords.get(generatedWords.size() - j).getWord().trim();
        generalFeatures.put("feature_attrWord_" + j + "_" + previousAttrWord.toLowerCase(), 1.0);
    String prevAttrWord = "@@";
    if (generatedWords.size() - 1 >= 0) {
        if (generatedWords.get(generatedWords.size() - 1).getAttribute().contains("=")) {
            prevAttrWord = generatedWords.get(generatedWords.size() - 1).getAttribute().trim().substring(0,
                    generatedWords.get(generatedWords.size() - 1).getAttribute().indexOf('=')) + ":"
                    + generatedWords.get(generatedWords.size() - 1).getWord().trim();
        } else {
            prevAttrWord = generatedWords.get(generatedWords.size() - 1).getAttribute().trim() + ":"
                    + generatedWords.get(generatedWords.size() - 1).getWord().trim();

    String prev2AttrWord = "@@";
    if (generatedWords.size() - 2 >= 0) {
        if (generatedWords.get(generatedWords.size() - 2).getAttribute().contains("=")) {
            prev2AttrWord = generatedWords.get(generatedWords.size() - 2).getAttribute().trim().substring(0,
                    generatedWords.get(generatedWords.size() - 2).getAttribute().indexOf('=')) + ":"
                    + generatedWords.get(generatedWords.size() - 2).getWord().trim();
        } else {
            prev2AttrWord = generatedWords.get(generatedWords.size() - 2).getAttribute().trim() + ":"
                    + generatedWords.get(generatedWords.size() - 2).getWord().trim();
    String prev3AttrWord = "@@";
    if (generatedWords.size() - 3 >= 0) {
        if (generatedWords.get(generatedWords.size() - 3).getAttribute().contains("=")) {
            prev3AttrWord = generatedWords.get(generatedWords.size() - 3).getAttribute().trim().substring(0,
                    generatedWords.get(generatedWords.size() - 3).getAttribute().indexOf('=')) + ":"
                    + generatedWords.get(generatedWords.size() - 3).getWord().trim();
        } else {
            prev3AttrWord = generatedWords.get(generatedWords.size() - 3).getAttribute().trim() + ":"
                    + generatedWords.get(generatedWords.size() - 3).getWord().trim();
    String prev4AttrWord = "@@";
    if (generatedWords.size() - 4 >= 0) {
        if (generatedWords.get(generatedWords.size() - 4).getAttribute().contains("=")) {
            prev4AttrWord = generatedWords.get(generatedWords.size() - 4).getAttribute().trim().substring(0,
                    generatedWords.get(generatedWords.size() - 4).getAttribute().indexOf('=')) + ":"
                    + generatedWords.get(generatedWords.size() - 4).getWord().trim();
        } else {
            prev4AttrWord = generatedWords.get(generatedWords.size() - 4).getAttribute().trim() + ":"
                    + generatedWords.get(generatedWords.size() - 4).getWord().trim();
    String prev5AttrWord = "@@";
    if (generatedWords.size() - 5 >= 0) {
        if (generatedWords.get(generatedWords.size() - 5).getAttribute().contains("=")) {
            prev5AttrWord = generatedWords.get(generatedWords.size() - 5).getAttribute().trim().substring(0,
                    generatedWords.get(generatedWords.size() - 5).getAttribute().indexOf('=')) + ":"
                    + generatedWords.get(generatedWords.size() - 5).getWord().trim();
        } else {
            prev5AttrWord = generatedWords.get(generatedWords.size() - 5).getAttribute().trim() + ":"
                    + generatedWords.get(generatedWords.size() - 5).getWord().trim();

    String prevAttrWordBigram = prev2AttrWord + "|" + prevAttrWord;
    String prevAttrWordTrigram = prev3AttrWord + "|" + prev2AttrWord + "|" + prevAttrWord;
    String prevAttrWord4gram = prev4AttrWord + "|" + prev3AttrWord + "|" + prev2AttrWord + "|" + prevAttrWord;
    String prevAttrWord5gram = prev5AttrWord + "|" + prev4AttrWord + "|" + prev3AttrWord + "|" + prev2AttrWord
            + "|" + prevAttrWord;

    generalFeatures.put("feature_attrWord_bigram_" + prevAttrWordBigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_attrWord_trigram_" + prevAttrWordTrigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_attrWord_4gram_" + prevAttrWord4gram.toLowerCase(), 1.0);
    generalFeatures.put("feature_attrWord_5gram_" + prevAttrWord5gram.toLowerCase(), 1.0);

    /*String bigramAttrWord54 = prev5AttrWord + "|" + prev4AttrWord;
    String bigramAttrWord43 = prev4AttrWord + "|" + prev3AttrWord;
    String bigramAttrWord32 = prev3AttrWord + "|" + prev2AttrWord;
    generalFeatures.put("feature_attrWord_bigramAttrWord54_" + bigramAttrWord54, 1.0);
    generalFeatures.put("feature_attrWord_bigramAttrWord43_" + bigramAttrWord43, 1.0);
    generalFeatures.put("feature_attrWord_bigramAttrWord32_" + bigramAttrWord32, 1.0);
    String bigramAttrWordSkip53 = prev5AttrWord + "|" + prev3AttrWord;
    String bigramAttrWordSkip42 = prev4AttrWord + "|" + prev2AttrWord;
    String bigramAttrWordSkip31 = prev3AttrWord + "|" + prevAttrWord;
    generalFeatures.put("feature_attrWord_bigramAttrWordSkip53_" + bigramAttrWordSkip53, 1.0);
    generalFeatures.put("feature_attrWord_bigramAttrWordSkip42_" + bigramAttrWordSkip42, 1.0);
    generalFeatures.put("feature_attrWord_bigramAttrWordSkip31_" + bigramAttrWordSkip31, 1.0);
    String trigramAttrWord543 = prev5AttrWord + "|" + prev4AttrWord + "|" + prev3AttrWord;
    String trigramAttrWord432 = prev4AttrWord + "|" + prev3AttrWord + "|" + prev2AttrWord;
    generalFeatures.put("feature_attrWord_trigramAttrWord543_" + trigramAttrWord543, 1.0);
    generalFeatures.put("feature_attrWord_trigramAttrWord432_" + trigramAttrWord432, 1.0);
    String trigramAttrWordSkip542 = prev5AttrWord + "|" + prev4AttrWord + "|" + prev2AttrWord;
    String trigramAttrWordSkip532 = prev5AttrWord + "|" + prev3AttrWord + "|" + prev2AttrWord;
    String trigramAttrWordSkip431 = prev4AttrWord + "|" + prev3AttrWord + "|" + prevAttrWord;
    String trigramAttrWordSkip421 = prev4AttrWord + "|" + prev2AttrWord + "|" + prevAttrWord;
    generalFeatures.put("feature_attrWord_trigramAttrWordSkip542_" + trigramAttrWordSkip542, 1.0);
    generalFeatures.put("feature_attrWord_trigramAttrWordSkip532_" + trigramAttrWordSkip532, 1.0);
    generalFeatures.put("feature_attrWord_trigramAttrWordSkip431_" + trigramAttrWordSkip431, 1.0);
    generalFeatures.put("feature_attrWord_trigramAttrWordSkip421_" + trigramAttrWordSkip421, 1.0);*/
    //Previous AttrValue|Word features
    for (int j = 1; j <= 1; j++) {
        String previousAttrWord = "@@";
        if (generatedWords.size() - j >= 0) {
            previousAttrWord = generatedWords.get(generatedWords.size() - j).getAttribute().trim() + "|"
                    + generatedWords.get(generatedWords.size() - j).getWord().trim();
        generalFeatures.put("feature_attrValueWord_" + j + "_" + previousAttrWord.toLowerCase(), 1.0);
    String prevAttrValueWord = "@@";
    if (generatedWords.size() - 1 >= 0) {
        prevAttrValueWord = generatedWords.get(generatedWords.size() - 1).getAttribute().trim() + ":"
                + generatedWords.get(generatedWords.size() - 1).getWord().trim();
    String prev2AttrValueWord = "@@";
    if (generatedWords.size() - 2 >= 0) {
        prev2AttrValueWord = generatedWords.get(generatedWords.size() - 2).getAttribute().trim() + ":"
                + generatedWords.get(generatedWords.size() - 2).getWord().trim();
    String prev3AttrValueWord = "@@";
    if (generatedWords.size() - 3 >= 0) {
        prev3AttrValueWord = generatedWords.get(generatedWords.size() - 3).getAttribute().trim() + ":"
                + generatedWords.get(generatedWords.size() - 3).getWord().trim();
    String prev4AttrValueWord = "@@";
    if (generatedWords.size() - 4 >= 0) {
        prev4AttrValueWord = generatedWords.get(generatedWords.size() - 4).getAttribute().trim() + ":"
                + generatedWords.get(generatedWords.size() - 4).getWord().trim();
    String prev5AttrValueWord = "@@";
    if (generatedWords.size() - 5 >= 0) {
        prev5AttrValueWord = generatedWords.get(generatedWords.size() - 5).getAttribute().trim() + ":"
                + generatedWords.get(generatedWords.size() - 5).getWord().trim();

    String prevAttrValueWordBigram = prev2AttrValueWord + "|" + prevAttrValueWord;
    String prevAttrValueWordTrigram = prev3AttrValueWord + "|" + prev2AttrValueWord + "|" + prevAttrValueWord;
    String prevAttrValueWord4gram = prev4AttrValueWord + "|" + prev3AttrValueWord + "|" + prev2AttrValueWord
            + "|" + prevAttrValueWord;
    String prevAttrValueWord5gram = prev5AttrValueWord + "|" + prev4AttrValueWord + "|" + prev3AttrValueWord
            + "|" + prev2AttrValueWord + "|" + prevAttrValueWord;

    generalFeatures.put("feature_attrValueWord_bigram_" + prevAttrValueWordBigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_attrValueWord_trigram_" + prevAttrValueWordTrigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_attrValueWord_4gram_" + prevAttrValueWord4gram.toLowerCase(), 1.0);
    generalFeatures.put("feature_attrValueWord_5gram_" + prevAttrValueWord5gram.toLowerCase(), 1.0);

    /*String bigramAttrValueWord54 = prev5AttrValueWord + "|" + prev4AttrValueWord;
    String bigramAttrValueWord43 = prev4AttrValueWord + "|" + prev3AttrValueWord;
    String bigramAttrValueWord32 = prev3AttrValueWord + "|" + prev2AttrValueWord;
    generalFeatures.put("feature_attrValueWord_bigramAttrValueWord54_" + bigramAttrValueWord54, 1.0);
    generalFeatures.put("feature_attrValueWord_bigramAttrValueWord43_" + bigramAttrValueWord43, 1.0);
    generalFeatures.put("feature_attrValueWord_bigramAttrValueWord32_" + bigramAttrValueWord32, 1.0);
    String bigramAttrValueWordSkip53 = prev5AttrValueWord + "|" + prev3AttrValueWord;
    String bigramAttrValueWordSkip42 = prev4AttrValueWord + "|" + prev2AttrValueWord;
    String bigramAttrValueWordSkip31 = prev3AttrValueWord + "|" + prevAttrValueWord;
    generalFeatures.put("feature_attrValueWord_bigramAttrValueWordSkip53_" + bigramAttrValueWordSkip53, 1.0);
    generalFeatures.put("feature_attrValueWord_bigramAttrValueWordSkip42_" + bigramAttrValueWordSkip42, 1.0);
    generalFeatures.put("feature_attrValueWord_bigramAttrValueWordSkip31_" + bigramAttrValueWordSkip31, 1.0);
    String trigramAttrValueWord543 = prev5AttrValueWord + "|" + prev4AttrValueWord + "|" + prev3AttrValueWord;
    String trigramAttrValueWord432 = prev4AttrValueWord + "|" + prev3AttrValueWord + "|" + prev2AttrValueWord;
    generalFeatures.put("feature_attrValueWord_trigramAttrValueWord543_" + trigramAttrValueWord543, 1.0);
    generalFeatures.put("feature_attrValueWord_trigramAttrValueWord432_" + trigramAttrValueWord432, 1.0);
    String trigramAttrValueWordSkip542 = prev5AttrValueWord + "|" + prev4AttrValueWord + "|" + prev2AttrValueWord;
    String trigramAttrValueWordSkip532 = prev5AttrValueWord + "|" + prev3AttrValueWord + "|" + prev2AttrValueWord;
    String trigramAttrValueWordSkip431 = prev4AttrValueWord + "|" + prev3AttrValueWord + "|" + prevAttrValueWord;
    String trigramAttrValueWordSkip421 = prev4AttrValueWord + "|" + prev2AttrValueWord + "|" + prevAttrValueWord;
    generalFeatures.put("feature_attrValueWord_trigramAttrValueWordSkip542_" + trigramAttrValueWordSkip542, 1.0);
    generalFeatures.put("feature_attrValueWord_trigramAttrValueWordSkip532_" + trigramAttrValueWordSkip532, 1.0);
    generalFeatures.put("feature_attrValueWord_trigramAttrValueWordSkip431_" + trigramAttrValueWordSkip431, 1.0);
    generalFeatures.put("feature_attrValueWord_trigramAttrValueWordSkip421_" + trigramAttrValueWordSkip421, 1.0);*/
    //Previous attrValue features
    int attributeSize = generatedAttributes.size();
    for (int j = 1; j <= 1; j++) {
        String previousAttrValue = "@@";
        if (attributeSize - j >= 0) {
            previousAttrValue = generatedAttributes.get(attributeSize - j).trim();
        generalFeatures.put("feature_attrValue_" + j + "_" + previousAttrValue, 1.0);
    String prevAttrValue = "@@";
    if (attributeSize - 1 >= 0) {
        prevAttrValue = generatedAttributes.get(attributeSize - 1).trim();
    String prev2AttrValue = "@@";
    if (attributeSize - 2 >= 0) {
        prev2AttrValue = generatedAttributes.get(attributeSize - 2).trim();
    String prev3AttrValue = "@@";
    if (attributeSize - 3 >= 0) {
        prev3AttrValue = generatedAttributes.get(attributeSize - 3).trim();
    String prev4AttrValue = "@@";
    if (attributeSize - 4 >= 0) {
        prev4AttrValue = generatedAttributes.get(attributeSize - 4).trim();
    String prev5AttrValue = "@@";
    if (attributeSize - 5 >= 0) {
        prev5AttrValue = generatedAttributes.get(attributeSize - 5).trim();

    String prevAttrBigramValue = prev2AttrValue + "|" + prevAttrValue;
    String prevAttrTrigramValue = prev3AttrValue + "|" + prev2AttrValue + "|" + prevAttrValue;
    String prevAttr4gramValue = prev4AttrValue + "|" + prev3AttrValue + "|" + prev2AttrValue + "|"
            + prevAttrValue;
    String prevAttr5gramValue = prev5AttrValue + "|" + prev4AttrValue + "|" + prev3AttrValue + "|"
            + prev2AttrValue + "|" + prevAttrValue;

    generalFeatures.put("feature_attrValue_bigram_" + prevAttrBigramValue.toLowerCase(), 1.0);
    generalFeatures.put("feature_attrValue_trigram_" + prevAttrTrigramValue.toLowerCase(), 1.0);
    generalFeatures.put("feature_attrValue_4gram_" + prevAttr4gramValue.toLowerCase(), 1.0);
    generalFeatures.put("feature_attrValue_5gram_" + prevAttr5gramValue.toLowerCase(), 1.0);

    /*String bigramAttrValue54 = prev5AttrValue + "|" + prev4AttrValue;
    String bigramAttrValue43 = prev4AttrValue + "|" + prev3AttrValue;
    String bigramAttrValue32 = prev3AttrValue + "|" + prev2AttrValue;
    generalFeatures.put("feature_attrValue_bigramAttrValue54_" + bigramAttrValue54, 1.0);
    generalFeatures.put("feature_attrValue_bigramAttrValue43_" + bigramAttrValue43, 1.0);
    generalFeatures.put("feature_attrValue_bigramAttrValue32_" + bigramAttrValue32, 1.0);
    String bigramAttrValueSkip53 = prev5AttrValue + "|" + prev3AttrValue;
    String bigramAttrValueSkip42 = prev4AttrValue + "|" + prev2AttrValue;
    String bigramAttrValueSkip31 = prev3AttrValue + "|" + prevAttrValue;
    generalFeatures.put("feature_attrValue_bigramAttrValueSkip53_" + bigramAttrValueSkip53, 1.0);
    generalFeatures.put("feature_attrValue_bigramAttrValueSkip42_" + bigramAttrValueSkip42, 1.0);
    generalFeatures.put("feature_attrValue_bigramAttrValueSkip31_" + bigramAttrValueSkip31, 1.0);
    String trigramAttrValue543 = prev5AttrValue + "|" + prev4AttrValue + "|" + prev3AttrValue;
    String trigramAttrValue432 = prev4AttrValue + "|" + prev3AttrValue + "|" + prev2AttrValue;
    generalFeatures.put("feature_attrValue_trigramAttrValue543_" + trigramAttrValue543, 1.0);
    generalFeatures.put("feature_attrValue_trigramAttrValue432_" + trigramAttrValue432, 1.0);
    String trigramAttrValueSkip542 = prev5AttrValue + "|" + prev4AttrValue + "|" + prev2AttrValue;
    String trigramAttrValueSkip532 = prev5AttrValue + "|" + prev3AttrValue + "|" + prev2AttrValue;
    String trigramAttrValueSkip431 = prev4AttrValue + "|" + prev3AttrValue + "|" + prevAttrValue;
    String trigramAttrValueSkip421 = prev4AttrValue + "|" + prev2AttrValue + "|" + prevAttrValue;
    generalFeatures.put("feature_attrValue_trigramAttrValueSkip542_" + trigramAttrValueSkip542, 1.0);
    generalFeatures.put("feature_attrValue_trigramAttrValueSkip532_" + trigramAttrValueSkip532, 1.0);
    generalFeatures.put("feature_attrValue_trigramAttrValueSkip431_" + trigramAttrValueSkip431, 1.0);
    generalFeatures.put("feature_attrValue_trigramAttrValueSkip421_" + trigramAttrValueSkip421, 1.0);*/
    //Previous attr features
    for (int j = 1; j <= 1; j++) {
        String previousAttr = "@@";
        if (attributeSize - j >= 0) {
            if (generatedAttributes.get(attributeSize - j).contains("=")) {
                previousAttr = generatedAttributes.get(attributeSize - j).trim().substring(0,
                        generatedAttributes.get(attributeSize - j).indexOf('='));
            } else {
                previousAttr = generatedAttributes.get(attributeSize - j).trim();
        generalFeatures.put("feature_attr_" + j + "_" + previousAttr, 1.0);
    String prevAttr = "@@";
    if (attributeSize - 1 >= 0) {
        if (generatedAttributes.get(attributeSize - 1).contains("=")) {
            prevAttr = generatedAttributes.get(attributeSize - 1).trim().substring(0,
                    generatedAttributes.get(attributeSize - 1).indexOf('='));
        } else {
            prevAttr = generatedAttributes.get(attributeSize - 1).trim();
    String prev2Attr = "@@";
    if (attributeSize - 2 >= 0) {
        if (generatedAttributes.get(attributeSize - 2).contains("=")) {
            prev2Attr = generatedAttributes.get(attributeSize - 2).trim().substring(0,
                    generatedAttributes.get(attributeSize - 2).indexOf('='));
        } else {
            prev2Attr = generatedAttributes.get(attributeSize - 2).trim();
    String prev3Attr = "@@";
    if (attributeSize - 3 >= 0) {
        if (generatedAttributes.get(attributeSize - 3).contains("=")) {
            prev3Attr = generatedAttributes.get(attributeSize - 3).trim().substring(0,
                    generatedAttributes.get(attributeSize - 3).indexOf('='));
        } else {
            prev3Attr = generatedAttributes.get(attributeSize - 3).trim();
    String prev4Attr = "@@";
    if (attributeSize - 4 >= 0) {
        if (generatedAttributes.get(attributeSize - 4).contains("=")) {
            prev4Attr = generatedAttributes.get(attributeSize - 4).trim().substring(0,
                    generatedAttributes.get(attributeSize - 4).indexOf('='));
        } else {
            prev4Attr = generatedAttributes.get(attributeSize - 4).trim();
    String prev5Attr = "@@";
    if (attributeSize - 5 >= 0) {
        if (generatedAttributes.get(attributeSize - 5).contains("=")) {
            prev5Attr = generatedAttributes.get(attributeSize - 5).trim().substring(0,
                    generatedAttributes.get(attributeSize - 5).indexOf('='));
        } else {
            prev5Attr = generatedAttributes.get(attributeSize - 5).trim();

    String prevAttrBigram = prev2Attr + "|" + prevAttr;
    String prevAttrTrigram = prev3Attr + "|" + prev2Attr + "|" + prevAttr;
    String prevAttr4gram = prev4Attr + "|" + prev3Attr + "|" + prev2Attr + "|" + prevAttr;
    String prevAttr5gram = prev5Attr + "|" + prev4Attr + "|" + prev3Attr + "|" + prev2Attr + "|" + prevAttr;

    generalFeatures.put("feature_attr_bigram_" + prevAttrBigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_attr_trigram_" + prevAttrTrigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_attr_4gram_" + prevAttr4gram.toLowerCase(), 1.0);
    generalFeatures.put("feature_attr_5gram_" + prevAttr5gram.toLowerCase(), 1.0);

    /*String bigramAttr54 = prev5Attr + "|" + prev4Attr;
    String bigramAttr43 = prev4Attr + "|" + prev3Attr;
    String bigramAttr32 = prev3Attr + "|" + prev2Attr;
    generalFeatures.put("feature_attr_bigramAttr54_" + bigramAttr54, 1.0);
    generalFeatures.put("feature_attr_bigramAttr43_" + bigramAttr43, 1.0);
    generalFeatures.put("feature_attr_bigramAttr32_" + bigramAttr32, 1.0);
    String bigramAttrSkip53 = prev5Attr + "|" + prev3Attr;
    String bigramAttrSkip42 = prev4Attr + "|" + prev2Attr;
    String bigramAttrSkip31 = prev3Attr + "|" + prevAttr;
    generalFeatures.put("feature_attr_bigramAttrSkip53_" + bigramAttrSkip53, 1.0);
    generalFeatures.put("feature_attr_bigramAttrSkip42_" + bigramAttrSkip42, 1.0);
    generalFeatures.put("feature_attr_bigramAttrSkip31_" + bigramAttrSkip31, 1.0);
    String trigramAttr543 = prev5Attr + "|" + prev4Attr + "|" + prev3Attr;
    String trigramAttr432 = prev4Attr + "|" + prev3Attr + "|" + prev2Attr;
    generalFeatures.put("feature_attr_trigramAttr543_" + trigramAttr543, 1.0);
    generalFeatures.put("feature_attr_trigramAttr432_" + trigramAttr432, 1.0);
    String trigramAttrSkip542 = prev5Attr + "|" + prev4Attr + "|" + prev2Attr;
    String trigramAttrSkip532 = prev5Attr + "|" + prev3Attr + "|" + prev2Attr;
    String trigramAttrSkip431 = prev4Attr + "|" + prev3Attr + "|" + prevAttr;
    String trigramAttrSkip421 = prev4Attr + "|" + prev2Attr + "|" + prevAttr;
    generalFeatures.put("feature_attr_trigramAttrSkip542_" + trigramAttrSkip542, 1.0);
    generalFeatures.put("feature_attr_trigramAttrSkip532_" + trigramAttrSkip532, 1.0);
    generalFeatures.put("feature_attr_trigramAttrSkip431_" + trigramAttrSkip431, 1.0);
    generalFeatures.put("feature_attr_trigramAttrSkip421_" + trigramAttrSkip421, 1.0);*/
    //Next attr features
    for (int j = 0; j < 1; j++) {
        String nextAttr = "@@";
        if (j < nextGeneratedAttributes.size()) {
            if (nextGeneratedAttributes.get(j).contains("=")) {
                nextAttr = nextGeneratedAttributes.get(j).trim().substring(0,
            } else {
                nextAttr = nextGeneratedAttributes.get(j).trim();
        generalFeatures.put("feature_nextAttr_" + j + "_" + nextAttr, 1.0);
    String nextAttr = "@@";
    if (0 < nextGeneratedAttributes.size()) {
        if (nextGeneratedAttributes.get(0).contains("=")) {
            nextAttr = nextGeneratedAttributes.get(0).trim().substring(0,
        } else {
            nextAttr = nextGeneratedAttributes.get(0).trim();
    String next2Attr = "@@";
    if (1 < nextGeneratedAttributes.size()) {
        if (nextGeneratedAttributes.get(1).contains("=")) {
            next2Attr = nextGeneratedAttributes.get(1).trim().substring(0,
        } else {
            next2Attr = nextGeneratedAttributes.get(1).trim();
    String next3Attr = "@@";
    if (2 < nextGeneratedAttributes.size()) {
        if (nextGeneratedAttributes.get(2).contains("=")) {
            next3Attr = nextGeneratedAttributes.get(2).trim().substring(0,
        } else {
            next3Attr = nextGeneratedAttributes.get(2).trim();
    String next4Attr = "@@";
    if (3 < nextGeneratedAttributes.size()) {
        if (nextGeneratedAttributes.get(3).contains("=")) {
            next4Attr = nextGeneratedAttributes.get(3).trim().substring(0,
        } else {
            next4Attr = nextGeneratedAttributes.get(3).trim();
    String next5Attr = "@@";
    if (4 < nextGeneratedAttributes.size()) {
        if (nextGeneratedAttributes.get(4).contains("=")) {
            next5Attr = nextGeneratedAttributes.get(4).trim().substring(0,
        } else {
            next5Attr = nextGeneratedAttributes.get(4).trim();

    String nextAttrBigram = nextAttr + "|" + next2Attr;
    String nextAttrTrigram = nextAttr + "|" + next2Attr + "|" + next3Attr;
    String nextAttr4gram = nextAttr + "|" + next2Attr + "|" + next3Attr + "|" + next4Attr;
    String nextAttr5gram = nextAttr + "|" + next2Attr + "|" + next3Attr + "|" + next4Attr + "|" + next5Attr;

    generalFeatures.put("feature_nextAttr_bigram_" + nextAttrBigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_nextAttr_trigram_" + nextAttrTrigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_nextAttr_4gram_" + nextAttr4gram.toLowerCase(), 1.0);
    generalFeatures.put("feature_nextAttr_5gram_" + nextAttr5gram.toLowerCase(), 1.0);

    //Next attrValue features
    for (int j = 0; j < 1; j++) {
        String nextAttrValue = "@@";
        if (j < nextGeneratedAttributes.size()) {
            nextAttrValue = nextGeneratedAttributes.get(j).trim();
        generalFeatures.put("feature_nextAttrValue_" + j + "_" + nextAttrValue, 1.0);
    String nextAttrValue = "@@";
    if (0 < nextGeneratedAttributes.size()) {
        nextAttrValue = nextGeneratedAttributes.get(0).trim();
    String next2AttrValue = "@@";
    if (1 < nextGeneratedAttributes.size()) {
        next2AttrValue = nextGeneratedAttributes.get(1).trim();
    String next3AttrValue = "@@";
    if (2 < nextGeneratedAttributes.size()) {
        next3AttrValue = nextGeneratedAttributes.get(2).trim();
    String next4AttrValue = "@@";
    if (3 < nextGeneratedAttributes.size()) {
        next4AttrValue = nextGeneratedAttributes.get(3).trim();
    String next5AttrValue = "@@";
    if (4 < nextGeneratedAttributes.size()) {
        next5AttrValue = nextGeneratedAttributes.get(4).trim();

    String nextAttrValueBigram = nextAttrValue + "|" + next2AttrValue;
    String nextAttrValueTrigram = nextAttrValue + "|" + next2AttrValue + "|" + next3AttrValue;
    String nextAttrValue4gram = nextAttrValue + "|" + next2AttrValue + "|" + next3AttrValue + "|"
            + next4AttrValue;
    String nextAttrValue5gram = nextAttrValue + "|" + next2AttrValue + "|" + next3AttrValue + "|"
            + next4AttrValue + "|" + next5AttrValue;

    generalFeatures.put("feature_nextAttrValue_bigram_" + nextAttrValueBigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_nextAttrValue_trigram_" + nextAttrValueTrigram.toLowerCase(), 1.0);
    generalFeatures.put("feature_nextAttrValue_4gram_" + nextAttrValue4gram.toLowerCase(), 1.0);
    generalFeatures.put("feature_nextAttrValue_5gram_" + nextAttrValue5gram.toLowerCase(), 1.0);

    //If values have already been generated or not
    generalFeatures.put("feature_valueToBeMentioned_" + currentValue.toLowerCase(), 1.0);
    if (wasValueMentioned) {
        generalFeatures.put("feature_wasValueMentioned_true", 1.0);
    } else {
        //generalFeatures.put("feature_wasValueMentioned_false", 1.0);
    HashSet<String> valuesThatFollow = new HashSet<>();
    attrValuesThatFollow.stream().map((attrValue) -> {
        generalFeatures.put("feature_attrValuesThatFollow_" + attrValue.toLowerCase(), 1.0);
        return attrValue;
    }).forEachOrdered((attrValue) -> {
        if (attrValue.contains("=")) {
            String v = attrValue.substring(attrValue.indexOf('=') + 1);
            if (v.matches("[xX][0-9]+")) {
                String attr = attrValue.substring(0, attrValue.indexOf('='));
                valuesThatFollow.add(Action.TOKEN_X + attr + "_" + v.substring(1));
            } else {
                    "feature_attrsThatFollow_" + attrValue.substring(0, attrValue.indexOf('=')).toLowerCase(),
        } else {
            generalFeatures.put("feature_attrsThatFollow_" + attrValue.toLowerCase(), 1.0);
    if (valuesThatFollow.isEmpty()) {
        generalFeatures.put("feature_noAttrsFollow", 1.0);
    } else {
        generalFeatures.put("feature_noAttrsFollow", 0.0);
    HashSet<String> mentionedValues = new HashSet<>();
    attrValuesAlreadyMentioned.stream().map((attrValue) -> {
        generalFeatures.put("feature_attrValuesAlreadyMentioned_" + attrValue.toLowerCase(), 1.0);
        return attrValue;
    }).forEachOrdered((attrValue) -> {
        if (attrValue.contains("=")) {
                    + attrValue.substring(0, attrValue.indexOf('=')).toLowerCase(), 1.0);
            String v = attrValue.substring(attrValue.indexOf('=') + 1);
            if (v.matches("[xX][0-9]+")) {
                String attr = attrValue.substring(0, attrValue.indexOf('='));
                mentionedValues.add(Action.TOKEN_X + attr + "_" + v.substring(1));
            } else {
        } else {
            generalFeatures.put("feature_attrsAlreadyMentioned_" + attrValue.toLowerCase(), 1.0);

    /*System.out.println("currentAttrValue: " + currentAttrValue);
    System.out.println("5W: " + prev5gram);
    System.out.println("5AW: " + prevAttrWord5gram);
    System.out.println("5A: " + prevAttr5gram);
    System.out.println("VM: " + wasValueMentioned);
    System.out.println("A_TF: " + attrValuesThatFollow);
    if (currentValue.equals("no") || currentValue.equals("yes") || currentValue.equals("yes or no")
            || currentValue.equals("none") || currentValue.equals("empty") //|| currentValue.equals("dont_care")
    ) {
        generalFeatures.put("feature_emptyValue", 1.0);

    //Word specific features (and also global features)
    for (Action action : availableWordActions.get(currentAttr)) {
        //Is word same as previous word
        if (prevWord.equals(action.getWord())) {
            //valueSpecificFeatures.get(action.getAction()).put("feature_specific_sameAsPreviousWord", 1.0);
        } else {
            //valueSpecificFeatures.get(action.getAction()).put("feature_specific_notSameAsPreviousWord", 1.0);
        //Has word appeared in the same attrValue before
        generatedWords.forEach((previousAction) -> {
            if (previousAction.getWord().equals(action.getWord())
                    && previousAction.getAttribute().equals(currentAttrValue)) {
                //valueSpecificFeatures.get(action.getAction()).put("feature_specific_appearedInSameAttrValue", 1.0);
                        .put("global_feature_specific_appearedInSameAttrValue", 1.0);
            } else {
                //valueSpecificFeatures.get(action.getAction()).put("feature_specific_notAppearedInSameAttrValue", 1.0);
                //valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_notAppearedInSameAttrValue", 1.0);
        //Has word appeared before
        generatedWords.forEach((previousAction) -> {
            if (previousAction.getWord().equals(action.getWord())) {
                //valueSpecificFeatures.get(action.getAction()).put("feature_specific_appeared", 1.0);
                valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_appeared", 1.0);
            } else {
                //valueSpecificFeatures.get(action.getAction()).put("feature_specific_notAppeared", 1.0);
                //valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_notAppeared", 1.0);
        if (currentValue.equals("no") || currentValue.equals("yes") || currentValue.equals("yes or no")
                || currentValue.equals("none") || currentValue.equals("empty") //|| currentValue.equals("dont_care")
        ) {
            //valueSpecificFeatures.get(action.getAction()).put("feature_specific_emptyValue", 1.0);
            valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_emptyValue", 1.0);
        } else {
            //valueSpecificFeatures.get(action.getAction()).put("feature_specific_notEmptyValue", 1.0);
            //valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_notEmptyValue", 1.0);

        HashSet<String> keys = new HashSet<>(valueSpecificFeatures.get(action.getAction()).keySet());
        keys.forEach((feature1) -> {
                    .filter((feature2) -> (valueSpecificFeatures.get(action.getAction()).get(feature1) == 1.0
                            && valueSpecificFeatures.get(action.getAction()).get(feature2) == 1.0
                            && feature1.compareTo(feature2) < 0))
                    .forEachOrdered((feature2) -> {
                        valueSpecificFeatures.get(action.getAction()).put(feature1 + "&&" + feature2, 1.0);

        if (!action.getWord().startsWith(Action.TOKEN_X) && !currentValue.equals("no")
                && !currentValue.equals("yes") && !currentValue.equals("yes or no")
                && !currentValue.equals("none") && !currentValue.equals("empty") //&& !currentValue.equals("dont_care")
        ) {
            for (String value : getValueAlignments().keySet()) {
                for (ArrayList<String> alignedStr : getValueAlignments().get(value).keySet()) {
                    if (alignedStr.get(0).equals(action.getWord())) {
                        if (mentionedValues.contains(value)) {
                            //valueSpecificFeatures.get(action.getAction()).put("feature_specific_beginsValue_alreadyMentioned", 1.0);
                                    .put("global_feature_specific_beginsValue_alreadyMentioned", 1.0);

                        } else if (currentValue.equals(value)) {
                            //valueSpecificFeatures.get(action.getAction()).put("feature_specific_beginsValue_current", 1.0);
                                    .put("global_feature_specific_beginsValue_current", 1.0);

                        } else if (valuesThatFollow.contains(value)) {
                            //valueSpecificFeatures.get(action.getAction()).put("feature_specific_beginsValue_thatFollows", 1.0);
                                    .put("global_feature_specific_beginsValue_thatFollows", 1.0);

                        } else {
                            //valueSpecificFeatures.get(action.getAction()).put("feature_specific_beginsValue_notInMR", 1.0);
                                    .put("global_feature_specific_beginsValue_notInMR", 1.0);

                    } else {
                        for (int i = 1; i < alignedStr.size(); i++) {
                            if (alignedStr.get(i).equals(action.getWord())) {
                                if (endsWith(generatedPhrase,
                                        new ArrayList<String>(alignedStr.subList(0, i + 1)))) {
                                    if (mentionedValues.contains(value)) {
                                        //valueSpecificFeatures.get(action.getAction()).put("feature_specific_inValue_alreadyMentioned", 1.0);
                                                .put("global_feature_specific_inValue_alreadyMentioned", 1.0);

                                    } else if (currentValue.equals(value)) {
                                        //valueSpecificFeatures.get(action.getAction()).put("feature_specific_inValue_current", 1.0);
                                                .put("global_feature_specific_inValue_current", 1.0);

                                    } else if (valuesThatFollow.contains(value)) {
                                        //valueSpecificFeatures.get(action.getAction()).put("feature_specific_inValue_thatFollows", 1.0);
                                                .put("global_feature_specific_inValue_thatFollows", 1.0);

                                    } else {
                                        //valueSpecificFeatures.get(action.getAction()).put("feature_specific_inValue_notInMR", 1.0);
                                                .put("global_feature_specific_inValue_notInMR", 1.0);

                                } else {
                                    /*if (mentionedValues.contains(value)) {
                                    valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue_alreadyMentioned", 1.0);
                                    } else if (currentValue.equals(value)) {
                                    valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue_current", 1.0);
                                    } else if (valuesThatFollow.contains(value)) {
                                    valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue_thatFollows", 1.0);
                                    } else {
                                    valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue_notInMR", 1.0);
                                    //valueSpecificFeatures.get(action.getAction()).put("feature_specific_outOfValue", 1.0);
                                            .put("global_feature_specific_outOfValue", 1.0);
            if (action.getWord().equals(Action.TOKEN_END)) {
                if (generatedWordsInSameAttrValue.isEmpty()) {
                    //valueSpecificFeatures.get(action.getAction()).put("feature_specific_closingEmptyAttr", 1.0);
                            .put("global_feature_specific_closingEmptyAttr", 1.0);
                if (!wasValueMentioned) {
                    //valueSpecificFeatures.get(action.getAction()).put("feature_specific_closingAttrWithValueNotMentioned", 1.0);
                            .put("global_feature_specific_closingAttrWithValueNotMentioned", 1.0);

                // if (!prevCurrentAttrValueWord.equals("@@")) {
                if (!prevWord.equals("@@")) {
                    boolean alignmentIsOpen = false;
                    for (String value : getValueAlignments().keySet()) {
                        for (ArrayList<String> alignedStr : getValueAlignments().get(value).keySet()) {
                            for (int i = 0; i < alignedStr.size() - 1; i++) {
                                if (alignedStr.get(i).equals(prevWord) && endsWith(generatedPhrase,
                                        new ArrayList<>(alignedStr.subList(0, i + 1)))) {
                                    alignmentIsOpen = true;
                    if (alignmentIsOpen) {
                        // valueSpecificFeatures.get(action.getAction()).put("feature_specific_closingAttrWhileValueIsNotConcluded", 1.0);
                                .put("global_feature_specific_closingAttrWhileValueIsNotConcluded", 1.0);
        } else if (currentValue.equals("no") || currentValue.equals("yes") || currentValue.equals("yes or no")
                || currentValue.equals("none") || currentValue.equals("empty") //|| currentValue.equals("dont_care")
        ) {
            valueSpecificFeatures.get(action.getAction()).put("global_feature_specific_XValue_notInMR", 1.0);
        } else {
            String currentValueVariant = "";
            if (currentValue.matches("[xX][0-9]+")) {
                currentValueVariant = Action.TOKEN_X + currentAttr + "_" + currentValue.substring(1);

            if (mentionedValues.contains(action.getWord())) {
                //valueSpecificFeatures.get(action.getAction()).put("feature_specific_XValue_alreadyMentioned", 1.0);
                        .put("global_feature_specific_XValue_alreadyMentioned", 1.0);
            } else if (currentValueVariant.equals(action.getWord()) && !currentValueVariant.isEmpty()) {
                //valueSpecificFeatures.get(action.getAction()).put("feature_specific_XValue_current", 1.0);

            } else if (valuesThatFollow.contains(action.getWord())) {
                //valueSpecificFeatures.get(action.getAction()).put("feature_specific_XValue_thatFollows", 1.0);
            } else {
                //valueSpecificFeatures.get(action.getAction()).put("feature_specific_XValue_notInMR", 1.0);
        /*for (int i : nGrams.keySet()) {
        for (String nGram : nGrams.get(i)) {
        if (i == 2) {
        if (nGram.startsWith(prevWord + "|")
        && nGram.endsWith("|" + action.getAction())) {
        valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPreviousWord", 1.0);
        } else if (i == 3) {
        if (nGram.startsWith(prevBigram + "|")
        && nGram.endsWith("|" + action.getAction())) {
        valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPreviousBigram", 1.0);
        } else if (i == 4) {
        if (nGram.startsWith(prevTrigram + "|")
        && nGram.endsWith("|" + action.getAction())) {
        valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPreviousTrigram", 1.0);
        } else if (i == 5) {
        if (nGram.startsWith(prev4gram + "|")
        && nGram.endsWith("|" + action.getAction())) {
        valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPrevious4gram", 1.0);
        } else if (i == 6) {
        if (nGram.startsWith(prev5gram + "|")
        && nGram.endsWith("|" + action.getAction())) {
        valueSpecificFeatures.get(action.getAction()).put("feature_specific_valuesFollowsPrevious5gram", 1.0);

        //valueSpecificFeatures.get(action.getAction()).put("global_feature_abstractMR_" + mr.getAbstractMR(), 1.0);
                .put("global_feature_currentValue_" + currentValue.toLowerCase(), 1.0);

        ArrayList<String> fullGramLM = new ArrayList<>();
        for (int i = 0; i < generatedWords.size(); i++) {

        ArrayList<String> prev5wordGramLM = new ArrayList<>();
        int j = 0;
        for (int i = generatedWords.size() - 1; (i >= 0 && j < 5); i--) {
            prev5wordGramLM.add(0, generatedWords.get(i).getWord());
        while (prev5wordGramLM.size() < 4) {
            prev5wordGramLM.add(0, "@@");

        double afterLMScorePerPred5Gram = getWordLMsPerPredicate().get(predicate)

        double afterLMScorePerPred = getWordLMsPerPredicate().get(predicate).getProbability(fullGramLM);

    /*HashSet<String> keys = new HashSet<>(generalFeatures.keySet());
    for (String feature1 : keys) {
    if (generalFeatures.get(feature1) == 1.0) {
        generalFeatures.put("global_feature_attr_" + currentValue.toLowerCase() + "&&" + feature1, 1.0);
    //generalFeatures.put("feature_abstractMR_" + mr.getAbstractMR(), 1.0);

    /*HashSet<String> keys = new HashSet<>(generalFeatures.keySet());
    for (String feature1 : keys) {
    for (String feature2 : keys) {
    if (generalFeatures.get(feature1) == 1.0
    && generalFeatures.get(feature2) == 1.0
    && feature1.compareTo(feature2) < 0) {
    generalFeatures.put(feature1 + "&&" + feature2, 1.0);
    return new Instance(generalFeatures, valueSpecificFeatures, costs);

From source file:structuredPredictionNLG.SFX.java

 * Populates the predicate, attribute, attribute/value pair, and value alignment collections
 * @param dataFile The dataset file.//from  ww  w  .  j ava 2  s.c  o m
public void createLists(File dataFile) {
    try {
        // Initialize the collections
        setPredicates(new ArrayList<>());
        setAttributes(new HashMap<>());
        setAttributeValuePairs(new HashMap<>());
        setValueAlignments(new HashMap<>());

        // Obtain the dataset portion of the file
        String dataPart = new String();
        boolean begin = false;
        try (BufferedReader br = new BufferedReader(new FileReader(dataFile))) {
            String s;
            while ((s = br.readLine()) != null) {
                if (s.startsWith("[")) {
                    begin = true;
                if (begin) {
                    dataPart += s;
        } catch (FileNotFoundException ex) {
            Logger.getLogger(Bagel.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(Bagel.class.getName()).log(Level.SEVERE, null, ex);

        // Parse the dataset with JSON
        JSONArray overArray = new JSONArray(dataPart);
        for (int o = 0; o < overArray.length(); o++) {
            // "dial" notes each seperate dialog
            JSONArray arr = overArray.getJSONObject(o).getJSONArray("dial");
            for (int i = 0; i < arr.length(); i++) {
                String MRstr;
                String ref;
                // "dact" notes every meaning representation
                MRstr = arr.getJSONObject(i).getJSONObject("S").getString("dact");
                // "ref" notes every corresponding reference
                ref = arr.getJSONObject(i).getJSONObject("S").getString("ref").replaceAll("-s", "s");

                //We split some composite words (based on Wen et al's (2016) code)
                ref = (" " + ref + " ").replaceAll(" it's ", " it is ").replaceAll(" don't ", " do not ")
                        .replaceAll(" doesn't ", " does not ").replaceAll(" didn't ", " did not ")
                        .replaceAll(" you'd ", " you would ").replaceAll(" you're ", " you are ")
                        .replaceAll(" you'll ", " you will ").replaceAll(" i'm ", " i am ")
                        .replaceAll(" they're ", " they are ").replaceAll(" that's ", " that is ")
                        .replaceAll(" what's ", " what is ").replaceAll(" couldn't ", " could not ")
                        .replaceAll(" i've ", " i have ").replaceAll(" we've ", " we have ")
                        .replaceAll(" can't ", " cannot ").replaceAll(" i'd ", " i would ")
                        .replaceAll(" i'd ", " i would ").replaceAll(" aren't ", " are not ")
                        .replaceAll(" isn't ", " is not ").replaceAll(" wasn't ", " was not ")
                        .replaceAll(" weren't ", " were not ").replaceAll(" won't ", " will not ")
                        .replaceAll(" there's ", " there is ").replaceAll(" there're ", " there are ")
                        .replaceAll(" \\. \\. ", " \\. ").replaceAll(" restaurants ", " restaurant -s ")
                        .replaceAll(" hotels ", " hotel -s ").replaceAll(" laptops ", " laptop -s ")
                        .replaceAll(" cheaper ", " cheap -er ").replaceAll(" dinners ", " dinner -s ")
                        .replaceAll(" lunches ", " lunch -s ").replaceAll(" breakfasts ", " breakfast -s ")
                        .replaceAll(" expensively ", " expensive -ly ")
                        .replaceAll(" moderately ", " moderate -ly ").replaceAll(" cheaply ", " cheap -ly ")
                        .replaceAll(" prices ", " price -s ").replaceAll(" places ", " place -s ")
                        .replaceAll(" venues ", " venue -s ").replaceAll(" ranges ", " range -s ")
                        .replaceAll(" meals ", " meal -s ").replaceAll(" locations ", " location -s ")
                        .replaceAll(" areas ", " area -s ").replaceAll(" policies ", " policy -s ")
                        .replaceAll(" children ", " child -s ").replaceAll(" kids ", " kid -s ")
                        .replaceAll(" kidfriendly ", " kid friendly ").replaceAll(" cards ", " card -s ")
                        .replaceAll(" st ", " street ").replaceAll(" ave ", " avenue ")
                        .replaceAll(" upmarket ", " expensive ").replaceAll(" inpricey ", " cheap ")
                        .replaceAll(" inches ", " inch -s ").replaceAll(" uses ", " use -s ")
                        .replaceAll(" dimensions ", " dimension -s ")
                        .replaceAll(" driverange ", " drive range ").replaceAll(" includes ", " include -s ")
                        .replaceAll(" computers ", " computer -s ").replaceAll(" machines ", " machine -s ")
                        .replaceAll(" ecorating ", " eco rating ").replaceAll(" families ", " family -s ")
                        .replaceAll(" ratings ", " rating -s ").replaceAll(" constraints ", " constraint -s ")
                        .replaceAll(" pricerange ", " price range ")
                        .replaceAll(" batteryrating ", " battery rating ")
                        .replaceAll(" requirements ", " requirement -s ").replaceAll(" drives ", " drive -s ")
                        .replaceAll(" specifications ", " specification -s ")
                        .replaceAll(" weightrange ", " weight range ").replaceAll(" harddrive ", " hard drive ")
                        .replaceAll(" batterylife ", " battery life ")
                        .replaceAll(" businesses ", " business -s ").replaceAll(" hours ", " hour -s ")
                        .replaceAll(" accessories ", " accessory -s ").replaceAll(" ports ", " port -s ")
                        .replaceAll(" televisions ", " television -s ")
                        .replaceAll(" restrictions ", " restriction -s ")
                        .replaceAll(" extremely ", " extreme -ly ").replaceAll(" actually ", " actual -ly ")
                        .replaceAll(" typically ", " typical -ly ").replaceAll(" drivers ", " driver -s ")
                        .replaceAll(" teh ", " the ").replaceAll(" definitely ", " definite -ly ")
                        .replaceAll(" factors ", " factor -s ").replaceAll(" truly ", " true -ly ")
                        .replaceAll(" mostly ", " most -ly ").replaceAll(" nicely ", " nice -ly ")
                        .replaceAll(" surely ", " sure -ly ").replaceAll(" certainly ", " certain -ly ")
                        .replaceAll(" totally ", " total -ly ").replaceAll(" \\# ", " number ")
                        .replaceAll(" \\& ", " and ").replaceAll(" avenue ", " ave ").replaceAll(" -s ", " s ")

                // If the MR concerns one of the following predicates, and a ref is available
                if ((MRstr.startsWith("inform(") || MRstr.startsWith("inform_only")
                        || MRstr.startsWith("inform_no_match(") || MRstr.startsWith("?confirm(")
                        || MRstr.startsWith("?select(") || MRstr.startsWith("?request(")
                        || MRstr.startsWith("?reqmore(") || MRstr.startsWith("goodbye(")) && !ref.isEmpty()) {
                    // Obtain the predicate
                    String predicate = MRstr.substring(0, MRstr.indexOf('('));
                    if (!getPredicates().contains(predicate) && predicate != null) {

                        if (!getAttributes().containsKey(predicate)) {
                            getAttributes().put(predicate, new HashSet<String>());
                        if (!getDatasetInstances().containsKey(predicate)) {
                            getDatasetInstances().put(predicate, new ArrayList<DatasetInstance>());

                    // Obtain the attributes
                    String attributesStr = MRstr.substring(MRstr.indexOf('(') + 1, MRstr.length() - 1);
                    HashMap<String, HashSet<String>> attributeValues = new HashMap<>();
                    // Track the indexes used for variables identifiers (seperately for each attribute)
                    HashMap<String, Integer> attrXIndeces = new HashMap<>();
                    if (!attributesStr.isEmpty()) {
                        // Parse the attributes and their values
                        String[] args = attributesStr.split(";");
                        for (String arg : args) {
                            String attr;
                            String value = "";
                            // If the attribute has corresponding values
                            if (arg.contains("=")) {
                                String[] subAttr = arg.split("=");
                                value = subAttr[1].toLowerCase();
                                attr = subAttr[0].toLowerCase().replaceAll("_", "");

                                if (value.startsWith("\'")) {
                                    value = value.substring(1, value.length() - 1);
                                // Normalize some closed set values
                                if (value.equals("true")) {
                                    value = "yes";
                                if (value.equals("false")) {
                                    value = "no";
                                if (value.equals("dontcare")) {
                                    value = "dont_care";
                                if ((" " + value + " ").contains(" avenue ")) {
                                    value = (" " + value + " ").replace(" avenue ", " ave ").trim();
                                // Treat these values as seperate attributes since they are expressed quite differently
                                if (value.equals("no") || value.equals("yes") || value.equals("yes or no")
                                        || value.equals("none") || value.equals("empty")) {
                                    attr += "_" + value.replaceAll(" ", "_");
                                    value = attr;
                                // Treat "dont_care" instances, as if "dont_care" is the attribute, and the original attribute is the value
                                // We do this because the phrasing is very similar between different "dont_care" realizations
                                if (value.equals("dont_care")) {
                                    String v = value;
                                    value = attr;
                                    attr = v;
                            } else {
                                attr = arg.replaceAll("_", "");
                            if (!getAttributes().get(predicate).contains(attr)) {
                            if (!attributeValues.containsKey(attr)) {
                                attributeValues.put(attr, new HashSet<String>());
                            // If the attribute has no corresponding value, we encode it by using the attibute identifier as the value
                            if (value.isEmpty()) {
                                value = attr;

                            // If the value is a variable, we name it as {@X@ + attribute identifier + variable index (for this attribute)}
                            // This occurs when values are already set as variables in the MR, before any delexicalization happens
                            if (value.toLowerCase().startsWith("x")) {
                                int index = 0;
                                if (!attrXIndeces.containsKey(attr)) {
                                    attrXIndeces.put(attr, 1);
                                } else {
                                    index = attrXIndeces.get(attr);
                                    attrXIndeces.put(attr, index + 1);
                                value = "x" + index;

                    // Delexicalizing the attribute/value pairs
                    HashMap<String, HashSet<String>> delexicalizedAttributeValues = new HashMap<>();
                    HashMap<String, HashMap<String, Integer>> attrValuePriorities = new HashMap<>();
                    int maximumPriority = 0;
                    /* Delixalization of values needs to happen incrementally with priority given to the values of greater lenth, to avoid overlap of values in the reference
                     * e.g. for the MR: inform{name="inn on castro", near="castro"}, with the reference "inn on castro is a nice restaurant",
                     *      we need to first align and delexicalize the "inn on castro" value, before the "castro" value 
                     *      (in this case because "castro" doesn't appear in the reference, but even if it appeared later the priorities would help align it with the correct one)
                    // We begin by determining which values may require delexicalization, and which not
                    for (String attr : attributeValues.keySet()) {
                        if (!attr.isEmpty()) {
                            delexicalizedAttributeValues.put(attr, new HashSet<String>());
                            attrValuePriorities.put(attr, new HashMap<String, Integer>());
                            for (String value : attributeValues.get(attr)) {
                                if (!value.equals("none") && !value.equals("empty") && !value.equals("yes")
                                        && !value.equals("yes or no") && !value.equals("no")
                                        && !value.equals(attr)) {
                                    // Initially priorities are given according to value order
                                    attrValuePriorities.get(attr).put(value, maximumPriority);
                                } else {
                                    // No delexicalization is needed here
                    // We shift the priorities of different values, according to their perspective lengths (i.e. longer values have higher priority)
                    boolean change = true;
                    while (change) {
                        change = false;
                        for (String attr1 : attrValuePriorities.keySet()) {
                            for (String value1 : attrValuePriorities.get(attr1).keySet()) {
                                for (String attr2 : attrValuePriorities.keySet()) {
                                    for (String value2 : attrValuePriorities.get(attr2).keySet()) {
                                        if (!value1.equals(value2) && value1.contains(value2)
                                                && attrValuePriorities.get(attr1).get(
                                                        value1) > attrValuePriorities.get(attr2).get(value2)) {
                                            int prio1 = attrValuePriorities.get(attr1).get(value1);
                                            int prio2 = attrValuePriorities.get(attr2).get(value2);
                                            attrValuePriorities.get(attr1).put(value1, prio2);
                                            attrValuePriorities.get(attr2).put(value2, prio1);
                                            change = true;
                    // Map between variables and their lexicalized values, required for relexicalization during postprocessing after the sentence generation
                    HashMap<String, String> delexicalizationMap = new HashMap<>();
                    ref = " " + ref + " ";
                    // Delexicalization occurs, in order of priority
                    for (int priority = 0; priority < maximumPriority; priority++) {
                        for (String attr : attrValuePriorities.keySet()) {
                            if (!attrXIndeces.containsKey(attr)) {
                                attrXIndeces.put(attr, 0);
                            for (String value : attrValuePriorities.get(attr).keySet()) {
                                if (attrValuePriorities.get(attr).get(value) == priority) {
                                    // If the value doesn't appear verbatim in the reference, and the value is not composed of multiple subvalues (i.e. doesn't contain connectives)
                                    if (!ref.contains(" " + value + " ") && !value.contains(" and ")
                                            && !value.contains(" or ")) {
                                        if (value.equals("restaurant") && ref.contains(" place ")) {
                                            ref = ref.replace(" place ", " " + Action.TOKEN_X + attr + "_"
                                                    + attrXIndeces.get(attr) + " ");
                                            ref = ref.replaceAll("  ", " ");
                                                    .add(Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr));
                                                    Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr),
                                            attrXIndeces.put(attr, attrXIndeces.get(attr) + 1);
                                        } else {
                                        // If the value doesn't appear verbatim in the reference, but the value is composed of multiple sub-values
                                    } else if (!ref.contains(" " + value + " ")
                                            && (value.contains(" and ") || value.contains(" or "))) {
                                        // We first check if the value appears verbatim when we switch "and" with "or" and vice versa
                                        // We do this due to some inconsistencies in the dataset on how conjuctions are treated
                                        String tempValue = value;
                                        if (value.contains(" and ")) {
                                            tempValue = value.replace(" and ", " or ");
                                        } else if (value.contains(" or ")) {
                                            tempValue = value.replace(" or ", " and ");

                                        if (ref.contains(" " + tempValue + " ")) {
                                            ref = ref.replace(" " + tempValue + " ", " " + Action.TOKEN_X + attr
                                                    + "_" + attrXIndeces.get(attr) + " ");
                                            ref = ref.replaceAll("  ", " ");
                                                    .add(Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr));
                                                    Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr),
                                            attrXIndeces.put(attr, attrXIndeces.get(attr) + 1);
                                        } else {
                                            // We split the conjunction into the seperate values; so far the code supports only 2 sub-values
                                            String[] values = new String[2];
                                            if (value.contains(" and ")) {
                                                values = value.split(" and ");
                                            } else if (value.contains(" or ")) {
                                                values = value.split(" or ");
                                            // And check if the conjunction appears verbatim when we switch the position of the sub-values
                                            String newValue1 = values[1] + " and " + values[0];
                                            String newValue2 = values[1] + " or " + values[0];
                                            if (ref.contains(" " + newValue1 + " ")) {
                                                ref = ref.replace(" " + newValue1 + " ", " " + Action.TOKEN_X
                                                        + attr + "_" + attrXIndeces.get(attr) + " ");
                                                ref = ref.replaceAll("  ", " ");
                                                        Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr));
                                                        Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr),
                                                attrXIndeces.put(attr, attrXIndeces.get(attr) + 1);
                                            } else if (ref.contains(" " + newValue2 + " ")) {
                                                ref = ref.replace(" " + newValue2 + " ", " " + Action.TOKEN_X
                                                        + attr + "_" + attrXIndeces.get(attr) + " ");
                                                ref = ref.replaceAll("  ", " ");
                                                        Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr));
                                                        Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr),
                                                attrXIndeces.put(attr, attrXIndeces.get(attr) + 1);
                                        // If the value appears verbatim in the reference, delexicalize it
                                    } else {
                                        ref = ref.replace(" " + value + " ", " " + Action.TOKEN_X + attr + "_"
                                                + attrXIndeces.get(attr) + " ");
                                        ref = ref.replaceAll("  ", " ");
                                                .add(Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr));
                                                Action.TOKEN_X + attr + "_" + attrXIndeces.get(attr), value);
                                        attrXIndeces.put(attr, attrXIndeces.get(attr) + 1);
                    ref = ref.trim();

                    // We construct the MeaningRepresentation
                    MeaningRepresentation MR = new MeaningRepresentation(predicate,
                            delexicalizedAttributeValues, MRstr, delexicalizationMap);

                    // Sequences of attribute/values pairs and words in the order we observe this in the reference
                    ArrayList<String> observedAttrValueSequence = new ArrayList<>();
                    ArrayList<String> observedWordSequence = new ArrayList<>();

                    // The observed word sequence does not include punctuation
                    String[] words = ref.replaceAll("([,.?!;:'])", " $1").split(" ");

                    // We construct the observed word sequence (and fix some orthographical errors along the way)
                    for (int w = 0; w < words.length; w++) {
                        if (!words[w].trim().isEmpty()) {
                            if (!words[w].trim().isEmpty()
                                    && (observedWordSequence.isEmpty() || !words[w].trim().equals(
                                            observedWordSequence.get(observedWordSequence.size() - 1)))) {
                                if (words[w].trim().equals("s") && (observedWordSequence
                                        .get(observedWordSequence.size() - 1).equals("child"))) {
                                    observedWordSequence.set(observedWordSequence.size() - 1, "children");
                                } else if (words[w].trim().equals("addres")
                                        || words[w].trim().equals("adress")) {
                                } else if (words[w].trim().equals("mathch")) {
                                } else if (words[w].trim().equals("prefered")) {
                                } else if (words[w].trim().equals("relevent")) {
                                } else if (words[w].trim().equals("alloed")) {
                                } else if (words[w].trim().equals("avalible")
                                        || words[w].trim().equals("avalable")) {
                                } else if (words[w].trim().equals("tha") || words[w].trim().equals("te")) {
                                } else if (words[w].trim().equals("internect")) {
                                } else if (words[w].trim().equals("wether")) {
                                } else if (words[w].trim().equals("aplogize")) {
                                } else if (words[w].trim().equals("accomodations")) {
                                } else if (words[w].trim().equals("whould")) {
                                } else if (words[w].trim().equals("aceepted")) {
                                } else if (words[w].trim().equals("postode")) {
                                } else if (words[w].trim().equals("ive")) {
                                } else if (words[w].trim().equals("waht")) {
                                } else if (words[w].trim().equals("neighborhood")) {
                                } else if (words[w].trim().equals("prefernce")) {
                                } else if (words[w].trim().equals("dont")) {
                                } else if (words[w].trim().equals("isnt")) {
                                } else if (words[w].trim().equals("intenet")
                                        || words[w].trim().equals("internetn")) {
                                } else if (words[w].trim().equals("cannote")) {
                                } else if (words[w].trim().equals("notels")) {
                                } else if (words[w].trim().equals("phne")) {
                                } else if (words[w].trim().equals("taht")) {
                                } else if (words[w].trim().equals("postdocde")) {
                                } else if (words[w].trim().equals("accpects")) {
                                } else if (words[w].trim().equals("doesn") || words[w].trim().equals("doesnt")
                                        || words[w].trim().equals("doesn")) {
                                } else if (words[w].trim().equals("restaurnats")) {
                                } else if (words[w].trim().equals("ther") || words[w].trim().equals("thers")) {

                                    // The dataset treats the suffixes "s" and "-ly" as separate words
                                    // We combine these suffixes with their preceding words but keep a cache with these changes to revert them before evaluation (we have to do this so that the token-based evaluation metrics are calculated in a consistent manner with Wen et al.'s)
                                } else if (words[w].trim().equals("s")) {
                                    if (observedWordSequence.isEmpty()) {
                                    } else if (observedWordSequence.get(observedWordSequence.size() - 1)
                                            .startsWith(Action.TOKEN_X)) {
                                    } else {
                                                observedWordSequence.get(observedWordSequence.size() - 1) + "s",
                                                observedWordSequence.get(observedWordSequence.size() - 1)
                                                        + " s");
                                        observedWordSequence.set(observedWordSequence.size() - 1,
                                                observedWordSequence.get(observedWordSequence.size() - 1)
                                                        + "s");
                                } else if (words[w].trim().equals("-ly")) {
                                    if (observedWordSequence.isEmpty()) {
                                    } else if (observedWordSequence.get(observedWordSequence.size() - 1)
                                            .startsWith(Action.TOKEN_X)) {
                                    } else {
                                                observedWordSequence.get(observedWordSequence.size() - 1)
                                                        + "ly",
                                                observedWordSequence.get(observedWordSequence.size() - 1)
                                                        + " -ly");
                                        observedWordSequence.set(observedWordSequence.size() - 1,
                                                observedWordSequence.get(observedWordSequence.size() - 1)
                                                        + "ly");
                                } else {

                    //Probably deprecated, need to do some more tests
                    MR.getAttributeValues().keySet().forEach((attr) -> {
                                .filter((value) -> (attr.equals("name") && value.equals("none")))
                                .forEachOrdered((value) -> {
                                            attr.toLowerCase() + "=" + value.toLowerCase());

                    // We store the maximum observed word sequence length, to use as a limit during generation
                    if (observedWordSequence.size() > getMaxWordSequenceLength()) {

                    // We initialize the alignments between words and attribute/value pairs
                    ArrayList<String> wordToAttrValueAlignment = new ArrayList<>();
                    // And populate them with "unaligned" tokens (i.e. "[]") and punctuation alignments; we do the latter so we know to filter out punctuation when estimating the alignments in later stages
                    observedWordSequence.forEach((word) -> {
                        if (word.trim().matches("[,.?!;:']")) {
                        } else {
                    // And using both word sequence and initial alignments, we construct a draft sequence of word actions corresponding to the reference
                    ArrayList<Action> directReferenceSequence = new ArrayList<>();
                    for (int r = 0; r < observedWordSequence.size(); r++) {
                                .add(new Action(observedWordSequence.get(r), wordToAttrValueAlignment.get(r)));
                    // Finally, we construct the DatasetInstance
                    DatasetInstance DI = new DatasetInstance(MR, directReferenceSequence,
                            postProcessRef(MR, directReferenceSequence));
                    // We add the evaluation references of all previously constructed DatasetInstances (that are identical to this one) as available evaluation references 
                            .filter((existingDI) -> (existingDI.getMeaningRepresentation().getAbstractMR()
                            .map((existingDI) -> {
                                return existingDI;
                            }).forEachOrdered((existingDI) -> {
                                // We add the direct reference of this DatasetInstance as an available evaluation reference to all previously constructed DatasetInstance that are identical to this one

                    // Calculate the possible alignments between (non-delexicalized) attribute values and reference subphrases
                    // We do this by comparing the values with n-gram subphrases of the reference, using character-level Levenshtein distance
                    // These are used during the estimation of naive alignments, but also for tracking which values have possibly been expressed during generation
                    HashMap<String, HashMap<String, Double>> observedValueAlignments = new HashMap<>();
                    MR.getAttributeValues().keySet().forEach((attr) -> {
                                .filter((value) -> (!value.equals("name=none")
                                        && !value.startsWith(Action.TOKEN_X)
                                        && !(value.matches("\"[xX][0-9]+\"") || value.matches("[xX][0-9]+"))))
                                .forEachOrdered((value) -> {
                                    String valueToCompare = value;
                                    if (value.equals("no") || value.equals("yes") || value.equals("yes or no")
                                            || value.equals("none") || value.equals("empty")) {
                                        // If the value is boolean or non-existant, we also compare using the attribute name
                                        valueToCompare = attr;
                                        observedValueAlignments.put(valueToCompare + ":" + value,
                                                new HashMap<String, Double>());
                                    } else {
                                                new HashMap<String, Double>());
                                    //For all n-grams in the referenec
                                    for (int n = 1; n < observedWordSequence.size(); n++) {
                                        //Calculate the similaritie between them and valueToCompare
                                        for (int r = 0; r <= observedWordSequence.size() - n; r++) {
                                            boolean compareAgainstNGram = true;
                                            for (int j = 0; j < n; j++) {
                                                if (observedWordSequence.get(r + j).startsWith(Action.TOKEN_X)
                                                        || wordToAttrValueAlignment.get(r + j)
                                                        || StringNLPUtilities
                                                                .isArticle(observedWordSequence.get(r + j))
                                                        || observedWordSequence.get(r + j)
                                                        || observedWordSequence.get(r + j)
                                                                .equalsIgnoreCase("or")) {
                                                    // We ignore n-grams that contain variables, punctuation, articles, or conjuctions
                                                    // In other words, we do not allow values to align with such n-grams
                                                    compareAgainstNGram = false;
                                            if (compareAgainstNGram) {
                                                String align = "";
                                                String compare = "";
                                                String backwardCompare = "";
                                                for (int j = 0; j < n; j++) {
                                                    // The coordinates of the alignment
                                                    align += (r + j) + " ";
                                                    compare += observedWordSequence.get(r + j);
                                                    backwardCompare = observedWordSequence.get(r + j)
                                                            + backwardCompare;
                                                align = align.trim();

                                                // Calculate the character-level distance between the value and the nGram (in its original and reversed order)
                                                Double distance = Levenshtein.getSimilarity(
                                                        valueToCompare.toLowerCase(), compare.toLowerCase(),
                                                Double backwardDistance = Levenshtein.getSimilarity(
                                                        backwardCompare.toLowerCase(), true);

                                                // We keep the best distance score; note that the Levenshtein distance is normalized so that greater is better 
                                                if (backwardDistance > distance) {
                                                    distance = backwardDistance;
                                                // We ignore all nGrams that are less similar than a threshold
                                                if (distance > 0.3) {
                                                    if (value.equals("no") || value.equals("yes")
                                                            || value.equals("yes or no") || value.equals("none")
                                                            || value.equals("empty")) {
                                                                .get(valueToCompare + ":" + value)
                                                                .put(align, distance);
                                                    } else {

                    // We filter out any values that haven't been aligned
                    HashSet<String> toRemove = new HashSet<>();
                    for (String value : observedValueAlignments.keySet()) {
                        if (observedValueAlignments.get(value).isEmpty()) {
                    for (String value : toRemove) {

                    // We keep the best aligned nGrams; since we do not want the aligned nGrams to be overlapping, we remove any overlapping alignments after we pick each one
                    while (!observedValueAlignments.keySet().isEmpty()) {
                        // Find the best aligned nGram
                        Double max = Double.NEGATIVE_INFINITY;
                        String[] bestAlignment = new String[2];
                        for (String value : observedValueAlignments.keySet()) {
                            for (String alignment : observedValueAlignments.get(value).keySet()) {
                                if (observedValueAlignments.get(value).get(alignment) > max) {
                                    max = observedValueAlignments.get(value).get(alignment);
                                    bestAlignment[0] = value;
                                    bestAlignment[1] = alignment;

                        // Find the subphrase that corresponds to the best aligned nGram, according to the coordinates
                        ArrayList<String> alignedStr = new ArrayList<>();
                        String[] coords = bestAlignment[1].split(" ");
                        if (coords.length == 1) {
                        } else {
                            for (int a = Integer.parseInt(coords[0].trim()); a <= Integer
                                    .parseInt(coords[coords.length - 1].trim()); a++) {

                        // Store the best aligned nGram
                        if (!getValueAlignments().containsKey(bestAlignment[0])) {
                                    new HashMap<ArrayList<String>, Double>());
                        getValueAlignments().get(bestAlignment[0]).put(alignedStr, max);

                        // And remove it from the observed ones for this instance
                        // And also remove any other aligned nGrams that are overlapping with the best aligned nGram
                        observedValueAlignments.keySet().forEach((value) -> {
                            HashSet<String> alignmentsToBeRemoved = new HashSet<>();
                            observedValueAlignments.get(value).keySet().forEach((alignment) -> {
                                String[] othCoords = alignment.split(" ");
                                if (Integer.parseInt(coords[0].trim()) <= Integer.parseInt(othCoords[0].trim())
                                        && (Integer.parseInt(coords[coords.length - 1].trim()) >= Integer
                                        || (Integer.parseInt(othCoords[0].trim()) <= Integer
                                                && Integer.parseInt(
                                                        othCoords[othCoords.length - 1].trim()) >= Integer
                                                                .parseInt(coords[0].trim()))) {
                            alignmentsToBeRemoved.forEach((alignment) -> {
                        // We filter out any values that are no logner aligned (due to overlapping conflicts)
                        toRemove = new HashSet<>();
                        for (String value : observedValueAlignments.keySet()) {
                            if (observedValueAlignments.get(value).isEmpty()) {
                        for (String value : toRemove) {
    } catch (JSONException ex) {

From source file:structuredPredictionNLG.SFX.java

 * @param trainingData//w w  w.java2  s  .co m
public void createRandomAlignments(ArrayList<DatasetInstance> trainingData) {
    HashMap<String, HashMap<ArrayList<Action>, HashMap<Action, Integer>>> punctPatterns = new HashMap<>();
    getPredicates().forEach((predicate) -> {
        punctPatterns.put(predicate, new HashMap<ArrayList<Action>, HashMap<Action, Integer>>());
    HashMap<DatasetInstance, ArrayList<Action>> punctRealizations = new HashMap<DatasetInstance, ArrayList<Action>>();

    HashMap<ArrayList<Action>, ArrayList<Action>> calculatedRealizationsCache = new HashMap<>();
    trainingData.stream().map((di) -> {
        HashSet<ArrayList<Action>> initRealizations = new HashSet<>();
        if (!calculatedRealizationsCache.containsKey(di.getDirectReferenceSequence())) {
        initRealizations.stream().map((realization) -> {
            HashMap<String, HashSet<String>> values = new HashMap<>();
            di.getMeaningRepresentation().getAttributeValues().keySet().forEach((attr) -> {
                values.put(attr, new HashSet<>(di.getMeaningRepresentation().getAttributeValues().get(attr)));
            ArrayList<Action> randomRealization = new ArrayList<Action>();
            realization.forEach((a) -> {
                if (a.getAttribute().equals(Action.TOKEN_PUNCT)) {
                    randomRealization.add(new Action(a.getWord(), a.getAttribute()));
                } else {
                    randomRealization.add(new Action(a.getWord(), ""));
            HashSet<String> unalignedAttrs = new HashSet<>();
            if (values.keySet().isEmpty()) {
                for (int i = 0; i < randomRealization.size(); i++) {
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        if (!getAttributes().get(di.getMeaningRepresentation().getPredicate())
                                .contains("empty")) {
            } else {
                values.keySet().forEach((attr) -> {
                    values.get(attr).forEach((value) -> {
                        if ((!(value.matches("\"[xX][0-9]+\"") || value.matches("[xX][0-9]+")
                                || value.startsWith(Action.TOKEN_X))) && !value.isEmpty()) {
                            String valueToCheck = value;
                            if (valueToCheck.equals("no") || valueToCheck.equals("yes")
                                    || valueToCheck.equals("yes or no") || valueToCheck.equals("none")
                            //|| valueToCheck.equals("dont_care")
                                    || valueToCheck.equals("empty")) {
                                valueToCheck = attr + ":" + value;
                                unalignedAttrs.add(attr + "=" + value);
                            if (valueToCheck.equals(attr)) {
                                unalignedAttrs.add(attr + "=" + value);
                            if (!valueToCheck.equals("empty:empty")
                                    && getValueAlignments().containsKey(valueToCheck)) {
                                unalignedAttrs.add(attr + "=" + valueToCheck);
                        } else {
                            unalignedAttrs.add(attr + "=" + value);
                unalignedAttrs.forEach((attrValue) -> {
                    int index = getRandomGen().nextInt(randomRealization.size());
                    boolean change = false;
                    while (!change) {
                        if (!randomRealization.get(index).getAttribute().equals(Action.TOKEN_PUNCT)) {
                            change = true;
                        } else {
                            index = getRandomGen().nextInt(randomRealization.size());
                String previousAttr = "";
                for (int i = 0; i < randomRealization.size(); i++) {
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        if (!previousAttr.isEmpty()) {
                    } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                        previousAttr = randomRealization.get(i).getAttribute();
                    } else {
                        previousAttr = "";
                //System.out.println("1: " + randomRealization);
                previousAttr = "";
                for (int i = randomRealization.size() - 1; i >= 0; i--) {
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        if (!previousAttr.isEmpty()) {
                    } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                        previousAttr = randomRealization.get(i).getAttribute();
                    } else {
                        previousAttr = "";
                //System.out.println("2: " + randomRealization);
                previousAttr = "";
                for (int i = 0; i < randomRealization.size(); i++) {
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        if (!previousAttr.isEmpty()) {
                    } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                        previousAttr = randomRealization.get(i).getAttribute();
                //System.out.println("3: " + randomRealization);
                previousAttr = "";
                for (int i = randomRealization.size() - 1; i >= 0; i--) {
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        if (!previousAttr.isEmpty()) {
                    } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                        previousAttr = randomRealization.get(i).getAttribute();
                //System.out.println("4: " + randomRealization);
            //FIX WRONG @PUNCT@
            String previousAttr = "";
            for (int i = randomRealization.size() - 1; i >= 0; i--) {
                if (randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)
                        && !randomRealization.get(i).getWord().matches("[,.?!;:']")) {
                    if (!previousAttr.isEmpty()) {
                } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                    previousAttr = randomRealization.get(i).getAttribute();
            ArrayList<Action> cleanRandomRealization = new ArrayList<>();
            randomRealization.stream().filter((a) -> (!a.getAttribute().equals(Action.TOKEN_PUNCT)))
                    .forEachOrdered((a) -> {
            //ADD END TOKENS
            ArrayList<Action> endRandomRealization = new ArrayList<>();
            previousAttr = "";
            for (int i = 0; i < cleanRandomRealization.size(); i++) {
                Action a = cleanRandomRealization.get(i);
                if (!previousAttr.isEmpty() && !a.getAttribute().equals(previousAttr)) {
                    endRandomRealization.add(new Action(Action.TOKEN_END, previousAttr));
                previousAttr = a.getAttribute();
            endRandomRealization.add(new Action(Action.TOKEN_END, previousAttr));
            endRandomRealization.add(new Action(Action.TOKEN_END, Action.TOKEN_END));
            calculatedRealizationsCache.put(realization, endRandomRealization);
            //System.out.println(di.getMeaningRepresentation().getPredicate() + ": " + endRandomRealization);
            ArrayList<String> attrValues = new ArrayList<String>();
            endRandomRealization.forEach((a) -> {
                if (attrValues.isEmpty()) {
                } else if (!attrValues.get(attrValues.size() - 1).equals(a.getAttribute())) {
            if (attrValues.size() > getMaxContentSequenceLength()) {
            ArrayList<Action> punctRealization = new ArrayList<>();
            previousAttr = "";
            for (int i = 0; i < punctRealization.size(); i++) {
                if (!punctRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                    if (!punctRealization.get(i).getAttribute().equals(previousAttr)
                            && !previousAttr.isEmpty()) {
                        punctRealization.add(i, new Action(Action.TOKEN_END, previousAttr));
                    previousAttr = punctRealization.get(i).getAttribute();
            if (!punctRealization.get(punctRealization.size() - 1).getWord().equals(Action.TOKEN_END)) {
                punctRealization.add(new Action(Action.TOKEN_END, previousAttr));
            return punctRealization;
        }).map((punctRealization) -> {
            punctRealizations.put(di, punctRealization);
            return punctRealization;
        }).forEachOrdered((punctRealization) -> {
            for (int i = 0; i < punctRealization.size(); i++) {
                Action a = punctRealization.get(i);
                if (a.getAttribute().equals(Action.TOKEN_PUNCT)) {
                    boolean legal = true;
                    ArrayList<Action> surroundingActions = new ArrayList<>();
                    /*if (i - 3 >= 0) {
                    surroundingActions.add(punctRealization.get(i - 3));
                    } else {
                    if (i - 2 >= 0) {
                        surroundingActions.add(punctRealization.get(i - 2));
                    } else {
                    if (i - 1 >= 0) {
                        surroundingActions.add(punctRealization.get(i - 1));
                    } else {
                        legal = false;
                    boolean oneMore = false;
                    if (i + 1 < punctRealization.size()) {
                        surroundingActions.add(punctRealization.get(i + 1));
                        if (!punctRealization.get(i + 1).getAttribute().equals(Action.TOKEN_END)) {
                            oneMore = true;
                    } else {
                        legal = false;
                    if (oneMore && i + 2 < punctRealization.size()) {
                        surroundingActions.add(punctRealization.get(i + 2));
                    } else {
                    if (legal) {
                        if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate())
                                .containsKey(surroundingActions)) {
                                    .put(surroundingActions, new HashMap<Action, Integer>());
                        if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate())
                                .get(surroundingActions).containsKey(a)) {
                                    .get(surroundingActions).put(a, 1);
                        } else {
                                    .put(a, punctPatterns.get(di.getMeaningRepresentation().getPredicate())
                                            .get(surroundingActions).get(a) + 1);
        return di;
    }).map((di) -> {
        return di;
    punctRealizations.keySet().forEach((di) -> {
        ArrayList<Action> punctRealization = punctRealizations.get(di);
        punctPatterns.get(di.getMeaningRepresentation().getPredicate()).keySet().forEach((surrounds) -> {
            int beforeNulls = 0;
            if (surrounds.get(0) == null) {
            if (surrounds.get(1) == null) {
            for (int i = 0 - beforeNulls; i < punctRealization.size(); i++) {
                boolean matches = true;
                int m = 0;
                for (int s = 0; s < surrounds.size(); s++) {
                    if (surrounds.get(s) != null) {
                        if (i + s < punctRealization.size()) {
                            if (!punctRealization.get(i + s).getWord().equals(surrounds.get(s)
                                    .getWord()) /*|| !cleanActionList.get(i).getAttribute().equals(surrounds.get(s).getAttribute())*/) {
                                matches = false;
                                s = surrounds.size();
                            } else {
                        } else {
                            matches = false;
                            s = surrounds.size();
                    } else if (s < 2 && i + s >= 0) {
                        matches = false;
                        s = surrounds.size();
                    } else if (s >= 2 && i + s < punctRealization.size()) {
                        matches = false;
                        s = surrounds.size();
                if (matches && m > 0) {
                    Action a = new Action("", "");
                    if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds)
                            .containsKey(a)) {
                    } else {
                                        .get(a) + 1);
    punctPatterns.keySet().forEach((predicate) -> {
        punctPatterns.get(predicate).keySet().forEach((punct) -> {
            Action bestAction = null;
            int bestCount = 0;
            for (Action a : punctPatterns.get(predicate).get(punct).keySet()) {
                if (punctPatterns.get(predicate).get(punct).get(a) > bestCount) {
                    bestAction = a;
                    bestCount = punctPatterns.get(predicate).get(punct).get(a);
                } else if (punctPatterns.get(predicate).get(punct).get(a) == bestCount
                        && bestAction.getWord().isEmpty()) {
                    bestAction = a;
            if (!getPunctuationPatterns().containsKey(predicate)) {
                getPunctuationPatterns().put(predicate, new HashMap<ArrayList<Action>, Action>());
            if (!bestAction.getWord().isEmpty()) {
                getPunctuationPatterns().get(predicate).put(punct, bestAction);

From source file:structuredPredictionNLG.SFX.java

 * @param predicate//w  w  w . ja  v a2 s.  com
 * @param costs
 * @param previousGeneratedAttrs
 * @param attrValuesAlreadyMentioned
 * @param attrValuesToBeMentioned
 * @param availableAttributeActions
 * @param MR
 * @return
public Instance createContentInstanceWithCosts(String predicate, TObjectDoubleHashMap<String> costs,
        ArrayList<String> previousGeneratedAttrs, HashSet<String> attrValuesAlreadyMentioned,
        HashSet<String> attrValuesToBeMentioned, HashMap<String, HashSet<String>> availableAttributeActions,
        MeaningRepresentation MR) {
    TObjectDoubleHashMap<String> generalFeatures = new TObjectDoubleHashMap<>();
    HashMap<String, TObjectDoubleHashMap<String>> valueSpecificFeatures = new HashMap<>();
    if (availableAttributeActions.containsKey(predicate)) {
        availableAttributeActions.get(predicate).forEach((action) -> {
            valueSpecificFeatures.put(action, new TObjectDoubleHashMap<String>());

    ArrayList<String> mentionedAttrValues = new ArrayList<>();
            (attrValue) -> (!attrValue.equals(Action.TOKEN_START) && !attrValue.equals(Action.TOKEN_END)))
            .forEachOrdered((attrValue) -> {

    for (int j = 1; j <= 1; j++) {
        String previousAttrValue = "@@";
        if (mentionedAttrValues.size() - j >= 0) {
            previousAttrValue = mentionedAttrValues.get(mentionedAttrValues.size() - j).trim();
        generalFeatures.put("feature_attrValue_" + j + "_" + previousAttrValue, 1.0);
    //Word N-Grams
    String prevAttrValue = "@@";
    if (mentionedAttrValues.size() - 1 >= 0) {
        prevAttrValue = mentionedAttrValues.get(mentionedAttrValues.size() - 1).trim();
    String prev2AttrValue = "@@";
    if (mentionedAttrValues.size() - 2 >= 0) {
        prev2AttrValue = mentionedAttrValues.get(mentionedAttrValues.size() - 2).trim();
    String prev3AttrValue = "@@";
    if (mentionedAttrValues.size() - 3 >= 0) {
        prev3AttrValue = mentionedAttrValues.get(mentionedAttrValues.size() - 3).trim();
    String prev4AttrValue = "@@";
    if (mentionedAttrValues.size() - 4 >= 0) {
        prev4AttrValue = mentionedAttrValues.get(mentionedAttrValues.size() - 4).trim();
    String prev5AttrValue = "@@";
    if (mentionedAttrValues.size() - 5 >= 0) {
        prev5AttrValue = mentionedAttrValues.get(mentionedAttrValues.size() - 5).trim();

    String prevBigramAttrValue = prev2AttrValue + "|" + prevAttrValue;
    String prevTrigramAttrValue = prev3AttrValue + "|" + prev2AttrValue + "|" + prevAttrValue;
    String prev4gramAttrValue = prev4AttrValue + "|" + prev3AttrValue + "|" + prev2AttrValue + "|"
            + prevAttrValue;
    String prev5gramAttrValue = prev5AttrValue + "|" + prev4AttrValue + "|" + prev3AttrValue + "|"
            + prev2AttrValue + "|" + prevAttrValue;
    generalFeatures.put("feature_attrValue_bigram_" + prevBigramAttrValue, 1.0);
    generalFeatures.put("feature_attrValue_trigram_" + prevTrigramAttrValue, 1.0);
    generalFeatures.put("feature_attrValue_4gram_" + prev4gramAttrValue, 1.0);
    generalFeatures.put("feature_attrValue_5gram_" + prev5gramAttrValue, 1.0);

    //If arguments have been generated or not
    for (int i = 0; i < mentionedAttrValues.size(); i++) {
        generalFeatures.put("feature_attrValue_allreadyMentioned_" + mentionedAttrValues.get(i), 1.0);
    //If arguments should still be generated or not
    attrValuesToBeMentioned.forEach((attrValue) -> {
        generalFeatures.put("feature_attrValue_toBeMentioned_" + attrValue, 1.0);
    }); //Which attrs are in the MR and which are not

    if (availableAttributeActions.containsKey(predicate)) {
        availableAttributeActions.get(predicate).forEach((attribute) -> {
            if (MR.getAttributeValues().keySet().contains(attribute)) {
                generalFeatures.put("feature_attr_inMR_" + attribute, 1.0);
            } else {
                generalFeatures.put("feature_attr_notInMR_" + attribute, 1.0);

    ArrayList<String> mentionedAttrs = new ArrayList<>();
    for (int i = 0; i < mentionedAttrValues.size(); i++) {
        String attr = mentionedAttrValues.get(i);
        if (attr.contains("=")) {
            attr = mentionedAttrValues.get(i).substring(0, mentionedAttrValues.get(i).indexOf('='));
    HashSet<String> attrsToBeMentioned = new HashSet<>();
    attrValuesToBeMentioned.stream().map((attrValue) -> {
        String attr = attrValue;
        if (attr.contains("=")) {
            attr = attrValue.substring(0, attrValue.indexOf('='));
        return attr;
    }).forEachOrdered((attr) -> {

    for (int j = 1; j <= 1; j++) {
        String previousAttr = "";
        if (mentionedAttrs.size() - j >= 0) {
            previousAttr = mentionedAttrs.get(mentionedAttrs.size() - j).trim();
        if (!previousAttr.isEmpty()) {
            generalFeatures.put("feature_attr_" + j + "_" + previousAttr, 1.0);
        } else {
            generalFeatures.put("feature_attr_" + j + "_@@", 1.0);
    //Word N-Grams
    String prevAttr = "@@";
    if (mentionedAttrs.size() - 1 >= 0) {
        prevAttr = mentionedAttrs.get(mentionedAttrs.size() - 1).trim();
    String prev2Attr = "@@";
    if (mentionedAttrs.size() - 2 >= 0) {
        prev2Attr = mentionedAttrs.get(mentionedAttrs.size() - 2).trim();
    String prev3Attr = "@@";
    if (mentionedAttrs.size() - 3 >= 0) {
        prev3Attr = mentionedAttrs.get(mentionedAttrs.size() - 3).trim();
    String prev4Attr = "@@";
    if (mentionedAttrs.size() - 4 >= 0) {
        prev4Attr = mentionedAttrs.get(mentionedAttrs.size() - 4).trim();
    String prev5Attr = "@@";
    if (mentionedAttrs.size() - 5 >= 0) {
        prev5Attr = mentionedAttrs.get(mentionedAttrs.size() - 5).trim();

    String prevBigramAttr = prev2Attr + "|" + prevAttr;
    String prevTrigramAttr = prev3Attr + "|" + prev2Attr + "|" + prevAttr;
    String prev4gramAttr = prev4Attr + "|" + prev3Attr + "|" + prev2Attr + "|" + prevAttr;
    String prev5gramAttr = prev5Attr + "|" + prev4Attr + "|" + prev3Attr + "|" + prev2Attr + "|" + prevAttr;

    generalFeatures.put("feature_attr_bigram_" + prevBigramAttr, 1.0);
    generalFeatures.put("feature_attr_trigram_" + prevTrigramAttr, 1.0);
    generalFeatures.put("feature_attr_4gram_" + prev4gramAttr, 1.0);
    generalFeatures.put("feature_attr_5gram_" + prev5gramAttr, 1.0);

    //If arguments have been generated or not
    attrValuesAlreadyMentioned.forEach((attr) -> {
        generalFeatures.put("feature_attr_alreadyMentioned_" + attr, 1.0);
    //If arguments should still be generated or not
    attrsToBeMentioned.forEach((attr) -> {
        generalFeatures.put("feature_attr_toBeMentioned_" + attr, 1.0);

    //Attr specific features (and global features)
    if (availableAttributeActions.containsKey(predicate)) {
        for (String action : availableAttributeActions.get(predicate)) {
            if (action.equals(Action.TOKEN_END)) {
                if (attrsToBeMentioned.isEmpty()) {
                } else {
            } else {
                //Is attr in MR?
                if (MR.getAttributeValues().get(action) != null) {
                    valueSpecificFeatures.get(action).put("global_feature_specific_isInMR", 1.0);
                } else {
                    valueSpecificFeatures.get(action).put("global_feature_specific_isNotInMR", 1.0);
                //Is attr already mentioned right before
                if (prevAttr.equals(action)) {
                    valueSpecificFeatures.get(action).put("global_feature_specific_attrFollowingSameAttr", 1.0);
                } else {
                //Is attr already mentioned
                attrValuesAlreadyMentioned.stream().map((attrValue) -> {
                    if (attrValue.indexOf('=') == -1) {
                    return attrValue;
                }).filter((attrValue) -> (attrValue.substring(0, attrValue.indexOf('=')).equals(action)))
                        .forEachOrdered((_item) -> {
                                    .put("global_feature_specific_attrAlreadyMentioned", 1.0);
                //Is attr to be mentioned (has value to express)
                boolean toBeMentioned = false;
                for (String attrValue : attrValuesToBeMentioned) {
                    if (attrValue.substring(0, attrValue.indexOf('=')).equals(action)) {
                        toBeMentioned = true;
                        valueSpecificFeatures.get(action).put("global_feature_specific_attrToBeMentioned", 1.0);
                if (!toBeMentioned) {
                    valueSpecificFeatures.get(action).put("global_feature_specific_attrNotToBeMentioned", 1.0);
            HashSet<String> keys = new HashSet<>(valueSpecificFeatures.get(action).keySet());
            keys.forEach((feature1) -> {
                        .filter((feature2) -> (valueSpecificFeatures.get(action).get(feature1) == 1.0
                                && valueSpecificFeatures.get(action).get(feature2) == 1.0
                                && feature1.compareTo(feature2) < 0))
                        .forEachOrdered((feature2) -> {
                            valueSpecificFeatures.get(action).put(feature1 + "&&" + feature2, 1.0);

            String nextValue = chooseNextValue(action, attrValuesToBeMentioned);
            if (nextValue.isEmpty() && !action.equals(Action.TOKEN_END)) {
                valueSpecificFeatures.get(action).put("global_feature_LMAttr_score", 0.0);
            } else {
                ArrayList<String> fullGramLM = new ArrayList<>();
                for (int i = 0; i < mentionedAttrValues.size(); i++) {
                ArrayList<String> prev5attrValueGramLM = new ArrayList<>();
                int j = 0;
                for (int i = mentionedAttrValues.size() - 1; (i >= 0 && j < 5); i--) {
                    prev5attrValueGramLM.add(0, mentionedAttrValues.get(i));
                if (!action.equals(Action.TOKEN_END)) {
                    prev5attrValueGramLM.add(action + "=" + chooseNextValue(action, attrValuesToBeMentioned));
                } else {
                while (prev5attrValueGramLM.size() < 4) {
                    prev5attrValueGramLM.add(0, "@@");

                double afterLMScore = getContentLMsPerPredicate().get(predicate)
                valueSpecificFeatures.get(action).put("global_feature_LMAttr_score", afterLMScore);

                afterLMScore = getContentLMsPerPredicate().get(predicate).getProbability(fullGramLM);
                valueSpecificFeatures.get(action).put("global_feature_LMAttrFull_score", afterLMScore);
    return new Instance(generalFeatures, valueSpecificFeatures, costs);

From source file:structuredPredictionNLG.SFX.java

 * @param trainingData/* ww w  .ja v  a  2 s .  c  om*/
public void createNaiveAlignments(ArrayList<DatasetInstance> trainingData) {
    HashMap<String, HashMap<ArrayList<Action>, HashMap<Action, Integer>>> punctPatterns = new HashMap<>();
    getPredicates().forEach((predicate) -> {
        punctPatterns.put(predicate, new HashMap<ArrayList<Action>, HashMap<Action, Integer>>());
    HashMap<DatasetInstance, ArrayList<Action>> punctRealizations = new HashMap<DatasetInstance, ArrayList<Action>>();

    trainingData.stream().map((di) -> {
        HashMap<ArrayList<Action>, ArrayList<Action>> calculatedRealizationsCache = new HashMap<>();
        HashSet<ArrayList<Action>> initRealizations = new HashSet<>();
        if (!calculatedRealizationsCache.containsKey(di.getDirectReferenceSequence())) {
        initRealizations.stream().map((realization) -> {
            HashMap<String, HashSet<String>> values = new HashMap<>();
            di.getMeaningRepresentation().getAttributeValues().keySet().forEach((attr) -> {
                values.put(attr, new HashSet<>(di.getMeaningRepresentation().getAttributeValues().get(attr)));
            ArrayList<Action> randomRealization = new ArrayList<>();
            for (int i = 0; i < realization.size(); i++) {
                Action a = realization.get(i);
                if (a.getAttribute().equals(Action.TOKEN_PUNCT)) {
                    randomRealization.add(new Action(a.getWord(), a.getAttribute()));
                } else {
                    randomRealization.add(new Action(a.getWord(), ""));
            if (values.keySet().isEmpty()) {
                for (int i = 0; i < randomRealization.size(); i++) {
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        if (!getAttributes().get(di.getMeaningRepresentation().getPredicate())
                                .contains("empty")) {
            } else {
                HashMap<Double, HashMap<String, ArrayList<Integer>>> indexAlignments = new HashMap<>();
                HashSet<String> noValueAttrs = new HashSet<String>();
                values.keySet().forEach((attr) -> {
                            (value) -> ((!(value.matches("\"[xX][0-9]+\"") || value.matches("[xX][0-9]+")
                                    || value.startsWith(Action.TOKEN_X))) && !value.isEmpty()))
                            .map((value) -> {
                                String valueToCheck = value;
                                if (valueToCheck.equals("no") || valueToCheck.equals("yes")
                                        || valueToCheck.equals("yes or no") || valueToCheck.equals("none")
                                //|| attr.equals("dont_care")
                                        || valueToCheck.equals("empty")) {
                                    valueToCheck = attr + ":" + value;
                                    noValueAttrs.add(attr + "=" + value);
                                if (valueToCheck.equals(attr)) {
                                    noValueAttrs.add(attr + "=" + value);
                                return valueToCheck;
                            .filter((valueToCheck) -> (!valueToCheck.equals("empty:empty")
                                    && getValueAlignments().containsKey(valueToCheck)))
                            .forEachOrdered((valueToCheck) -> {
                                for (ArrayList<String> align : getValueAlignments().get(valueToCheck)
                                        .keySet()) {
                                    int n = align.size();
                                    for (int i = 0; i <= randomRealization.size() - n; i++) {
                                        ArrayList<String> compare = new ArrayList<String>();
                                        ArrayList<Integer> indexAlignment = new ArrayList<Integer>();
                                        for (int j = 0; j < n; j++) {
                                            compare.add(randomRealization.get(i + j).getWord());
                                            indexAlignment.add(i + j);
                                        if (compare.equals(align)) {
                                            if (!indexAlignments.containsKey(
                                                    getValueAlignments().get(valueToCheck).get(align))) {
                                                        new HashMap());
                                                    .put(attr + "=" + valueToCheck, indexAlignment);
                ArrayList<Double> similarities = new ArrayList<>(indexAlignments.keySet());
                HashSet<String> assignedAttrValues = new HashSet<String>();
                HashSet<Integer> assignedIntegers = new HashSet<Integer>();
                for (int i = similarities.size() - 1; i >= 0; i--) {
                    for (String attrValue : indexAlignments.get(similarities.get(i)).keySet()) {
                        if (!assignedAttrValues.contains(attrValue)) {
                            boolean isUnassigned = true;
                            for (Integer index : indexAlignments.get(similarities.get(i)).get(attrValue)) {
                                if (assignedIntegers.contains(index)) {
                                    isUnassigned = false;
                            if (isUnassigned) {
                                for (Integer index : indexAlignments.get(similarities.get(i)).get(attrValue)) {
                //System.out.println("-1: " + randomRealization);
                randomRealization.stream().filter((a) -> (a.getWord().startsWith(Action.TOKEN_X)))
                        .forEachOrdered((a) -> {
                            String attr = a.getWord().substring(3, a.getWord().lastIndexOf('_')).toLowerCase()
                            a.setAttribute(attr + "=" + a.getWord());
                HashSet<String> unalignedNoValueAttrs = new HashSet<>();
                noValueAttrs.forEach((noValueAttr) -> {
                    boolean assigned = false;
                    for (Action a : randomRealization) {
                        if (a.getAttribute().equals(noValueAttr)) {
                            assigned = true;
                    if (!assigned) {
                boolean isAllEmpty = true;
                boolean hasSpace = false;
                for (int i = 0; i < randomRealization.size(); i++) {
                    if (!randomRealization.get(i).getAttribute().isEmpty()
                            && !randomRealization.get(i).getAttribute().equals("[]")
                            && !randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                        isAllEmpty = false;
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        hasSpace = true;
                if (isAllEmpty && hasSpace && !unalignedNoValueAttrs.isEmpty()) {
                    unalignedNoValueAttrs.forEach((attrValue) -> {
                        int index = getRandomGen().nextInt(randomRealization.size());
                        boolean change = false;
                        while (!change) {
                            if (!randomRealization.get(index).getAttribute().equals(Action.TOKEN_PUNCT)) {
                                change = true;
                            } else {
                                index = getRandomGen().nextInt(randomRealization.size());
                //System.out.println(isAllEmpty + " " + hasSpace + " " + unalignedNoValueAttrs);
                //System.out.println(">> " + noValueAttrs);
                //System.out.println(">> " + values);
                //System.out.println("0: " + randomRealization);
                String previousAttr = "";
                int start = -1;
                for (int i = 0; i < randomRealization.size(); i++) {
                    if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)
                            && !randomRealization.get(i).getAttribute().isEmpty()
                            && !randomRealization.get(i).getAttribute().equals("[]")) {
                        if (start != -1) {
                            int middle = (start + i - 1) / 2 + 1;
                            for (int j = start; j < middle; j++) {
                                if (randomRealization.get(j).getAttribute().isEmpty()
                                        || randomRealization.get(j).getAttribute().equals("[]")) {
                            for (int j = middle; j < i; j++) {
                                if (randomRealization.get(j).getAttribute().isEmpty()
                                        || randomRealization.get(j).getAttribute().equals("[]")) {
                        start = i;
                        previousAttr = randomRealization.get(i).getAttribute();
                    } else {
                        previousAttr = "";
                //System.out.println("1: " + randomRealization);
                previousAttr = "";
                for (int i = randomRealization.size() - 1; i >= 0; i--) {
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        if (!previousAttr.isEmpty()) {
                    } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                        previousAttr = randomRealization.get(i).getAttribute();
                    } else {
                        previousAttr = "";
                //System.out.println("2: " + randomRealization);
                previousAttr = "";
                for (int i = 0; i < randomRealization.size(); i++) {
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        if (!previousAttr.isEmpty()) {
                    } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                        previousAttr = randomRealization.get(i).getAttribute();
                //System.out.println("3: " + randomRealization);
                previousAttr = "";
                for (int i = randomRealization.size() - 1; i >= 0; i--) {
                    if (randomRealization.get(i).getAttribute().isEmpty()
                            || randomRealization.get(i).getAttribute().equals("[]")) {
                        if (!previousAttr.isEmpty()) {
                    } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                        previousAttr = randomRealization.get(i).getAttribute();
                //System.out.println("4: " + randomRealization);
            //FIX WRONG @PUNCT@
            String previousAttr = "";
            for (int i = randomRealization.size() - 1; i >= 0; i--) {
                if (randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)
                        && !randomRealization.get(i).getWord().matches("[,.?!;:']")) {
                    if (!previousAttr.isEmpty()) {
                } else if (!randomRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                    previousAttr = randomRealization.get(i).getAttribute();
            ArrayList<Action> cleanRandomRealization = new ArrayList<>();
            randomRealization.stream().filter((a) -> (!a.getAttribute().equals(Action.TOKEN_PUNCT)))
                    .forEachOrdered((a) -> {
            //ADD END TOKENS
            ArrayList<Action> endRandomRealization = new ArrayList<>();
            previousAttr = "";
            for (int i = 0; i < cleanRandomRealization.size(); i++) {
                Action a = cleanRandomRealization.get(i);
                if (!previousAttr.isEmpty() && !a.getAttribute().equals(previousAttr)) {
                    endRandomRealization.add(new Action(Action.TOKEN_END, previousAttr));
                previousAttr = a.getAttribute();
            endRandomRealization.add(new Action(Action.TOKEN_END, previousAttr));
            endRandomRealization.add(new Action(Action.TOKEN_END, Action.TOKEN_END));
            calculatedRealizationsCache.put(realization, endRandomRealization);
            //System.out.println(di.getMeaningRepresentation().getPredicate() + ": " + endRandomRealization);
            ArrayList<String> attrValues = new ArrayList<String>();
            endRandomRealization.forEach((a) -> {
                if (attrValues.isEmpty()) {
                } else if (!attrValues.get(attrValues.size() - 1).equals(a.getAttribute())) {
            if (attrValues.size() > getMaxContentSequenceLength()) {
            ArrayList<Action> punctRealization = new ArrayList<>();
            previousAttr = "";
            for (int i = 0; i < punctRealization.size(); i++) {
                if (!punctRealization.get(i).getAttribute().equals(Action.TOKEN_PUNCT)) {
                    if (!punctRealization.get(i).getAttribute().equals(previousAttr)
                            && !previousAttr.isEmpty()) {
                        punctRealization.add(i, new Action(Action.TOKEN_END, previousAttr));
                    previousAttr = punctRealization.get(i).getAttribute();
            if (!punctRealization.get(punctRealization.size() - 1).getWord().equals(Action.TOKEN_END)) {
                punctRealization.add(new Action(Action.TOKEN_END, previousAttr));
            return punctRealization;
        }).map((punctRealization) -> {
            punctRealizations.put(di, punctRealization);
            return punctRealization;
        }).forEachOrdered((punctRealization) -> {
            for (int i = 0; i < punctRealization.size(); i++) {
                Action a = punctRealization.get(i);
                if (a.getAttribute().equals(Action.TOKEN_PUNCT)) {
                    boolean legal = true;
                    ArrayList<Action> surroundingActions = new ArrayList<>();
                    if (i - 2 >= 0) {
                        surroundingActions.add(punctRealization.get(i - 2));
                    } else {
                    if (i - 1 >= 0) {
                        surroundingActions.add(punctRealization.get(i - 1));
                    } else {
                        legal = false;
                    boolean oneMore = false;
                    if (i + 1 < punctRealization.size()) {
                        surroundingActions.add(punctRealization.get(i + 1));
                        if (!punctRealization.get(i + 1).getAttribute().equals(Action.TOKEN_END)) {
                            oneMore = true;
                    } else {
                        legal = false;
                    if (oneMore && i + 2 < punctRealization.size()) {
                        surroundingActions.add(punctRealization.get(i + 2));
                    } else {
                    if (legal) {
                        if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate())
                                .containsKey(surroundingActions)) {
                                    .put(surroundingActions, new HashMap<Action, Integer>());
                        if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate())
                                .get(surroundingActions).containsKey(a)) {
                                    .get(surroundingActions).put(a, 1);
                        } else {
                                    .put(a, punctPatterns.get(di.getMeaningRepresentation().getPredicate())
                                            .get(surroundingActions).get(a) + 1);
        return di;
    }).forEachOrdered((di) -> {
        HashSet<String> attrValuesToBeMentioned = new HashSet<>();
        di.getMeaningRepresentation().getAttributeValues().keySet().forEach((attribute) -> {
            int a = 0;
            for (String value : di.getMeaningRepresentation().getAttributeValues().get(attribute)) {
                if (value.startsWith("\"x")) {
                    value = "x" + a;
                } else if (value.startsWith("\"")) {
                    value = value.substring(1, value.length() - 1).replaceAll(" ", "_");
                attrValuesToBeMentioned.add(attribute + "=" + value);
        di.getDirectReferenceSequence().stream().map((key) -> {
            return key;
    punctRealizations.keySet().forEach((di) -> {
        ArrayList<Action> punctRealization = punctRealizations.get(di);
        punctPatterns.get(di.getMeaningRepresentation().getPredicate()).keySet().forEach((surrounds) -> {
            int beforeNulls = 0;
            if (surrounds.get(0) == null) {
            if (surrounds.get(1) == null) {
            for (int i = 0 - beforeNulls; i < punctRealization.size(); i++) {
                boolean matches = true;
                int m = 0;
                for (int s = 0; s < surrounds.size(); s++) {
                    if (surrounds.get(s) != null) {
                        if (i + s < punctRealization.size()) {
                            if (!punctRealization.get(i + s).getWord().equals(surrounds.get(s)
                                    .getWord()) /*|| !cleanActionList.get(i).getAttribute().equals(surrounds.get(s).getAttribute())*/) {
                                matches = false;
                                s = surrounds.size();
                            } else {
                        } else {
                            matches = false;
                            s = surrounds.size();
                    } else if (s < 2 && i + s >= 0) {
                        matches = false;
                        s = surrounds.size();
                    } else if (s >= 2 && i + s < punctRealization.size()) {
                        matches = false;
                        s = surrounds.size();
                if (matches && m > 0) {
                    Action a = new Action("", "");
                    if (!punctPatterns.get(di.getMeaningRepresentation().getPredicate()).get(surrounds)
                            .containsKey(a)) {
                    } else {
                                        .get(a) + 1);
    punctPatterns.keySet().forEach((predicate) -> {
        punctPatterns.get(predicate).keySet().forEach((punct) -> {
            Action bestAction = null;
            int bestCount = 0;
            for (Action a : punctPatterns.get(predicate).get(punct).keySet()) {
                if (punctPatterns.get(predicate).get(punct).get(a) > bestCount) {
                    bestAction = a;
                    bestCount = punctPatterns.get(predicate).get(punct).get(a);
                } else if (punctPatterns.get(predicate).get(punct).get(a) == bestCount
                        && bestAction.getWord().isEmpty()) {
                    bestAction = a;
            if (!getPunctuationPatterns().containsKey(predicate)) {
                getPunctuationPatterns().put(predicate, new HashMap<ArrayList<Action>, Action>());
            if (!bestAction.getWord().isEmpty()) {
                getPunctuationPatterns().get(predicate).put(punct, bestAction);