opennlp.tools.parse_thicket.pattern_structure.JSMLearnerOnLatticeTest.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.parse_thicket.pattern_structure.JSMLearnerOnLatticeTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.parse_thicket.pattern_structure;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.collections.ListUtils;

import junit.framework.TestCase;
import opennlp.tools.fca.ConceptLattice;
import opennlp.tools.fca.FcaWriter;
import opennlp.tools.fca.FormalConcept;
import opennlp.tools.similarity.apps.BingWebQueryRunner;
import opennlp.tools.similarity.apps.HitBase;
import opennlp.tools.similarity.apps.utils.Pair;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

public class JSMLearnerOnLatticeTest extends TestCase {
    ParserChunker2MatcherProcessor chunk_maker = ParserChunker2MatcherProcessor.getInstance();
    LinguisticPatternStructure psPos = new LinguisticPatternStructure(0, 0),
            psNeg = new LinguisticPatternStructure(0, 0);
    ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();

    public void testJSMLearner() {

        String text1p = "I rent an office space. This office is for my business. I can deduct office rental expense from my business profit to calculate net income. ";
        String text2p = "To run my business, I have to rent an office. The net business profit is calculated as follows. Rental expense needs to be subtracted from revenue. ";
        String text3p = "To store goods for my retail business I rent some space. When I calculate the net income, I take revenue and subtract business expenses such as office rent. ";
        String text4p = "I rent some space for my business. To calculate my net income, I subtract from revenue my rental business expense.";

        String text1n = "I rent out a first floor unit of my house to a travel business. I need to add the rental income to my profit. However, when I repair my house, I can deduct the repair expense from my rental income. ";
        String text2n = "I receive rental income from my office. I have to claim it as a profit in my tax forms. I need to add my rental income to my profits, but subtract rental expenses such as repair from it. ";
        String text3n = "I advertised my property as a business rental. Advertisement and repair expenses can be subtracted from the rental income. Remaining rental income needs to be added to my profit and be reported as taxable profit. ";
        String text4n = "I showed  my property to a business owner to rent. Expenses on my time spent on advertisement are subtracted from the rental income. My rental profits are added to my taxable income.  ";

        List<List<ParseTreeChunk>> chunks1p = chunk_maker.formGroupedPhrasesFromChunksForPara(text1p);
        List<List<ParseTreeChunk>> chunks2p = chunk_maker.formGroupedPhrasesFromChunksForPara(text2p);
        List<List<ParseTreeChunk>> chunks3p = chunk_maker.formGroupedPhrasesFromChunksForPara(text3p);
        List<List<ParseTreeChunk>> chunks4p = chunk_maker.formGroupedPhrasesFromChunksForPara(text4p);
        List<List<ParseTreeChunk>> chunks1n = chunk_maker.formGroupedPhrasesFromChunksForPara(text1n);
        List<List<ParseTreeChunk>> chunks2n = chunk_maker.formGroupedPhrasesFromChunksForPara(text2n);
        List<List<ParseTreeChunk>> chunks3n = chunk_maker.formGroupedPhrasesFromChunksForPara(text3n);
        List<List<ParseTreeChunk>> chunks4n = chunk_maker.formGroupedPhrasesFromChunksForPara(text4n);

        LinkedHashSet<Integer> obj = null;
        obj = new LinkedHashSet<Integer>();
        obj.add(0);
        psPos.AddIntent(chunks1p, obj, 0);
        obj = new LinkedHashSet<Integer>();
        obj.add(1);
        psPos.AddIntent(chunks2p, obj, 0);
        obj = new LinkedHashSet<Integer>();
        obj.add(2);
        psPos.AddIntent(chunks3p, obj, 0);
        obj = new LinkedHashSet<Integer>();
        obj.add(3);
        psPos.AddIntent(chunks4p, obj, 0);
        obj = new LinkedHashSet<Integer>();
        obj.add(0);
        psNeg.AddIntent(chunks1n, obj, 0);
        obj = new LinkedHashSet<Integer>();
        obj.add(1);
        psNeg.AddIntent(chunks2n, obj, 0);
        obj = new LinkedHashSet<Integer>();
        obj.add(2);
        psNeg.AddIntent(chunks3n, obj, 0);
        obj = new LinkedHashSet<Integer>();
        obj.add(3);
        psNeg.AddIntent(chunks4n, obj, 0);

        String unknown = "I do not want to rent anything to anyone. I just want to rent a space for myself. I neither calculate deduction of individual or business tax. I subtract my tax from my income";
        List<List<ParseTreeChunk>> chunksUnknown = chunk_maker.formGroupedPhrasesFromChunksForPara(unknown);
        List<List<List<ParseTreeChunk>>> posIntersections = new ArrayList<List<List<ParseTreeChunk>>>(),
                negIntersections = new ArrayList<List<List<ParseTreeChunk>>>();
        List<List<ParseTreeChunk>> intersection = null;
        for (int iConcept = 0; iConcept < psPos.conceptList.size(); iConcept++) {
            if (psPos.conceptList.get(iConcept).intent != null
                    && psPos.conceptList.get(iConcept).intent.size() > 0) {
                intersection = md.matchTwoSentencesGroupedChunksDeterministic(
                        psPos.conceptList.get(iConcept).intent, chunksUnknown);
                if (reduceList(intersection).size() > 0)
                    posIntersections.add(reduceList(intersection));
            }
            if (psNeg.conceptList.get(iConcept).intent != null
                    && psNeg.conceptList.get(iConcept).intent.size() > 0) {
                intersection = md.matchTwoSentencesGroupedChunksDeterministic(
                        psNeg.conceptList.get(iConcept).intent, chunksUnknown);
                if (reduceList(intersection).size() > 0)
                    negIntersections.add(reduceList(intersection));
            }
        }

        Pair<List<List<List<ParseTreeChunk>>>, List<List<List<ParseTreeChunk>>>> pair = removeInconsistenciesFromPosNegIntersections(
                posIntersections, negIntersections);

        posIntersections = pair.getFirst();
        negIntersections = pair.getSecond();

        List<List<List<ParseTreeChunk>>> posIntersectionsUnderNeg = new ArrayList<List<List<ParseTreeChunk>>>(),
                negIntersectionsUnderPos = new ArrayList<List<List<ParseTreeChunk>>>();

        for (int iConcept = 0; iConcept < psNeg.conceptList.size(); iConcept++) {
            for (int iConceptJ = 0; iConceptJ < negIntersections.size(); iConceptJ++) {
                intersection = md.matchTwoSentencesGroupedChunksDeterministic(
                        psNeg.conceptList.get(iConcept).intent, negIntersections.get(iConceptJ));
                if (reduceList(intersection).size() > 0)
                    posIntersectionsUnderNeg.add(reduceList(intersection));
            }
        }

        for (int iConcept = 0; iConcept < psPos.conceptList.size(); iConcept++) {
            for (int iConceptJ = 0; iConceptJ < posIntersections.size(); iConceptJ++) {
                intersection = md.matchTwoSentencesGroupedChunksDeterministic(
                        psPos.conceptList.get(iConcept).intent, posIntersections.get(iConceptJ));
                if (reduceList(intersection).size() > 0)
                    negIntersectionsUnderPos.add(reduceList(intersection));
            }
        }

        List<ParseTreeChunk> posIntersectionsUnderNegLst = flattenParseTreeChunkLst(posIntersectionsUnderNeg);
        List<ParseTreeChunk> negIntersectionsUnderPosLst = flattenParseTreeChunkLst(negIntersectionsUnderPos);

        posIntersectionsUnderNegLst = subtract(posIntersectionsUnderNegLst, negIntersectionsUnderPosLst);
        negIntersectionsUnderPosLst = subtract(negIntersectionsUnderPosLst, posIntersectionsUnderNegLst);

        System.out.println("Pos - neg inters = " + posIntersectionsUnderNegLst);
        System.out.println("Neg - pos inters = " + negIntersectionsUnderPosLst);

    }

    public List<List<ParseTreeChunk>> reduceList(List<List<ParseTreeChunk>> list) {
        float minScore = 1.3f;
        List<List<ParseTreeChunk>> newList = new ArrayList<List<ParseTreeChunk>>();

        ParseTreeChunkListScorer scorer = new ParseTreeChunkListScorer();
        for (List<ParseTreeChunk> group : list) {
            List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>();
            for (ParseTreeChunk ch : group) {
                if (scorer.getScore(ch) > minScore)
                    newGroup.add(ch);
            }
            if (newGroup.size() > 0)
                newList.add(newGroup);
        }

        return newList;

    }

    public List<List<ParseTreeChunk>> flattenParseTreeChunkListList(List<List<List<ParseTreeChunk>>> listOfLists) {
        List<List<ParseTreeChunk>> newList = new ArrayList<List<ParseTreeChunk>>();

        for (List<List<ParseTreeChunk>> member : listOfLists) {
            Set<ParseTreeChunk> newSet = new HashSet<ParseTreeChunk>();
            for (List<ParseTreeChunk> group : member) {
                if (group.size() > 0)
                    newSet.addAll(group);
            }
            newList.add(new ArrayList<ParseTreeChunk>(newSet));
        }

        return newList;
    }

    public List<ParseTreeChunk> flattenParseTreeChunkLst(List<List<List<ParseTreeChunk>>> listOfLists) {
        List<ParseTreeChunk> newList = new ArrayList<ParseTreeChunk>();
        Set<ParseTreeChunk> newSetAll = new HashSet<ParseTreeChunk>();

        for (List<List<ParseTreeChunk>> member : listOfLists) {
            Set<ParseTreeChunk> newSet = new HashSet<ParseTreeChunk>();
            for (List<ParseTreeChunk> group : member) {
                if (group.size() > 0)
                    newSet.addAll(group);
            }
            newSetAll.addAll(newSet);
        }

        return removeDuplicates(new ArrayList<ParseTreeChunk>(newSetAll));
    }

    public List<ParseTreeChunk> removeDuplicates(List<ParseTreeChunk> dupes) {
        List<Integer> toDelete = new ArrayList<Integer>();
        for (int i = 0; i < dupes.size(); i++)
            for (int j = i + 1; j < dupes.size(); j++) {
                if (dupes.get(i).equals(dupes.get(j))) {
                    toDelete.add(j);
                }
            }
        List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>();
        for (int i = 0; i < dupes.size(); i++) {
            if (!toDelete.contains(i))
                cleaned.add(dupes.get(i));
        }
        return cleaned;
    }

    public List<ParseTreeChunk> subtract(List<ParseTreeChunk> main, List<ParseTreeChunk> toSubtract) {
        List<Integer> toDelete = new ArrayList<Integer>();
        for (int i = 0; i < main.size(); i++)
            for (int j = 0; j < toSubtract.size(); j++) {
                if (main.get(i).equals(toSubtract.get(j))) {
                    toDelete.add(i);
                }
            }
        List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>();
        for (int i = 0; i < main.size(); i++) {
            if (!toDelete.contains(i))
                cleaned.add(main.get(i));
        }
        return cleaned;
    }

    public List<ParseTreeChunk> intesectParseTreeChunkLists(List<ParseTreeChunk> a, List<ParseTreeChunk> b) {
        List<Integer> inters = new ArrayList<Integer>();
        for (int i = 0; i < a.size(); i++)
            for (int j = 0; j < b.size(); j++) {
                if (a.get(i).equals(b.get(j))) {
                    inters.add(i);
                }
            }
        List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>();
        for (int i = 0; i < a.size(); i++) {
            if (inters.contains(i))
                cleaned.add(a.get(i));
        }
        return cleaned;
    }

    public Pair<List<List<List<ParseTreeChunk>>>, List<List<List<ParseTreeChunk>>>> removeInconsistenciesFromPosNegIntersections(
            List<List<List<ParseTreeChunk>>> pos, List<List<List<ParseTreeChunk>>> neg) {

        List<ParseTreeChunk> posIntersectionsFl = flattenParseTreeChunkLst(pos);
        List<ParseTreeChunk> negIntersectionsFl = flattenParseTreeChunkLst(neg);

        List<ParseTreeChunk> intersParseTreeChunkLists = intesectParseTreeChunkLists(posIntersectionsFl,
                negIntersectionsFl);

        List<List<List<ParseTreeChunk>>> cleanedFromInconsPos = new ArrayList<List<List<ParseTreeChunk>>>(),
                cleanedFromInconsNeg = new ArrayList<List<List<ParseTreeChunk>>>();
        /*
        System.out.println("pos = "+ pos);
        System.out.println("neg = "+ neg);
        System.out.println("pos flat = "+ posIntersectionsFl);
        System.out.println("neg flat = "+ negIntersectionsFl);
        System.out.println("inters = "+  intersParseTreeChunkLists);
        */

        for (List<List<ParseTreeChunk>> member : pos) {
            List<List<ParseTreeChunk>> memberList = new ArrayList<List<ParseTreeChunk>>();
            for (List<ParseTreeChunk> group : member) {
                List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>();
                for (ParseTreeChunk ch : group) {
                    boolean bSkip = false;
                    for (ParseTreeChunk check : intersParseTreeChunkLists) {
                        if (check.equals(ch))
                            bSkip = true;
                    }
                    if (!bSkip)
                        newGroup.add(ch);
                }
                if (newGroup.size() > 0)
                    memberList.add(newGroup);
            }
            if (memberList.size() > 0)
                cleanedFromInconsPos.add(memberList);
        }

        for (List<List<ParseTreeChunk>> member : neg) {
            List<List<ParseTreeChunk>> memberList = new ArrayList<List<ParseTreeChunk>>();
            for (List<ParseTreeChunk> group : member) {
                List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>();
                for (ParseTreeChunk ch : group) {
                    boolean bSkip = false;
                    for (ParseTreeChunk check : intersParseTreeChunkLists) {
                        if (check.equals(ch))
                            bSkip = true;
                    }
                    if (!bSkip)
                        newGroup.add(ch);
                }
                if (newGroup.size() > 0)
                    memberList.add(newGroup);
            }
            if (memberList.size() > 0)
                cleanedFromInconsNeg.add(memberList);
        }

        return new Pair(cleanedFromInconsPos, cleanedFromInconsNeg);

    }

}