opennlp.tools.parse_thicket.kernel_interface.TreeKernelBasedClassifier.java Source code

Introduction

Here is the source code for opennlp.tools.parse_thicket.kernel_interface.TreeKernelBasedClassifier.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.tools.parse_thicket.kernel_interface;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;

import org.apache.commons.io.FileUtils;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;

import opennlp.tools.jsmlearning.ProfileReaderWriter;
import opennlp.tools.parse_thicket.ParseThicket;
import opennlp.tools.parse_thicket.VerbNetProcessor;
import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;
import opennlp.tools.parse_thicket.matching.Matcher;

public class TreeKernelBasedClassifier {
    protected static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.TreeKernelBasedClassifier");
    protected ArrayList<File> queuePos = new ArrayList<File>(), queueNeg = new ArrayList<File>();

    protected Matcher matcher = new Matcher();
    protected TreeKernelRunner tkRunner = new TreeKernelRunner();
    protected TreeExtenderByAnotherLinkedTree treeExtender = new TreeExtenderByAnotherLinkedTree();

    protected String path;

    public void setKernelPath(String path) {
        this.path = path;
    }

    protected static final String modelFileName = "model.txt";

    protected static final String trainingFileName = "training.txt";

    protected static final String unknownToBeClassified = "unknown.txt";

    protected static final String classifierOutput = "classifier_output.txt";
    protected static final Float MIN_SVM_SCORE_TOBE_IN = 0.2f;

    /*
     * main entry point to SVM TK classifier gets a file, reads it outside of
     * CI, extracts longer paragraphs and builds parse thickets for them. Then
     * parse thicket dump is processed by svm_classify
     */
    public Boolean classifyText(File f) {
        FileUtils.deleteQuietly(new File(path + unknownToBeClassified));
        if (!(new File(path + modelFileName).exists())) {
            LOG.severe("Model file '" + modelFileName + "'is absent: skip SVM classification");
            return null;
        }
        Map<Integer, Integer> countObject = new HashMap<Integer, Integer>();
        int itemCount = 0, objectCount = 0;
        List<String> treeBankBuffer = new ArrayList<String>();
        List<String> texts = DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);
        List<String> lines = formTreeKernelStructuresMultiplePara(texts, "0");
        for (String l : lines) {
            countObject.put(itemCount, objectCount);
            itemCount++;
        }
        objectCount++;
        treeBankBuffer.addAll(lines);

        // write the lists of samples to a file
        try {
            FileUtils.writeLines(new File(path + unknownToBeClassified), null, treeBankBuffer);
        } catch (IOException e) {
            LOG.severe("Problem creating parse thicket files '" + path + unknownToBeClassified
                    + "' to be classified\n" + e.getMessage());
        }

        tkRunner.runClassifier(path, unknownToBeClassified, modelFileName, classifierOutput);
        // read classification results
        List<String[]> classifResults = ProfileReaderWriter.readProfiles(path + classifierOutput, ' ');

        itemCount = 0;
        objectCount = 0;
        int currentItemCount = 0;
        float accum = 0;
        LOG.info("\nsvm scores per paragraph: ");
        for (String[] line : classifResults) {
            Float val = Float.parseFloat(line[0]);
            System.out.print(val + " ");
            accum += val;
            currentItemCount++;
        }

        float averaged = accum / (float) currentItemCount;
        LOG.info("\n average = " + averaged);
        currentItemCount = 0;
        Boolean in = false;
        if (averaged > MIN_SVM_SCORE_TOBE_IN)
            return true;
        else
            return false;
    }

    protected void addFilesPos(File file) {

        if (!file.exists()) {
            System.out.println(file + " does not exist.");
        }
        if (file.isDirectory()) {
            for (File f : file.listFiles()) {
                // if (!(f.getName().endsWith(".txt") ||
                // f.getName().endsWith(".pdf")))
                // continue;
                addFilesPos(f);
                System.out.println(f.getName());
            }
        } else {
            queuePos.add(file);
        }
    }

    protected void addFilesNeg(File file) {

        if (!file.exists()) {
            System.out.println(file + " does not exist.");
        }
        if (file.isDirectory()) {
            for (File f : file.listFiles()) {
                // if
                // (!(f.getName().endsWith(".txt")||f.getName().endsWith(".pdf")))
                // continue;
                addFilesNeg(f);
                System.out.println(f.getName());
            }
        } else {
            queueNeg.add(file);
        }
    }

    protected void trainClassifier(String posDirectory, String negDirectory) {

        queuePos.clear();
        queueNeg.clear();
        addFilesPos(new File(posDirectory));
        addFilesNeg(new File(negDirectory));

        List<File> filesPos = new ArrayList<File>(queuePos), filesNeg = new ArrayList<File>(queueNeg);

        List<String[]> treeBankBuffer = new ArrayList<String[]>();

        for (File f : filesPos) {
            // get first paragraph of text
            String text = DescriptiveParagraphFromDocExtractor.getFirstParagraphFromFile(f);
            treeBankBuffer.add(new String[] { formTreeKernelStructure(text, "1") });
        }
        for (File f : filesNeg) {
            // get first paragraph of text
            String text = DescriptiveParagraphFromDocExtractor.getFirstParagraphFromFile(f);
            treeBankBuffer.add(new String[] { formTreeKernelStructure(text, "-1") });
        }

        // write the lists of samples to a file
        ProfileReaderWriter.writeReport(treeBankBuffer, path + trainingFileName, ' ');
        // build the model
        tkRunner.runLearner(path, trainingFileName, modelFileName);
    }

    public List<String[]> classifyFilesInDirectory(String dirFilesToBeClassified) {
        List<String[]> treeBankBuffer = new ArrayList<String[]>();
        queuePos.clear();
        addFilesPos(new File(dirFilesToBeClassified));
        List<File> filesUnkn = new ArrayList<File>(queuePos);
        for (File f : filesUnkn) {
            String text = DescriptiveParagraphFromDocExtractor.getFirstParagraphFromFile(f);
            String line = formTreeKernelStructure(text, "0");
            treeBankBuffer.add(new String[] { line });
        }

        // form a file from the texts to be classified
        ProfileReaderWriter.writeReport(treeBankBuffer, path + unknownToBeClassified, ' ');

        tkRunner.runClassifier(path, unknownToBeClassified, modelFileName, classifierOutput);
        // read classification results
        List<String[]> classifResults = ProfileReaderWriter.readProfiles(path + classifierOutput, ' ');
        // iterate through classification results and set them as scores for
        // hits
        List<String[]> results = new ArrayList<String[]>();
        int count = 0;
        for (String[] line : classifResults) {
            Float val = Float.parseFloat(line[0]);
            Boolean in = false;
            if (val > MIN_SVM_SCORE_TOBE_IN)
                in = true;

            String[] rline = new String[] { filesUnkn.get(count).getName(), in.toString(), line[0],
                    filesUnkn.get(count).getAbsolutePath() }; // treeBankBuffer.get(count).toString()
            // };
            results.add(rline);
            count++;

        }
        return results;

    }

    protected List<String> formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
        List<String> extendedTreesDumpTotal = new ArrayList<String>();
        try {

            for (String text : texts) {
                // get the parses from original documents, and form the training
                // dataset
                LOG.info("About to build pt from " + text);
                ParseThicket pt = matcher.buildParseThicketFromTextWithRST(text);
                LOG.info("About to build extended forest ");
                List<String> extendedTreesDump = treeExtender.buildForestForCorefArcs(pt);
                for (String line : extendedTreesDump)
                    extendedTreesDumpTotal.add(flag + " |BT| " + line + " |ET| ");
                LOG.info("DONE");
            }

        } catch (Exception e) {
            LOG.severe("Problem forming  parse thicket flat file to be classified\n" + e.getMessage());
        }
        return extendedTreesDumpTotal;
    }

    protected String formTreeKernelStructure(String text, String flag) {
        String treeBankBuffer = "";
        try {
            // get the parses from original documents, and form the training
            // dataset
            LOG.info("About to build pt from " + text);
            ParseThicket pt = matcher.buildParseThicketFromTextWithRST(text);
            LOG.info("About to build extended forest ");
            List<String> extendedTreesDump = treeExtender.buildForestForCorefArcs(pt);
            LOG.info("DONE");

            treeBankBuffer += flag;
            // form the list of training samples
            for (String t : extendedTreesDump) {
                if (BracesProcessor.isBalanced(t))
                    treeBankBuffer += " |BT| " + t;
                else
                    System.err.println("Wrong tree: " + t);
            }
            if (extendedTreesDump.size() < 1)
                treeBankBuffer += " |BT| ";
        } catch (Exception e) {
            e.printStackTrace();
        }
        return treeBankBuffer + " |ET|";
    }

    public static void main(String[] args) {
        VerbNetProcessor p = VerbNetProcessor
                .getInstance("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources");

        TreeKernelBasedClassifier proc = new TreeKernelBasedClassifier();
        proc.setKernelPath("/Users/borisgalitsky/Documents/tree_kernel/");
        proc.trainClassifier(args[0], args[1]);
        List<String[]> res = proc.classifyFilesInDirectory(args[2]);
        ProfileReaderWriter.writeReport(res, "svmDesignDocReport03minus.csv");
    }

}