de.iisys.schub.processMining.similarity.AlgoController.java Source code

Introduction

Here is the source code for de.iisys.schub.processMining.similarity.AlgoController.java
Source

package de.iisys.schub.processMining.similarity;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.sql.Timestamp;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.concurrent.ExecutionException;

import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;

import de.iisys.schub.processMining.activities.network.ArtefactController;
import de.iisys.schub.processMining.config.Config;
import de.iisys.schub.processMining.similarity.Tokenizing.UsePOSTagger;
import de.iisys.schub.processMining.similarity.Tokenizing.UseStemmer;
import de.iisys.schub.processMining.similarity.Tokenizing.UseStopwords;
import de.iisys.schub.processMining.similarity.Tokenizing.UseTokenizer;
import de.iisys.schub.processMining.similarity.model.MinedDocument;
import de.iisys.schub.processMining.similarity.model.MinedMainDocument;
import de.iisys.schub.processMining.similarity.model.Similarity;
import de.iisys.schub.processMining.similarity.parsing.DocxParser;
import de.iisys.schub.processMining.similarity.parsing.TextParser;

/**
 * Main controller to compare the similarity of documents.
 * You usually have a docx main document (with chapters) 
 * and some other documents/texts to compare with.
 * 
 * 
 * LICENSE:
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * @author Christian Ochsenkhn
 *
 */
public class AlgoController {

    private int THREADS;
    private String OUTPUT_NAME;
    private double PERCENTILE;
    private UseTokenizer TOKENIZER;
    private UseStopwords STOPWORDS;
    private UseStemmer STEMMER;
    private UsePOSTagger POSTAGGER;
    private boolean ALLOW_NUMBER_AS_TERM;

    public enum Language {
        EN, DE
    }

    private MinedMainDocument mainDoc;
    private List<MinedDocument> compareDocs;
    private List<String> allTerms;

    private Timestamp lastTimestamp = null;
    private double time_parsing;
    private double time_tokenizing;
    private double time_tfidf;
    private double time_cosine;

    public AlgoController() {
        loadProperties();
    }

    public AlgoController(MinedMainDocument mainDoc, List<MinedDocument> compareDocs) {
        this.mainDoc = mainDoc;
        this.compareDocs = compareDocs;
        loadProperties();
    }

    /**
     * Starts a pipeline for the given documents with the following steps:
     * Parsing, tokenizing, stopword removal, stemming, [POS tagging],
     * tf-idf vectors creation and calculating the cosine similarities.
     * @param mainFilePath:
     *       path to the main docx document with chapters
     * @param compareFilesPath:
     *       path to the documents to compare with
     * @param percentile:
     *       the percentile (in percent) above which documents are supposed as similar
     */
    public void pipelineCosineSimilarity(String mainFilePath, String compareFilesPath, double percentile) {
        PERCENTILE = percentile;
        this.pipelineCosineSimilarity(mainFilePath, compareFilesPath);
    }

    /**
     * Starts a pipeline for the given documents with the following steps:
     * Parsing, tokenizing, stopword removal, stemming, [POS tagging],
     * tf-idf vectors creation and calculating the cosine similarities.
     * 
     * Last Step: Saves all cosine similarities to an output file.
     * 
     * @param mainFilePath:
     *       path to the main docx document with chapters
     * @param compareFilesPath:
     *       path to the documents to compare with
     */
    public void pipelineCosineSimilarity(String mainFilePath, String compareFilesPath) {
        // step 1-3:
        this.mainPipeline(mainFilePath, compareFilesPath);

        // step 4: calculating cosine similarities
        printTimestamp(false);
        System.out.println(": Calculating cosine similarity...");
        String output = this.calculateCosineSimilarities();
        time_cosine = printTimestamp(true);
        System.out.println(": Done!");

        // step 5: save output
        saveToOutputFile(this.showMainMetaData() + output, OUTPUT_NAME);
    }

    /**
     * Starts a pipeline for the given documents with the following steps:
     * Parsing, tokenizing, stopword removal, stemming, [POS tagging],
     * tf-idf vectors creation and calculating the cosine similarities.
     * 
     * Last Steps:
     * All docs with an cosine similarity over the percentile are considered as similar.
     * Only the similar docs are saved to an output file 
     * (incl. the 3 most important terms, which appear in both similar documents).
     * 
     * @param mainFilePath
     * @param compareFilesPath
     * @return
     *       Returns the mainDocument file which contains a list of similar documents.
     */
    public MinedMainDocument pipelineSimilarDocs(String mainFilePath, String compareFilesPath) {
        // step 1-3:
        this.mainPipeline(mainFilePath, compareFilesPath);

        // step 4: calculating similar docs (via cosine similarity and percentile)
        printTimestamp(false);
        System.out.println(": Calculating cosine similarity...");
        this.calculateSimilarDocsViaCosine();
        time_cosine = printTimestamp(true);
        System.out.println(": Done!");

        // step 5: create output string
        String output = this.showSimilarDocs(this.mainDoc);

        // step 6: save output
        saveToOutputFile(this.showMainMetaData() + output, OUTPUT_NAME);

        return this.mainDoc;
    }

    /**
     * This pipeline should be used for the Shindig activity mining, coming from the ActivityController.
     * 1. Parsing the main artifact/document and the artifacts to compare.
     * 2. Tokenizinig (incl. stemming etc.) of the artifacts' texts.
     * 3. Creating the tf-idf vectors of these texts.
     * 4. Calculating the cosine similarity between these texts and finding the similar texts.
     * 5. Creating and saving the results of the similarity to a text file.
     * 6. Returning the MinedMainDocument, containing the similarity results.
     * 
     * @param artefactCon
     * @param cycleNr
     * @return
     */
    public MinedMainDocument pipelineNuxeoSimilarArtefacts(ArtefactController artefactCon, int cycleNr) {
        // step 1: parsing
        printTimestamp(false);
        System.out.println(": Parsing files...");
        this.mainDoc = artefactCon.parseMainArtifact();
        this.compareDocs = new ArrayList<MinedDocument>();

        // TODO: only for testing:
        //      parseFilesFromFolders(null, "docs");

        this.compareDocs.addAll(artefactCon.parseCompareArtifacts());
        time_parsing = printTimestamp(true);
        System.out.println(": Done!");

        // step 2: tokenizing (incl. stemming etc.)
        printTimestamp(false);
        System.out.println(": Tokenizing " + (compareDocs.size() + 1) + " doc(s) and " + mainDoc.chapterCount()
                + " chapters...");
        this.tokenizeTexts();
        time_tokenizing = printTimestamp(true);
        System.out.println(": Done!");

        // step 3: creating tf-idf vectors
        printTimestamp(false);
        System.out.println(": Creating tf-idf vectors...");
        this.calculateTfIdfs();
        time_tfidf = printTimestamp(true);
        System.out.println(": Done!");

        // step 4: calculating similar docs (via cosine similarity and percentile)
        printTimestamp(false);
        System.out.println(": Calculating cosine similarity...");
        this.calculateSimilarDocsViaCosine();
        time_cosine = printTimestamp(true);
        System.out.println(": Done!");

        // step 5: create output string
        String output = this.showSimilarDocs(this.mainDoc);

        // step 6: save output
        saveToOutputFile(this.showMainMetaData() + output, "cycle" + cycleNr + ".txt");

        return this.mainDoc;
    }

    private void mainPipeline(String mainFilePath, String compareFilesPath) {
        // step 1: parsing
        printTimestamp(false);
        System.out.println(": Parsing files...");
        this.parseFilesFromFolders(mainFilePath, compareFilesPath);
        time_parsing = printTimestamp(true);
        System.out.println(": Done!");

        // step 2: tokenizing (incl. stemming etc.)
        printTimestamp(false);
        System.out.println(": Tokenizing " + (compareDocs.size() + 1) + " docs and " + mainDoc.chapterCount()
                + " chapters...");
        this.tokenizeTexts();
        time_tokenizing = printTimestamp(true);
        System.out.println(": Done!");

        // step 3: creating tf-idf vectors
        printTimestamp(false);
        System.out.println(": Creating tf-idf vectors...");
        this.calculateTfIdfs();
        time_tfidf = printTimestamp(true);
        System.out.println(": Done!");
    }

    private void parseFilesFromFolders(String mainFilePath, String compareFilesPath) {
        TextParser textParser;

        // main doc:
        if (mainFilePath != null) {
            if (mainFilePath.endsWith(".docx")) {
                DocxParser docx = new DocxParser(mainFilePath);
                docx.parseDocxAndChapters();
                mainDoc = new MinedMainDocument(docx.getTitle(), docx.getFullText());
                mainDoc.addChapters(docx.getChapterHeadlines(), docx.getChapterTexts());
            } else {
                textParser = new TextParser(mainFilePath);
                mainDoc = new MinedMainDocument(textParser.getFileNames().get(0), textParser.getTexts().get(0));
            }
        }

        // compare docs:
        compareDocs = new ArrayList<MinedDocument>();
        textParser = new TextParser(compareFilesPath);
        List<String> texts = textParser.getTexts();
        List<String> names = textParser.getFileNames();

        for (int i = 0; i < texts.size(); i++) {
            compareDocs.add(new MinedDocument(names.get(i), texts.get(i)));
        }
    }

    /*
    private void parseFilesFromNuxeoAndFolders(String nuxeoDocId, String compareFilesPath) {
       TextParser textParser;      
        
       try {
     NuxeoConnector nuxeo = new NuxeoConnector();
     InputStream is = nuxeo.getDocumentInputStream(nuxeoDocId);
         
     String title = nuxeo.getLastDocTitle();
     if(title==null) title = "Main Document";
         
     DocxParser docx = new DocxParser(is);
     docx.parseDocxAndChapters();
     this.mainDoc = new MinedMainDocument(title, docx.getFullText());
     this.mainDoc.addChapters(docx.getChapterHeadlines(), docx.getChapterTexts());
     nuxeo.close();
     nuxeo = null; is = null;
       } catch (Exception e) {
     e.printStackTrace();
       }
        
           
       // compare docs:
       compareDocs = new ArrayList<MinedDocument>();
       textParser = new TextParser(compareFilesPath);
       List<String> texts = textParser.getTexts();
       List<String> names = textParser.getFileNames();
           
       for(int i=0; i<texts.size(); i++) {
     compareDocs.add(new MinedDocument(names.get(i), texts.get(i)));
       }
    } */

    private void tokenizeTexts() {
        Tokenizing tok = new Tokenizing(TOKENIZER, STOPWORDS, STEMMER, POSTAGGER, ALLOW_NUMBER_AS_TERM);

        this.allTerms = tok.tokenizeTermsThreadable(mainDoc, THREADS);
        // saves terms for each doc in the doc's object
        this.allTerms = tok.tokenizeTermsThreadable(compareDocs, allTerms, THREADS);
    }

    private void calculateTfIdfs() {
        TfIdf tfidf = new TfIdf(this.allTerms);
        try {
            tfidf.calculateTfIdfsThreadable(mainDoc, compareDocs, THREADS);
        } catch (InterruptedException | ExecutionException e) {
            e.printStackTrace();
        }
    }

    private void calculateSimilarDocsViaCosine() {
        List<Double> cosSims = new ArrayList<Double>();
        List<String[]> similarTerms = new ArrayList<String[]>();
        Similarity sim;

        int i;
        /*
        for(MinedDocument doc : this.compareDocs) {
           sim = mainDoc.compareSimilarity(doc, allTerms);
           cosSims.add(i, sim.getCosineSimilarity());
           similarTerms.add(i, sim.getSimilarTerms());
           i++;
        }*/
        for (i = 0; i < this.compareDocs.size(); i++) {
            sim = mainDoc.compareSimilarity(this.compareDocs.get(i), allTerms);
            if (sim != null) {
                cosSims.add(i, sim.getCosineSimilarity());
                similarTerms.add(i, sim.getSimilarTerms());
            } else {
                cosSims.add(i, 0.);
                similarTerms.add(i, null);
            }
        }
        double percentile = this.getDocsPercentile(cosSims);

        for (i = 0; i < cosSims.size(); i++) {
            if (cosSims.get(i) != null && cosSims.get(i) >= percentile)
                mainDoc.addSimilarDoc(compareDocs.get(i), similarTerms.get(i), cosSims.get(i));
            else
                mainDoc.addSimilarDoc(null, null, cosSims.get(i));
        }

        if (mainDoc.hasChapters()) {
            for (int j = 0; j < mainDoc.chapterCount(); j++) {
                cosSims.clear();
                similarTerms.clear();

                for (MinedDocument doc : compareDocs) {
                    sim = mainDoc.getChapter(j).compareSimilarity(doc, allTerms);
                    if (sim != null) {
                        cosSims.add(sim.getCosineSimilarity());
                        similarTerms.add(sim.getSimilarTerms());
                    } else {
                        cosSims.add(0.);
                        similarTerms.add(null);
                    }
                }
                percentile = this.getDocsPercentile(cosSims);
                for (int k = 0; k < cosSims.size(); k++) {
                    if (cosSims.get(k) != null && cosSims.get(k) >= percentile)
                        mainDoc.getChapter(j).addSimilarDoc(compareDocs.get(k), similarTerms.get(k),
                                cosSims.get(k));
                    else
                        mainDoc.getChapter(j).addSimilarDoc(null, null, cosSims.get(k));
                }
            }
        }
    }

    private String calculateCosineSimilarities() {
        DecimalFormat df = new DecimalFormat("#00.00");
        double cosSim;
        List<Double> cosSims = new ArrayList<Double>();

        StringBuffer mainBuf = new StringBuffer("\n\nDocument '" + mainDoc.getName() + "' is...\n");
        StringBuffer docBuf = new StringBuffer();
        for (MinedDocument doc : compareDocs) {
            cosSim = mainDoc.compareSimilarity(doc);
            cosSims.add(cosSim);
            docBuf.append("\t" + df.format(cosSim * 100) + " % \t similar with \t" + " (" + doc.getLang() + ")\n");
        }
        mainBuf.append(this.showDocMetaData(cosSims) + "\n\n");
        mainBuf.append(docBuf);

        if (mainDoc.hasChapters()) {
            for (int i = 0; i < mainDoc.chapterCount(); i++) {
                docBuf = new StringBuffer();
                cosSims.clear();

                mainBuf.append("\n\nChapter " + mainDoc.getChapter(i).getName() + " is...\n");
                for (MinedDocument doc : compareDocs) {
                    cosSim = mainDoc.getChapter(i).compareSimilarity(doc);
                    cosSims.add(cosSim);
                    docBuf.append("\t" + df.format(cosSim * 100) + " % \t similar with \t" + doc.getName() + " ("
                            + doc.getLang() + ")\n");
                }
                mainBuf.append(this.showDocMetaData(cosSims) + "\n\n");
                mainBuf.append(docBuf);
            }
        }

        return mainBuf.toString();
    }

    private String showSimilarDocs(MinedMainDocument mainDoc) {
        DecimalFormat df = new DecimalFormat("#00.00");
        String terms[];

        //      Map<MinedDocument, String[]> similarDocs = mainDoc.getSimilarDocs();
        List<MinedDocument> similarDocs = mainDoc.getSimilarDocs();
        List<String[]> similarDocsTerms = mainDoc.getSimilarDocsTerms();
        StringBuffer mainBuf = new StringBuffer("\n\nDocument '" + mainDoc.getName() + "' is...\n");

        int j = 0;
        //      for(MinedDocument simDoc : similarDocs.keySet()) {
        for (MinedDocument simDoc : similarDocs) {
            if (simDoc != null) {
                //            terms = similarDocs.get(simDoc);
                terms = similarDocsTerms.get(j);
                String termsString = "";
                if (terms != null) {
                    for (int t = 0; t < terms.length; t++) {
                        if (t > 0)
                            termsString += ", ";
                        termsString += terms[t];
                    }
                }
                mainBuf.append("\t" + df.format(mainDoc.getSimilarDocsCosine(j) * 100) + " % \t similar with \t"
                        + simDoc.getName() + "\t\t(" + termsString + ")\n");
            }
            j++;
        }

        if (mainDoc.hasChapters()) {
            for (int i = 0; i < mainDoc.chapterCount(); i++) {
                similarDocs = mainDoc.getChapter(i).getSimilarDocs();
                similarDocsTerms = mainDoc.getChapter(i).getSimilarDocsTerms();
                mainBuf.append("\n\nChapter " + mainDoc.getChapter(i).getName() + " is...\n");
                j = 0;
                //            for(MinedDocument simDoc : similarDocs.keySet()) {
                for (MinedDocument simDoc : similarDocs) {
                    if (simDoc != null) {
                        //                  terms = similarDocs.get(simDoc);
                        terms = similarDocsTerms.get(j);
                        String termsString = "";
                        if (terms != null) {
                            for (int t = 0; t < terms.length; t++) {
                                if (t > 0)
                                    termsString += ", ";
                                termsString += terms[t];
                            }
                        }
                        mainBuf.append("\t" + df.format(mainDoc.getChapter(i).getSimilarDocsCosine(j) * 100)
                                + " % \t similar with \t" + simDoc.getName() + "\t\t(" + termsString + ")\n");
                    }
                    j++;
                }
            }
        }

        return mainBuf.toString();
    }

    private String showMainMetaData() {
        String meta = "*** Cosine TfIdf Similarity ***\n\n" + "* 1 Main Document with " + mainDoc.chapterCount()
                + " chapters \n" + "* " + this.compareDocs.size() + " Documents to compare \n" + "* "
                + this.allTerms.size() + " unique terms \n" + "* " + this.THREADS + " threads \n" + "\n"
                + "* Parsing time: " + time_parsing + "s \n" + "* Tokenizing time: " + time_tokenizing + "s \n"
                + "* TfIdf time: " + time_tfidf + "s \n" + "* Cosine time: " + time_cosine + "s \n" + "\n"
                + "Configuration: \n" + "* Percentile: " + this.PERCENTILE + " % \n" + "* Numbers as terms: "
                + this.ALLOW_NUMBER_AS_TERM + "\n" + "* Tokenizer: " + this.TOKENIZER + "\n" + "* Stemmer: "
                + this.STEMMER + "\n" + "* Pos-Tagger: " + this.POSTAGGER + "\n" + "* Stopwords: " + this.STOPWORDS
                + "\n" + "\n";
        return meta;
    }

    private String showDocMetaData(List<Double> cosineSimValues) {
        DescriptiveStatistics stat = new DescriptiveStatistics();

        for (int i = 0; i < cosineSimValues.size(); i++) {
            stat.addValue(cosineSimValues.get(i));
        }

        double min = Math.round(stat.getMin() * 1000) / 1000.0;
        double max = Math.round(stat.getMax() * 1000) / 1000.0;
        double arithMean = Math.round(stat.getMean() * 10000) / 10000.0;
        double percentile = Math.round(stat.getPercentile(PERCENTILE) * 1000) / 1000.0;

        DecimalFormat df = new DecimalFormat("#00.00");
        String meta = "Min: " + df.format(min * 100) + " %" + ", Max: " + df.format(max * 100) + " %"
                + ", Arith. Mean: " + df.format(arithMean * 100) + " %" + ", Percentile (" + PERCENTILE + " %): "
                + df.format(percentile * 100) + " %";

        return meta;
    }

    private double getDocsPercentile(List<Double> cosineSimValues) {
        DescriptiveStatistics stat = new DescriptiveStatistics();
        for (int i = 0; i < cosineSimValues.size(); i++) {
            stat.addValue(cosineSimValues.get(i));
        }
        return Math.round(stat.getPercentile(PERCENTILE) * 1000) / 1000.0;
    }

    public static void saveToOutputFile(String output, String outputFileName) {
        Writer out;
        try {
            out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileName), "UTF-8"));
            try {
                out.write(output);
                System.out
                        .println("PROCESSMINING: Successfully saved smiliarity results to " + outputFileName + "!");
            } finally {
                out.close();
                out = null;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private double printTimestamp(boolean showDuration) {
        DecimalFormat df = new DecimalFormat("#0.000");
        double duration = 0;
        Timestamp time = new Timestamp(Calendar.getInstance().getTime().getTime());
        if (showDuration && this.lastTimestamp != null) {
            duration = (time.getTime() - this.lastTimestamp.getTime()) / 1000f;
            System.out.print(time.toString() + " (" + df.format((Math.round(duration * 1000) / 1000.0)) + " s)");
        } else
            System.out.print(time);

        this.lastTimestamp = time;
        return Math.round(duration * 1000) / 1000.0;
    }

    private void loadProperties() {
        // threads:
        try {
            THREADS = Integer.parseInt(Config.THREADS.getValue());
        } catch (NumberFormatException nfe) {
            THREADS = 4;
        }
        // outputFile:
        OUTPUT_NAME = Config.OUTPUT_FILE.getValue();
        // percentile:
        try {
            PERCENTILE = Integer.parseInt(Config.PERCENTILE.getValue());
        } catch (NumberFormatException e) {
            PERCENTILE = 90;
        }
        // allowNumberAsTerm:
        ALLOW_NUMBER_AS_TERM = Boolean.parseBoolean(Config.ALLOW_NUMBER_AS_TERM.getValue());
        // tokenizer:
        switch (Config.TOKENIZER.getValue().toUpperCase()) {
        case "WITH_MODEL":
            TOKENIZER = UseTokenizer.WITH_MODEL;
            break;
        case "SIMPLE":
        default:
            TOKENIZER = UseTokenizer.SIMPLE;
        }
        // stopwords:
        switch (Config.STOPWORDS.getValue().toUpperCase()) {
        case "NONE":
            STOPWORDS = UseStopwords.NONE;
            break;
        case "STOPWORDS_LIST":
        default:
            STOPWORDS = UseStopwords.STOPWORDS_LIST;
            break;
        }
        // stemmer:
        switch (Config.STEMMER.getValue().toUpperCase()) {
        case "SNOWBALL":
            STEMMER = UseStemmer.SNOWBALL;
            break;
        case "NONE":
        default:
            STEMMER = UseStemmer.NONE;
        }
        // posTagger:
        switch (Config.POS_TAGGER.getValue().toUpperCase()) {
        case "MAXENT":
            POSTAGGER = UsePOSTagger.MAXENT;
            break;
        case "PERCEPTRON":
            POSTAGGER = UsePOSTagger.PERCEPTRON;
            break;
        case "NONE":
        default:
            POSTAGGER = UsePOSTagger.NONE;
        }
    }

}