de.andreasschoknecht.LS3.DocumentCollection.java Source code

Introduction

Here is the source code for de.andreasschoknecht.LS3.DocumentCollection.java
Source

/**
 * Part of the LS3 Similarity-based process model search package.
 * 
 * Licensed under the GNU General Public License v3.
 *
 * Copyright 2012 by Andreas Schoknecht <andreas_schoknecht@web.de>
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 * @author Andreas Schoknecht
 */

package de.andreasschoknecht.LS3;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.stream.Stream;

import org.apache.commons.io.FileUtils;
import org.apache.commons.math3.linear.RealMatrix;
import org.jdom2.JDOMException;

import com.google.common.collect.Multiset;

/**
 * <p>
 * The DocumentCollection represents a collection of process models. When building a collection the files are filtered according to
 * the file extension .pnml. Process models in other formats are not supported.
 * <p>
 * After creation of a DocumentCollection the function createDocuments() allows the parsing of PNML files. With the function 
 * generateTDMatrix() the corresponding Term-Document Matrix can be created afterwards.
 */
public class DocumentCollection {

    /** The path to the directory containing PNML files. */
    private String pnmlPath;

    /** The file list of PNML files. */
    private String[] fileList;

    /** The amount of LS3Documents in this document collection.
     *  The amount of distinct terms in the whole document collection */
    private int documentNumber, amountTerms;

    /** The LS3Documents contained in this document collection. */
    private ArrayList<LS3Document> ls3Documents;

    /** The term collection of this document collection. */
    private LinkedHashSet<String> termCollection;

    /** The Term-LS3Document Matrix for this document collection. */
    private TDMatrix tdMatrix;

    public DocumentCollection(String pnmlPath) {
        File dir = new File(pnmlPath);
        String absolutePath = dir.getAbsolutePath();
        setPnmlPath(absolutePath);
        setFileList(filterPNMLFiles(dir));

        setDocumentNumber(fileList.length);

        this.ls3Documents = new ArrayList<LS3Document>();
        this.termCollection = new LinkedHashSet<String>();
    }

    /**
     * Filters the PNML files in a directory, which possibly contains other files as well.
     *
     * @param dir The directory containing PNML files and possibly other files
     * @return the PNML files in the directory
     */
    private static String[] filterPNMLFiles(File dir) {
        // Check if directory can be accessed
        if (!dir.isDirectory()) {
            System.out.println("Error: could not open directory " + dir.getAbsolutePath());
            System.exit(-2);
        }
        // Filter PNML files
        String[] fileList = dir.list(new FilenameFilter() {
            public boolean accept(File d, String name) {
                return name.endsWith(".pnml");
            }
        });
        return fileList;
    }

    /**
     * Generate a Term-Document Matrix from this document collection's documents.
     */
    public void generateTDMatrix() {
        tdMatrix = new TDMatrix(amountTerms, documentNumber);
        tdMatrix.fillMatrix(termCollection, ls3Documents);
        tdMatrix.fillWeightedMatrix();
    }

    /**
     * Creates the LS3Documents of this document collection. Each document contains the relevant information of a PNML file for the LS3.
     */
    public void createDocuments() {
        for (int i = 0; i < fileList.length; i++) {
            ls3Documents.add(new LS3Document(pnmlPath + File.separatorChar + fileList[i]));
        }
        PNMLReader pnmlReader = new PNMLReader();
        try {
            pnmlReader.processDocuments(this);
        } catch (JDOMException | IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * Calculate query results for a whole document collection using a Latent Semantic Analysis-based Similarity Measure (LSSM) matrix.
     * All documents having a similarity value equal or higher than theta with respect to another document are included in the results.
     *
     * @param lssmMatrix The LSSM matrix containing similarity values between documents
     * @param theta The parameter theta as a threshold value
     * @return all the results for each document as query
     */
    QueryAllResult calculateResults(RealMatrix lssmMatrix, float theta) {
        QueryAllResult result = new QueryAllResult();
        ArrayList<LS3Document> documents = this.getDocuments();
        int docNumber = documents.size();
        double[][] matrixData = lssmMatrix.getData();

        for (int i = 0; i < docNumber; i++) {
            QueryResult queryResult = new QueryResult(documents.get(i));
            for (int j = 0; j < docNumber; j++) {
                if (matrixData[i][j] >= theta)
                    queryResult.addResult(documents.get(j));
            }
            result.addResult(queryResult);
        }

        return result;
    }

    /**
     * Calculate query results for a query and a document collection. All documents having a similarity value equal or higher
     * than theta with respect to the query are included in the results.
     *
     * @param query The query document
     * @param theta The parameter theta as a threshold value
     * @return the query result
     */
    QueryAllResult calculateResults(Query query, float theta) {
        QueryAllResult result = new QueryAllResult();
        ArrayList<LS3Document> documents = this.getDocuments();
        int docNumber = documents.size();
        double[] lssmValues = query.getLSSMValues();

        QueryResult queryResult = new QueryResult(query);

        for (int i = 0; i < docNumber; i++) {
            if (lssmValues[i] >= theta)
                queryResult.addResult(documents.get(i));
        }
        result.addResult(queryResult);

        return result;
    }

    /**
     * Stores the Term-Document Matrix of this document collection as a text file containing file paths, term list and TD Maritx.
     *
     * @param filePath The file path for the resulting text file
     */
    public void storeTDMatrix(String filePath) {
        File file = new File(filePath);
        String text = "";

        text = text + "--------------------COLLECTION--------------------\r\n";
        for (int i = 0, k = fileList.length; i < k; i++) {
            text = text + fileList[i] + "\r\n";
        }
        text = text + "----------------------------------------\r\n";

        text = text + "--------------------TERM LIST--------------------\r\n";
        Iterator<String> itr = termCollection.iterator();
        while (itr.hasNext()) {
            text = text + itr.next() + "\r\n";
        }
        text = text + "----------------------------------------\r\n";

        text = text + "--------------------TD MATRIX--------------------\r\n";
        text = text + "--------------------ROW NUMBER--------------------\r\n";
        text = text + tdMatrix.getRowNumber() + "\r\n";
        text = text + "----------------------------------------\r\n";
        text = text + "--------------------COLUMN NUMBER--------------------\r\n";
        text = text + tdMatrix.getColumnNumber() + "\r\n";
        text = text + "----------------------------------------\r\n";
        text = text + "--------------------MATRIX DATA--------------------\r\n";
        for (int i = 0, k = tdMatrix.getRowNumber(); i < k; i++) {
            for (int j = 0, l = tdMatrix.getColumnNumber(); j < l; j++) {
                if (j != 0)
                    text = text + " " + tdMatrix.getMatrix()[i][j];
                else
                    text = text + tdMatrix.getMatrix()[i][j];

            }
            text = text + "\r\n";
        }

        try {
            FileUtils.writeStringToFile(file, text, "UTF-8");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        System.out.println("File stored");
    }

    /**
     * Load Term-Document Matrix data stored in a text file.
     *
     * @param filePath The file path to the file containing the Term-Document Matrix data.
     */
    public void loadTDMatrix(String filePath) {

        try (Stream<String> lines = Files.lines(Paths.get(filePath), StandardCharsets.UTF_8)) {
            ArrayList<String> files = new ArrayList<String>();
            TDMatrix tdMatrix;
            int rowNumber = 0;
            int columnNumber = 0;

            Iterator<String> itr = lines.iterator();
            while (itr.hasNext()) {
                String line = itr.next();

                switch (line) {
                case "--------------------COLLECTION--------------------":
                    System.out.println(line);
                    String line2 = itr.next();
                    while (!line2.equals("----------------------------------------")) {
                        files.add(line2);
                        line2 = itr.next();
                    }
                    break;
                case "--------------------TERM LIST--------------------":
                    System.out.println(line);
                    termCollection.clear();
                    line2 = itr.next();
                    while (!line2.equals("----------------------------------------")) {
                        termCollection.add(line2);
                        line2 = itr.next();
                    }
                    break;
                case "--------------------ROW NUMBER--------------------":
                    System.out.println(line);
                    rowNumber = Integer.parseInt(itr.next());
                    break;
                case "--------------------COLUMN NUMBER--------------------":
                    System.out.println(line);
                    columnNumber = Integer.parseInt(itr.next());
                    break;
                case "--------------------MATRIX DATA--------------------":
                    System.out.println(line);
                    tdMatrix = new TDMatrix(rowNumber, columnNumber);
                    tdMatrix.setRowNumber(rowNumber);
                    tdMatrix.setColumnNumber(columnNumber);
                    tdMatrix.setTermArray(termCollection.toArray(new String[0]));

                    int rowCounter = 0;
                    while (itr.hasNext()) {
                        line2 = itr.next();
                        String[] tokens = line2.split(" ");
                        double[] values = new double[tokens.length];
                        for (int i = 0; i < tokens.length; i++)
                            values[i] = Double.parseDouble(tokens[i]);

                        tdMatrix.fillRow(values, rowCounter);

                        rowCounter++;
                    }

                    this.tdMatrix = tdMatrix;
                    break;
                }

            }

            setFileList(files.toArray(new String[0]));
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        System.out.println("File loaded");
    }

    /**
     * Insert a model to a model collection. This means that the underlying Term-Document Matrix has to be updated.
     *
     * @param modelPath the path to the model to be inserted.
     */
    public void insertModel(String modelPath) {
        // Make sure file name is correct
        if (!modelPath.endsWith(".pnml"))
            modelPath = modelPath + ".pnml";

        // Create new LS3Document object and add it to the document collection list of documents
        System.out.println("------------------------");
        System.out.println("Model to insert:");
        System.out.println("------------------------");
        System.out.println(modelPath.substring(modelPath.lastIndexOf(File.separator) + 1));
        System.out.println("------------------------");
        System.out.println("Models in list:");
        System.out.println("------------------------");

        String[] updatedFileList = new String[fileList.length + 1];
        for (int i = 0; i <= fileList.length; i++) {
            if (i != fileList.length)
                updatedFileList[i] = fileList[i];
            else
                updatedFileList[i] = modelPath.substring(modelPath.lastIndexOf(File.separator) + 1);

            System.out.println(updatedFileList[i]);

        }

        documentNumber++;

        LS3Document newDocument = new LS3Document(modelPath);
        PNMLReader pnmlReader = new PNMLReader();
        try {
            pnmlReader.processDocument(newDocument);
        } catch (JDOMException | IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        System.out.println("------------------------");
        System.out.println("New LS3Document data:");
        System.out.println("------------------------");
        System.out.println(newDocument.getPNMLPath());
        System.out.println("Amount of terms = " + newDocument.getAmountTerms());
        for (String term : newDocument.getTermCollection()) {
            System.out.println(term);
        }

        // Add new column to the Term-Document Matrix
        int t = tdMatrix.getRowNumber();
        double[] termFrequencies = new double[t];
        String[] termCollectionArray = new String[termCollection.size()];
        termCollection.toArray(termCollectionArray);

        Multiset<String> termsM = newDocument.getTermCollection();
        for (int i = 0; i < t; i++) {
            termFrequencies[i] = termsM.count(termCollectionArray[i]);
            termsM.remove(termCollectionArray[i]);
        }
        System.out.println("------------------------");
        System.out.println("Term frequencies:");
        System.out.println("------------------------");
        System.out.println(Arrays.toString(termFrequencies));

        System.out.println("------------------------");
        System.out.println("Old TD Matrix:");
        System.out.println("------------------------");
        for (int i = 0, k = tdMatrix.getRowNumber(); i < k; i++) {
            System.out.print(tdMatrix.getTermArray()[i] + " ");
            for (int j = 0, l = tdMatrix.getColumnNumber(); j < l; j++) {
                System.out.print(" " + tdMatrix.getMatrix()[i][j] + " ");
            }
            System.out.println("");
        }
        System.out.println("---------------------\r\n\r\n");

        tdMatrix.addColumn(termFrequencies);

        System.out.println("------------------------");
        System.out.println("New TD Matrix:");
        System.out.println("------------------------");
        for (int i = 0, k = tdMatrix.getRowNumber(); i < k; i++) {
            System.out.print(tdMatrix.getTermArray()[i] + " ");
            for (int j = 0, l = tdMatrix.getColumnNumber(); j < l; j++) {
                System.out.print(" " + tdMatrix.getMatrix()[i][j] + " ");
            }
            System.out.println("");
        }
        System.out.println("---------------------\r\n\r\n");

        // Add new terms of the new model to the term list of the document collection
        System.out.println("------------------------");
        System.out.println("Old term collection:");
        System.out.println("------------------------");
        for (String term : termCollection) {
            System.out.println(term);
        }

        System.out.println("------------------------");
        System.out.println("Terms remaining in insertion model:");
        System.out.println("------------------------");
        System.out.println(Arrays.toString(termsM.toArray(new String[termsM.size()])));

        Set<String> termSet = termsM.elementSet();
        String[] newTerms = termSet.toArray(new String[termSet.size()]);
        for (String term : newTerms) {
            termCollection.add(term);
        }

        System.out.println("------------------------");
        System.out.println("New term collection:");
        System.out.println("------------------------");
        for (String term : termCollection) {
            System.out.println(term);
        }

        System.out.println("------------------------");
        System.out.println("New term collection TD Matrix:");
        System.out.println("------------------------");
        for (String term : tdMatrix.getTermArray()) {
            System.out.println(term);
        }

        //  Add one row for each new term and add the corresponding Term-Document Matrix entries
        double[] newTermsFrequencies = new double[newTerms.length];
        for (int i = 0; i < newTerms.length; i++) {
            newTermsFrequencies[i] = termsM.count(newTerms[i]);
        }

        System.out.println("------------------------");
        System.out.println("New term frequencies:");
        System.out.println("------------------------");
        System.out.println(Arrays.toString(newTermsFrequencies));

        int n = tdMatrix.getColumnNumber();
        for (int i = 0; i < newTermsFrequencies.length; i++) {
            double[] newRow = new double[n];
            for (int j = 0; j < n - 2; j++)
                newRow[j] = 0;

            newRow[n - 1] = newTermsFrequencies[i];
            tdMatrix.addRow(newRow);
        }

        // Update term list of TDMatrix object
        tdMatrix.setTermArray(termCollection.toArray(new String[0]));

        System.out.println("------------------------");
        System.out.println("Final TD Matrix:");
        System.out.println("------------------------");
        for (int i = 0, k = tdMatrix.getRowNumber(); i < k; i++) {
            System.out.print(tdMatrix.getTermArray()[i] + " ");
            for (int j = 0, l = tdMatrix.getColumnNumber(); j < l; j++) {
                System.out.print(" " + tdMatrix.getMatrix()[i][j] + " ");
            }
            System.out.println("");
        }
        System.out.println("---------------------\r\n\r\n");

    }

    /**
     * Delete a model from the Term-Document Matrix search structure.
     *
     * @param modelName The model name of the model to be removed.
     */
    public void deleteModel(String modelName) {
        // Make sure file name is correct
        if (!modelName.endsWith(".pnml"))
            modelName = modelName + ".pnml";

        // Delete column from TD Matrix and set correct number of columns
        int deletionIndex = 0;
        for (int i = 0, l = fileList.length; i < l; i++) {
            if (fileList[i].equals(modelName)) {
                tdMatrix.deleteColumn(i);
                deletionIndex = i;
            }
        }

        // Delete model name from fileList (update to new file list).
        String[] newFileList = new String[fileList.length - 1];
        int counter = 0;
        for (int i = 0, l = fileList.length; i < l; i++) {
            if (i != deletionIndex) {
                newFileList[counter] = fileList[i];
                counter++;
            }
        }
        setFileList(newFileList);

        // Delete LS3Document representation of file "modelName" (update to new ArrayList of LS3Documents).
        for (int i = 0, l = ls3Documents.size(); i < l; i++) {
            if (ls3Documents.get(i).getPNMLPath().endsWith(modelName)) {
                ls3Documents.remove(i);
                i = l;
            }
        }

        // Delete term rows that only contain values 0.0. I.e. delete unnecessary terms.
        ArrayList<Integer> termDeletionIndices = new ArrayList<Integer>();
        boolean delete = true;

        double[][] matrix = tdMatrix.getMatrix();
        for (int i = 0, k = tdMatrix.getRowNumber(); i < k; i++) {
            for (int j = 0, l = tdMatrix.getColumnNumber(); j < l; j++) {
                if (matrix[i][j] != 0.0) {
                    delete = false;
                    j = l;
                }
            }
            if (delete == true)
                termDeletionIndices.add(i);
            else
                delete = true;
        }

        int deletionCounter = 0;
        for (int index : termDeletionIndices) {
            tdMatrix.deleteRow(index - deletionCounter);
            deletionCounter++;
        }

        // Update term list of document collection.
        deletionCounter = 0;
        LinkedHashSet<String> newTermList = new LinkedHashSet<String>();
        for (String term : termCollection) {
            if (!termDeletionIndices.contains(deletionCounter))
                newTermList.add(term);

            deletionCounter++;
        }

        setTermCollection(newTermList);

        // Update term list of TDMatrix object
        tdMatrix.setTermArray(termCollection.toArray(new String[0]));

    }

    public String getPnmlPath() {
        return pnmlPath;
    }

    void setPnmlPath(String pnmlPath) {
        this.pnmlPath = pnmlPath;
    }

    public String[] getFileList() {
        return fileList;
    }

    void setFileList(String[] fileList) {
        this.fileList = fileList;
    }

    public int getDocumentNumber() {
        return documentNumber;
    }

    void setDocumentNumber(int documentNumber) {
        this.documentNumber = documentNumber;
    }

    public ArrayList<LS3Document> getDocuments() {
        return ls3Documents;
    }

    void addDocument(LS3Document ls3Document) {
        this.ls3Documents.add(ls3Document);
    }

    public LinkedHashSet<String> getTermCollection() {
        return termCollection;
    }

    public void setTermCollection(LinkedHashSet<String> termCollection) {
        this.termCollection = termCollection;
    }

    void addTerm(String term) {
        this.termCollection.add(term);
        amountTerms = termCollection.size();
    }

    public int getAmountTerms() {
        return amountTerms;
    }

    public TDMatrix getTDMatrix() {
        return tdMatrix;
    }

}