org.opentestsystem.airose.sspace.TrainEssayScorerLSA.java Source code

Introduction

Here is the source code for org.opentestsystem.airose.sspace.TrainEssayScorerLSA.java
Source

/*******************************************************************************
 * Copyright (c) 2013 American Institutes for Research
 * 
 * This file is part of AIROSE.
 * 
 * AIROSE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 * 
 * AIROSE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with AIROSE.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package org.opentestsystem.airose.sspace;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.log4j.Logger;
import org.apache.commons.lang.NotImplementedException;

import edu.ucla.sspace.basis.BasisMapping;
import edu.ucla.sspace.common.GenericTermDocumentVectorSpace;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.lsa.LatentSemanticAnalysis;
import edu.ucla.sspace.matrix.LogEntropyTransform;
import edu.ucla.sspace.matrix.Matrices;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.MatrixBuilder;
import edu.ucla.sspace.matrix.MatrixFile;
import edu.ucla.sspace.matrix.SVD;
import edu.ucla.sspace.matrix.Transform;
import org.opentestsystem.airose.common.abstractdocument.AbstractDocument;
import org.opentestsystem.airose.common.abstractdocument.AbstractResource;
import org.opentestsystem.airose.common.abstractdocument.AbstractToken;
import org.opentestsystem.airose.common.config.ConfigurationFactory;
import org.opentestsystem.airose.common.config.TrainerConfiguration;
import org.opentestsystem.airose.common.config.UninitializedException;
import org.opentestsystem.airose.common.config.ConfigurationFactory.ConfigurationType;
import org.opentestsystem.airose.db.entities.DocumentQuality;
import org.opentestsystem.airose.docquality.DocumentQualityLoader;
import org.opentestsystem.airose.docquality.EnumDocumentQualityAttributes;
import org.opentestsystem.airose.docquality.processors.DocumentQualityProcessorNotImplementedException;
import org.opentestsystem.airose.document.DocumentFactory;
import org.opentestsystem.airose.languagetool.Mistake;
import org.opentestsystem.airose.linear.MatrixTypeEnum;
import org.opentestsystem.airose.utilities.LoggerUtil;

import edu.ucla.sspace.util.ReflectionUtil;
import edu.ucla.sspace.util.SparseArray;
import edu.ucla.sspace.util.SparseDoubleArray;
import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.Vector;

/*
 * See edu.ucla.sspace.common.GenericTermDocumentVectorSpace in SemanticSpaces.
 * Also take a look at edu.ucla.sspace.lsa.LatentSemanticAnalysis.
 */

public class TrainEssayScorerLSA implements SemanticSpace {

    /*
     * a resources map.
     */
    private HashMap<String, AbstractResource> mResources = null;

    /**
     * The name prefix used with {@link #getName()}
     */
    private static final String LSA_SSPACE_NAME = "lsa-semantic-space";

    /*
     * hack!!! the file currently being processed. not thread-safe.
     */
    private String mFileBeingProcessed = null;

    /**
     * The document space of the term document based word space If the word space
     * is reduced. After reduction it is the right factor matrix of the SVD of the
     * word-document matrix. This matrix is only available after the
     * {@link #processSpace(Transform, SVD.Algorithm, int, boolean) processSpace}
     * method has been called.
     */
    protected org.opentestsystem.airose.linear.Matrix mDocumentSpace;

    protected org.opentestsystem.airose.linear.Matrix[] mUSV = null;

    protected static Logger LOG = Logger.getLogger(TrainEssayScorerLSA.class.getName());

    protected TermDimensionMap mDimensionMapper = null;
    /**
     * The counter for recording the current number of documents observed.
     * Subclasses can use this for any reporting.
     */
    protected AtomicInteger mDocumentCounter;

    /**
     * The builder used to construct the term-document matrix as new documents are
     * processed.
     */
    protected MatrixBuilder mTermDocumentMatrixBuilder;

    /**
     * The word space of the term document based word space model. If the word
     * space is reduced, it is the left factor matrix of the SVD of the
     * word-document matrix. This matrix is only available after the
     * {@link #processSpace(Transform) processSpace} method has been called.
     */
    protected org.opentestsystem.airose.linear.Matrix mWordSpace;

    protected String mEssaySet = null;
    protected String mModelId = null;
    protected String mScoreType = null;

    /*
     * The number of dimensions in the reduced semantic space.
     */
    protected int mDimensionInReducedSpace = -1;

    /*
     * we are going to be scanning the documents at this point. we may as well
     * keep track of all the various mistakes we came across as we need them for
     * adjusting our document quality dimensions.
     */
    protected DocumentQualityLoader mDocumentQualityLoader = null;

    /**
     * Constructs the {@code GenericTermDocumentVectorSpace}.
     * 
     * @throws IOException
     *           if this instance encounters any errors when creatng the backing
     *           array files required for processing
     */
    public TrainEssayScorerLSA(String essaySet, String modelId, String scoreType, TermDimensionMap termDimensionMap,
            HashMap<String, AbstractResource> resourcesMap, DocumentQualityLoader documentQualityAssessor) {
        this(essaySet, modelId, scoreType, termDimensionMap, Matrices.getMatrixBuilderForSVD());
        mResources = resourcesMap;
        mDocumentQualityLoader = documentQualityAssessor;
    }

    /*
     * Get the essay set id.
     */
    public String getEssaySetId() {
        return this.mEssaySet;
    }

    /*
     * get the model name for this essay.
     */
    public String getModelId() {
        return this.mModelId;
    }

    /*
     * get the score type. for running LSA, score typpe may not matter. however,
     * there is a possibility that various other doc processors / doc tokenizers
     * will be used depending on score type.
     */
    public String getScoreType() {
        return this.mScoreType;
    }

    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
        return LSA_SSPACE_NAME;
    }

    /*
     * Get the SVD decomposition matrices.
     */
    public org.opentestsystem.airose.linear.Matrix[] getUSV() {
        return mUSV;
    }

    public String getFileBeingCurrentlyProcessed() {
        return mFileBeingProcessed;
    }

    public void setFileBeingCurrentlyProcessed(String fileName) {
        this.mFileBeingProcessed = fileName;
    }

    /*
     * Get the number of documents in the semantic space.
     */
    public int getDocuments() {
        if (mDocumentSpace == null)
            throw new IllegalArgumentException("The document space has not been retained or generated.");
        return mDocumentSpace.rows();
    }

    public int getNumberOfDimensionsInReducedSpace() throws LSANotInitializedException {
        if (mDimensionInReducedSpace == -1)
            throw new LSANotInitializedException();
        return mDimensionInReducedSpace;
    }

    /**
     * Returns the semantics of the document as represented by a numeric vector.
     * Note that document semantics may be represented in an entirely different
     * space, so the corresponding semantic dimensions in the word space will be
     * completely unrelated. However, document vectors may be compared to find
     * those document with similar content.
     * 
     * </p>
     * 
     * Similar to {@code getVector}, this method is only to be used after
     * {@code processSpace} has been called. By default, the document space is not
     * retained unless {@code retainDocumentSpace} is set to true.
     * 
     * </p>
     * 
     * Implementation note: If a specific document ordering is needed, caution
     * should be used when using this class in a multi-threaded environment.
     * Beacuse the document number is based on what order it was <i>processed</i>,
     * no guarantee is made that this will correspond with the original document
     * ordering as it exists in the corpus files. However, in a single-threaded
     * environment, the ordering will be preserved.
     * 
     * @param documentNumber
     *          the number of the document according to when it was processed
     * 
     * @return the semantics of the document in the document space.
     * @throws IllegalArgumentException
     *           If the document space was not retained or the document number is
     *           out of range.
     */
    public DoubleVector getDocumentVector(int documentNumber) {
        /*
         * if (mDocumentSpace == null) throw new IllegalArgumentException
         * ("The document space has not been retained or generated.");
         * 
         * if (documentNumber < 0 || documentNumber >= mDocumentSpace.rows ()) {
         * throw new IllegalArgumentException
         * ("Document number is not within the bounds of the number of " +
         * "documents: " + documentNumber); } //TODO Ask return
         * mDocumentSpace.getRowVector (documentNumber);
         */
        return null;
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * edu.ucla.sspace.common.SemanticSpace#processDocument(java.io.BufferedReader
     * )
     * 
     * This method expects a document where each line represents a human assigned
     * score and a response separated by a comma. This is not necessarily the CSV
     * format.
     * 
     * At the end of this call it will close the file and hence there is no need
     * to close it externally.
     */

    public void processDocument(BufferedReader document) throws IOException {
        String line = null;
        while ((line = document.readLine()) != null) {
            try {
                // Each line consists of a score and a response separated by a
                // comma. First process and tokenize the document.
                /*
                 * hack!!! the use of getFileBeingCurrentlyProcessed makes an instance
                 * of this class none-thread safe.
                 */
                AbstractDocument doc = DocumentFactory.getDocument(getFileBeingCurrentlyProcessed(), line,
                        getEssaySetId(), -1);
                // preprocess
                doc.invokeDocumentProcessors(mResources);
                // tokenize
                doc.invokeDocumentTokenizers();
                // assign term weights
                doc.assignTermWeights();

                // what document qualities do we need to be cognizant of
                addToDocumentQualityAttributes(doc);
                // seems like everything went fine with processign and
                // tokenization.
                // we will extract the tokens one-by-one and insert them into
                // the term-by-doc matrix.
                Iterator<AbstractToken> tokens = doc.getListOfTokens().iterator();
                // empty document.
                if (!tokens.hasNext())
                    continue;

                // we will have to walk through these tokens twice...
                // here is the reason why.
                // in the begining we do not know what are the different number
                // of
                // unique words that we have seen so far
                // in all the processing we have done. during the first walk we
                // will add
                // these tokens to the termToIndex
                // data collection so that now we can keep track of how many
                // unique
                // tokens we have seen so far.
                while (tokens.hasNext()) {
                    AbstractToken token = tokens.next();
                    // this getDimension internally add the token to the list of
                    // unique
                    // words by assigning a
                    // unique dimension.
                    mDimensionMapper.getDimension(token.getToken());
                }
                // now re-initialize the iterator.
                tokens = doc.getListOfTokens().iterator();

                // Increaes the count of documents observed so far.
                int docCount = mDocumentCounter.getAndAdd(1);

                // Get the total number of terms encountered so far, including
                // any new
                // unique terms found in the most recent document
                int totalNumberOfUniqueWords = mDimensionMapper.getNumberOfDimensions();

                // Convert the Map count to a SparseArray
                SparseArray<Double> documentColumn = new SparseDoubleArray(totalNumberOfUniqueWords);
                while (tokens.hasNext()) {
                    AbstractToken token = tokens.next();
                    documentColumn.set(mDimensionMapper.getDimension(token.getToken()), token.getWeight());

                }
                // Update the term-document matrix with the results of
                // processing the
                // document.
                mTermDocumentMatrixBuilder.addColumn(documentColumn);
                if (docCount % 50 == 0)
                    LoggerUtil.info(LOG, "Processed " + docCount + " documents.");
            } catch (Exception exp) {
                LoggerUtil.severe(LOG, "Exception invoking docProcessors/docTokenizers on \"%s\". Message: %s",
                        line, exp.getMessage());
            }
        }
        // close the document.
        document.close();
    }

    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        return mDimensionMapper.getWords();
    }

    /**
     * {@inheritDoc}
     */
    @SuppressWarnings("rawtypes")
    public Vector getVector(String word) {
        // determine the index for the word
        int index = mDimensionMapper.getDimension(word);

        return (index < 0) ? null : vectorConverter(mWordSpace.getRowVector(index));
    }

    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return mWordSpace.columns();
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * edu.ucla.sspace.common.SemanticSpace#processSpace(java.util.Properties)
     * 
     * This method is not used. Call processSpace() instead.
     */
    public void processSpace(Properties properties) {
        throw new NotImplementedException();
    }

    /**
     * {@inheritDoc}
     * 
     * @param properties
     *          {@inheritDoc} See this class's {@link LatentSemanticAnalysis
     *          javadoc} for the full list of supported properties.
     */
    public void processSpace() throws UninitializedException {

        Transform transform = new LogEntropyTransform();
        // lets get the number of dimensions we want to reduce the space to from
        // the configuration.
        int dimensions = 50; // default
        SVD.Algorithm alg = SVD.Algorithm.ANY;

        if (ConfigurationFactory.getConfigurationType() == ConfigurationType.TRAINER) {
            TrainerConfiguration configuration = (TrainerConfiguration) ConfigurationFactory.getConfiguration();
            dimensions = configuration.getNumberOfDimensions();

            String svdProp = configuration.getSVDAlgorithm();
            alg = (svdProp == null) ? SVD.Algorithm.ANY : SVD.Algorithm.valueOf(svdProp);

            String transformClass = configuration.getTransformClass();
            if (transformClass != null) {
                transform = ReflectionUtil.getObjectInstance(transformClass);
            }
        }

        try {
            MatrixFile processedSpace = processSpace(transform);

            LoggerUtil.info(LOG, "reducing to %d dimensions", dimensions);

            // Compute SVD on the pre-processed matrix.
            @SuppressWarnings("deprecation")
            Matrix[] usv = SVD.svd(processedSpace.getFile(), alg, processedSpace.getFormat(), dimensions);

            mUSV = new org.opentestsystem.airose.linear.Matrix[] { convetTo(usv[0], MatrixTypeEnum.REAL2D),
                    convetTo(usv[1], MatrixTypeEnum.DIAGONAL), convetTo(usv[2], MatrixTypeEnum.REAL2D) };

            // Load the left factor matrix, which is the word semantic space
            mWordSpace = convetTo(usv[0], MatrixTypeEnum.REAL2D);

            // Weight the values in the word space by the singular values.
            Matrix singularValues = usv[1];
            for (int r = 0; r < mWordSpace.rows(); ++r) {
                for (int c = 0; c < mWordSpace.columns(); ++c) {
                    mWordSpace.set(r, c, mWordSpace.get(r, c) * singularValues.get(c, c));
                }
            }

            // set the property to keep track of the number of dimensions in the
            // reduced space.
            mDimensionInReducedSpace = dimensions;

        } catch (IOException ioe) {
            // rethrow as Error
            throw new IOError(ioe);
        }
    }

    protected org.opentestsystem.airose.linear.Matrix convetTo(Matrix m, MatrixTypeEnum type) {
        // Conver a SSPACE matrix representation to our AIROSE matrix
        // representation.
        org.opentestsystem.airose.linear.Matrix output = org.opentestsystem.airose.linear.Matrices.create(m.rows(),
                m.columns(), type);
        for (int row = 0; row < m.rows(); ++row)
            for (int column = 0; column < m.columns(); ++column)
                output.set(row, column, m.get(row, column));
        return output;
    }

    /*
     * This method convert a java.util.Vector into a edu.ucla.sspace.vector.Vector
     */
    @SuppressWarnings("rawtypes")
    protected Vector vectorConverter(java.util.Vector v) {

        Vector outputVec = new DenseVector(v.size());
        for (int i = 0; i < v.size(); ++i)
            outputVec.set(i, (Number) v.get(i));

        return outputVec;
    }

    /**
     * Constructs the {@code GenericTermDocumentVectorSpace} using the provided
     * objects for processing.
     * 
     * @param essaySet
     *          The id of the essay set for which we are running the training.
     * @param readHeaderToken
     *          If true, the first token of each document will be read and passed
     *          to {@link #handleDocumentHeader(int, String) handleDocumentHeader}
     *          , which by default discards the header.
     * @param termToIndex
     *          The {@link BasisMapping} used to map strings to indices.
     * @param termDocumentMatrixBuilder
     *          The {@link MatrixBuilder} used to write document vectors to disk
     *          which later get processed in {@link #processSpace(Properties)
     *          processSpace}.
     * 
     * @throws IOException
     *           if this instance encounters any errors when creatng the backing
     *           array files required for processing
     */
    protected TrainEssayScorerLSA(String essaySet, String modelId, String scoreType,

            TermDimensionMap termDimensionMap, MatrixBuilder termDocumentMatrixBuilder) {
        mEssaySet = essaySet;
        mModelId = modelId;
        mScoreType = scoreType;
        mDimensionMapper = termDimensionMap;
        mDocumentCounter = new AtomicInteger(0);

        mTermDocumentMatrixBuilder = termDocumentMatrixBuilder;

        mWordSpace = null;
        mDocumentSpace = null;
    }

    /**
     * Processes the {@link GenericTermDocumentVectorSpace} with the provided
     * {@link Transform} if it is not {@code null} as a {@link MatrixFile}.
     * Otherwise, the raw term document counts are returned. Sub classes must call
     * this in order to access the term document counts before doing any other
     * processing.
     * 
     * @param transform
     *          A matrix transform used to rescale the original raw document
     *          counts. If {@code null} no transform is done.
     */
    protected MatrixFile processSpace(Transform transform) throws IOException {
        // first ensure that we are no longer writing to the matrix
        mTermDocumentMatrixBuilder.finish();

        // Get the finished matrix file from the builder
        File termDocumentMatrix = mTermDocumentMatrixBuilder.getFile();

        // If a transform was specified, perform the matrix transform.
        if (transform != null) {
            LoggerUtil.info(LOG, "performing %s transform", transform);

            LoggerUtil.verbose(LOG, "stored term-document matrix in format %s at %s",
                    mTermDocumentMatrixBuilder.getMatrixFormat(), termDocumentMatrix.getAbsolutePath());

            // Convert the raw term counts using the specified transform
            termDocumentMatrix = transform.transform(termDocumentMatrix,
                    mTermDocumentMatrixBuilder.getMatrixFormat());

            LoggerUtil.verbose(LOG, "transformed matrix to %s", termDocumentMatrix.getAbsolutePath());
        }

        return new MatrixFile(termDocumentMatrix, mTermDocumentMatrixBuilder.getMatrixFormat());
    }

    /**
     * Subclasses should override this method if they need to utilize a header
     * token for each document. Implementations of this method <b>must</b> be
     * thread safe. The default action is a no-op.
     * 
     * @param docIndex
     *          The document id assigned to the current document
     * @param documentName
     *          The name of the current document.
     */
    protected void handleDocumentHeader(int docIndex, String header) {
    }

    /*
     * we will run through all the mistakes
     */
    private void addToDocumentQualityAttributes(AbstractDocument doc) {
        for (Mistake mistake : doc.getMistakes().getListOfMistakes()) {
            DocumentQuality quality = new DocumentQuality(-1,
                    EnumDocumentQualityAttributes.mapMistakeTypeEnum(mistake.getMistakeType()), -1,
                    mistake.getMistakeTypeSubCategory());
            try {
                mDocumentQualityLoader.addToDocumentQualityAttribute(quality);
            } catch (DocumentQualityProcessorNotImplementedException exp) {
                // we should never throw it here. but still lets log it.
                LoggerUtil.warning(LOG, "Document quality type evaluator could not be found for type %s",
                        mistake.getUniqueMistakeId());
            }
        }
    }
}