Java tutorial
/******************************************************************************* * Copyright (c) 2013 American Institutes for Research * * This file is part of AIROSE. * * AIROSE is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * AIROSE is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with AIROSE. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.opentestsystem.airose.sspace; import java.io.BufferedReader; import java.io.File; import java.io.IOError; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Properties; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import org.apache.log4j.Logger; import org.apache.commons.lang.NotImplementedException; import edu.ucla.sspace.basis.BasisMapping; import edu.ucla.sspace.common.GenericTermDocumentVectorSpace; import edu.ucla.sspace.common.SemanticSpace; import edu.ucla.sspace.lsa.LatentSemanticAnalysis; import edu.ucla.sspace.matrix.LogEntropyTransform; import edu.ucla.sspace.matrix.Matrices; import edu.ucla.sspace.matrix.Matrix; import edu.ucla.sspace.matrix.MatrixBuilder; import edu.ucla.sspace.matrix.MatrixFile; import edu.ucla.sspace.matrix.SVD; import edu.ucla.sspace.matrix.Transform; import org.opentestsystem.airose.common.abstractdocument.AbstractDocument; import org.opentestsystem.airose.common.abstractdocument.AbstractResource; import org.opentestsystem.airose.common.abstractdocument.AbstractToken; import org.opentestsystem.airose.common.config.ConfigurationFactory; import org.opentestsystem.airose.common.config.TrainerConfiguration; import org.opentestsystem.airose.common.config.UninitializedException; import org.opentestsystem.airose.common.config.ConfigurationFactory.ConfigurationType; import org.opentestsystem.airose.db.entities.DocumentQuality; import org.opentestsystem.airose.docquality.DocumentQualityLoader; import org.opentestsystem.airose.docquality.EnumDocumentQualityAttributes; import org.opentestsystem.airose.docquality.processors.DocumentQualityProcessorNotImplementedException; import org.opentestsystem.airose.document.DocumentFactory; import org.opentestsystem.airose.languagetool.Mistake; import org.opentestsystem.airose.linear.MatrixTypeEnum; import org.opentestsystem.airose.utilities.LoggerUtil; import edu.ucla.sspace.util.ReflectionUtil; import edu.ucla.sspace.util.SparseArray; import edu.ucla.sspace.util.SparseDoubleArray; import edu.ucla.sspace.vector.DenseVector; import edu.ucla.sspace.vector.DoubleVector; import edu.ucla.sspace.vector.Vector; /* * See edu.ucla.sspace.common.GenericTermDocumentVectorSpace in SemanticSpaces. * Also take a look at edu.ucla.sspace.lsa.LatentSemanticAnalysis. */ public class TrainEssayScorerLSA implements SemanticSpace { /* * a resources map. */ private HashMap<String, AbstractResource> mResources = null; /** * The name prefix used with {@link #getName()} */ private static final String LSA_SSPACE_NAME = "lsa-semantic-space"; /* * hack!!! the file currently being processed. not thread-safe. */ private String mFileBeingProcessed = null; /** * The document space of the term document based word space If the word space * is reduced. After reduction it is the right factor matrix of the SVD of the * word-document matrix. This matrix is only available after the * {@link #processSpace(Transform, SVD.Algorithm, int, boolean) processSpace} * method has been called. */ protected org.opentestsystem.airose.linear.Matrix mDocumentSpace; protected org.opentestsystem.airose.linear.Matrix[] mUSV = null; protected static Logger LOG = Logger.getLogger(TrainEssayScorerLSA.class.getName()); protected TermDimensionMap mDimensionMapper = null; /** * The counter for recording the current number of documents observed. * Subclasses can use this for any reporting. */ protected AtomicInteger mDocumentCounter; /** * The builder used to construct the term-document matrix as new documents are * processed. */ protected MatrixBuilder mTermDocumentMatrixBuilder; /** * The word space of the term document based word space model. If the word * space is reduced, it is the left factor matrix of the SVD of the * word-document matrix. This matrix is only available after the * {@link #processSpace(Transform) processSpace} method has been called. */ protected org.opentestsystem.airose.linear.Matrix mWordSpace; protected String mEssaySet = null; protected String mModelId = null; protected String mScoreType = null; /* * The number of dimensions in the reduced semantic space. */ protected int mDimensionInReducedSpace = -1; /* * we are going to be scanning the documents at this point. we may as well * keep track of all the various mistakes we came across as we need them for * adjusting our document quality dimensions. */ protected DocumentQualityLoader mDocumentQualityLoader = null; /** * Constructs the {@code GenericTermDocumentVectorSpace}. * * @throws IOException * if this instance encounters any errors when creatng the backing * array files required for processing */ public TrainEssayScorerLSA(String essaySet, String modelId, String scoreType, TermDimensionMap termDimensionMap, HashMap<String, AbstractResource> resourcesMap, DocumentQualityLoader documentQualityAssessor) { this(essaySet, modelId, scoreType, termDimensionMap, Matrices.getMatrixBuilderForSVD()); mResources = resourcesMap; mDocumentQualityLoader = documentQualityAssessor; } /* * Get the essay set id. */ public String getEssaySetId() { return this.mEssaySet; } /* * get the model name for this essay. */ public String getModelId() { return this.mModelId; } /* * get the score type. for running LSA, score typpe may not matter. however, * there is a possibility that various other doc processors / doc tokenizers * will be used depending on score type. */ public String getScoreType() { return this.mScoreType; } /** * {@inheritDoc} */ public String getSpaceName() { return LSA_SSPACE_NAME; } /* * Get the SVD decomposition matrices. */ public org.opentestsystem.airose.linear.Matrix[] getUSV() { return mUSV; } public String getFileBeingCurrentlyProcessed() { return mFileBeingProcessed; } public void setFileBeingCurrentlyProcessed(String fileName) { this.mFileBeingProcessed = fileName; } /* * Get the number of documents in the semantic space. */ public int getDocuments() { if (mDocumentSpace == null) throw new IllegalArgumentException("The document space has not been retained or generated."); return mDocumentSpace.rows(); } public int getNumberOfDimensionsInReducedSpace() throws LSANotInitializedException { if (mDimensionInReducedSpace == -1) throw new LSANotInitializedException(); return mDimensionInReducedSpace; } /** * Returns the semantics of the document as represented by a numeric vector. * Note that document semantics may be represented in an entirely different * space, so the corresponding semantic dimensions in the word space will be * completely unrelated. However, document vectors may be compared to find * those document with similar content. * * </p> * * Similar to {@code getVector}, this method is only to be used after * {@code processSpace} has been called. By default, the document space is not * retained unless {@code retainDocumentSpace} is set to true. * * </p> * * Implementation note: If a specific document ordering is needed, caution * should be used when using this class in a multi-threaded environment. * Beacuse the document number is based on what order it was <i>processed</i>, * no guarantee is made that this will correspond with the original document * ordering as it exists in the corpus files. However, in a single-threaded * environment, the ordering will be preserved. * * @param documentNumber * the number of the document according to when it was processed * * @return the semantics of the document in the document space. * @throws IllegalArgumentException * If the document space was not retained or the document number is * out of range. */ public DoubleVector getDocumentVector(int documentNumber) { /* * if (mDocumentSpace == null) throw new IllegalArgumentException * ("The document space has not been retained or generated."); * * if (documentNumber < 0 || documentNumber >= mDocumentSpace.rows ()) { * throw new IllegalArgumentException * ("Document number is not within the bounds of the number of " + * "documents: " + documentNumber); } //TODO Ask return * mDocumentSpace.getRowVector (documentNumber); */ return null; } /* * (non-Javadoc) * * @see * edu.ucla.sspace.common.SemanticSpace#processDocument(java.io.BufferedReader * ) * * This method expects a document where each line represents a human assigned * score and a response separated by a comma. This is not necessarily the CSV * format. * * At the end of this call it will close the file and hence there is no need * to close it externally. */ public void processDocument(BufferedReader document) throws IOException { String line = null; while ((line = document.readLine()) != null) { try { // Each line consists of a score and a response separated by a // comma. First process and tokenize the document. /* * hack!!! the use of getFileBeingCurrentlyProcessed makes an instance * of this class none-thread safe. */ AbstractDocument doc = DocumentFactory.getDocument(getFileBeingCurrentlyProcessed(), line, getEssaySetId(), -1); // preprocess doc.invokeDocumentProcessors(mResources); // tokenize doc.invokeDocumentTokenizers(); // assign term weights doc.assignTermWeights(); // what document qualities do we need to be cognizant of addToDocumentQualityAttributes(doc); // seems like everything went fine with processign and // tokenization. // we will extract the tokens one-by-one and insert them into // the term-by-doc matrix. Iterator<AbstractToken> tokens = doc.getListOfTokens().iterator(); // empty document. if (!tokens.hasNext()) continue; // we will have to walk through these tokens twice... // here is the reason why. // in the begining we do not know what are the different number // of // unique words that we have seen so far // in all the processing we have done. during the first walk we // will add // these tokens to the termToIndex // data collection so that now we can keep track of how many // unique // tokens we have seen so far. while (tokens.hasNext()) { AbstractToken token = tokens.next(); // this getDimension internally add the token to the list of // unique // words by assigning a // unique dimension. mDimensionMapper.getDimension(token.getToken()); } // now re-initialize the iterator. tokens = doc.getListOfTokens().iterator(); // Increaes the count of documents observed so far. int docCount = mDocumentCounter.getAndAdd(1); // Get the total number of terms encountered so far, including // any new // unique terms found in the most recent document int totalNumberOfUniqueWords = mDimensionMapper.getNumberOfDimensions(); // Convert the Map count to a SparseArray SparseArray<Double> documentColumn = new SparseDoubleArray(totalNumberOfUniqueWords); while (tokens.hasNext()) { AbstractToken token = tokens.next(); documentColumn.set(mDimensionMapper.getDimension(token.getToken()), token.getWeight()); } // Update the term-document matrix with the results of // processing the // document. mTermDocumentMatrixBuilder.addColumn(documentColumn); if (docCount % 50 == 0) LoggerUtil.info(LOG, "Processed " + docCount + " documents."); } catch (Exception exp) { LoggerUtil.severe(LOG, "Exception invoking docProcessors/docTokenizers on \"%s\". Message: %s", line, exp.getMessage()); } } // close the document. document.close(); } /** * {@inheritDoc} */ public Set<String> getWords() { return mDimensionMapper.getWords(); } /** * {@inheritDoc} */ @SuppressWarnings("rawtypes") public Vector getVector(String word) { // determine the index for the word int index = mDimensionMapper.getDimension(word); return (index < 0) ? null : vectorConverter(mWordSpace.getRowVector(index)); } /** * {@inheritDoc} */ public int getVectorLength() { return mWordSpace.columns(); } /* * (non-Javadoc) * * @see * edu.ucla.sspace.common.SemanticSpace#processSpace(java.util.Properties) * * This method is not used. Call processSpace() instead. */ public void processSpace(Properties properties) { throw new NotImplementedException(); } /** * {@inheritDoc} * * @param properties * {@inheritDoc} See this class's {@link LatentSemanticAnalysis * javadoc} for the full list of supported properties. */ public void processSpace() throws UninitializedException { Transform transform = new LogEntropyTransform(); // lets get the number of dimensions we want to reduce the space to from // the configuration. int dimensions = 50; // default SVD.Algorithm alg = SVD.Algorithm.ANY; if (ConfigurationFactory.getConfigurationType() == ConfigurationType.TRAINER) { TrainerConfiguration configuration = (TrainerConfiguration) ConfigurationFactory.getConfiguration(); dimensions = configuration.getNumberOfDimensions(); String svdProp = configuration.getSVDAlgorithm(); alg = (svdProp == null) ? SVD.Algorithm.ANY : SVD.Algorithm.valueOf(svdProp); String transformClass = configuration.getTransformClass(); if (transformClass != null) { transform = ReflectionUtil.getObjectInstance(transformClass); } } try { MatrixFile processedSpace = processSpace(transform); LoggerUtil.info(LOG, "reducing to %d dimensions", dimensions); // Compute SVD on the pre-processed matrix. @SuppressWarnings("deprecation") Matrix[] usv = SVD.svd(processedSpace.getFile(), alg, processedSpace.getFormat(), dimensions); mUSV = new org.opentestsystem.airose.linear.Matrix[] { convetTo(usv[0], MatrixTypeEnum.REAL2D), convetTo(usv[1], MatrixTypeEnum.DIAGONAL), convetTo(usv[2], MatrixTypeEnum.REAL2D) }; // Load the left factor matrix, which is the word semantic space mWordSpace = convetTo(usv[0], MatrixTypeEnum.REAL2D); // Weight the values in the word space by the singular values. Matrix singularValues = usv[1]; for (int r = 0; r < mWordSpace.rows(); ++r) { for (int c = 0; c < mWordSpace.columns(); ++c) { mWordSpace.set(r, c, mWordSpace.get(r, c) * singularValues.get(c, c)); } } // set the property to keep track of the number of dimensions in the // reduced space. mDimensionInReducedSpace = dimensions; } catch (IOException ioe) { // rethrow as Error throw new IOError(ioe); } } protected org.opentestsystem.airose.linear.Matrix convetTo(Matrix m, MatrixTypeEnum type) { // Conver a SSPACE matrix representation to our AIROSE matrix // representation. org.opentestsystem.airose.linear.Matrix output = org.opentestsystem.airose.linear.Matrices.create(m.rows(), m.columns(), type); for (int row = 0; row < m.rows(); ++row) for (int column = 0; column < m.columns(); ++column) output.set(row, column, m.get(row, column)); return output; } /* * This method convert a java.util.Vector into a edu.ucla.sspace.vector.Vector */ @SuppressWarnings("rawtypes") protected Vector vectorConverter(java.util.Vector v) { Vector outputVec = new DenseVector(v.size()); for (int i = 0; i < v.size(); ++i) outputVec.set(i, (Number) v.get(i)); return outputVec; } /** * Constructs the {@code GenericTermDocumentVectorSpace} using the provided * objects for processing. * * @param essaySet * The id of the essay set for which we are running the training. * @param readHeaderToken * If true, the first token of each document will be read and passed * to {@link #handleDocumentHeader(int, String) handleDocumentHeader} * , which by default discards the header. * @param termToIndex * The {@link BasisMapping} used to map strings to indices. * @param termDocumentMatrixBuilder * The {@link MatrixBuilder} used to write document vectors to disk * which later get processed in {@link #processSpace(Properties) * processSpace}. * * @throws IOException * if this instance encounters any errors when creatng the backing * array files required for processing */ protected TrainEssayScorerLSA(String essaySet, String modelId, String scoreType, TermDimensionMap termDimensionMap, MatrixBuilder termDocumentMatrixBuilder) { mEssaySet = essaySet; mModelId = modelId; mScoreType = scoreType; mDimensionMapper = termDimensionMap; mDocumentCounter = new AtomicInteger(0); mTermDocumentMatrixBuilder = termDocumentMatrixBuilder; mWordSpace = null; mDocumentSpace = null; } /** * Processes the {@link GenericTermDocumentVectorSpace} with the provided * {@link Transform} if it is not {@code null} as a {@link MatrixFile}. * Otherwise, the raw term document counts are returned. Sub classes must call * this in order to access the term document counts before doing any other * processing. * * @param transform * A matrix transform used to rescale the original raw document * counts. If {@code null} no transform is done. */ protected MatrixFile processSpace(Transform transform) throws IOException { // first ensure that we are no longer writing to the matrix mTermDocumentMatrixBuilder.finish(); // Get the finished matrix file from the builder File termDocumentMatrix = mTermDocumentMatrixBuilder.getFile(); // If a transform was specified, perform the matrix transform. if (transform != null) { LoggerUtil.info(LOG, "performing %s transform", transform); LoggerUtil.verbose(LOG, "stored term-document matrix in format %s at %s", mTermDocumentMatrixBuilder.getMatrixFormat(), termDocumentMatrix.getAbsolutePath()); // Convert the raw term counts using the specified transform termDocumentMatrix = transform.transform(termDocumentMatrix, mTermDocumentMatrixBuilder.getMatrixFormat()); LoggerUtil.verbose(LOG, "transformed matrix to %s", termDocumentMatrix.getAbsolutePath()); } return new MatrixFile(termDocumentMatrix, mTermDocumentMatrixBuilder.getMatrixFormat()); } /** * Subclasses should override this method if they need to utilize a header * token for each document. Implementations of this method <b>must</b> be * thread safe. The default action is a no-op. * * @param docIndex * The document id assigned to the current document * @param documentName * The name of the current document. */ protected void handleDocumentHeader(int docIndex, String header) { } /* * we will run through all the mistakes */ private void addToDocumentQualityAttributes(AbstractDocument doc) { for (Mistake mistake : doc.getMistakes().getListOfMistakes()) { DocumentQuality quality = new DocumentQuality(-1, EnumDocumentQualityAttributes.mapMistakeTypeEnum(mistake.getMistakeType()), -1, mistake.getMistakeTypeSubCategory()); try { mDocumentQualityLoader.addToDocumentQualityAttribute(quality); } catch (DocumentQualityProcessorNotImplementedException exp) { // we should never throw it here. but still lets log it. LoggerUtil.warning(LOG, "Document quality type evaluator could not be found for type %s", mistake.getUniqueMistakeId()); } } } }