won.preprocessing.GateRESCALProcessing.java Source code

Introduction

Here is the source code for won.preprocessing.GateRESCALProcessing.java
Source

/*
 * Copyright 2012  Research Studios Austria Forschungsges.m.b.H.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */

package won.preprocessing;

import gate.*;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.GateException;
import gate.util.persistence.PersistenceManager;
import org.apache.commons.io.FilenameUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
 * User: hfriedrich
 * Date: 01.07.2014
 *
 * Create data (binary tensor) for the RESCAL algorithm from a corpus of text files that are processed by Gate.
 */
public class GateRESCALProcessing {
    private static final Logger logger = LoggerFactory.getLogger(GateRESCALProcessing.class);

    public enum AnnotationType {
        TOPIC("TopicToken", "string"), // use "stem" here as second argument for stemmed tokens
        DESCRIPTION("DescriptionToken", "string"), // use "stem" here as second argument for stemmed tokens
        CLASSIFICATION("NeedClassificationToken", "kind");

        private String tokenName;
        private String featureName;

        private AnnotationType(String token, String feature) {
            tokenName = token;
            featureName = feature;
        }

        public String getTokenName() {
            return tokenName;
        }

        public String getFeatureName() {
            return featureName;
        }

        public static AnnotationType getTypeByToken(String token) {
            for (AnnotationType type : AnnotationType.values()) {
                if (type.getTokenName().equals(token)) {
                    return type;
                }
            }
            return null;
        }
    }

    private String baseFolder;
    private boolean createContentSlice;
    private boolean useStemming;
    private WonMatchingData matchingData;
    private CorpusController gateApplication;

    public GateRESCALProcessing(String gateAppPath, String baseFolder, boolean createContentSlice,
            boolean useStemming) throws GateException, IOException {
        matchingData = new WonMatchingData();
        this.baseFolder = baseFolder;
        this.createContentSlice = createContentSlice;
        this.useStemming = useStemming;

        // init Gate
        logger.info("Initialising Gate");
        Gate.init();

        // load Gate application
        logger.info("Loading Gate application: {}", gateAppPath);
        gateApplication = (CorpusController) PersistenceManager.loadObjectFromFile(new File(gateAppPath));
    }

    /**
     * After the mails have been preprocessed the Gate processing and tensor creation is executed by the gate
     * application.
     *
     * @param corpusFolder corpus document input folder
     * @throws gate.util.GateException
     * @throws MalformedURLException
     */
    public void processFilesWithGate(String corpusFolder)
            throws IOException, ResourceInstantiationException, ExecutionException {

        int maxFilesPerCorpus = 1000;
        int processedFiles = 0;

        logger.info("Gate processing and tensor creation of files from folder: {}", corpusFolder);
        File folder = new File(corpusFolder);

        for (int bulk = 0; bulk < folder.listFiles().length; bulk += maxFilesPerCorpus) {
            Corpus corpus = Factory.newCorpus("Transient Gate Corpus");
            for (int i = bulk; i < maxFilesPerCorpus + bulk && i < folder.listFiles().length; i++) {
                File file = folder.listFiles()[i];
                if (!file.isDirectory() && !file.isHidden()) {
                    corpus.add(Factory.newDocument(file.toURI().toURL()));
                }
            }
            gateApplication.setCorpus(corpus);
            gateApplication.execute();
            addDataFromProcessedCorpus(corpus);
            processedFiles += corpus.size();
            logger.info("{} files processed ...", processedFiles);
        }
    }

    private static void saveXMLDocumentAnnotations(Corpus corpus, String folder) throws IOException {

        logger.info("Saving XML gate annotation files to folder: {}", folder);
        File outFolder = new File(folder);
        outFolder.mkdirs();
        Iterator documentIterator = corpus.iterator();
        while (documentIterator.hasNext()) {
            Document currDoc = (Document) documentIterator.next();
            String xmlDocument = currDoc.toXml();
            String fileName = java.net.URLDecoder
                    .decode(FilenameUtils.getBaseName(currDoc.getSourceUrl().getFile()), "UTF-8");
            String path = new String(folder + "/" + fileName + ".xml");
            logger.debug("Saving XML gate annotation file: {}", path);
            FileWriter writer = new FileWriter(path);
            writer.write(xmlDocument);
            writer.close();
        }
    }

    /**
     * Add a Gate-processed corpus to the class to generate output of it later by {@link #createRescalData(String)}.
     * The documents in the corpus must have been annotated correctly by Gate (see annotation definition constants in
     * this class).
     *
     * @param corpus
     */
    private void addDataFromProcessedCorpus(Corpus corpus) throws UnsupportedEncodingException {
        Iterator documentIterator = corpus.iterator();
        while (documentIterator.hasNext()) {
            Document currDoc = (Document) documentIterator.next();
            String needId = createNeedId(currDoc);

            for (Annotation annotation : currDoc.getAnnotations()) {

                String attrValue = null;
                AnnotationType type = AnnotationType.getTypeByToken(annotation.getType());
                if (type == null) {
                    continue;
                }

                switch (type) {

                case TOPIC:
                    attrValue = getFeatureValueFromAnnotation(annotation, type.getFeatureName());
                    if (useStemming) {
                        attrValue = getFeatureValueFromAnnotation(annotation, "stem");
                    }
                    matchingData.addNeedAttribute(needId, attrValue, WonMatchingData.AttributeType.TOPIC);
                    break;

                case DESCRIPTION:
                    if (createContentSlice) {
                        attrValue = getFeatureValueFromAnnotation(annotation, type.getFeatureName());
                        if (useStemming) {
                            attrValue = getFeatureValueFromAnnotation(annotation, "stem");
                        }
                        matchingData.addNeedAttribute(needId, attrValue, WonMatchingData.AttributeType.DESCRIPTION);
                    }
                    break;

                case CLASSIFICATION:
                    attrValue = getFeatureValueFromAnnotation(annotation, type.getFeatureName());
                    WonMatchingData.NeedType needType = WonMatchingData.NeedType.OFFER;
                    if (!attrValue.equalsIgnoreCase(needType.name())) {
                        needType = WonMatchingData.NeedType.WANT;
                        if (!attrValue.equalsIgnoreCase(needType.name())) {
                            logger.error("Unknown feature value '{}' found in annotation '{}'",
                                    type.getFeatureName() + "=" + attrValue, annotation.getType());
                            break;
                        }
                    }
                    matchingData.addNeedType(needId, needType);
                    break;

                default:
                    break;
                }
            }
        }
    }

    private String getFeatureValueFromAnnotation(Annotation annotation, String feature) {
        String attrValue = (String) annotation.getFeatures().get(feature);
        if (attrValue == null) {
            logger.error("Feature value '{}' not found in annotation '{}'", feature, annotation.getType());
        }
        return attrValue.toLowerCase();
    }

    /**
     * Add data about Need connections. Use filenames as names for needs, one need per line. Create a need connection
     * between a Need and all following Needs until empty line in text file.
     *
     */
    public void addConnectionData(String connectionFile, boolean ignoreNeedsNotFound) throws Exception {

        logger.info("Create Need connection from input file: {}", connectionFile);
        BufferedReader reader = new BufferedReader(new FileReader(connectionFile));
        String line = "";
        List<String> needs = new LinkedList<String>();

        while ((line = reader.readLine()) != null) {
            if (line.length() == 0) {
                // add a connection between the first need and all following needs until empty line
                addConnection(needs, ignoreNeedsNotFound);
                needs = new LinkedList<String>();
            } else {
                needs.add(line.trim());
            }
        }
        addConnection(needs, ignoreNeedsNotFound);
    }

    private String createNeedId(Document doc) throws UnsupportedEncodingException {
        return java.net.URLDecoder.decode(FilenameUtils.getBaseName(doc.getSourceUrl().getFile()), "UTF-8");
    }

    private void addConnection(List<String> needs, boolean ignoreNeedsNotFound) throws Exception {
        for (int i = 1; i < needs.size(); i++) {
            String need1 = needs.get(0);
            String need2 = needs.get(i);
            if (!matchingData.getNeeds().contains(need1) || !matchingData.getNeeds().contains(need2)) {
                logger.warn("add connection between new needs: \n{} \n{}", need1, need2);
                if (!ignoreNeedsNotFound) {
                    throw new Exception(
                            "No need found in input directory for connection specified in connection file:  \n"
                                    + need1 + "\n" + need2);
                }
            }
            matchingData.addNeedConnection(need1, need2);
        }
    }

    /**
     * Save the data that was added using {@link #addDataFromProcessedCorpus(gate.Corpus)}.
     *
     * @param outputFolder data folder
     * @throws IOException
     */
    public void createRescalData(String outputFolder) throws IOException {
        matchingData.writeOutputFiles(outputFolder);
    }

}