ie.deri.unlp.javaservices.topicextraction.topicextractor.gate.TopicExtractorGate.java Source code

Java tutorial

Introduction

Here is the source code for ie.deri.unlp.javaservices.topicextraction.topicextractor.gate.TopicExtractorGate.java

Source

/*
 * TopicExtractionImpl.java, provides topic extraction as a Java program/API
 * Copyright (C) 2008  Alexander Schutz
 * National University of Ireland, Galway
 * Digital Enterprise Research Institute
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */

package ie.deri.unlp.javaservices.topicextraction.topicextractor.gate;

import gate.Gate;
import gate.util.GateException;
import ie.deri.unlp.javaservices.topicextraction.data.DomainModel;
import ie.deri.unlp.javaservices.topicextraction.topicextractor.TopicBearer;
import ie.deri.unlp.javaservices.topicextraction.topicextractor.TopicExtractor;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

public class TopicExtractorGate implements TopicExtractor {

    private GateProcessorPool gateProcessorPool = null;
    private boolean gateInitialised = false;
    private Logger logger = Logger.getLogger(TopicExtractorGate.class);

    private static TopicExtractorGate instance = null;

    public static TopicExtractorGate getInstance() throws GateException {
        if (instance == null) {
            instance = new TopicExtractorGate();
        }
        return instance;
    }

    private TopicExtractorGate() throws GateException {
        initGate();
    }

    public TopicBearer extractTopics(String documentText, DomainModel domainModel) throws GateException {
        initGate();
        gate.Document gateDocument = null;

        gateDocument = gate.Factory.newDocument(documentText);

        return getTopics(gateDocument, domainModel);
    }

    private TopicBearer getTopics(gate.Document gateDocument, DomainModel domainModel) throws GateException {
        TopicBearer response = null;
        if (gateDocument != null) {
            try {
                response = new GateProcessorConsumer(gateProcessorPool).process(gateDocument, domainModel);
            } catch (InterruptedException e) {
                throw new GateException(e);
            }
        }

        return response;
    }

    public void initGate() throws GateException {
        if (!gateInitialised) {
            logger.info("Initializing GATE");
            try {
                URI url = getClass().getResource("/gate").toURI();
                logger.info("URL to GATE resources: " + url);

                File gateHome;
                if (url.isOpaque()) {
                    /*
                     * GATE only has an interface for File objects, which means the files have to be on the filesystem.
                     * If we're running from a JAR we first have to extract the gate/ folder from the JAR. 
                     */

                    logger.info("Unpacking GATE resources from JAR");
                    String tempDirectoryPath = FileUtils.getTempDirectoryPath();
                    //Delete any existing directory
                    String gateResource = "gate";
                    FileUtils.deleteDirectory(new File(FilenameUtils.concat(tempDirectoryPath, gateResource)));
                    String gateHomePath = extractDirectoryFromClasspathJAR(getClass(), gateResource,
                            tempDirectoryPath);
                    gateHome = new File(gateHomePath);

                } else {
                    gateHome = new File(url);
                }

                Gate.setGateHome(gateHome);
                Gate.setSiteConfigFile(new File(gateHome, "gate-site.xml"));
                Gate.setUserConfigFile(new File(gateHome, "gate-user.xml"));
                // Gate.setUserSessionFile(new File(gateHome, GATE_SESSION));

                Gate.setPluginsHome(new File(gateHome, "plugins/"));

                Gate.init();

                Iterator<URL> pluginItr = Gate.getKnownPlugins().iterator();
                while (pluginItr.hasNext()) {
                    URL pluginURL = pluginItr.next();
                    Gate.getCreoleRegister().registerDirectories(pluginURL);
                }

                logger.log(Level.INFO, "configuring GATE processor pool..");

                gateProcessorPool = GateProcessorPool.getInstance();

                gateInitialised = true;

            } catch (IOException | URISyntaxException | GateException exn) {
                throw new GateException("Exception while initializing GATE resources", exn);
            }
            logger.info("GATE Initialized");
        }
    }

    /**
     * 
     * Extract a directory in a JAR on the classpath to an output folder.
     * 
     * Note: User's responsibility to ensure that the files are actually in a JAR.
     * 
     * @param classInJar A class in the JAR file which is on the classpath
     * @param resourceDirectory Path to resource directory in JAR
     * @param outputDirectory Directory to write to  
     * @return String containing the path to the outputDirectory
     * @throws IOException
     */
    private static String extractDirectoryFromClasspathJAR(Class<?> classInJar, String resourceDirectory,
            String outputDirectory) throws IOException {

        resourceDirectory = StringUtils.strip(resourceDirectory, "\\/") + File.separator;

        URL jar = classInJar.getProtectionDomain().getCodeSource().getLocation();
        JarFile jarFile = new JarFile(new File(jar.getFile()));

        byte[] buf = new byte[1024];
        Enumeration<JarEntry> jarEntries = jarFile.entries();
        while (jarEntries.hasMoreElements()) {
            JarEntry jarEntry = jarEntries.nextElement();
            if (jarEntry.isDirectory() || !jarEntry.getName().startsWith(resourceDirectory)) {
                continue;
            }

            String outputFileName = FilenameUtils.concat(outputDirectory, jarEntry.getName());
            //Create directories if they don't exist
            new File(FilenameUtils.getFullPath(outputFileName)).mkdirs();

            //Write file
            FileOutputStream fileOutputStream = new FileOutputStream(outputFileName);
            int n;
            InputStream is = jarFile.getInputStream(jarEntry);
            while ((n = is.read(buf, 0, 1024)) > -1) {
                fileOutputStream.write(buf, 0, n);
            }
            is.close();
            fileOutputStream.close();
        }
        jarFile.close();

        String fullPath = FilenameUtils.concat(outputDirectory, resourceDirectory);
        return fullPath;
    }
}