gate.creole.kea.CorpusImporter.java Source code

Java tutorial

Introduction

Here is the source code for gate.creole.kea.CorpusImporter.java

Source

/*
 *    CorpusImporter.java
 *
 *    Copyright (c) 1998-2005, The University of Sheffield.
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Valentin Tablan, 3/Feb/2005
 *
 *  $Id: CorpusImporter.java 7479 2006-06-29 19:21:29 +0000 (Thu, 29 Jun 2006) valyt $
 */

package gate.creole.kea;

import gate.AnnotationSet;
import gate.Corpus;
import gate.Document;
import gate.Executable;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.AbstractVisualResource;
import gate.creole.ExecutionException;
import gate.gui.MainFrame;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.ExtensionFileFilter;

import java.awt.GridBagConstraints;
import java.awt.GridBagLayout;
import java.awt.Insets;
import java.awt.event.ActionEvent;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.swing.AbstractAction;
import javax.swing.BorderFactory;
import javax.swing.Box;
import javax.swing.JButton;
import javax.swing.JFileChooser;
import javax.swing.JLabel;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JTextField;

import org.apache.commons.io.IOUtils;

/**
 * A simple utility to import KEA style corpora into GATE.
 * It is registered as a GATE viewer for the KEA Processing Resource and it is
 * available from the main pane.
 */
@SuppressWarnings("serial")
public class CorpusImporter extends AbstractVisualResource {

    private static final long serialVersionUID = 5318018411995211792L;

    public CorpusImporter() {
    }

    /**
     * Does nothing as this is not really a viewer.
     */
    public void setTarget(Object target) {
    }

    public Resource init() throws gate.creole.ResourceInstantiationException {
        super.init();
        initLocalData();
        initGUIComponents();
        initListeners();
        return this;
    }

    protected void initLocalData() {
    }

    protected void initGUIComponents() {
        setLayout(new GridBagLayout());
        GridBagConstraints constraints = new GridBagConstraints();
        constraints.gridy = 0;
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.weightx = 0;
        constraints.weighty = 0;
        constraints.fill = GridBagConstraints.BOTH;

        JPanel inputPanel = new JPanel(new GridBagLayout());
        inputPanel.setBorder(BorderFactory.createTitledBorder("Input"));
        GridBagConstraints constraints2 = new GridBagConstraints();
        constraints2.gridy = 0;
        constraints2.gridx = GridBagConstraints.RELATIVE;
        constraints2.weighty = 0;
        constraints2.weightx = 0;
        constraints2.fill = GridBagConstraints.BOTH;
        constraints2.insets = new Insets(2, 2, 2, 2);

        JLabel label = new JLabel("Source directory:");
        inputPanel.add(label, constraints2);
        sourceDirTField = new JTextField(30);
        inputPanel.add(sourceDirTField, constraints2);
        JButton openButton = new JButton(new SelectDirectoryAction());
        inputPanel.add(openButton, constraints2);

        constraints2.gridy = 1;
        label = new JLabel("Extension for text files:");
        inputPanel.add(label, constraints2);
        constraints2.gridwidth = 2;
        textExtensionTField = new JTextField(".txt");
        inputPanel.add(textExtensionTField, constraints2);
        constraints2.gridwidth = 1;

        constraints2.gridy = 2;
        label = new JLabel("Extension for keyphrase files:");
        inputPanel.add(label, constraints2);
        constraints2.gridwidth = 2;
        keyExtensionTField = new JTextField(".key");
        inputPanel.add(keyExtensionTField, constraints2);
        constraints2.gridwidth = 1;

        constraints2.gridy = 3;
        label = new JLabel("Encoding for input files:");
        inputPanel.add(label, constraints2);
        constraints2.gridwidth = 2;
        encodingTField = new JTextField("");
        inputPanel.add(encodingTField, constraints2);
        constraints2.gridwidth = 1;

        add(inputPanel, constraints);
        constraints.weightx = 1;
        add(Box.createHorizontalGlue(), constraints);
        constraints.weightx = 0;

        JPanel outputPanel = new JPanel();
        outputPanel.setLayout(new GridBagLayout());
        outputPanel.setBorder(BorderFactory.createTitledBorder("Output"));

        constraints2.gridy = 0;
        label = new JLabel("Corpus name:");
        outputPanel.add(label, constraints2);
        constraints2.weightx = 1;
        corpusNameTField = new JTextField("KEA Corpus");
        constraints2.weightx = 0;
        outputPanel.add(corpusNameTField, constraints2);

        constraints2.gridy = 1;
        label = new JLabel("Output annotation set:");
        outputPanel.add(label, constraints2);
        constraints2.weightx = 1;
        annotationSetTField = new JTextField("Key");
        constraints2.weightx = 0;
        outputPanel.add(annotationSetTField, constraints2);

        constraints2.gridy = 2;
        label = new JLabel("Keyphrase annotation type:");
        outputPanel.add(label, constraints2);
        constraints2.weightx = 1;
        annotationTypeTField = new JTextField("Keyphrase");
        constraints2.weightx = 0;
        outputPanel.add(annotationTypeTField, constraints2);

        constraints.gridy = 1;
        add(outputPanel, constraints);

        constraints.gridy = 2;
        constraints.fill = GridBagConstraints.NONE;
        constraints.anchor = GridBagConstraints.CENTER;
        add(new JButton(new ImportCorpusAction()), constraints);

        constraints.gridy = 3;
        constraints.weighty = 1;
        add(Box.createVerticalGlue(), constraints);
    }

    protected void initListeners() {
    }

    protected class SelectDirectoryAction extends AbstractAction {
        public SelectDirectoryAction() {
            super("");
            putValue(SHORT_DESCRIPTION, "Opens a file chooser");
            putValue(SMALL_ICON, MainFrame.getIcon("open-file"));
        }

        public void actionPerformed(ActionEvent evt) {
            JFileChooser chooser = MainFrame.getFileChooser();
            chooser.setMultiSelectionEnabled(false);
            chooser.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY);
            if (chooser.showOpenDialog(CorpusImporter.this) == JFileChooser.APPROVE_OPTION) {
                try {
                    sourceDirTField.setText(chooser.getSelectedFile().getCanonicalPath());
                } catch (IOException ioe) {
                    JOptionPane.showMessageDialog(MainFrame.getInstance(), "Error!\n" + ioe.toString(), "Gate",
                            JOptionPane.ERROR_MESSAGE);
                    ioe.printStackTrace(Err.getPrintWriter());
                }
            }
        }
    }

    protected class ImportCorpusAction extends AbstractAction {
        public ImportCorpusAction() {
            super("Import!");
            putValue(SHORT_DESCRIPTION, "Creates a new GATE corpus");
        }

        public void actionPerformed(ActionEvent evt) {
            Executable executable = new Executable() {
                public void execute() {
                    interrupted = false;
                    try {
                        MainFrame.lockGUI("Importing...");

                        File sourceDir = new File(sourceDirTField.getText());
                        if (sourceDir.isDirectory()) {
                            //create a new corpus
                            Corpus corpus = Factory.newCorpus(corpusNameTField.getText());
                            String textExt = textExtensionTField.getText();
                            String keyExt = keyExtensionTField.getText();
                            String encoding = encodingTField.getText();

                            //get the text files
                            ExtensionFileFilter filter = new ExtensionFileFilter();
                            filter.addExtension(textExt);
                            File[] textFiles = sourceDir.listFiles(filter);
                            if (textFiles != null) {
                                for (int i = 0; i < textFiles.length; i++) {
                                    if (textFiles[i].isDirectory())
                                        continue;
                                    FeatureMap params = Factory.newFeatureMap();
                                    params.put("sourceUrl", textFiles[i].toURI().toURL());
                                    if (encoding != null && encoding.length() > 0)
                                        params.put("encoding", encoding);
                                    Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl",
                                            params, null, textFiles[i].getName());
                                    corpus.add(doc);
                                    //locate the key file
                                    String fileName = textFiles[i].getCanonicalPath();
                                    int pos = fileName.lastIndexOf(textExt);
                                    fileName = fileName.substring(0, pos) + keyExt;
                                    List<String> keys = new ArrayList<String>();
                                    Exception e = null;
                                    BufferedReader reader = null;
                                    try {

                                        if (encoding != null && encoding.length() > 0) {
                                            reader = new BomStrippingInputStreamReader(
                                                    new FileInputStream(fileName), encoding);
                                        } else {
                                            reader = new BufferedReader(new FileReader(fileName));
                                        }

                                        String line = reader.readLine();
                                        while (line != null) {
                                            keys.add(line);
                                            line = reader.readLine();
                                        }
                                    } catch (IOException ioe) {
                                        e = ioe;
                                    } finally {
                                        IOUtils.closeQuietly(reader);
                                    }
                                    if (keys.isEmpty() || e != null) {
                                        MainFrame.unlockGUI();
                                        int res = JOptionPane.showOptionDialog(CorpusImporter.this,
                                                "There were problems obtainig the keyphrases for "
                                                        + textFiles[i].getName() + "!",
                                                "Gate", JOptionPane.YES_NO_OPTION, JOptionPane.ERROR_MESSAGE, null,
                                                new String[] { "Continue", "Stop importing" }, "Continue");
                                        if (e != null)
                                            e.printStackTrace(Err.getPrintWriter());
                                        if (res == JOptionPane.NO_OPTION) {
                                            interrupt();
                                        } else {
                                            //the user wants to continue: lock the GUI
                                            MainFrame.lockGUI("Importing...");
                                        }
                                        Factory.deleteResource(doc);
                                    } else {
                                        //we found some keys
                                        if (!annotateKeyPhrases(doc, annotationSetTField.getText(),
                                                annotationTypeTField.getText(), keys)) {
                                            //none of the keys were present in the document
                                            MainFrame.unlockGUI();
                                            int res = JOptionPane.showOptionDialog(CorpusImporter.this,
                                                    "None of the keyphrases were present in "
                                                            + textFiles[i].getName(),
                                                    "Gate", JOptionPane.YES_NO_CANCEL_OPTION,
                                                    JOptionPane.ERROR_MESSAGE, null, new String[] { "Keep document",
                                                            "Discard document", "Stop importing" },
                                                    "Discard document");
                                            if (res == JOptionPane.CANCEL_OPTION) {
                                                //the user gave up
                                                interrupt();
                                            } else {
                                                MainFrame.lockGUI("Importing...");
                                                if (res == JOptionPane.NO_OPTION) {
                                                    Factory.deleteResource(doc);
                                                }
                                            }

                                        }
                                    }

                                    if (isInterrupted()) {
                                        MainFrame.unlockGUI();
                                        return;
                                    }
                                }
                            }

                        } else {
                            throw new Exception(sourceDir.getCanonicalPath() + " is not a directory!");
                        }
                    } catch (Exception e) {
                        MainFrame.unlockGUI();
                        JOptionPane.showMessageDialog(MainFrame.getInstance(), "Error!\n" + e.toString(), "Gate",
                                JOptionPane.ERROR_MESSAGE);
                        e.printStackTrace(Err.getPrintWriter());
                    } finally {
                        MainFrame.unlockGUI();
                        Gate.setExecutable(null);
                    }
                }

                /**
                 * Checks whether this executable has been interrupted since the last
                 * time its {@link execute()} method was called.
                 */
                public synchronized boolean isInterrupted() {
                    return interrupted;
                }

                /**
                 * Notifies this executable that it should stop its execution as soon
                 * as possible.
                 */
                public synchronized void interrupt() {
                    interrupted = true;
                }

                protected boolean interrupted = false;
            };

            class Executor implements Runnable {
                public Executor(Executable executable) {
                    this.executable = executable;
                }

                public void run() {
                    try {
                        executable.execute();
                    } catch (ExecutionException ee) {
                        MainFrame.unlockGUI();
                        JOptionPane.showMessageDialog(MainFrame.getInstance(), "Error!\n" + ee.toString(), "Gate",
                                JOptionPane.ERROR_MESSAGE);
                        ee.printStackTrace(Err.getPrintWriter());
                    }
                }

                Executable executable;
            }
            ;

            Thread thread = new Thread(new Executor(executable));
            thread.setPriority(Thread.MIN_PRIORITY);
            Gate.setExecutable(executable);
            thread.start();
        }
    }

    protected boolean annotateKeyPhrases(Document document, String annSetName, String keyphraseAnnotationType,
            List<String> phrases) throws Exception {
        if (phrases == null || phrases.isEmpty())
            return false;
        //create a pattern
        String patternStr = "";
        Iterator<String> phraseIter = phrases.iterator();
        while (phraseIter.hasNext()) {
            String phrase = phraseIter.next();
            patternStr += patternStr.length() == 0 ? "\\Q" + phrase + "\\E" : "|\\Q" + phrase + "\\E";
        }

        Pattern pattern = Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);

        Matcher matcher = pattern.matcher(document.getContent().toString());
        AnnotationSet outputSet = annSetName == null || annSetName.length() == 0 ? document.getAnnotations()
                : document.getAnnotations(annSetName);
        boolean result = false;
        while (matcher.find()) {
            int start = matcher.start();
            int end = matcher.end();
            outputSet.add(new Long(start), new Long(end), keyphraseAnnotationType, Factory.newFeatureMap());
            result = true;
        }
        document.getFeatures().put("Author assigned keyphrases", phrases);
        return result;
    }//protected void annotateKeyPhrases(List phrases)

    JTextField sourceDirTField;
    JTextField textExtensionTField;
    JTextField keyExtensionTField;
    JTextField encodingTField;

    JTextField corpusNameTField;
    JTextField annotationTypeTField;
    JTextField annotationSetTField;
}