gate.corpora.CSVImporter.java Source code

Java tutorial

Introduction

Here is the source code for gate.corpora.CSVImporter.java

Source

/*
 * CSVImporter.java
 * 
 * Copyright (c) 2013, The University of Sheffield. See the file COPYRIGHT.txt
 * in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 * 
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * licenced under the GNU Library General Public License, Version 2, June 1991
 * (in the distribution as file licence.html, and also available at
 * http://gate.ac.uk/gate/licence.html).
 * 
 * Mark A. Greenwood, 10/09/2013
 */

package gate.corpora;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.gui.MainFrame;
import gate.gui.NameBearerHandle;
import gate.gui.ResourceHelper;
import gate.util.ExtensionFileFilter;
import gate.util.Files;

import java.awt.GridBagConstraints;
import java.awt.GridBagLayout;
import java.awt.Insets;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import javax.swing.AbstractAction;
import javax.swing.Action;
import javax.swing.JButton;
import javax.swing.JCheckBox;
import javax.swing.JComponent;
import javax.swing.JFileChooser;
import javax.swing.JLabel;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JSpinner;
import javax.swing.JTextField;
import javax.swing.SpinnerNumberModel;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;

import au.com.bytecode.opencsv.CSVReader;

@SuppressWarnings("serial")
@CreoleResource(name = "CSV Corpus Populater", tool = true, autoinstances = @AutoInstance, comment = "Populate a corpus from CSV files", helpURL = "http://gate.ac.uk/userguide/sec:creole:csv")
public class CSVImporter extends ResourceHelper {

    private static JComponent dialog = new JPanel();

    private static SpinnerNumberModel textColModel = new SpinnerNumberModel(0, 0, Integer.MAX_VALUE, 1);

    private static JCheckBox cboFeatures = new JCheckBox("1st Row Contains Column Labels", true);

    private static JCheckBox cboDocuments = new JCheckBox("Create One Document Per Row", false);

    private static JTextField txtURL = new JTextField(30);

    private static JTextField txtSeparator = new JTextField(",", 3);

    private static JTextField txtQuoteChar = new JTextField("\"", 3);

    private static JTextField txtEncoding = new JTextField("UTF-8");

    private static FileFilter CSV_FILE_FILTER = new ExtensionFileFilter("CSV Files (*.csv)", "csv");

    static {
        // we'll use the same dialog instance regardless of the corpus we are
        // populating so we'll create a single static instance

        dialog.setLayout(new GridBagLayout());

        GridBagConstraints constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 0;
        constraints.gridwidth = 2;
        constraints.anchor = GridBagConstraints.WEST;
        constraints.fill = GridBagConstraints.NONE;
        constraints.insets = new Insets(0, 0, 0, 5);
        dialog.add(new JLabel("CSV File URL:"), constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 0;
        constraints.gridwidth = 5;
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.insets = new Insets(0, 0, 0, 10);
        dialog.add(txtURL, constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 0;
        constraints.gridwidth = 1;
        constraints.anchor = GridBagConstraints.NORTHWEST;
        JButton btnCSVURL = new JButton(MainFrame.getIcon("open-file"));
        dialog.add(btnCSVURL, constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 1;
        constraints.gridwidth = 1;
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.insets = new Insets(0, 0, 15, 5);
        dialog.add(new JLabel("Encoding:"), constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 1;
        constraints.gridwidth = 1;
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.insets = new Insets(0, 15, 15, 10);
        dialog.add(txtEncoding, constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 1;
        constraints.gridwidth = 1;
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.insets = new Insets(0, 0, 15, 5);
        dialog.add(new JLabel("Column Separator:"), constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 1;
        constraints.gridwidth = 1;
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.insets = new Insets(0, 15, 15, 10);
        dialog.add(txtSeparator, constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 1;
        constraints.gridwidth = 1;
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.insets = new Insets(0, 0, 15, 5);
        dialog.add(new JLabel("Quote Character:"), constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 1;
        constraints.gridwidth = 1;
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.insets = new Insets(0, 0, 15, 10);
        dialog.add(txtQuoteChar, constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 2;
        constraints.gridwidth = 3;
        constraints.anchor = GridBagConstraints.NORTHWEST;
        constraints.insets = new Insets(0, 0, 15, 5);
        dialog.add(new JLabel("Document Content Is In Column"), constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 2;
        constraints.gridwidth = 3;
        constraints.anchor = GridBagConstraints.NORTHWEST;
        dialog.add(new JSpinner(textColModel), constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 3;
        constraints.gridwidth = GridBagConstraints.RELATIVE;
        constraints.anchor = GridBagConstraints.NORTHWEST;
        dialog.add(cboFeatures, constraints);

        constraints = new GridBagConstraints();
        constraints.gridx = GridBagConstraints.RELATIVE;
        constraints.gridy = 4;
        constraints.gridwidth = GridBagConstraints.RELATIVE;
        constraints.anchor = GridBagConstraints.NORTHWEST;
        dialog.add(cboDocuments, constraints);

        btnCSVURL.addActionListener(new ActionListener() {
            @Override
            public void actionPerformed(ActionEvent e) {
                JFileChooser filer = MainFrame.getFileChooser();

                filer.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES);
                filer.setDialogTitle("Select a CSV File");
                filer.resetChoosableFileFilters();
                filer.setAcceptAllFileFilterUsed(false);
                filer.addChoosableFileFilter((javax.swing.filechooser.FileFilter) CSV_FILE_FILTER);
                filer.setFileFilter((javax.swing.filechooser.FileFilter) CSV_FILE_FILTER);

                if (filer.showOpenDialog(dialog) != JFileChooser.APPROVE_OPTION)
                    return;
                try {
                    txtURL.setText(filer.getSelectedFile().toURI().toURL().toExternalForm());
                } catch (IOException ioe) {
                    // do nothing here
                }
            }
        });
    }

    @Override
    protected List<Action> buildActions(final NameBearerHandle handle) {
        List<Action> actions = new ArrayList<Action>();

        if (!(handle.getTarget() instanceof Corpus))
            return actions;

        actions.add(new AbstractAction("Populate from CSV File") {
            @Override
            public void actionPerformed(ActionEvent e) {

                // display the populater dialog and return if it is cancelled
                if (JOptionPane.showConfirmDialog(null, dialog, "Populate From CSV File",
                        JOptionPane.OK_CANCEL_OPTION, JOptionPane.PLAIN_MESSAGE) != JOptionPane.OK_OPTION)
                    return;

                // we want to run the population in a separate thread so we don't lock
                // up the GUI
                Thread thread = new Thread(Thread.currentThread().getThreadGroup(), "CSV Corpus Populater") {

                    public void run() {
                        try {

                            // unescape the strings that define the format of the file and
                            // get the actual chars
                            char separator = StringEscapeUtils.unescapeJava(txtSeparator.getText()).charAt(0);
                            char quote = StringEscapeUtils.unescapeJava(txtQuoteChar.getText()).charAt(0);

                            // see if we can convert the URL to a File instance
                            File file = null;
                            try {
                                file = Files.fileFromURL(new URL(txtURL.getText()));
                            } catch (IllegalArgumentException iae) {
                                // this will happen if someone enters an actual URL, but we
                                // handle that later so we can just ignore the exception for
                                // now and keep going
                            }

                            if (file != null && file.isDirectory()) {
                                // if we have a File instance and that points at a directory
                                // then....

                                // get all the CSV files in the directory structure
                                File[] files = Files.listFilesRecursively(file, CSV_FILE_FILTER);

                                for (File f : files) {
                                    // for each file...

                                    // skip directories as we don't want to handle those
                                    if (f.isDirectory())
                                        continue;

                                    if (cboDocuments.isSelected()) {
                                        // if we are creating lots of documents from a single
                                        // file
                                        // then call the populate method passing through all the
                                        // options from the GUI
                                        populate((Corpus) handle.getTarget(), f.toURI().toURL(),
                                                txtEncoding.getText(), (Integer) textColModel.getValue(),
                                                cboFeatures.isSelected(), separator, quote);
                                    } else {
                                        // if we are creating a single document from a single
                                        // file
                                        // then call the createDoc method passing through all
                                        // the
                                        // options from the GUI
                                        createDoc((Corpus) handle.getTarget(), f.toURI().toURL(),
                                                txtEncoding.getText(), (Integer) textColModel.getValue(),
                                                cboFeatures.isSelected(), separator, quote);
                                    }
                                }
                            } else {
                                // we have a single URL to process so...

                                if (cboDocuments.isSelected()) {
                                    // if we are creating lots of documents from a single file
                                    // then call the populate method passing through all the
                                    // options from the GUI
                                    populate((Corpus) handle.getTarget(), new URL(txtURL.getText()),
                                            txtEncoding.getText(), (Integer) textColModel.getValue(),
                                            cboFeatures.isSelected(), separator, quote);
                                } else {
                                    // if we are creating a single document from a single file
                                    // then call the createDoc method passing through all the
                                    // options from the GUI
                                    createDoc((Corpus) handle.getTarget(), new URL(txtURL.getText()),
                                            txtEncoding.getText(), (Integer) textColModel.getValue(),
                                            cboFeatures.isSelected(), separator, quote);
                                }
                            }
                        } catch (Exception e) {
                            // TODO give a sensible error message
                            e.printStackTrace();
                        }
                    }
                };

                // let's leave the GUI nice and responsive
                thread.setPriority(Thread.MIN_PRIORITY);

                // lets get to it and do some actual work!
                thread.start();

            }
        });

        return actions;
    }

    public static void populate(Corpus corpus, URL csv, String encoding, int column, boolean colLabels) {
        populate(corpus, csv, encoding, column, colLabels, ',', '"');
    }

    /**
     * Create a new document from each row and push it into the specified corpus
     * 
     * @param corpus
     *          the Corpus to add documents to
     * @param csv
     *          the URL of the CSV file to processes
     * @param column
     *          the (zero index based) column which contains the text content
     * @param colLabels
     *          true if the first row contains column labels, true otherwise
     * @param separator
     *          the character that is used to separate columns (usually ,)
     * @param quote
     *          the character used to quote data that includes the column
     *          separator (usually ")
     */
    public static void populate(Corpus corpus, URL csv, String encoding, int column, boolean colLabels,
            char separator, char quote) {
        CSVReader reader = null;
        try {
            // open a CSVReader over the URL
            reader = new CSVReader(new InputStreamReader(csv.openStream(), encoding), separator, quote);

            // if we are adding features read the first line
            String[] features = (colLabels ? reader.readNext() : null);

            String[] nextLine;
            while ((nextLine = reader.readNext()) != null) {
                // for each line in the file...

                // skip the line if there are less columns than we need to get to the
                // content
                if (column >= nextLine.length)
                    continue;

                // skip the line if the column with the content is empty
                if (nextLine[column].trim().equals(""))
                    continue;

                FeatureMap fmap = Factory.newFeatureMap();
                if (colLabels) {
                    // copy all the features from the row into a FeatureMap using the
                    // labels from the first line
                    for (int i = 0; i < features.length; ++i) {
                        if (i != column && i < nextLine.length) {
                            fmap.put(features[i], nextLine[i]);
                        }
                    }
                }

                // setup the initialization params for the document
                FeatureMap params = Factory.newFeatureMap();
                params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, nextLine[column]);

                // create the document
                Document doc = (Document) Factory.createResource(gate.corpora.DocumentImpl.class.getName(), params,
                        fmap);

                // add the document to the corpus
                corpus.add(doc);

                if (corpus.getLRPersistenceId() != null) {
                    // persistent corpus -> unload the document
                    corpus.unloadDocument(doc);
                    Factory.deleteResource(doc);
                }

            }

            if (corpus.getDataStore() != null) {
                // if this corpus is in a datastore make sure we sync it back
                corpus.getDataStore().sync(corpus);
            }
        } catch (Exception e) {
            // not much we can do other than report the exception
            throw new RuntimeException("Unable to open CSV file: " + csv, e);
        } finally {
            // if we opened the reader successfully then close it so we don't leak
            // file handles
            if (reader != null)
                IOUtils.closeQuietly(reader);
        }
    }

    public static void createDoc(Corpus corpus, URL csv, String encoding, int column, boolean colLabels) {
        createDoc(corpus, csv, encoding, column, colLabels, ',', '"');
    }

    /**
     * Creates a single document from the CSV file
     * 
     * @param corpus
     *          the Corpus to add documents to
     * @param csv
     *          the URL of the CSV file to processes
     * @param column
     *          the (zero index based) column which contains the text content
     * @param colLabels
     *          true if the first row contains column labels, true otherwise
     * @param separator
     *          the character that is used to separate columns (usually ,)
     * @param quote
     *          the character used to quote data that includes the column
     *          separator (usually ")
     */
    public static void createDoc(Corpus corpus, URL csv, String encoding, int column, boolean colLabels,
            char separator, char quote) {
        CSVReader reader = null;
        Document doc = null;
        try {
            // open a CSVReader over the URL
            reader = new CSVReader(new InputStreamReader(csv.openStream(), encoding), separator, quote);

            // if we are adding features read the first line
            String[] features = (colLabels ? reader.readNext() : null);

            // create an empty document to which we will add the content as we go
            doc = Factory.newDocument("");

            String[] nextLine;
            while ((nextLine = reader.readNext()) != null) {
                // for each line in the file...

                // skip the line if there are less columns than we need to get to the
                // content
                if (column >= nextLine.length)
                    continue;

                // skip the line if the column with the doc content is empty
                if (nextLine[column].trim().equals(""))
                    continue;

                FeatureMap fmap = Factory.newFeatureMap();
                if (colLabels) {
                    // put the data from the other columns into a FeatureMap using the
                    // labels from the first row
                    for (int i = 0; i < features.length; ++i) {
                        if (i != column && i < nextLine.length) {
                            fmap.put(features[i], nextLine[i]);
                        }
                    }
                }

                // find out how long the document currently is
                // TODO can we keep a running track of this to avoid this call?
                long length = doc.getContent().size();

                // add the new text to the document
                doc.edit(length, length, new DocumentContentImpl(nextLine[column] + "\n\n"));

                // add the spanning annotation to the Original markups set, we use the
                // type "Text" if the columns don't have labels
                doc.getAnnotations("Original markups").add(length, length + nextLine[column].length(),
                        (colLabels ? features[column] : "Text"), fmap);
            }

            // store the original csv file URL as a document feature
            doc.getFeatures().put("csvURL", csv.toExternalForm());

            // so that the doc gets recreated properly put the XML for the doc we just
            // created into the init param that will be used if the document is
            // recreated
            doc.setParameterValue(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, doc.toXml());

            // add the document to the corpus
            corpus.add(doc);

            if (corpus.getLRPersistenceId() != null) {
                // persistent corpus -> unload the document
                corpus.unloadDocument(doc);
                Factory.deleteResource(doc);
            }

            if (corpus.getDataStore() != null) {
                // if this corpus is in a datastore make sure we sync it back
                corpus.getDataStore().sync(corpus);
            }
        } catch (Exception e) {
            // if we failed somewhere then delete the part built document
            if (doc != null)
                Factory.deleteResource(doc);

            // throw a "helpful" exception
            throw new RuntimeException("Unable to open CSV file: " + csv, e);
        } finally {
            // if we got as far as opening a reader over the file then close it
            if (reader != null)
                IOUtils.closeQuietly(reader);
        }
    }
}