org.norvelle.addressdiscoverer.gui.threading.ExtractIndividualsFromFileWorker.java Source code

Introduction

Here is the source code for org.norvelle.addressdiscoverer.gui.threading.ExtractIndividualsFromFileWorker.java
Source

/**
 * Part of the AddressDiscoverer project, licensed under the GPL v.3 license.
 * This project provides intelligence for discovering email addresses in
 * specified web pages, associating them with a given institution and department
 * and address type.
 *
 * This project is licensed under the GPL v.3. Your rights to copy and modify
 * are regulated by the conditions specified in that license, available at
 * http://www.gnu.org/licenses/gpl-3.0.html
 */
package org.norvelle.addressdiscoverer.gui.threading;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.List;
import java.util.logging.Logger;
import javax.swing.JOptionPane;
import javax.swing.SwingWorker;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.norvelle.addressdiscoverer.AddressDiscoverer;
import org.norvelle.addressdiscoverer.gui.threading.ExtractIndividualsStatusReporter.ClassificationStages;
import org.norvelle.addressdiscoverer.classifier.IProgressConsumer;
import org.norvelle.addressdiscoverer.exceptions.CantParseIndividualException;
import org.norvelle.addressdiscoverer.exceptions.DoesNotContainContactLinkException;
import org.norvelle.addressdiscoverer.exceptions.MultipleContactLinksOfSameTypeFoundException;
import org.norvelle.addressdiscoverer.gui.EmailDiscoveryPanel;
import org.norvelle.addressdiscoverer.model.Department;
import org.norvelle.addressdiscoverer.model.Individual;
import org.norvelle.addressdiscoverer.model.UnamName;
import org.norvelle.addressdiscoverer.parse.INameElement;
import org.norvelle.addressdiscoverer.parse.ContactLink;
import org.norvelle.addressdiscoverer.parse.structured.StructuredPageContactLinkLocator;
import org.norvelle.addressdiscoverer.parse.INameElementFinder;
import org.norvelle.addressdiscoverer.parse.structured.StructuredNameElementFinder;
import org.norvelle.addressdiscoverer.parse.unstructured.UnstructuredNameElementFinder;
import org.norvelle.utils.Utils;

/**
 * A SwingWorker to handle background processing of the page classification
 * process. 
 * 
 * @author Erik Norvelle <erik.norvelle@cyberlogos.co>
 */
public class ExtractIndividualsFromFileWorker extends SwingWorker<String, String> implements IProgressConsumer {
    static final Logger logger = Logger.getLogger(Logger.GLOBAL_LOGGER_NAME);
    protected File fileToClassify;
    private final EmailDiscoveryPanel parent;
    private final Department department;
    private final boolean useSequentialParser;

    /**
     * Run the classification process on the contents of a file in the filesystem
     * 
     * @param parent
     * @param fileToClassify
     * @param department
     * @param useSequentialParser
     */
    public ExtractIndividualsFromFileWorker(EmailDiscoveryPanel parent, File fileToClassify, Department department,
            boolean useSequentialParser) {
        this.parent = parent;
        this.fileToClassify = fileToClassify;
        this.department = department;
        this.useSequentialParser = useSequentialParser;
    }

    @Override
    @SuppressWarnings({ "UseSpecificCatch", "BroadCatchBlock", "TooBroadCatch" })
    protected String doInBackground() throws Exception {
        InputStream in = null;
        try {
            // Set the base URL if one is specified (in order to resolve urls
            // that point to web links within downloaded pages
            if (!department.getBaseUrl().isEmpty())
                StructuredPageContactLinkLocator.baseUrl = department.getBaseUrl();

            // Delete any individuals present from last parse.
            Individual.deleteIndividualsForDepartment(department);

            // Fetch the page and parse it into a JSoup document
            in = new FileInputStream(this.fileToClassify);
            String charset = Utils.getCharsetFromStream(in);
            String html = FileUtils.readFileToString(this.fileToClassify, Charset.forName(charset));
            Document soup = Jsoup.parse(html, charset);

            // Classify the page to discover its structure
            parent.getjStageNameLabel().setText("Finding names");
            ExtractIndividualsStatusReporter status = new ExtractIndividualsStatusReporter(
                    ClassificationStages.CREATING_ITERATOR, this);
            INameElementFinder nameElementFinder;
            if (!this.useSequentialParser)
                nameElementFinder = new StructuredNameElementFinder(soup, charset, status);
            else
                nameElementFinder = new UnstructuredNameElementFinder(soup, charset, status);

            // Now, fetch the individuals we've extract as a List
            List<Individual> individuals = null; //extractor.getIndividuals();
            List<INameElement> nameElements = nameElementFinder.getNameElements();
            int count = 1;
            for (INameElement ne : nameElements) {
                parent.getjStageNameLabel()
                        .setText(String.format("Processing name %d out of %d", count++, nameElements.size()));

                // First, see if we can parse the name; if not, we skip this name
                UnamName nm;
                try {
                    nm = ne.getUnamName();
                } catch (CantParseIndividualException e) {
                    this.reportException("Couldn't parse name for " + ne.toString());
                    continue;
                }

                String email;
                try {
                    ContactLink cl = ne.getContactLink();
                    email = cl.getAddress();
                } catch (DoesNotContainContactLinkException ex) {
                    email = "Not found";
                } catch (MultipleContactLinksOfSameTypeFoundException ex2) {
                    email = ex2.getMessage();
                } catch (Exception ex3) {
                    this.reportException(
                            String.format("Exception '%s' while processing %s", ex3.getMessage(), ne.toString()));
                    continue;
                }
                Individual i = new Individual(nm, email, "", department);
                Individual.store(i);
            }
            parent.getjStageNameLabel().setText(String.format("Found %d individuals", Individual.getCount()));

            // All done    
            this.parent.notifyParsingFinished();
            this.parent.refreshResultsTable();
        } catch (Exception ex) {
            AddressDiscoverer.reportException(ex);
        } finally {
            try {
                if (in != null)
                    in.close();
            } catch (IOException ex) {
                AddressDiscoverer.reportException(ex);
            }
        }

        parent.getjStageNameLabel().setText("Done");
        return "";
    }

    /**
     * A single-access point for reporting exceptions to the user.
     * 
     * @param message
     */
    public void reportException(String message) {
        JOptionPane.showMessageDialog(null, Utils.wordWrapString(message, 70), "Program error",
                JOptionPane.ERROR_MESSAGE);
    }

    @Override
    public void reportProgressStage(ExtractIndividualsStatusReporter progress) {
        publish(progress.toString());
    }

    @Override
    public void reportText(String text) {
        this.parent.getjStageNameLabel().setText(text);
    }

    /**
     * This method receives the signals that the doInBackground method sends out,
     * allowing the SwingWorker to periodically check those signals and process
     * them here.
     *
     * @param progressUpdates
     */
    @Override
    protected void process(final List<String> progressUpdates) {
        for (final String progress : progressUpdates) {
            this.parent.addToOutput(progress);
        }
    }

}