org.norvelle.addressdiscoverer.parse.structured.StructuredPageEmailContactLink.java Source code

Java tutorial

Introduction

Here is the source code for org.norvelle.addressdiscoverer.parse.structured.StructuredPageEmailContactLink.java

Source

/**
 * Part of the AddressDiscoverer project, licensed under the GPL v.3 license.
 * This project provides intelligence for discovering email addresses in
 * specified web pages, associating them with a given institution and department
 * and address type.
 *
 * This project is licensed under the GPL v.3. Your rights to copy and modify
 * are regulated by the conditions specified in that license, available at
 * http://www.gnu.org/licenses/gpl-3.0.html
 */
package org.norvelle.addressdiscoverer.parse.structured;

import java.util.ArrayList;
import java.util.regex.Matcher;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import org.norvelle.addressdiscoverer.exceptions.DoesNotContainContactLinkException;
import org.norvelle.addressdiscoverer.exceptions.MultipleContactLinksOfSameTypeFoundException;
import org.norvelle.addressdiscoverer.parse.ContactLink;

/**
 * Given a name-containing Jsoup element, search it and its children for an email
 * address. Throw an exception if either nothing is found, or more than one email
 * is found (which would indicate an unstructured page.
 * 
 * @author Erik Norvelle <erik.norvelle@cyberlogos.co>
 */
public class StructuredPageEmailContactLink extends ContactLink {

    /**
     * Try to find an email address in both the HTML (so that we can get attributes
     * of elements) as well as in the plain text (in case the HTML has been scrambled
     * to obfuscate the address).
     * 
     * @param element
     * @throws DoesNotContainContactLinkException
     * @throws MultipleContactLinksOfSameTypeFoundException 
     */
    public StructuredPageEmailContactLink(Element element)
            throws DoesNotContainContactLinkException, MultipleContactLinksOfSameTypeFoundException {
        super(element);
        String content = element.html();
        try {
            this.address = this.findLinkInString(content);
        } catch (DoesNotContainContactLinkException ex) {
            content = element.text();
            this.address = this.findLinkInString(content);
        }
    }

    /**
     * Given a chunk of text, see if we can't find a single email address in it.
     * If there are multiple instances of the same address, that's fine, but if there
     * are multiple different emails we raise an exception.
     * 
     * @param str
     * @return
     * @throws DoesNotContainContactLinkException
     * @throws MultipleContactLinksOfSameTypeFoundException 
     */
    private String findLinkInString(String str)
            throws DoesNotContainContactLinkException, MultipleContactLinksOfSameTypeFoundException {
        Matcher emailMatcher = emailPattern.matcher(str);
        String matchFound = "";
        int numMatches = 0;
        ArrayList<String> emails = new ArrayList();

        // Extract all email addresses found
        while (emailMatcher.find()) {
            matchFound = str.substring(emailMatcher.start(), emailMatcher.end());
            emails.add(matchFound);
            numMatches++;
        }

        // Throw exception if nothing found.
        if (numMatches == 0)
            throw new DoesNotContainContactLinkException();

        // If we have multiple emails that are distinct, we throw an exception
        if (numMatches > 1) {
            String firstEmail = emails.get(0);
            for (String email : emails)
                if (!email.equals(firstEmail)) {
                    String emailStr = StringUtils.join(emails, ", ");
                    if (emailStr.length() > 100)
                        emailStr = emailStr.substring(0, 99);
                    throw new MultipleContactLinksOfSameTypeFoundException(emailStr);
                }
        }

        // Otherwise, we can use the email found above.
        return matchFound;
    }

    @Override
    public String toString() {
        return String.format("mailto:%s", this.address);
    }
}