Java tutorial
/** * Part of the AddressDiscoverer project, licensed under the GPL v.3 license. * This project provides intelligence for discovering email addresses in * specified web pages, associating them with a given institution and department * and address type. * * This project is licensed under the GPL v.3. Your rights to copy and modify * are regulated by the conditions specified in that license, available at * http://www.gnu.org/licenses/gpl-3.0.html */ package org.norvelle.addressdiscoverer.parse.structured; import java.io.IOException; import java.io.InputStream; import java.net.URISyntaxException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.norvelle.addressdiscoverer.Constants; import org.norvelle.addressdiscoverer.exceptions.DoesNotContainContactLinkException; import org.norvelle.addressdiscoverer.exceptions.MultipleContactLinksOfSameTypeFoundException; import org.norvelle.addressdiscoverer.parse.ContactLink; import org.norvelle.addressdiscoverer.parse.structured.StructuredPageContactLinkLocator; /** * * @author Erik Norvelle <erik.norvelle@cyberlogos.co> */ public class StructuredPageWebContactLink extends ContactLink { protected static final Pattern weblinkPattern = Pattern.compile(Constants.weblinkRegex); /** * Attempt to find a URL-type link associated with the given Jsoup Element, * by looking at all the HREF attributes of the various subelements. * * @param element * @throws DoesNotContainContactLinkException * @throws MultipleContactLinksOfSameTypeFoundException */ public StructuredPageWebContactLink(Element element) throws DoesNotContainContactLinkException, MultipleContactLinksOfSameTypeFoundException { super(element); ArrayList<String> hrefs = new ArrayList(); Elements elements = element.getAllElements(); for (Element child : elements) { if (child.hasAttr("href")) { String href = child.attr("href"); if (!href.startsWith("mailto:")) hrefs.add(href); } } if (hrefs.isEmpty()) throw new DoesNotContainContactLinkException(); else if (hrefs.size() > 1) throw new MultipleContactLinksOfSameTypeFoundException("Multiple web links"); this.address = hrefs.get(0); } /** * Fetches the web page specified by the contact weblink and extracts * an email from it. The email gets stored in the address field for retrieval * by the Individual extractor. Note that we fetch the first such email found * and discard others. * * @return * @throws org.norvelle.addressdiscoverer.exceptions.DoesNotContainContactLinkException */ public String fetchEmailFromWeblink() throws DoesNotContainContactLinkException { String body; if (this.address.startsWith("javascript:")) throw new DoesNotContainContactLinkException(); // Try to fetch the webpage linked to try { String addr = StructuredPageContactLinkLocator.resolveAddress(this.address); URL u = new URL(addr); u.toURI(); URLConnection con = u.openConnection(); InputStream in = con.getInputStream(); String encoding = con.getContentEncoding(); encoding = encoding == null ? "UTF-8" : encoding; String html = IOUtils.toString(in, encoding); Document soup = Jsoup.parse(html); Element bodyElement = soup.select("body").first(); body = bodyElement.html(); } catch (URISyntaxException | IOException ex) { throw new DoesNotContainContactLinkException(); } // Now, extract the email if we can. String matchFound = this.findEmail(body); if (matchFound.isEmpty()) { throw new DoesNotContainContactLinkException(); } return matchFound; } private String findEmail(String text) { Matcher emailMatcher = emailPattern.matcher(text); HashMap<String, Integer> matchesFound = new HashMap(); while (emailMatcher.find()) { String matchFound = text.substring(emailMatcher.start(), emailMatcher.end()); matchesFound.put(matchFound, 1); } return StringUtils.join(matchesFound.keySet(), ", "); } /** * Since this is a web link, we don't return the URL directly; instead, we fetch * the referenced page and seek to get an email address from it. * * @return * @throws org.norvelle.addressdiscoverer.exceptions.DoesNotContainContactLinkException */ @Override public String getAddress() throws DoesNotContainContactLinkException { return this.fetchEmailFromWeblink(); } @Override public String toString() { return String.format("URL: %s", this.address); } @Override public String getUnderlyingUrl() { return address; } }