com.jimplush.goose.ContentExtractor.java Source code

Introduction

Here is the source code for com.jimplush.goose.ContentExtractor.java
Source

/**
 * Licensed to Gravity.com under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Gravity.com licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.jimplush.goose;

import com.jimplush.goose.cleaners.DefaultDocumentCleaner;
import com.jimplush.goose.cleaners.DocumentCleaner;
import com.jimplush.goose.images.BestImageGuesser;
import com.jimplush.goose.images.ImageExtractor;
import com.jimplush.goose.network.*;
import com.jimplush.goose.outputformatters.DefaultOutputFormatter;
import com.jimplush.goose.outputformatters.OutputFormatter;
import com.jimplush.goose.texthelpers.*;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.http.client.HttpClient;
import org.jsoup.nodes.*;
import org.jsoup.select.Elements;
import org.jsoup.select.Selector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;

/**
 * User: jim plush
 * Date: 12/16/10
 * a lot of work in this class is based on Arc90's readability code that does content extraction in JS
 * I wasn't able to find a good server side codebase to acheive the same so I started with their base ideas and then
 * built additional metrics on top of it such as looking for clusters of english stopwords.
 * Gravity was doing 30+ million links per day with this codebase across a series of crawling servers for a project
 * and it held up well. Our current port is slightly different than this one but I'm working to align them so the goose
 * project gets the love as we continue to move forward.
 *
 * Cougar: God dammit, Mustang! This is Ghost Rider 117. This bogey is all over me. He's got missile lock on me. Do I have permission to fire?
 * Stinger: Do not fire until fired upon...
 */

public class ContentExtractor {

    // PRIVATE PROPERTIES BELOW

    private static final Logger logger = LoggerFactory.getLogger(ContentExtractor.class);

    private static final StringReplacement MOTLEY_REPLACEMENT = StringReplacement.compile("&#65533;", string.empty);

    private static final StringReplacement ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement.compile("#!",
            "?_escaped_fragment_=");

    private static final ReplaceSequence TITLE_REPLACEMENTS = ReplaceSequence.create("&raquo;").append("");
    private static final StringSplitter PIPE_SPLITTER = new StringSplitter("\\|");
    private static final StringSplitter DASH_SPLITTER = new StringSplitter(" - ");
    private static final StringSplitter ARROWS_SPLITTER = new StringSplitter("");
    private static final StringSplitter COLON_SPLITTER = new StringSplitter(":");
    private static final StringSplitter SPACE_SPLITTER = new StringSplitter(" ");

    private static final Set<String> NO_STRINGS = new HashSet<String>(0);
    private static final String A_REL_TAG_SELECTOR = "a[rel=tag], a[href*=/tag/]";

    /**
     * holds the configuration settings we want to use
     */
    private Configuration config;

    // sets the default cleaner class to prep the HTML for parsing
    private DocumentCleaner documentCleaner;
    // the MD5 of the URL we're currently parsing, used to references the images we download to the url so we
    // can more easily clean up resources when we're done with the page.
    private String linkhash;
    // once we have our topNode then we want to format that guy for output to the user
    private OutputFormatter outputFormatter;
    private ImageExtractor imageExtractor;

    /**
     * you can optionally pass in a configuration object here that will allow you to override the settings
     * that goose comes default with
     */
    public ContentExtractor() {
        this.config = new Configuration();
    }

    /**
     * overloaded to accept a custom configuration object
     *
     * @param config
     */
    public ContentExtractor(Configuration config) {
        this.config = config;
    }

    /**
     * @param urlToCrawl - The url you want to extract the text from
     * @param html       - if you already have the raw html handy you can pass it here to avoid a network call
     * @return
     */
    public Article extractContent(String urlToCrawl, String html) {

        return performExtraction(urlToCrawl, html);

    }

    /**
     * @param urlToCrawl - The url you want to extract the text from, makes a network call
     * @return
     */
    public Article extractContent(String urlToCrawl) {
        String html = null;
        return performExtraction(urlToCrawl, html);
    }

    public Article performExtraction(String urlToCrawl, String rawHtml) {

        urlToCrawl = getUrlToCrawl(urlToCrawl);
        try {
            new URL(urlToCrawl);

            this.linkhash = HashUtils.md5(urlToCrawl);
        } catch (MalformedURLException e) {
            throw new IllegalArgumentException("Invalid URL Passed in: " + urlToCrawl, e);
        }

        ParseWrapper parseWrapper = new ParseWrapper();
        Article article = null;
        try {

            if (rawHtml == null) {
                rawHtml = HtmlFetcher.getHtml(urlToCrawl);
            }

            article = new Article();

            article.setRawHtml(rawHtml);

            Document doc = parseWrapper.parse(rawHtml, urlToCrawl);

            // before we cleanse, provide consumers with an opportunity to extract the publish date
            article.setPublishDate(config.getPublishDateExtractor().extract(doc));

            // now allow for any additional data to be extracted
            article.setAdditionalData(config.getAdditionalDataExtractor().extract(doc));

            // grab the text nodes of any <a ... rel="tag">Tag Name</a> elements
            article.setTags(extractTags(doc));

            // now perform a nice deep cleansing
            DocumentCleaner documentCleaner = getDocCleaner();
            doc = documentCleaner.clean(doc);

            article.setTitle(getTitle(doc));
            article.setMetaDescription(getMetaDescription(doc));
            article.setMetaKeywords(getMetaKeywords(doc));
            article.setCanonicalLink(getCanonicalLink(doc, urlToCrawl));
            article.setDomain(article.getCanonicalLink());

            // extract the content of the article
            article.setTopNode(calculateBestNodeBasedOnClustering(doc));

            if (article.getTopNode() != null) {

                // extract any movie embeds out from our main article content
                article.setMovies(extractVideos(article.getTopNode()));

                if (config.isEnableImageFetching()) {
                    HttpClient httpClient = HtmlFetcher.getHttpClient();
                    imageExtractor = getImageExtractor(httpClient, urlToCrawl);
                    article.setTopImage(imageExtractor.getBestImage(doc, article.getTopNode()));

                }

                // grab siblings and remove high link density elements
                cleanupNode(article.getTopNode());

                outputFormatter = getOutputFormatter();

                article.setCleanedArticleText(outputFormatter.getFormattedText(article.getTopNode()));

                if (logger.isDebugEnabled()) {
                    logger.debug("FINAL EXTRACTION TEXT: \n" + article.getCleanedArticleText());
                }

                if (config.isEnableImageFetching()) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("\n\nFINAL EXTRACTION IMAGE: \n" + article.getTopImage().getImageSrc());
                    }
                }

            }

            // cleans up all the temp images that we've downloaded
            releaseResources();

        } catch (MaxBytesException e) {
            logger.error(e.toString(), e);
        } catch (NotHtmlException e) {
            logger.error("URL: " + urlToCrawl + " did not contain valid HTML to parse, exiting. " + e.toString());
        } catch (Exception e) {
            logger.error("General Exception occured on url: " + urlToCrawl + " " + e.toString());
            //      throw new RuntimeException(e);
        }

        return article;
    }

    private Set<String> extractTags(Element node) {
        if (node.children().size() == 0)
            return NO_STRINGS;

        Elements elements = Selector.select(A_REL_TAG_SELECTOR, node);
        if (elements.size() == 0)
            return NO_STRINGS;

        Set<String> tags = new HashSet<String>(elements.size());
        for (Element el : elements) {
            String tag = el.text();
            if (!string.isNullOrEmpty(tag))
                tags.add(tag);
        }

        return tags;
    }

    // used for gawker type ajax sites with pound sites
    private String getUrlToCrawl(String urlToCrawl) {
        String finalURL;
        if (urlToCrawl.contains("#!")) {
            finalURL = ESCAPED_FRAGMENT_REPLACEMENT.replaceAll(urlToCrawl);
        } else {
            finalURL = urlToCrawl;
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Goose Extraction: " + finalURL);
        }

        return finalURL;
    }

    // todo create a setter for this for people to override output formatter
    private OutputFormatter getOutputFormatter() {
        if (outputFormatter == null) {
            return new DefaultOutputFormatter();
        } else {
            return outputFormatter;
        }

    }

    private ImageExtractor getImageExtractor(HttpClient httpClient, String urlToCrawl) {

        if (imageExtractor == null) {
            BestImageGuesser bestImageGuesser = new BestImageGuesser(this.config, httpClient, urlToCrawl);
            return bestImageGuesser;
        } else {
            return imageExtractor;
        }

    }

    /**
     * todo allow for setter to override the default documentCleaner in case user wants more flexibility
     *
     * @return
     */
    private DocumentCleaner getDocCleaner() {
        if (this.documentCleaner == null) {
            this.documentCleaner = new DefaultDocumentCleaner();
        }
        return this.documentCleaner;
    }

    /**
     * attemps to grab titles from the html pages, lots of sites use different delimiters
     * for titles so we'll try and do our best guess.
     *
     *
     * @param doc
     * @return
     */
    private String getTitle(Document doc) {
        String title = string.empty;

        try {

            Elements titleElem = doc.getElementsByTag("title");
            if (titleElem == null || titleElem.isEmpty())
                return string.empty;

            String titleText = titleElem.first().text();

            if (string.isNullOrEmpty(titleText))
                return string.empty;

            boolean usedDelimeter = false;

            if (titleText.contains("|")) {
                titleText = doTitleSplits(titleText, PIPE_SPLITTER);
                usedDelimeter = true;
            }

            if (!usedDelimeter && titleText.contains("-")) {
                titleText = doTitleSplits(titleText, DASH_SPLITTER);
                usedDelimeter = true;
            }
            if (!usedDelimeter && titleText.contains("")) {
                titleText = doTitleSplits(titleText, ARROWS_SPLITTER);
                usedDelimeter = true;
            }

            if (!usedDelimeter && titleText.contains(":")) {
                titleText = doTitleSplits(titleText, COLON_SPLITTER);
            }

            // encode unicode charz
            title = StringEscapeUtils.escapeHtml(titleText);

            // todo this is a hack until I can fix this.. weird motely crue error with
            // http://money.cnn.com/2010/10/25/news/companies/motley_crue_bp.fortune/index.htm?section=money_latest
            title = MOTLEY_REPLACEMENT.replaceAll(title);

            if (logger.isDebugEnabled()) {
                logger.debug("Page title is: " + title);
            }

        } catch (NullPointerException e) {
            logger.error(e.toString());
        }
        return title;

    }

    /**
     * based on a delimeter in the title take the longest piece or do some custom logic based on the site
     *
     * @param title
     * @param splitter
     * @return
     */
    private String doTitleSplits(String title, StringSplitter splitter) {
        int largetTextLen = 0;
        int largeTextIndex = 0;

        String[] titlePieces = splitter.split(title);

        // take the largest split
        for (int i = 0; i < titlePieces.length; i++) {
            String current = titlePieces[i];
            if (current.length() > largetTextLen) {
                largetTextLen = current.length();
                largeTextIndex = i;
            }
        }

        return TITLE_REPLACEMENTS.replaceAll(titlePieces[largeTextIndex]).trim();
    }

    private String getMetaContent(Document doc, String metaName) {
        Elements meta = doc.select(metaName);
        if (meta.size() > 0) {
            String content = meta.first().attr("content");
            return string.isNullOrEmpty(content) ? string.empty : content.trim();
        }
        return string.empty;
    }

    /**
     * if the article has meta description set in the source, use that
     */
    private String getMetaDescription(Document doc) {
        return getMetaContent(doc, "meta[name=description]");
    }

    /**
     * if the article has meta keywords set in the source, use that
     */
    private String getMetaKeywords(Document doc) {
        return getMetaContent(doc, "meta[name=keywords]");
    }

    /**
     * if the article has meta canonical link set in the url
     */
    private String getCanonicalLink(Document doc, String baseUrl) {
        Elements meta = doc.select("link[rel=canonical]");
        if (meta.size() > 0) {
            String href = meta.first().attr("href");
            return string.isNullOrEmpty(href) ? string.empty : href.trim();
        } else {
            return baseUrl;
        }

        /*    Not sure what this is for
            // set domain based on canonicalUrl
            URL url = null;
            try {
            
              if (canonicalUrl != null) {
                if (!canonicalUrl.startsWith("http://")) {
                  url = new URL(new URL(baseUrl), canonicalUrl);
                } else {
                  url = new URL(canonicalUrl);
                }
            
              } else {
                url = new URL(baseUrl);
              }
            
            } catch (MalformedURLException e) {
              logger.error(e.toString(), e);
            }*/
    }

    private String getDomain(String canonicalLink) {
        try {
            return new URL(canonicalLink).getHost();
        } catch (MalformedURLException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords
     * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around
     * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score
     *
     * @return
     */
    private Element calculateBestNodeBasedOnClustering(Document doc) {
        Element topNode = null;

        // grab all the paragraph elements on the page to start to inspect the likely hood of them being good peeps
        ArrayList<Element> nodesToCheck = getNodesToCheck(doc);

        double startingBoost = 1.0;
        int cnt = 0;
        int i = 0;

        // holds all the parents of the nodes we're checking
        Set<Element> parentNodes = new HashSet<Element>();

        ArrayList<Element> nodesWithText = new ArrayList<Element>();

        for (Element node : nodesToCheck) {

            String nodeText = node.text();
            WordStats wordStats = StopWords.getStopWordCount(nodeText);
            boolean highLinkDensity = isHighLinkDensity(node);

            if (wordStats.getStopWordCount() > 2 && !highLinkDensity) {

                nodesWithText.add(node);
            }

        }

        int numberOfNodes = nodesWithText.size();
        int negativeScoring = 0; // we shouldn't give more negatives than positives
        // we want to give the last 20% of nodes negative scores in case they're comments
        double bottomNodesForNegativeScore = (float) numberOfNodes * 0.25;

        if (logger.isDebugEnabled()) {
            logger.debug("About to inspect num of nodes with text: " + numberOfNodes);
        }

        for (Element node : nodesWithText) {

            // add parents and grandparents to scoring
            // only add boost to the middle paragraphs, top and bottom is usually jankz city
            // so basically what we're doing is giving boost scores to paragraphs that appear higher up in the dom
            // and giving lower, even negative scores to those who appear lower which could be commenty stuff

            float boostScore = 0;

            if (isOkToBoost(node)) {
                if (cnt >= 0) {
                    boostScore = (float) ((1.0 / startingBoost) * 50);
                    startingBoost++;
                }
            }

            // check for negative node values
            if (numberOfNodes > 15) {
                if ((numberOfNodes - i) <= bottomNodesForNegativeScore) {
                    float booster = (float) bottomNodesForNegativeScore - (float) (numberOfNodes - i);
                    boostScore = -(float) Math.pow(booster, (float) 2);

                    // we don't want to score too highly on the negative side.
                    float negscore = Math.abs(boostScore) + negativeScoring;
                    if (negscore > 40) {
                        boostScore = 5;
                    }
                }
            }

            if (logger.isDebugEnabled()) {
                logger.debug("Location Boost Score: " + boostScore + " on interation: " + i + "' id='"
                        + node.parent().id() + "' class='" + node.parent().attr("class"));
            }
            String nodeText = node.text();
            WordStats wordStats = StopWords.getStopWordCount(nodeText);
            int upscore = (int) (wordStats.getStopWordCount() + boostScore);
            updateScore(node.parent(), upscore);
            updateScore(node.parent().parent(), upscore / 2);
            updateNodeCount(node.parent(), 1);
            updateNodeCount(node.parent().parent(), 1);

            if (!parentNodes.contains(node.parent())) {
                parentNodes.add(node.parent());
            }

            if (!parentNodes.contains(node.parent().parent())) {
                parentNodes.add(node.parent().parent());
            }

            cnt++;
            i++;
        }

        // now let's find the parent node who scored the highest

        int topNodeScore = 0;
        for (Element e : parentNodes) {

            if (logger.isDebugEnabled()) {
                logger.debug("ParentNode: score='" + e.attr("gravityScore") + "' nodeCount='"
                        + e.attr("gravityNodes") + "' id='" + e.id() + "' class='" + e.attr("class") + "' ");
            }
            //int score = Integer.parseInt(e.attr("gravityScore")) * Integer.parseInt(e.attr("gravityNodes"));
            int score = getScore(e);
            if (score > topNodeScore) {
                topNode = e;
                topNodeScore = score;
            }

            if (topNode == null) {
                topNode = e;
            }
        }

        if (logger.isDebugEnabled()) {
            if (topNode == null) {
                logger.debug("ARTICLE NOT ABLE TO BE EXTRACTED!, WE HAZ FAILED YOU LORD VADAR");
            } else {
                String logText;
                String targetText = "";
                Element topPara = topNode.getElementsByTag("p").first();
                if (topPara == null) {
                    topNode.text();
                } else {
                    topPara.text();
                }

                if (targetText.length() >= 51) {
                    logText = targetText.substring(0, 50);
                } else {
                    logText = targetText;
                }
                logger.debug("TOPNODE TEXT: " + logText.trim());
                logger.debug("Our TOPNODE: score='" + topNode.attr("gravityScore") + "' nodeCount='"
                        + topNode.attr("gravityNodes") + "' id='" + topNode.id() + "' class='"
                        + topNode.attr("class") + "' ");
            }
        }

        return topNode;

    }

    /**
     * returns a list of nodes we want to search on like paragraphs and tables
     *
     * @return
     */
    private ArrayList<Element> getNodesToCheck(Document doc) {
        ArrayList<Element> nodesToCheck = new ArrayList<Element>();

        nodesToCheck.addAll(doc.getElementsByTag("p"));
        nodesToCheck.addAll(doc.getElementsByTag("pre"));
        nodesToCheck.addAll(doc.getElementsByTag("td"));
        return nodesToCheck;

    }

    /**
     * checks the density of links within a node, is there not much text and most of it contains linky shit?
     * if so it's no good
     *
     * @param e
     * @return
     */
    private static boolean isHighLinkDensity(Element e) {

        Elements links = e.getElementsByTag("a");

        if (links.size() == 0) {
            return false;
        }

        String text = e.text().trim();
        String[] words = SPACE_SPLITTER.split(text);
        float numberOfWords = words.length;

        // let's loop through all the links and calculate the number of words that make up the links
        StringBuilder sb = new StringBuilder();
        for (Element link : links) {
            sb.append(link.text());
        }
        String linkText = sb.toString();
        String[] linkWords = SPACE_SPLITTER.split(linkText);
        float numberOfLinkWords = linkWords.length;

        float numberOfLinks = links.size();

        float linkDivisor = numberOfLinkWords / numberOfWords;
        float score = linkDivisor * numberOfLinks;

        if (logger.isDebugEnabled()) {
            String logText;
            if (e.text().length() >= 51) {
                logText = e.text().substring(0, 50);
            } else {
                logText = e.text();
            }
            logger.debug("Calulated link density score as: " + score + " for node: " + logText);
        }
        if (score > 1) {
            return true;
        }

        return false;
    }

    /**
     * alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
     * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
     * so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it
     *
     *
     * @param node
     * @return
     */
    private boolean isOkToBoost(Element node) {

        int stepsAway = 0;

        Element sibling = node.nextElementSibling();
        while (sibling != null) {

            if (sibling.tagName().equals("p")) {
                if (stepsAway >= 3) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Next paragraph is too far away, not boosting");
                    }
                    return false;
                }

                String paraText = sibling.text();
                WordStats wordStats = StopWords.getStopWordCount(paraText);
                if (wordStats.getStopWordCount() > 5) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("We're gonna boost this node, seems contenty");
                    }
                    return true;
                }

            }

            // increase how far away the next paragraph is from this node
            stepsAway++;

            sibling = sibling.nextElementSibling();
        }

        return false;
    }

    /**
     * adds a score to the gravityScore Attribute we put on divs
     * we'll get the current score then add the score we're passing in to the current
     *
     * @param node
     * @param addToScore - the score to add to the node
     */
    private void updateScore(Element node, int addToScore) {
        int currentScore;
        try {
            String scoreString = node.attr("gravityScore");
            currentScore = string.isNullOrEmpty(scoreString) ? 0 : Integer.parseInt(scoreString);
        } catch (NumberFormatException e) {
            currentScore = 0;
        }
        int newScore = currentScore + addToScore;
        node.attr("gravityScore", Integer.toString(newScore));

    }

    /**
     * stores how many decent nodes are under a parent node
     *
     * @param node
     * @param addToCount
     */
    private void updateNodeCount(Element node, int addToCount) {
        int currentScore;
        try {
            String countString = node.attr("gravityNodes");
            currentScore = string.isNullOrEmpty(countString) ? 0 : Integer.parseInt(countString);
        } catch (NumberFormatException e) {
            currentScore = 0;
        }
        int newScore = currentScore + addToCount;
        node.attr("gravityNodes", Integer.toString(newScore));

    }

    /**
     * returns the gravityScore as an integer from this node
     *
     * @param node
     * @return
     */
    private int getScore(Element node) {
        if (node == null)
            return 0;
        try {
            String grvScoreString = node.attr("gravityScore");
            if (string.isNullOrEmpty(grvScoreString))
                return 0;
            return Integer.parseInt(grvScoreString);
        } catch (NumberFormatException e) {
            return 0;
        }
    }

    /**
     * pulls out videos we like
     *
     * @return
     */
    private ArrayList<Element> extractVideos(Element node) {
        ArrayList<Element> candidates = new ArrayList<Element>();
        ArrayList<Element> goodMovies = new ArrayList<Element>();
        try {

            Elements embeds = node.parent().getElementsByTag("embed");
            for (Element el : embeds) {
                candidates.add(el);
            }
            Elements objects = node.parent().getElementsByTag("object");
            for (Element el : objects) {
                candidates.add(el);
            }
            if (logger.isDebugEnabled()) {
                logger.debug("extractVideos: Starting to extract videos. Found: " + candidates.size());
            }

            for (Element el : candidates) {

                Attributes attrs = el.attributes();

                for (Attribute a : attrs) {
                    try {
                        if (logger.isDebugEnabled()) {
                            logger.debug(a.getKey() + " : " + a.getValue());
                        }
                        if ((a.getValue().contains("youtube") || a.getValue().contains("vimeo"))
                                && a.getKey().equals("src")) {
                            if (logger.isDebugEnabled()) {
                                logger.debug("Found video... setting");
                                logger.debug("This page has a video!: " + a.getValue());
                            }
                            goodMovies.add(el);

                        }
                    } catch (Exception e) {
                        logger.error(e.toString());
                        e.printStackTrace();
                    }
                }

            }
        } catch (NullPointerException e) {
            logger.error(e.toString(), e);
        } catch (Exception e) {
            logger.error(e.toString(), e);
        }
        if (logger.isDebugEnabled()) {
            logger.debug("extractVideos:  done looking videos");
        }
        return goodMovies;
    }

    /**
     * remove any divs that looks like non-content, clusters of links, or paras with no gusto
     *
     * @param node
     * @return
     */
    private Element cleanupNode(Element node) {
        if (logger.isDebugEnabled()) {
            logger.debug("Starting cleanup Node");
        }

        node = addSiblings(node);

        Elements nodes = node.children();
        for (Element e : nodes) {
            if (e.tagName().equals("p")) {
                continue;
            }
            if (logger.isDebugEnabled()) {
                logger.debug("CLEANUP  NODE: " + e.id() + " class: " + e.attr("class"));
            }
            boolean highLinkDensity = isHighLinkDensity(e);
            if (highLinkDensity) {
                if (logger.isDebugEnabled()) {
                    logger.debug("REMOVING  NODE FOR LINK DENSITY: " + e.id() + " class: " + e.attr("class"));
                }
                e.remove();
                continue;
            }
            // now check for word density
            // grab all the paragraphs in the children and remove ones that are too small to matter
            Elements subParagraphs = e.getElementsByTag("p");

            for (Element p : subParagraphs) {
                if (p.text().length() < 25) {
                    p.remove();
                }
            }

            // now that we've removed shorty paragraphs let's make sure to exclude any first paragraphs that don't have paras as
            // their next siblings to avoid getting img bylines
            // first let's remove any element that now doesn't have any p tags at all
            Elements subParagraphs2 = e.getElementsByTag("p");
            if (subParagraphs2.size() == 0 && !e.tagName().equals("td")) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Removing node because it doesn't have any paragraphs");
                }
                e.remove();
                continue;
            }

            //if this node has a decent enough gravityScore we should keep it as well, might be content
            int topNodeScore = getScore(node);
            int currentNodeScore = getScore(e);
            float thresholdScore = (float) (topNodeScore * .08);
            if (logger.isDebugEnabled()) {
                logger.debug("topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore
                        + " threshold: " + thresholdScore);
            }
            if (currentNodeScore < thresholdScore) {
                if (!e.tagName().equals("td")) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Removing node due to low threshold score");
                    }
                    e.remove();
                } else {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Not removing TD node");
                    }
                }

                continue;
            }

        }

        return node;

    }

    /**
     * adds any siblings that may have a decent score to this node
     *
     * @param node
     * @return
     */
    private Element addSiblings(Element node) {
        if (logger.isDebugEnabled()) {
            logger.debug("Starting to add siblings");
        }
        int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node);

        Element currentSibling = node.previousElementSibling();
        while (currentSibling != null) {
            if (logger.isDebugEnabled()) {
                logger.debug("SIBLINGCHECK: " + debugNode(currentSibling));
            }

            if (currentSibling.tagName().equals("p")) {

                node.child(0).before(currentSibling.outerHtml());
                currentSibling = currentSibling.previousElementSibling();
                continue;
            }

            // check for a paraph embedded in a containing element
            int insertedSiblings = 0;
            Elements potentialParagraphs = currentSibling.getElementsByTag("p");
            if (potentialParagraphs.first() == null) {
                currentSibling = currentSibling.previousElementSibling();
                continue;
            }
            for (Element firstParagraph : potentialParagraphs) {
                WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text());

                int paragraphScore = wordStats.getStopWordCount();

                if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("This node looks like a good sibling, adding it");
                    }
                    node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>");
                    insertedSiblings++;
                }

            }

            currentSibling = currentSibling.previousElementSibling();
        }
        return node;

    }

    /**
     * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against
     * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring
     * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of
     * 100 then 100 should be our base.
     *
     * @param topNode
     * @return
     */
    private int getBaselineScoreForSiblings(Element topNode) {

        int base = 100000;

        int numberOfParagraphs = 0;
        int scoreOfParagraphs = 0;

        Elements nodesToCheck = topNode.getElementsByTag("p");

        for (Element node : nodesToCheck) {

            String nodeText = node.text();
            WordStats wordStats = StopWords.getStopWordCount(nodeText);
            boolean highLinkDensity = isHighLinkDensity(node);

            if (wordStats.getStopWordCount() > 2 && !highLinkDensity) {

                numberOfParagraphs++;
                scoreOfParagraphs += wordStats.getStopWordCount();
            }

        }

        if (numberOfParagraphs > 0) {
            base = scoreOfParagraphs / numberOfParagraphs;
            if (logger.isDebugEnabled()) {
                logger.debug("The base score for siblings to beat is: " + base + " NumOfParas: "
                        + numberOfParagraphs + " scoreOfAll: " + scoreOfParagraphs);
            }
        }

        return base;

    }

    private String debugNode(Element e) {

        StringBuilder sb = new StringBuilder();
        sb.append("GravityScore: '");
        sb.append(e.attr("gravityScore"));
        sb.append("' paraNodeCount: '");
        sb.append(e.attr("gravityNodes"));
        sb.append("' nodeId: '");
        sb.append(e.id());
        sb.append("' className: '");
        sb.append(e.attr("class"));
        return sb.toString();

    }

    /**
     * cleans up any temp shit we have laying around like temp images
     * removes any image in the temp dir that starts with the linkhash of the url we just parsed
     */
    public void releaseResources() {
        if (logger.isDebugEnabled()) {
            logger.debug("STARTING TO RELEASE ALL RESOURCES");
        }
        File dir = new File(config.getLocalStoragePath());
        String[] children = dir.list();

        if (children == null) {
            logger.debug("No Temp images found for linkhash: " + this.linkhash);
        } else {
            for (int i = 0; i < children.length; i++) {
                // Get filename of file or directory
                String filename = children[i];

                if (filename.startsWith(this.linkhash)) {

                    File f = new File(dir.getAbsolutePath() + "/" + filename);
                    if (!f.delete()) {
                        logger.error("Unable to remove temp file: " + filename);
                    }
                }
            }
        }

    }

}