Example usage for org.jsoup.nodes Element parent

List of usage examples for org.jsoup.nodes Element parent

Introduction

In this page you can find the example usage for org.jsoup.nodes Element parent.

Prototype

@Override
    public final Element parent() 

Source Link

Usage

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

/**
 * ??:/*from   w  ww .ja v  a  2  s.  com*/
 * 1. ???
 * 2. ???????
 * 3. ??
 * 4. ?? ??
 * 5. ?
 *
 * @return
 * @throws XpathSyntaxErrorException
 */
private String getAuthor() throws XpathSyntaxErrorException {
    String author = "";
    if (StringUtils.isBlank(srcTime)) {
        author = getAuthor(doc.body().html());
        return author;
    }
    Element cur = doc.body().select("*:containsOwn(" + srcTime + ")").first();
    if (cur == null) {
        LOG.warn("?srcTime=" + srcTime);
        author = getAuthor(doc.body().html());
        return author;
    }

    if (!noText(cur)) {
        String arr[] = cur.html().split(srcTime);
        for (String text : arr) {
            author = getShortText(text);
            if (!StringUtils.isBlank(author))
                return author;
        }
    }
    Element parent = cur.parent();
    while (parent != null && noText(parent)) {
        cur = parent;
        parent = parent.parent();
    }
    author = getAuthor(parent.html());
    if (!StringUtils.isBlank(author))
        return author;

    Element pre = cur.previousElementSibling();
    while (pre != null && noText(pre)) {
        pre = pre.previousElementSibling();
    }
    if (pre != null) {
        author = getShortText(pre.text());
    }
    if (!StringUtils.isBlank(author))
        return author;
    Element next = cur.nextElementSibling();
    while (next != null && noText(next)) {
        next = next.nextElementSibling();
    }
    if (next != null) {
        author = getShortText(next.text());
    }
    if (!StringUtils.isBlank(author))
        return author;

    author = getShortText(parent.html().replace(srcTime, " "));
    if (!StringUtils.isBlank(author))
        return author;

    author = getAuthor(doc.body().html());
    if (StringUtils.isBlank(author)) {
        return author_bak;
    }
    return author;
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * pulls out videos we like//w ww.ja v a  2s .c  o  m
 *
 * @return
 */
private ArrayList<Element> extractVideos(Element node) {
    ArrayList<Element> candidates = new ArrayList<Element>();
    ArrayList<Element> goodMovies = new ArrayList<Element>();
    try {

        Elements embeds = node.parent().getElementsByTag("embed");
        for (Element el : embeds) {
            candidates.add(el);
        }
        Elements objects = node.parent().getElementsByTag("object");
        for (Element el : objects) {
            candidates.add(el);
        }
        if (logger.isDebugEnabled()) {
            logger.debug("extractVideos: Starting to extract videos. Found: " + candidates.size());
        }

        for (Element el : candidates) {

            Attributes attrs = el.attributes();

            for (Attribute a : attrs) {
                try {
                    if (logger.isDebugEnabled()) {
                        logger.debug(a.getKey() + " : " + a.getValue());
                    }
                    if ((a.getValue().contains("youtube") || a.getValue().contains("vimeo"))
                            && a.getKey().equals("src")) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("Found video... setting");
                            logger.debug("This page has a video!: " + a.getValue());
                        }
                        goodMovies.add(el);

                    }
                } catch (Exception e) {
                    logger.error(e.toString());
                    e.printStackTrace();
                }
            }

        }
    } catch (NullPointerException e) {
        logger.error(e.toString(), e);
    } catch (Exception e) {
        logger.error(e.toString(), e);
    }
    if (logger.isDebugEnabled()) {
        logger.debug("extractVideos:  done looking videos");
    }
    return goodMovies;
}

From source file:de.geeksfactory.opacclient.apis.BiBer1992.java

private String parse_option_regex(Element inputTag) {
    String optStr = inputTag.val();
    String html = inputTag.parent().html();
    String result = optStr;//from www.ja  v  a2s. c  o  m

    String regex1 = "value=\"" + optStr + "\".*?>([^<]+)";
    String[] regexList = new String[] { regex1 };

    for (String regex : regexList) {
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(html);
        if (matcher.find()) {
            result = matcher.group(1);
            result = result.replaceAll("&nbsp;", " ").trim();
            break;
        }
    }

    return result;
}

From source file:me.vertretungsplan.parser.UntisCommonParser.java

/**
 * Parses an Untis substitution schedule table
 *
 * @param table        the <code>table</code> Element from the HTML document
 * @param data         {@link SubstitutionScheduleData#getData()}
 * @param day          the {@link SubstitutionScheduleDay} where the substitutions will be stored
 * @param defaultClass the class that should be set if there is no class column in the table
 *///from w  w  w .j a  v  a  2s  .  c om
private void parseSubstitutionScheduleTable(Element table, JSONObject data, SubstitutionScheduleDay day,
        String defaultClass) throws JSONException, CredentialInvalidException {
    if (data.optBoolean(PARAM_CLASS_IN_EXTRA_LINE) || data.optBoolean("class_in_extra_line")) { // backwards compatibility
        for (Element element : table.select("td.inline_header")) {
            String className = getClassName(element.text(), data);
            if (isValidClass(className)) {
                Element zeile = null;
                try {
                    zeile = element.parent().nextElementSibling();
                    if (zeile.select("td") == null) {
                        zeile = zeile.nextElementSibling();
                    }
                    int skipLines = 0;
                    while (zeile != null && !zeile.select("td").attr("class").equals("list inline_header")) {
                        if (skipLines > 0) {
                            skipLines--;
                            zeile = zeile.nextElementSibling();
                            continue;
                        }

                        Substitution v = new Substitution();

                        int i = 0;
                        for (Element spalte : zeile.select("td")) {
                            String text = spalte.text();
                            if (isEmpty(text)) {
                                i++;
                                continue;
                            }

                            int skipLinesForThisColumn = 0;
                            Element nextLine = zeile.nextElementSibling();
                            boolean continueSkippingLines = true;
                            while (continueSkippingLines) {
                                if (nextLine != null && nextLine.children().size() == zeile.children().size()) {
                                    Element columnInNextLine = nextLine.child(spalte.elementSiblingIndex());
                                    if (columnInNextLine.text().replaceAll("\u00A0", "").trim()
                                            .equals(nextLine.text().replaceAll("\u00A0", "").trim())) {
                                        // Continued in the next line
                                        text += " " + columnInNextLine.text();
                                        skipLinesForThisColumn++;
                                        nextLine = nextLine.nextElementSibling();
                                    } else {
                                        continueSkippingLines = false;
                                    }
                                } else {
                                    continueSkippingLines = false;
                                }
                            }
                            if (skipLinesForThisColumn > skipLines)
                                skipLines = skipLinesForThisColumn;

                            String type = data.getJSONArray(PARAM_COLUMNS).getString(i);

                            switch (type) {
                            case "lesson":
                                v.setLesson(text);
                                break;
                            case "subject":
                                handleSubject(v, spalte);
                                break;
                            case "previousSubject":
                                v.setPreviousSubject(text);
                                break;
                            case "type":
                                v.setType(text);
                                v.setColor(colorProvider.getColor(text));
                                break;
                            case "type-entfall":
                                if (text.equals("x")) {
                                    v.setType("Entfall");
                                    v.setColor(colorProvider.getColor("Entfall"));
                                } else {
                                    v.setType("Vertretung");
                                    v.setColor(colorProvider.getColor("Vertretung"));
                                }
                                break;
                            case "room":
                                handleRoom(v, spalte);
                                break;
                            case "teacher":
                                handleTeacher(v, spalte, data);
                                break;
                            case "previousTeacher":
                                v.setPreviousTeachers(splitTeachers(text, data));
                                break;
                            case "desc":
                                v.setDesc(text);
                                break;
                            case "desc-type":
                                v.setDesc(text);
                                String recognizedType = recognizeType(text);
                                v.setType(recognizedType);
                                v.setColor(colorProvider.getColor(recognizedType));
                                break;
                            case "previousRoom":
                                v.setPreviousRoom(text);
                                break;
                            case "substitutionFrom":
                                v.setSubstitutionFrom(text);
                                break;
                            case "teacherTo":
                                v.setTeacherTo(text);
                                break;
                            case "ignore":
                                break;
                            case "date": // used by UntisSubstitutionParser
                                break;
                            default:
                                throw new IllegalArgumentException("Unknown column type: " + type);
                            }
                            i++;
                        }

                        autoDetectType(data, zeile, v);

                        v.getClasses().add(className);

                        if (v.getLesson() != null && !v.getLesson().equals("")) {
                            day.addSubstitution(v);
                        }

                        zeile = zeile.nextElementSibling();

                    }
                } catch (Throwable e) {

                    e.printStackTrace();
                }
            }
        }
    } else {
        boolean hasType = false;
        for (int i = 0; i < data.getJSONArray(PARAM_COLUMNS).length(); i++) {
            if (data.getJSONArray(PARAM_COLUMNS).getString(i).equals("type")) {
                hasType = true;
            }
        }
        int skipLines = 0;
        for (Element zeile : table.select("tr.list.odd:not(:has(td.inline_header)), "
                + "tr.list.even:not(:has(td.inline_header)), " + "tr:has(td[align=center]):gt(0)")) {
            if (skipLines > 0) {
                skipLines--;
                continue;
            }

            Substitution v = new Substitution();
            String klassen = defaultClass != null ? defaultClass : "";
            int i = 0;
            for (Element spalte : zeile.select("td")) {
                String text = spalte.text();

                String type = data.getJSONArray(PARAM_COLUMNS).getString(i);
                if (isEmpty(text) && !type.equals("type-entfall")) {
                    i++;
                    continue;
                }

                int skipLinesForThisColumn = 0;
                Element nextLine = zeile.nextElementSibling();
                boolean continueSkippingLines = true;
                while (continueSkippingLines) {
                    if (nextLine != null && nextLine.children().size() == zeile.children().size()) {
                        Element columnInNextLine = nextLine.child(spalte.elementSiblingIndex());
                        if (columnInNextLine.text().replaceAll("\u00A0", "").trim()
                                .equals(nextLine.text().replaceAll("\u00A0", "").trim())) {
                            // Continued in the next line
                            text += " " + columnInNextLine.text();
                            skipLinesForThisColumn++;
                            nextLine = nextLine.nextElementSibling();
                        } else {
                            continueSkippingLines = false;
                        }
                    } else {
                        continueSkippingLines = false;
                    }
                }
                if (skipLinesForThisColumn > skipLines)
                    skipLines = skipLinesForThisColumn;

                switch (type) {
                case "lesson":
                    v.setLesson(text);
                    break;
                case "subject":
                    handleSubject(v, spalte);
                    break;
                case "previousSubject":
                    v.setPreviousSubject(text);
                    break;
                case "type":
                    v.setType(text);
                    v.setColor(colorProvider.getColor(text));
                    break;
                case "type-entfall":
                    if (text.equals("x")) {
                        v.setType("Entfall");
                        v.setColor(colorProvider.getColor("Entfall"));
                    } else if (!hasType) {
                        v.setType("Vertretung");
                        v.setColor(colorProvider.getColor("Vertretung"));
                    }
                    break;
                case "room":
                    handleRoom(v, spalte);
                    break;
                case "previousRoom":
                    v.setPreviousRoom(text);
                    break;
                case "desc":
                    v.setDesc(text);
                    break;
                case "desc-type":
                    v.setDesc(text);
                    String recognizedType = recognizeType(text);
                    v.setType(recognizedType);
                    v.setColor(colorProvider.getColor(recognizedType));
                    break;
                case "teacher":
                    handleTeacher(v, spalte, data);
                    break;
                case "previousTeacher":
                    v.setPreviousTeachers(splitTeachers(text, data));
                    break;
                case "substitutionFrom":
                    v.setSubstitutionFrom(text);
                    break;
                case "teacherTo":
                    v.setTeacherTo(text);
                    break;
                case "class":
                    klassen = getClassName(text, data);
                    break;
                case "ignore":
                    break;
                case "date": // used by UntisSubstitutionParser
                    break;
                default:
                    throw new IllegalArgumentException("Unknown column type: " + type);
                }
                i++;
            }

            if (v.getLesson() == null || v.getLesson().equals("")) {
                continue;
            }

            autoDetectType(data, zeile, v);

            List<String> affectedClasses;

            // Detect things like "7"
            Pattern singlePattern = Pattern.compile("(\\d+)");
            Matcher singleMatcher = singlePattern.matcher(klassen);

            // Detect things like "5-12"
            Pattern rangePattern = Pattern.compile("(\\d+) ?- ?(\\d+)");
            Matcher rangeMatcher = rangePattern.matcher(klassen);

            Pattern pattern2 = Pattern.compile("^(\\d+).*");

            if (rangeMatcher.matches()) {
                affectedClasses = new ArrayList<>();
                int min = Integer.parseInt(rangeMatcher.group(1));
                int max = Integer.parseInt(rangeMatcher.group(2));
                try {
                    for (String klasse : getAllClasses()) {
                        Matcher matcher2 = pattern2.matcher(klasse);
                        if (matcher2.matches()) {
                            int num = Integer.parseInt(matcher2.group(1));
                            if (min <= num && num <= max)
                                affectedClasses.add(klasse);
                        }
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            } else if (singleMatcher.matches()) {
                affectedClasses = new ArrayList<>();
                int grade = Integer.parseInt(singleMatcher.group(1));
                try {
                    for (String klasse : getAllClasses()) {
                        Matcher matcher2 = pattern2.matcher(klasse);
                        if (matcher2.matches() && grade == Integer.parseInt(matcher2.group(1))) {
                            affectedClasses.add(klasse);
                        }
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            } else {
                if (data.optBoolean(PARAM_CLASSES_SEPARATED, true)
                        && data.optBoolean("classes_separated", true)) { // backwards compatibility
                    affectedClasses = Arrays.asList(klassen.split(", "));
                } else {
                    affectedClasses = new ArrayList<>();
                    try {
                        for (String klasse : getAllClasses()) { // TODO: is there a better way?
                            StringBuilder regex = new StringBuilder();
                            for (char character : klasse.toCharArray()) {
                                if (character == '?') {
                                    regex.append("\\?");
                                } else {
                                    regex.append(character);
                                }
                                regex.append(".*");
                            }
                            if (klassen.matches(regex.toString())) {
                                affectedClasses.add(klasse);
                            }
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }

            for (String klasse : affectedClasses) {
                if (isValidClass(klasse)) {
                    v.getClasses().add(klasse);
                }
            }

            if (data.optBoolean(PARAM_MERGE_WITH_DIFFERENT_TYPE, false)) {
                boolean found = false;
                for (Substitution subst : day.getSubstitutions()) {
                    if (subst.equalsExcludingType(v)) {
                        found = true;

                        if (v.getType().equals("Vertretung")) {
                            subst.setType("Vertretung");
                            subst.setColor(colorProvider.getColor("Vertretung"));
                        }

                        break;
                    }
                }
                if (!found) {
                    day.addSubstitution(v);
                }
            } else {
                day.addSubstitution(v);
            }
        }
    }
}

From source file:org.confab.PhpBB3Parser.java

public List<Forum> parseForums(Document root, BulletinBoard parent) {
    Utilities.debug("parseForums");

    List<Forum> ret = new ArrayList<Forum>();

    // get table/*from  w w w.  j  a  va  2  s  .c om*/
    Elements forum_tables = root.select("ul[class=topiclist forums]");
    assert !forum_tables.isEmpty() : root.html();

    for (Element forum_table : forum_tables) {
        Elements els_li = forum_table.select("li.row");
        assert !els_li.isEmpty();
        for (Element el_li : els_li) {
            Forum new_forum = new Forum(parent);

            // Get the forum url
            Elements els_a = el_li.select("a.forumtitle");
            Element el_a = els_a.first();
            assert el_a != null;
            new_forum.url = el_a.attr("href");
            assert new_forum.url != null;
            Utilities.debug("new_forum.url : " + new_forum.url);

            // Get the title text
            new_forum.title = el_a.text();
            assert new_forum.title != null;
            Utilities.debug("new_forum.title : " + new_forum.title);

            // Check for any subforums in remaining a elements
            els_a.remove(els_a.first());
            for (Element _el_a : els_a) {
                Forum sub_forum = new Forum(parent);
                sub_forum.url = el_a.attr("href");
                assert sub_forum.url != null;
                sub_forum.title = el_a.text();
                assert sub_forum.title != null;
                new_forum.subForums.add(sub_forum);
                Utilities.debug("added subForum: " + sub_forum.title);
            }

            // Get the description/message of this topic
            String el_description = el_a.parent().text();
            if (el_description != null) {
                new_forum.description = el_description;
            } else {
                new_forum.description = "";
            }
            Utilities.debug("new_forum.description : " + new_forum.description);

            Utilities.debug("new_forum.parent.url : " + new_forum.parent.url);

            ret.add(new_forum);
            Utilities.debug("-----");
        }
    }
    Utilities.debug("end parseForums");
    return ret;
}

From source file:de.geeksfactory.opacclient.apis.Zones.java

@Override
public List<SearchField> getSearchFields() throws IOException {
    if (!initialised)
        start();//from   w w  w.  ja  v a2  s . c  om
    List<SearchField> fields = new ArrayList<>();
    String html = httpGet(opac_url + "/APS_ZONES?fn=AdvancedSearch&Style=Portal3&SubStyle=&Lang=GER"
            + "&ResponseEncoding=utf-8", getDefaultEncoding());

    Document doc = Jsoup.parse(html);

    // find text fields
    Elements txt_opts = doc.select("#formSelectTerm_1 option");
    for (Element opt : txt_opts) {
        TextSearchField field = new TextSearchField();
        field.setId(opt.attr("value"));
        field.setHint("");
        field.setDisplayName(opt.text());
        fields.add(field);
    }

    // find filters
    String filtersQuery = version18 ? ".inSearchLimits .floatingBox" : ".TabRechAv .limitBlock";
    Elements filters = doc.select(filtersQuery);
    int i = 0;
    for (Element filter : filters) {
        DropdownSearchField dropdown = new DropdownSearchField();
        dropdown.addDropdownValue("", "Alle");
        // All dropdowns use "q.limits.limit" as URL param, but they must not have the same ID
        dropdown.setId("dropdown_" + i);

        if (version18) {
            dropdown.setDisplayName(filter.select("tr").get(0).text().trim());
            Elements opts = filter.select("tr").get(1).select("table td:has(input)");
            for (Element opt : opts) {
                dropdown.addDropdownValue(opt.select("input").attr("value"), opt.text().trim());
            }
        } else {
            dropdown.setDisplayName(filter.parent().previousElementSibling().text().trim());
            Elements opts = filter.select(".limitChoice label");
            for (Element opt : opts) {
                dropdown.addDropdownValue(opt.attr("for"), opt.text().trim());
            }
        }
        fields.add(dropdown);
        i++;
    }

    return fields;
}

From source file:de.geeksfactory.opacclient.apis.TouchPoint.java

private void parseDropdown(Element dropdownElement, List<SearchField> fields) {
    Elements options = dropdownElement.select("option");
    DropdownSearchField dropdown = new DropdownSearchField();
    dropdown.setId(dropdownElement.attr("name"));
    // Some fields make no sense or are not supported in the app
    if (dropdown.getId().equals("numberOfHits") || dropdown.getId().equals("timeOut")
            || dropdown.getId().equals("rememberList")) {
        return;//from  www .  jav  a  2 s . c o  m
    }
    for (Element option : options) {
        dropdown.addDropdownValue(option.attr("value"), option.text());
    }
    dropdown.setDisplayName(dropdownElement.parent().select("label").text());
    fields.add(dropdown);
}

From source file:de.geeksfactory.opacclient.apis.SISIS.java

private void parseDropdown(Element dropdownElement, List<SearchField> fields) throws JSONException {
    Elements options = dropdownElement.select("option");
    DropdownSearchField dropdown = new DropdownSearchField();
    if (dropdownElement.parent().select("input[type=hidden]").size() > 0) {
        dropdown.setId(dropdownElement.parent().select("input[type=hidden]").attr("value"));
        dropdown.setData(new JSONObject("{\"restriction\": true}"));
    } else {//from ww  w .j ava 2s .  com
        dropdown.setId(dropdownElement.attr("name"));
        dropdown.setData(new JSONObject("{\"restriction\": false}"));
    }
    for (Element option : options) {
        dropdown.addDropdownValue(option.attr("value"), option.text());
    }
    dropdown.setDisplayName(dropdownElement.parent().select("label").text());
    fields.add(dropdown);
}

From source file:de.geeksfactory.opacclient.apis.BiBer1992.java

@Override
public List<SearchField> getSearchFields() throws IOException {
    List<SearchField> fields = new ArrayList<>();

    HttpGet httpget;/*from   ww  w.ja  v  a  2  s .  co  m*/
    if (opacDir.contains("opax")) {
        httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel.html.S");
    } else {
        httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel_main.S");
    }

    HttpResponse response = http_client.execute(httpget);

    if (response.getStatusLine().getStatusCode() == 500) {
        throw new NotReachableException(response.getStatusLine().getReasonPhrase());
    }
    String html = convertStreamToString(response.getEntity().getContent());
    HttpUtils.consume(response.getEntity());

    Document doc = Jsoup.parse(html);

    // get text fields
    Elements text_opts = doc.select("form select[name=REG1] option");
    for (Element opt : text_opts) {
        TextSearchField field = new TextSearchField();
        field.setId(opt.attr("value"));
        field.setDisplayName(opt.text());
        field.setHint("");
        fields.add(field);
    }

    // get media types
    Elements mt_opts = doc.select("form input[name~=(MT|MS)]");
    if (mt_opts.size() > 0) {
        DropdownSearchField mtDropdown = new DropdownSearchField();
        mtDropdown.setId(mt_opts.get(0).attr("name"));
        mtDropdown.setDisplayName("Medientyp");
        for (Element opt : mt_opts) {
            if (!opt.val().equals("")) {
                String text = opt.text();
                if (text.length() == 0) {
                    // text is empty, check layouts:
                    // Essen: <input name="MT"><img title="mediatype">
                    // Schaffenb: <input name="MT"><img alt="mediatype">
                    Element img = opt.nextElementSibling();
                    if (img != null && img.tagName().equals("img")) {
                        text = img.attr("title");
                        if (text.equals("")) {
                            text = img.attr("alt");
                        }
                    }
                }
                if (text.length() == 0) {
                    // text is still empty, check table layout, Example
                    // Friedrichshafen
                    // <td><input name="MT"></td> <td><img
                    // title="mediatype"></td>
                    Element td1 = opt.parent();
                    Element td2 = td1.nextElementSibling();
                    if (td2 != null) {
                        Elements td2Children = td2.select("img[title]");
                        if (td2Children.size() > 0) {
                            text = td2Children.get(0).attr("title");
                        }
                    }
                }
                if (text.length() == 0) {
                    // text is still empty, check images in label layout, Example
                    // Wiedenst
                    // <input type="radio" name="MT" id="MTYP1" value="MTYP1">
                    // <label for="MTYP1"><img src="http://www.wiedenest.de/bib/image/books
                    // .png" alt="Bcher" title="Bcher"></label>
                    Element label = opt.nextElementSibling();
                    if (label != null) {
                        Elements td2Children = label.select("img[title]");
                        if (td2Children.size() > 0) {
                            text = td2Children.get(0).attr("title");
                        }
                    }
                }
                if (text.length() == 0) {
                    // text is still empty: missing end tag like Offenburg
                    text = parse_option_regex(opt);
                }
                mtDropdown.addDropdownValue(opt.val(), text);
            }
        }
        fields.add(mtDropdown);
    }

    // get branches
    Elements br_opts = doc.select("form select[name=ZW] option");
    if (br_opts.size() > 0) {
        DropdownSearchField brDropdown = new DropdownSearchField();
        brDropdown.setId(br_opts.get(0).parent().attr("name"));
        brDropdown.setDisplayName(br_opts.get(0).parent().parent().previousElementSibling().text()
                .replace("\u00a0", "").replace("?", "").trim());
        for (Element opt : br_opts) {
            brDropdown.addDropdownValue(opt.val(), opt.text());
        }
        fields.add(brDropdown);
    }

    return fields;
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords
 * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around
 * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score
 *
 * @return/*from  ww w .j a  v  a  2s  . co  m*/
 */
private Element calculateBestNodeBasedOnClustering(Document doc) {
    Element topNode = null;

    // grab all the paragraph elements on the page to start to inspect the likely hood of them being good peeps
    ArrayList<Element> nodesToCheck = getNodesToCheck(doc);

    double startingBoost = 1.0;
    int cnt = 0;
    int i = 0;

    // holds all the parents of the nodes we're checking
    Set<Element> parentNodes = new HashSet<Element>();

    ArrayList<Element> nodesWithText = new ArrayList<Element>();

    for (Element node : nodesToCheck) {

        String nodeText = node.text();
        WordStats wordStats = StopWords.getStopWordCount(nodeText);
        boolean highLinkDensity = isHighLinkDensity(node);

        if (wordStats.getStopWordCount() > 2 && !highLinkDensity) {

            nodesWithText.add(node);
        }

    }

    int numberOfNodes = nodesWithText.size();
    int negativeScoring = 0; // we shouldn't give more negatives than positives
    // we want to give the last 20% of nodes negative scores in case they're comments
    double bottomNodesForNegativeScore = (float) numberOfNodes * 0.25;

    if (logger.isDebugEnabled()) {
        logger.debug("About to inspect num of nodes with text: " + numberOfNodes);
    }

    for (Element node : nodesWithText) {

        // add parents and grandparents to scoring
        // only add boost to the middle paragraphs, top and bottom is usually jankz city
        // so basically what we're doing is giving boost scores to paragraphs that appear higher up in the dom
        // and giving lower, even negative scores to those who appear lower which could be commenty stuff

        float boostScore = 0;

        if (isOkToBoost(node)) {
            if (cnt >= 0) {
                boostScore = (float) ((1.0 / startingBoost) * 50);
                startingBoost++;
            }
        }

        // check for negative node values
        if (numberOfNodes > 15) {
            if ((numberOfNodes - i) <= bottomNodesForNegativeScore) {
                float booster = (float) bottomNodesForNegativeScore - (float) (numberOfNodes - i);
                boostScore = -(float) Math.pow(booster, (float) 2);

                // we don't want to score too highly on the negative side.
                float negscore = Math.abs(boostScore) + negativeScoring;
                if (negscore > 40) {
                    boostScore = 5;
                }
            }
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Location Boost Score: " + boostScore + " on interation: " + i + "' id='"
                    + node.parent().id() + "' class='" + node.parent().attr("class"));
        }
        String nodeText = node.text();
        WordStats wordStats = StopWords.getStopWordCount(nodeText);
        int upscore = (int) (wordStats.getStopWordCount() + boostScore);
        updateScore(node.parent(), upscore);
        updateScore(node.parent().parent(), upscore / 2);
        updateNodeCount(node.parent(), 1);
        updateNodeCount(node.parent().parent(), 1);

        if (!parentNodes.contains(node.parent())) {
            parentNodes.add(node.parent());
        }

        if (!parentNodes.contains(node.parent().parent())) {
            parentNodes.add(node.parent().parent());
        }

        cnt++;
        i++;
    }

    // now let's find the parent node who scored the highest

    int topNodeScore = 0;
    for (Element e : parentNodes) {

        if (logger.isDebugEnabled()) {
            logger.debug("ParentNode: score='" + e.attr("gravityScore") + "' nodeCount='"
                    + e.attr("gravityNodes") + "' id='" + e.id() + "' class='" + e.attr("class") + "' ");
        }
        //int score = Integer.parseInt(e.attr("gravityScore")) * Integer.parseInt(e.attr("gravityNodes"));
        int score = getScore(e);
        if (score > topNodeScore) {
            topNode = e;
            topNodeScore = score;
        }

        if (topNode == null) {
            topNode = e;
        }
    }

    if (logger.isDebugEnabled()) {
        if (topNode == null) {
            logger.debug("ARTICLE NOT ABLE TO BE EXTRACTED!, WE HAZ FAILED YOU LORD VADAR");
        } else {
            String logText;
            String targetText = "";
            Element topPara = topNode.getElementsByTag("p").first();
            if (topPara == null) {
                topNode.text();
            } else {
                topPara.text();
            }

            if (targetText.length() >= 51) {
                logText = targetText.substring(0, 50);
            } else {
                logText = targetText;
            }
            logger.debug("TOPNODE TEXT: " + logText.trim());
            logger.debug("Our TOPNODE: score='" + topNode.attr("gravityScore") + "' nodeCount='"
                    + topNode.attr("gravityNodes") + "' id='" + topNode.id() + "' class='"
                    + topNode.attr("class") + "' ");
        }
    }

    return topNode;

}