Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:de.geeksfactory.opacclient.apis.Zones22.java

@Override
public List<SearchField> getSearchFields() throws ClientProtocolException, IOException {
    List<SearchField> fields = new ArrayList<SearchField>();
    String html = httpGet(//from  ww  w  .ja  v  a2  s. c o  m
            opac_url + "/APS_ZONES?fn=AdvancedSearch&Style=Portal3&SubStyle=&Lang=GER&ResponseEncoding=utf-8",
            getDefaultEncoding());

    Document doc = Jsoup.parse(html);

    // Textfelder auslesen
    Elements txt_opts = doc.select("#formSelectTerm_1 option");
    for (Element opt : txt_opts) {
        TextSearchField field = new TextSearchField();
        field.setId(opt.attr("value"));
        field.setHint("");
        field.setDisplayName(opt.text());
        fields.add(field);
    }

    // Zweigstellen auslesen
    Elements zst_opts = doc.select(".TabRechAv .limitChoice label");
    if (zst_opts.size() > 0) {
        DropdownSearchField brDropdown = new DropdownSearchField();
        brDropdown.setId(zst_opts.get(0).parent().select("input").attr("name"));
        brDropdown.setDisplayName("Zweigstelle");

        List<Map<String, String>> brOptions = new ArrayList<Map<String, String>>();
        Map<String, String> all = new HashMap<String, String>();
        all.put("key", "");
        all.put("value", "Alle");
        brOptions.add(all);
        for (Element opt : zst_opts) {
            Map<String, String> value = new HashMap<String, String>();
            value.put("key", opt.attr("for"));
            value.put("value", opt.text().trim());
            brOptions.add(value);
        }
        brDropdown.setDropdownValues(brOptions);
        fields.add(brDropdown);
    }

    return fields;
}

From source file:net.kevxu.purdueassist.course.CatalogDetail.java

private CatalogDetailEntry parseDocument(Document document)
        throws HtmlParseException, CourseNotFoundException, IOException {
    CatalogDetailEntry entry = new CatalogDetailEntry(subject, cnbr);
    Elements tableElements = document.getElementsByAttributeValue("summary",
            "This table lists the course detail for the selected term.");
    if (tableElements.isEmpty() != true) {
        // get name
        try {/*from ww w . j  a  v  a  2  s .co m*/
            Element body = tableElements.first().select("tbody").first();
            String nameBlock = body.select("tr td.nttitle").first().text();
            String[] temp = nameBlock.split(subject.name() + " " + String.valueOf(cnbr));
            String name = temp[temp.length - 1].substring(3);
            entry.setName(name);

            // get description
            body = body.select(".ntdefault").first();
            String text = body.text();
            int split = text.indexOf("Levels:");
            String description = text.substring(0, split);
            description = description.substring(20);
            entry.setDescription(description);

            // get levels
            int begin = split;
            int end = text.indexOf("Schedule Types:");
            String levels = text.substring(begin + 8, end);
            temp = levels.split("[ ,]");
            List<String> lvs = new ArrayList<String>();
            for (String s : temp)
                if (!s.equals("")) {
                    lvs.add(s);
                }
            entry.setLevels(lvs);

            // get type and prerequisites
            List<Type> types = new ArrayList<Type>();
            List<String> preq = new ArrayList<String>();
            Elements parsing_A = body.select("a");
            for (Element e : parsing_A) {
                if (e.attr("href").contains("schd_in") && !(e.attr("href").contains("%"))) {

                    try {
                        types.add(Type.valueOf(e.text().replace(" ", "")));
                    } catch (Exception exception) {
                        throw new HtmlParseException();
                    }
                } else if (e.attr("href").contains("sel_attr=")) {
                    preq.add(e.text());
                }
            }
            if (types.size() > 0)
                entry.setType(types);
            if (preq.size() > 0)
                entry.setPrerequisites(preq);

            // get offered by
            begin = text.indexOf("Offered By:");
            end = text.indexOf("Department:");
            if (end < 0)
                end = text.indexOf("Course Attributes:");
            if (end > 0) {
                entry.setOfferedBy(text.substring(begin + 12, end - 1));
            }

            // get department
            begin = text.indexOf("Department:");
            if (begin > 0) {
                end = text.indexOf("Course Attributes:");
                entry.setDepartment((text.substring(begin + 12, end - 1)));
            }

            // get campus
            begin = text.indexOf("May be offered at any of the following campuses:");
            String campuses;
            end = text.indexOf("Repeatable for Additional Credit:");
            if (end < 0)
                end = text.indexOf("Learning Objectives:");
            if (end < 0)
                end = text.indexOf("Restrictions:");
            if (end < 0)
                end = text.indexOf("Corequisites:");
            if (end < 0)
                end = text.indexOf("Prerequisites:");
            if (end < 0) {
                campuses = text
                        .substring(begin + "May be offered at any of the following campuses:".length() + 5);
            } else {
                campuses = text.substring(
                        begin + "May be offered at any of the following campuses:".length() + 5, end - 1);
            }
            temp = campuses.replace("    ", "#").split("#");
            List<String> camps = new ArrayList<String>();
            for (String s : temp) {
                if (s.length() > 1) {
                    camps.add(s);
                }

            }
            entry.setCampuses(camps);

            // get restrictions
            begin = text.indexOf("Restrictions:");
            end = text.indexOf("Corequisites:");
            if (end < 0)
                end = text.indexOf("Prerequisites:");
            if (begin > 0 && end < 0) {
                entry.setRestrictions(
                        text.substring(begin + "Restrictions:".length()).replace("      ", "\n"));
            } else if (begin > 0) {
                entry.setRestrictions(
                        text.substring(begin + "Restrictions:".length(), end).replace("      ", "\n"));
            }

        } catch (StringIndexOutOfBoundsException e) {
            // no type, not available
            // System.out.println("-----------");
            // System.out.println("Error for cnbr = " + cnbr);
            // System.out.println("-----------");
        }
    } else {
        throw new CourseNotFoundException();
    }

    return entry;
}

From source file:mml.handler.post.MMLPostHTMLHandler.java

/**
 * Parse a span with a class or not/* w  ww  . ja va 2  s. c om*/
 * @param span the span in HTML
 */
private void parseSpan(Element span) throws JSONException {
    if (span.hasText()) {
        int offset = sb.length();
        String name = span.attr("class");
        Range r = new Range(name, offset, 0);
        if (name == null || name.length() == 0)
            name = "span";
        if (isMilestone(name)) {
            pages.add(r);
            sb.append(span.text());
            sb.append("\n");
            pages.updateLen(r, sb.length() - offset);
            prevWasMilestone = true;
        } else if (name.equals("soft-hyphen")) {
            stil.add(r);
            // get previous word
            int i = sb.length() - 1;
            while (i > 0 && !Character.isWhitespace(sb.charAt(i)))
                i--;
            if (i > 0)
                i++;
            String prev = clean(sb.substring(i), true);
            // get next word
            String next = clean(nextWord(span), false);
            if (this.speller.isHardHyphen(prev, next))
                r.name = "hard-hyphen";
            sb.append(span.text());
            stil.updateLen(r, sb.length() - offset);
        } else // span may contain other spans
        {
            stil.add(r);
            List<Node> children = span.childNodes();
            for (Node child : children) {
                if (child instanceof Element) {
                    String nName = child.nodeName().toLowerCase();
                    if (nName.equals("span"))
                        parseSpan((Element) child);
                    else
                        parseOtherElement((Element) child);
                } else if (child instanceof TextNode) {
                    TextNode tn = (TextNode) child;
                    sb.append(tn.text());
                }
            }
            if (isLineFormat(name))
                ensure(1, false);
            stil.updateLen(r, sb.length() - offset);
        }
    }
    // else strangely no text: ignore it
}

From source file:me.vertretungsplan.parser.IndiwareParser.java

SubstitutionScheduleDay parseIndiwareDay(Element doc, boolean html) throws IOException {
    SubstitutionScheduleDay day = new SubstitutionScheduleDay();

    DataSource ds;/*from   ww  w . j a  va 2s .c  o m*/
    if (html) {
        ds = new HTMLDataSource(doc);
    } else {
        ds = new XMLDataSource(doc);
    }

    Matcher matcher = datePattern.matcher(ds.titel().text());
    if (!matcher.find())
        throw new IOException("malformed date: " + ds.titel().text());
    String date = matcher.group();
    day.setDate(
            DateTimeFormat.forPattern("EEEE, dd. MMMM yyyy").withLocale(Locale.GERMAN).parseLocalDate(date));

    String lastChange = ds.datum().text();
    day.setLastChange(DateTimeFormat.forPattern("dd.MM.yyyy, HH:mm").withLocale(Locale.GERMAN)
            .parseLocalDateTime(lastChange));

    if (ds.kopfinfos().size() > 0) {
        for (Element kopfinfo : ds.kopfinfos()) {
            String title = html ? kopfinfo.select("th").text() : kopfinfoTitle(kopfinfo.tagName()) + ":";

            StringBuilder message = new StringBuilder();
            if (title != null && !title.isEmpty()) {
                message.append("<b>").append(title).append("</b>").append(" ");
            }
            message.append(html ? kopfinfo.select("td").text() : kopfinfo.text());

            day.addMessage(message.toString());
        }
    }

    if (ds.fuss() != null) {
        StringBuilder message = new StringBuilder();
        boolean first = true;
        for (Element fusszeile : ds.fusszeilen()) {
            if (first) {
                first = false;
            } else {
                message.append("\n");
            }
            message.append(fusszeile.text());
        }
        day.addMessage(message.toString());
    }

    List<String> columnTypes = null;
    if (html) {
        columnTypes = new ArrayList<>();
        for (Element th : ((HTMLDataSource) ds).headers()) {
            columnTypes.add(th.className().replace("thplan", "").replace("thlplan", ""));
        }
    }

    for (Element aktion : ds.aktionen()) {
        Substitution substitution = new Substitution();
        String type = "Vertretung";
        String course = null;
        int i = 0;
        for (Element info : aktion.children()) {
            String value = info.text().replace("\u00a0", "");
            if (value.equals("---")) {
                i++;
                continue;
            }
            final String columnType = html ? columnTypes.get(i) : info.tagName();
            switch (columnType) {
            case "klasse":
                Set<String> classes = new HashSet<>();
                for (String klasse : value.split(",")) {
                    Matcher courseMatcher = coursePattern.matcher(klasse);
                    if (courseMatcher.matches()) {
                        classes.add(courseMatcher.group(1));
                        course = courseMatcher.group(2);
                    } else {
                        classes.add(klasse);
                    }
                }
                substitution.setClasses(classes);
                break;
            case "stunde":
                substitution.setLesson(value);
                break;
            case "fach":
                String subject = subjectAndCourse(course, value);
                if (columnTypes != null && columnTypes.contains("vfach")) {
                    substitution.setPreviousSubject(subject);
                } else {
                    substitution.setSubject(subject);
                }
                break;
            case "vfach":
                substitution.setSubject(subjectAndCourse(course, value));
            case "lehrer":
                Matcher bracesMatcher = bracesPattern.matcher(value);
                if (bracesMatcher.matches())
                    value = bracesMatcher.group(1);
                substitution.setTeacher(value);
                break;
            case "raum":
                if (columnTypes != null && columnTypes.contains("vraum")) {
                    substitution.setPreviousRoom(value);
                } else {
                    substitution.setRoom(value);
                }
                break;
            case "vraum":
                substitution.setRoom(value);
            case "info":
                Matcher substitutionMatcher = substitutionPattern.matcher(value);
                Matcher cancelMatcher = cancelPattern.matcher(value);
                Matcher delayMatcher = delayPattern.matcher(value);
                Matcher selfMatcher = selfPattern.matcher(value);
                if (substitutionMatcher.matches()) {
                    substitution.setPreviousSubject(substitutionMatcher.group(1));
                    substitution.setPreviousTeacher(substitutionMatcher.group(2));
                    if (!substitutionMatcher.group(3).isEmpty()) {
                        substitution.setDesc(substitutionMatcher.group(3));
                    }
                } else if (cancelMatcher.matches()) {
                    type = "Entfall";
                    substitution.setPreviousSubject(cancelMatcher.group(1));
                    substitution.setPreviousTeacher(cancelMatcher.group(2));
                } else if (delayMatcher.matches()) {
                    type = "Verlegung";
                    substitution.setPreviousSubject(delayMatcher.group(1));
                    substitution.setPreviousTeacher(delayMatcher.group(2));
                    substitution.setDesc(delayMatcher.group(3));
                } else if (selfMatcher.matches()) {
                    type = "selbst.";
                    if (!selfMatcher.group(1).isEmpty())
                        substitution.setDesc(selfMatcher.group(1));
                } else if (value.equals("fllt aus") || value.equals("Klausur") || value.equals("Aufg.")) {
                    type = value;
                } else {
                    substitution.setDesc(value);
                }
                break;
            }
            i++;
        }
        substitution.setType(type);
        substitution.setColor(colorProvider.getColor(substitution.getType()));
        if (course != null && substitution.getSubject() == null) {
            substitution.setSubject(course);
        }
        day.addSubstitution(substitution);
    }

    return day;
}

From source file:de.geeksfactory.opacclient.apis.Zones22.java

private DetailledItem parse_result(String id, String html) throws IOException {
    Document doc = Jsoup.parse(html);

    DetailledItem result = new DetailledItem();
    result.setTitle("");
    boolean title_is_set = false;

    result.setId(id);/*w  w w .j a  va  2s. co m*/

    Elements detaildiv = doc.select("div.record-item-new");

    Elements detailtrs1 = doc.select(".DetailDataCell table table:not(.inRecordHeader) tr");
    for (int i = 0; i < detailtrs1.size(); i++) {
        Element tr = detailtrs1.get(i);
        int s = tr.children().size();
        if (tr.child(0).text().trim().equals("Titel") && !title_is_set) {
            result.setTitle(tr.child(s - 1).text().trim());
            title_is_set = true;
        } else if (s > 1) {
            Element valchild = tr.child(s - 1);
            if (valchild.select("table").isEmpty()) {
                String val = valchild.text().trim();
                if (val.length() > 0)
                    result.addDetail(new Detail(tr.child(0).text().trim(), val));
            }
        }
    }

    for (Element a : doc.select("a.SummaryActionLink")) {
        if (a.text().contains("Vormerken")) {
            result.setReservable(true);
            result.setReservation_info(a.attr("href"));
        }
    }

    if (!detaildiv.isEmpty()) {
        for (int i = 0; i < detaildiv.size(); i++) {
            Element dd = detaildiv.get(i);
            String text = "";
            for (Node node : dd.childNodes()) {
                if (node instanceof TextNode) {
                    String snip = ((TextNode) node).text();
                    if (snip.length() > 0)
                        text += snip;
                } else if (node instanceof Element) {
                    if (((Element) node).tagName().equals("br"))
                        text += "\n";
                    else {
                        String snip = ((Element) node).text().trim();
                        if (snip.length() > 0)
                            text += snip;
                    }
                }
            }
            result.addDetail(new Detail("", text));
        }
    }

    if (doc.select("span.z3988").size() > 0) {
        // Sometimes there is a <span class="Z3988"> item which provides
        // data in a standardized format.
        String z3988data = doc.select("span.z3988").first().attr("title").trim();
        for (String pair : z3988data.split("\\&")) {
            String[] nv = pair.split("=", 2);
            if (nv.length == 2) {
                if (!nv[1].trim().equals("")) {
                    if (nv[0].equals("rft.btitle") && result.getTitle().length() == 0) {
                        result.setTitle(nv[1]);
                    } else if (nv[0].equals("rft.atitle") && result.getTitle().length() == 0) {
                        result.setTitle(nv[1]);
                    } else if (nv[0].equals("rft.au")) {
                        result.addDetail(new Detail("Author", nv[1]));
                    }
                }
            }
        }
    }

    Elements copydivs = doc.select(".DetailDataCell div[id^=stock_]");
    String pop = "";
    for (int i = 0; i < copydivs.size(); i++) {
        Element div = copydivs.get(i);

        if (div.attr("id").startsWith("stock_head")) {
            pop = div.text().trim();
            continue;
        }

        Map<String, String> copy = new HashMap<String, String>();

        // This is getting very ugly - check if it is valid for libraries
        // which are not
        // Hamburg.
        int j = 0;
        for (Node node : div.childNodes()) {
            try {
                if (node instanceof Element) {
                    if (((Element) node).tag().getName().equals("br")) {
                        copy.put(DetailledItem.KEY_COPY_BRANCH, pop);
                        result.addCopy(copy);
                        j = -1;
                    } else if (((Element) node).tag().getName().equals("b") && j == 1) {
                        copy.put(DetailledItem.KEY_COPY_LOCATION, ((Element) node).text());
                    } else if (((Element) node).tag().getName().equals("b") && j > 1) {
                        copy.put(DetailledItem.KEY_COPY_STATUS, ((Element) node).text());
                    }
                    j++;
                } else if (node instanceof TextNode) {
                    if (j == 0)
                        copy.put(DetailledItem.KEY_COPY_DEPARTMENT, ((TextNode) node).text());
                    if (j == 2)
                        copy.put(DetailledItem.KEY_COPY_BARCODE,
                                ((TextNode) node).getWholeText().trim().split("\n")[0].trim());
                    if (j == 6) {
                        String text = ((TextNode) node).text().trim();
                        copy.put(DetailledItem.KEY_COPY_RETURN, text.substring(text.length() - 10));
                    }
                    j++;
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    return result;
}

From source file:de.geeksfactory.opacclient.apis.Heidi.java

@Override
public ReservationResult reservation(DetailledItem item, Account account, int useraction, String selection)
        throws IOException {
    String html = httpGet(opac_url + "/bestellung.cgi?ks=" + item.getId() + "&sess=" + sessid, ENCODING, false,
            cookieStore);/*from  w w w . j a  v  a2 s  . c o  m*/
    Document doc = Jsoup.parse(html);
    if (doc.select("input[name=pw]").size() > 0) {
        List<NameValuePair> nameValuePairs = new ArrayList<>(2);
        nameValuePairs.add(new BasicNameValuePair("id", account.getName()));
        nameValuePairs.add(new BasicNameValuePair("pw", account.getPassword()));
        nameValuePairs.add(new BasicNameValuePair("sess", sessid));
        nameValuePairs.add(new BasicNameValuePair("log", "login"));
        nameValuePairs.add(new BasicNameValuePair("weiter", "bestellung.cgi?ks=" + item.getId()));
        html = httpPost(opac_url + "/login.cgi", new UrlEncodedFormEntity(nameValuePairs), ENCODING);
        doc = Jsoup.parse(html);
        if (doc.select(".loginbox .meld").size() > 0) {
            return new ReservationResult(MultiStepResult.Status.ERROR, doc.select(".loginbox .meld").text());
        }
    }
    if (doc.select("input[name=ort]").size() > 0) {
        if (selection != null) {
            List<NameValuePair> nameValuePairs = new ArrayList<>(2);
            nameValuePairs.add(new BasicNameValuePair("ks", item.getId()));
            nameValuePairs.add(new BasicNameValuePair("ort", selection));
            nameValuePairs.add(new BasicNameValuePair("sess", sessid));
            nameValuePairs.add(new BasicNameValuePair("funktion", "Vormerkung"));
            html = httpPost(opac_url + "/bestellung.cgi", new UrlEncodedFormEntity(nameValuePairs), ENCODING);
            doc = Jsoup.parse(html);
        } else {
            List<Map<String, String>> options = new ArrayList<>();
            for (Element input : doc.select("input[name=ort]")) {
                Element label = doc.select("label[for=" + input.id() + "]").first();
                Map<String, String> selopt = new HashMap<>();
                selopt.put("key", input.attr("value"));
                selopt.put("value", label.text());
                options.add(selopt);
            }
            ReservationResult res = new ReservationResult(MultiStepResult.Status.SELECTION_NEEDED);
            res.setSelection(options);
            return res;
        }
    }
    if (doc.select(".fehler").size() > 0) {
        String text = doc.select(".fehler").text();
        return new ReservationResult(MultiStepResult.Status.ERROR, text);
    }
    String text = doc.select(".meld2").text();
    if (text.contains("Das Medium wurde")) {
        return new ReservationResult(MultiStepResult.Status.OK, text);
    } else {
        return new ReservationResult(MultiStepResult.Status.ERROR, text);
    }
}

From source file:com.jimplush.goose.ContentExtractor.java

private Set<String> extractTags(Element node) {
    if (node.children().size() == 0)
        return NO_STRINGS;

    Elements elements = Selector.select(A_REL_TAG_SELECTOR, node);
    if (elements.size() == 0)
        return NO_STRINGS;

    Set<String> tags = new HashSet<String>(elements.size());
    for (Element el : elements) {
        String tag = el.text();
        if (!string.isNullOrEmpty(tag))
            tags.add(tag);/*from   w  ww. j a v a 2 s .co  m*/
    }

    return tags;
}

From source file:dsll.pinterest.crawler.Reduce.java

private static Text getPinContent(String url, DBCollection pinsCollection) throws JSONException {
    Document html = null;/*ww  w  . jav  a  2  s.  c o  m*/
    JSONObject pin = new JSONObject();
    try {
        html = Jsoup.connect(url).get();
    } catch (Exception e) {
        return new Text("HTTP connection failed...");
    }

    // Gather major pins data
    Element doc = html.select("body").first();
    // Pin ID
    String id = (url.split("pin/")[1].split("/")[0]);
    pin.append("ID", id);

    // Pin image
    String imageURL = "";
    Element tmp = doc.select("div[class=pinImageSourceWrapper]").first();
    try {
        tmp = tmp.select("div[class=imageContainer]").select("img").first();
        imageURL = tmp.attr("src");
    } catch (Exception e) {
    }
    //        try{
    //            ByteArrayOutputStream pimg=new ByteArrayOutputStream(), cimg = new ByteArrayOutputStream();
    //            for(int i=0; i<3; i++){ 
    //                BufferedImage img=dummyImage;
    //                try{
    //                    img = ImageIO.read(new URL(imageURL));
    //                
    //                }catch(Exception e){}
    //                ImageIO.write(img, "jpg", cimg);
    //                if(pimg.size()<cimg.size()){
    //                        pimg = cimg;
    //                }
    //            }
    //            // save to hdfs
    //            Configuration conf = new Configuration();
    //            FileSystem fs = FileSystem.get(conf);
    //            Path outFile = new Path("/home/hadoop/"+id+".png");
    //            FSDataOutputStream out = fs.create(outFile);
    //            out.write(pimg.toByteArray());
    //
    //        }catch(Exception e){
    //                e.printStackTrace();
    //        }
    pin.append("image", imageURL);

    //Pin name
    tmp = doc.select("h2[itemprop=name]").first();
    String name = "";
    if (tmp != null) {
        name = tmp.text().trim();
    }
    pin.append("name", name);

    // Pin source
    Element sourceCont = doc.select("div[class=sourceFlagWrapper]").first();
    JSONObject source = new JSONObject();
    if (sourceCont != null) {
        String title = sourceCont.text().trim();
        String src = sourceCont.select("a").first().attr("href");
        source.append("title", title);
        source.append("src", src);
    }
    pin.append("source", source);

    //pin credit
    JSONObject pinCredit = new JSONObject();
    Element credit = doc.select("div[class=pinCredits]").first();
    String creditName = "", creditTitle = "", creditSource = "";
    try {
        creditName = credit.select("div[class=creditName]").text().trim();
    } catch (Exception e) {
    }
    try {
        creditTitle = credit.select("div[class=creditTitle]").text().trim();
    } catch (Exception e) {
    }
    try {
        creditSource = credit.select("a").attr("href");
    } catch (Exception e) {
    }
    pinCredit.append("name", creditName);
    pinCredit.append("title", creditTitle);
    pinCredit.append("src", creditSource);
    pin.append("credit", pinCredit);

    //comments
    JSONArray comments = new JSONArray();
    Elements commentsConts = doc.select("div[class=commenterNameCommentText]");
    for (Element commentCont : commentsConts) {
        JSONObject comment = new JSONObject();
        Element creatorEle = commentCont.select("div[class=commenterWrapper] a").first();
        String creatorName = creatorEle.text().trim();
        String creatorSrc = creatorEle.attr("href");
        String content = "", raw = "";
        Element commentContent = commentCont.select(".commentDescriptionContent").first();
        try {
            content = commentContent.text().trim();
            raw = commentContent.html();
            comment.append("creator", creatorName);
            comment.append("creator_url", creatorSrc);
            comment.append("content", content);
            comment.append("content_raw", raw);
            comments.put(comment);
        } catch (Exception e) {
        }

    }
    pin.append("comments", comments);

    //pin board link and related pins
    Element bottomDoc = doc.select("div[class=Module CloseupSidebar]").first();

    //pin board
    JSONArray board = new JSONArray();
    if (bottomDoc != null) {
        Element boardEle = bottomDoc.select("div[class=boardHeader]").first();
        JSONObject b = new JSONObject();
        String boardName = "";
        try {
            boardName = boardEle.select("h3[class=title]").text().trim();
        } catch (Exception ee) {
        }
        String boardSrc = "";
        try {
            boardSrc = "https://www.pinterest.com" + boardEle.select("a").attr("href").trim();
        } catch (Exception ee) {
        }
        b.append("name", boardName);
        b.append("src", boardSrc);
        board.put(b);
    }
    pin.append("board", board); //CAUTION: what if a pin shows up in different boards?

    //related pins
    bottomDoc = doc
            .select("div[class=closeupBottom] div[class=Module CloseupBottom] div[class=relatedPinsWrapper]")
            .first();

    JSONArray relatedPins = new JSONArray();
    if (bottomDoc != null) {
        Elements relatedPinsConts = bottomDoc.select("div[class=pinWrapper]");
        for (Element relatedPinsCont : relatedPinsConts) {
            JSONObject relatedPin = new JSONObject();
            try {
                relatedPin.append("src", "https://www.pinterest.com"
                        + relatedPinsCont.select("div[class=pinHolder] > a").attr("href"));
            } catch (Exception e) {
            }
            relatedPins.put(relatedPin);
        }
    }
    pin.append("related_pins", relatedPins);

    // Optional: push data to database
    BasicDBObject dbObject = (BasicDBObject) JSON.parse(pin.toString());
    pinsCollection.insert(dbObject);
    return new Text(pin.toString());
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against
 * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring
 * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of
 * 100 then 100 should be our base./*from  w  w  w  . ja  v  a  2s .  com*/
 *
 * @param topNode
 * @return
 */
private int getBaselineScoreForSiblings(Element topNode) {

    int base = 100000;

    int numberOfParagraphs = 0;
    int scoreOfParagraphs = 0;

    Elements nodesToCheck = topNode.getElementsByTag("p");

    for (Element node : nodesToCheck) {

        String nodeText = node.text();
        WordStats wordStats = StopWords.getStopWordCount(nodeText);
        boolean highLinkDensity = isHighLinkDensity(node);

        if (wordStats.getStopWordCount() > 2 && !highLinkDensity) {

            numberOfParagraphs++;
            scoreOfParagraphs += wordStats.getStopWordCount();
        }

    }

    if (numberOfParagraphs > 0) {
        base = scoreOfParagraphs / numberOfParagraphs;
        if (logger.isDebugEnabled()) {
            logger.debug("The base score for siblings to beat is: " + base + " NumOfParas: "
                    + numberOfParagraphs + " scoreOfAll: " + scoreOfParagraphs);
        }
    }

    return base;

}

From source file:de.geeksfactory.opacclient.apis.IOpac.java

private SearchField createSearchField(Element descTd, Element inputTd) {
    String name = descTd.select("span, blockquote").text().replace(":", "").trim().replace("\u00a0", "");
    if (inputTd.select("select").size() > 0 && !name.equals("Treffer/Seite") && !name.equals("Medientypen")
            && !name.equals("Medientyp") && !name.equals("Treffer pro Seite")) {
        Element select = inputTd.select("select").first();
        DropdownSearchField field = new DropdownSearchField();
        field.setDisplayName(name);//from  w  w  w .j  a  v a 2s  .  com
        field.setId(select.attr("name"));
        for (Element option : select.select("option")) {
            field.addDropdownValue(option.attr("value"), option.text());
        }
        return field;
    } else if (inputTd.select("input").size() > 0) {
        TextSearchField field = new TextSearchField();
        Element input = inputTd.select("input").first();
        field.setDisplayName(name);
        field.setId(input.attr("name"));
        field.setHint("");
        return field;
    } else {
        return null;
    }
}