Example usage for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text()

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:com.digitalpebble.storm.crawler.bolt.JSoupParserBolt.java

@Override
public void execute(Tuple tuple) {

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    // check that its content type is HTML
    // look at value found in HTTP headers
    boolean CT_OK = false;
    String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
    if (StringUtils.isNotBlank(httpCT)) {
        if (httpCT.toLowerCase().contains("html")) {
            CT_OK = true;// w w  w. j  a  v  a2 s . co  m
        }
    }
    // simply ignore cases where the content type has not been set
    // TODO sniff content with Tika?
    else {
        CT_OK = true;
    }

    if (!CT_OK) {
        String errorMessage = "Exception content-type " + httpCT + " for " + url;
        RuntimeException e = new RuntimeException(errorMessage);
        handleException(url, e, metadata, tuple, "content-type checking", errorMessage);
        return;
    }

    LOG.info("Parsing : starting {}", url);

    long start = System.currentTimeMillis();

    String charset = getContentCharset(content, metadata);

    // get the robots tags from the fetch metadata
    RobotsTags robotsTags = new RobotsTags(metadata);

    Map<String, List<String>> slinks;
    String text;
    DocumentFragment fragment;
    try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) {
        org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url);

        fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc);

        // extracts the robots directives from the meta tags
        robotsTags.extractMetaTags(fragment);

        // store a normalised representation in metadata
        // so that the indexer is aware of it
        robotsTags.normaliseToMetadata(metadata);

        // do not extract the links if no follow has been set
        // and we are in strict mode
        if (robotsTags.isNoFollow() && robots_noFollow_strict) {
            slinks = new HashMap<String, List<String>>(0);
        } else {
            Elements links = jsoupDoc.select("a[href]");
            slinks = new HashMap<String, List<String>>(links.size());
            for (Element link : links) {
                // abs:href tells jsoup to return fully qualified domains
                // for
                // relative urls.
                // e.g.: /foo will resolve to http://shopstyle.com/foo
                String targetURL = link.attr("abs:href");

                // nofollow
                boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel"));
                // remove altogether
                if (noFollow && robots_noFollow_strict) {
                    continue;
                }

                // link not specifically marked as no follow
                // but whole page is
                if (!noFollow && robotsTags.isNoFollow()) {
                    noFollow = true;
                }

                String anchor = link.text();
                if (StringUtils.isNotBlank(targetURL)) {
                    // any existing anchors for the same target?
                    List<String> anchors = slinks.get(targetURL);
                    if (anchors == null) {
                        anchors = new LinkedList<String>();
                        slinks.put(targetURL, anchors);
                    }
                    // track the anchors only if no follow is false
                    if (!noFollow && StringUtils.isNotBlank(anchor)) {
                        anchors.add(anchor);
                    }
                }
            }
        }

        text = jsoupDoc.body().text();

    } catch (Throwable e) {
        String errorMessage = "Exception while parsing " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content parsing", errorMessage);
        return;
    }

    // store identified charset in md
    metadata.setValue("parse.Content-Encoding", charset);

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    List<Outlink> outlinks = toOutlinks(url, metadata, slinks);

    ParseResult parse = new ParseResult();
    parse.setOutlinks(outlinks);

    // parse data of the parent URL
    ParseData parseData = parse.get(url);
    parseData.setMetadata(metadata);
    parseData.setText(text);
    parseData.setContent(content);

    // apply the parse filters if any
    try {
        parseFilters.filter(url, content, fragment, parse);
    } catch (RuntimeException e) {

        String errorMessage = "Exception while running parse filters on " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content filtering", errorMessage);
        return;
    }

    if (emitOutlinks) {
        for (Outlink outlink : outlinks) {
            collector.emit(StatusStreamName, tuple,
                    new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
        }
    }

    // emit each document/subdocument in the ParseResult object
    // there should be at least one ParseData item for the "parent" URL

    for (Map.Entry<String, ParseData> doc : parse) {
        ParseData parseDoc = doc.getValue();

        collector.emit(tuple,
                new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()));
    }

    collector.ack(tuple);
    eventCounter.scope("tuple_success").incr();
}

From source file:com.digitalpebble.stormcrawler.bolt.JSoupParserBolt.java

@Override
public void execute(Tuple tuple) {

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    LOG.info("Parsing : starting {}", url);

    // check that its content type is HTML
    // look at value found in HTTP headers
    boolean CT_OK = false;

    String mimeType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);

    if (detectMimeType) {
        mimeType = guessMimeType(url, mimeType, content);
        // store identified type in md
        metadata.setValue("parse.Content-Type", mimeType);
    }//w ww .j a  v a  2 s.  c  o  m

    if (StringUtils.isNotBlank(mimeType)) {
        if (mimeType.toLowerCase().contains("html")) {
            CT_OK = true;
        }
    }
    // go ahead even if no mimetype is available
    else {
        CT_OK = true;
    }

    if (!CT_OK) {
        if (this.treat_non_html_as_error) {
            String errorMessage = "Exception content-type " + mimeType + " for " + url;
            RuntimeException e = new RuntimeException(errorMessage);
            handleException(url, e, metadata, tuple, "content-type checking", errorMessage);
        } else {
            LOG.info("Incorrect mimetype - passing on : {}", url);
            collector.emit(tuple, new Values(url, content, metadata, ""));
            collector.ack(tuple);
        }
        return;
    }

    long start = System.currentTimeMillis();

    String charset = getContentCharset(content, metadata);

    // get the robots tags from the fetch metadata
    RobotsTags robotsTags = new RobotsTags(metadata);

    Map<String, List<String>> slinks;
    String text = "";
    DocumentFragment fragment;
    try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) {
        org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url);

        fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc);

        // extracts the robots directives from the meta tags
        robotsTags.extractMetaTags(fragment);

        // store a normalised representation in metadata
        // so that the indexer is aware of it
        robotsTags.normaliseToMetadata(metadata);

        // do not extract the links if no follow has been set
        // and we are in strict mode
        if (robotsTags.isNoFollow() && robots_noFollow_strict) {
            slinks = new HashMap<>(0);
        } else {
            Elements links = jsoupDoc.select("a[href]");
            slinks = new HashMap<>(links.size());
            for (Element link : links) {
                // abs:href tells jsoup to return fully qualified domains
                // for
                // relative urls.
                // e.g.: /foo will resolve to http://shopstyle.com/foo
                String targetURL = link.attr("abs:href");

                // nofollow
                boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel"));
                // remove altogether
                if (noFollow && robots_noFollow_strict) {
                    continue;
                }

                // link not specifically marked as no follow
                // but whole page is
                if (!noFollow && robotsTags.isNoFollow()) {
                    noFollow = true;
                }

                String anchor = link.text();
                if (StringUtils.isNotBlank(targetURL)) {
                    // any existing anchors for the same target?
                    List<String> anchors = slinks.get(targetURL);
                    if (anchors == null) {
                        anchors = new LinkedList<>();
                        slinks.put(targetURL, anchors);
                    }
                    // track the anchors only if no follow is false
                    if (!noFollow && StringUtils.isNotBlank(anchor)) {
                        anchors.add(anchor);
                    }
                }
            }
        }

        Element body = jsoupDoc.body();
        if (body != null) {
            text = body.text();
        }

    } catch (Throwable e) {
        String errorMessage = "Exception while parsing " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content parsing", errorMessage);
        return;
    }

    // store identified charset in md
    metadata.setValue("parse.Content-Encoding", charset);

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    List<Outlink> outlinks = toOutlinks(url, metadata, slinks);

    ParseResult parse = new ParseResult();
    parse.setOutlinks(outlinks);

    // parse data of the parent URL
    ParseData parseData = parse.get(url);
    parseData.setMetadata(metadata);
    parseData.setText(text);
    parseData.setContent(content);

    // apply the parse filters if any
    try {
        parseFilters.filter(url, content, fragment, parse);
    } catch (RuntimeException e) {

        String errorMessage = "Exception while running parse filters on " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content filtering", errorMessage);
        return;
    }

    if (emitOutlinks) {
        for (Outlink outlink : parse.getOutlinks()) {
            collector.emit(StatusStreamName, tuple,
                    new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
        }
    }

    // emit each document/subdocument in the ParseResult object
    // there should be at least one ParseData item for the "parent" URL

    for (Map.Entry<String, ParseData> doc : parse) {
        ParseData parseDoc = doc.getValue();

        collector.emit(tuple,
                new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()));
    }

    collector.ack(tuple);
    eventCounter.scope("tuple_success").incr();
}

From source file:de.geeksfactory.opacclient.apis.BiBer1992.java

@Override
public List<SearchField> getSearchFields() throws IOException {
    List<SearchField> fields = new ArrayList<>();

    HttpGet httpget;/*from  ww  w  .j a  v a 2 s .c  o  m*/
    if (opacDir.contains("opax")) {
        httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel.html.S");
    } else {
        httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel_main.S");
    }

    HttpResponse response = http_client.execute(httpget);

    if (response.getStatusLine().getStatusCode() == 500) {
        throw new NotReachableException(response.getStatusLine().getReasonPhrase());
    }
    String html = convertStreamToString(response.getEntity().getContent());
    HttpUtils.consume(response.getEntity());

    Document doc = Jsoup.parse(html);

    // get text fields
    Elements text_opts = doc.select("form select[name=REG1] option");
    for (Element opt : text_opts) {
        TextSearchField field = new TextSearchField();
        field.setId(opt.attr("value"));
        field.setDisplayName(opt.text());
        field.setHint("");
        fields.add(field);
    }

    // get media types
    Elements mt_opts = doc.select("form input[name~=(MT|MS)]");
    if (mt_opts.size() > 0) {
        DropdownSearchField mtDropdown = new DropdownSearchField();
        mtDropdown.setId(mt_opts.get(0).attr("name"));
        mtDropdown.setDisplayName("Medientyp");
        for (Element opt : mt_opts) {
            if (!opt.val().equals("")) {
                String text = opt.text();
                if (text.length() == 0) {
                    // text is empty, check layouts:
                    // Essen: <input name="MT"><img title="mediatype">
                    // Schaffenb: <input name="MT"><img alt="mediatype">
                    Element img = opt.nextElementSibling();
                    if (img != null && img.tagName().equals("img")) {
                        text = img.attr("title");
                        if (text.equals("")) {
                            text = img.attr("alt");
                        }
                    }
                }
                if (text.length() == 0) {
                    // text is still empty, check table layout, Example
                    // Friedrichshafen
                    // <td><input name="MT"></td> <td><img
                    // title="mediatype"></td>
                    Element td1 = opt.parent();
                    Element td2 = td1.nextElementSibling();
                    if (td2 != null) {
                        Elements td2Children = td2.select("img[title]");
                        if (td2Children.size() > 0) {
                            text = td2Children.get(0).attr("title");
                        }
                    }
                }
                if (text.length() == 0) {
                    // text is still empty, check images in label layout, Example
                    // Wiedenst
                    // <input type="radio" name="MT" id="MTYP1" value="MTYP1">
                    // <label for="MTYP1"><img src="http://www.wiedenest.de/bib/image/books
                    // .png" alt="Bcher" title="Bcher"></label>
                    Element label = opt.nextElementSibling();
                    if (label != null) {
                        Elements td2Children = label.select("img[title]");
                        if (td2Children.size() > 0) {
                            text = td2Children.get(0).attr("title");
                        }
                    }
                }
                if (text.length() == 0) {
                    // text is still empty: missing end tag like Offenburg
                    text = parse_option_regex(opt);
                }
                mtDropdown.addDropdownValue(opt.val(), text);
            }
        }
        fields.add(mtDropdown);
    }

    // get branches
    Elements br_opts = doc.select("form select[name=ZW] option");
    if (br_opts.size() > 0) {
        DropdownSearchField brDropdown = new DropdownSearchField();
        brDropdown.setId(br_opts.get(0).parent().attr("name"));
        brDropdown.setDisplayName(br_opts.get(0).parent().parent().previousElementSibling().text()
                .replace("\u00a0", "").replace("?", "").trim());
        for (Element opt : br_opts) {
            brDropdown.addDropdownValue(opt.val(), opt.text());
        }
        fields.add(brDropdown);
    }

    return fields;
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
 * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
 * so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it
 *
 *
 * @param node// w ww  .  j a v a2s .  c o  m
 * @return
 */
private boolean isOkToBoost(Element node) {

    int stepsAway = 0;

    Element sibling = node.nextElementSibling();
    while (sibling != null) {

        if (sibling.tagName().equals("p")) {
            if (stepsAway >= 3) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Next paragraph is too far away, not boosting");
                }
                return false;
            }

            String paraText = sibling.text();
            WordStats wordStats = StopWords.getStopWordCount(paraText);
            if (wordStats.getStopWordCount() > 5) {
                if (logger.isDebugEnabled()) {
                    logger.debug("We're gonna boost this node, seems contenty");
                }
                return true;
            }

        }

        // increase how far away the next paragraph is from this node
        stepsAway++;

        sibling = sibling.nextElementSibling();
    }

    return false;
}

From source file:de.geeksfactory.opacclient.apis.Pica.java

protected DetailledItem parse_result(String html) {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);//from ww w . j  a v  a2 s.co m

    DetailledItem result = new DetailledItem();
    for (Element a : doc.select("a[href*=PPN")) {
        Map<String, String> hrefq = getQueryParamsFirst(a.absUrl("href"));
        String ppn = hrefq.get("PPN");
        result.setId(ppn);
        break;
    }

    // GET COVER
    if (doc.select("td.preslabel:contains(ISBN) + td.presvalue").size() > 0) {
        Element isbnElement = doc.select("td.preslabel:contains(ISBN) + td.presvalue").first();
        String isbn = "";
        for (Node child : isbnElement.childNodes()) {
            if (child instanceof TextNode) {
                isbn = ((TextNode) child).text().trim();
                break;
            }
        }
        result.setCover(ISBNTools.getAmazonCoverURL(isbn, true));
    }

    // GET TITLE AND SUBTITLE
    String titleAndSubtitle;
    Element titleAndSubtitleElem = null;
    String titleRegex = ".*(Titel|Aufsatz|Zeitschrift|Gesamttitel"
            + "|Title|Article|Periodical|Collective\\stitle" + "|Titre|Article|P.riodique|Titre\\sg.n.ral).*";
    String selector = "td.preslabel:matches(" + titleRegex + ") + td.presvalue";
    if (doc.select(selector).size() > 0) {
        titleAndSubtitleElem = doc.select(selector).first();
        titleAndSubtitle = titleAndSubtitleElem.text().trim();
        int slashPosition = Math.min(titleAndSubtitle.indexOf("/"), titleAndSubtitle.indexOf(":"));
        String title;
        if (slashPosition > 0) {
            title = titleAndSubtitle.substring(0, slashPosition).trim();
            String subtitle = titleAndSubtitle.substring(slashPosition + 1).trim();
            result.addDetail(new Detail(stringProvider.getString(StringProvider.SUBTITLE), subtitle));
        } else {
            title = titleAndSubtitle;
        }
        result.setTitle(title);
    } else {
        result.setTitle("");
    }

    // Details
    int line = 0;
    Elements lines = doc.select("td.preslabel + td.presvalue");
    if (titleAndSubtitleElem != null) {
        lines.remove(titleAndSubtitleElem);
    }
    for (Element element : lines) {
        Element titleElem = element.firstElementSibling();
        String detail = "";
        if (element.select("div").size() > 1 && element.select("div").text().equals(element.text())) {
            boolean first = true;
            for (Element div : element.select("div")) {
                if (!div.text().replace("\u00a0", " ").trim().equals("")) {
                    if (!first) {
                        detail += "\n" + div.text().replace("\u00a0", " ").trim();
                    } else {
                        detail += div.text().replace("\u00a0", " ").trim();
                        first = false;
                    }
                }
            }
        } else {
            detail = element.text().replace("\u00a0", " ").trim();
        }
        String title = titleElem.text().replace("\u00a0", " ").trim();

        if (element.select("hr").size() > 0)
        // after the separator we get the copies
        {
            break;
        }

        if (detail.length() == 0 && title.length() == 0) {
            line++;
            continue;
        }
        if (title.contains(":")) {
            title = title.substring(0, title.indexOf(":")); // remove colon
        }
        result.addDetail(new Detail(title, detail));

        if (element.select("a").size() == 1 && !element.select("a").get(0).text().trim().equals("")) {
            String url = element.select("a").first().absUrl("href");
            if (!url.startsWith(opac_url)) {
                result.addDetail(new Detail(stringProvider.getString(StringProvider.LINK), url));
            }
        }

        line++;
    }
    line++; // next line after separator

    // Copies
    Copy copy = new Copy();
    String location = "";

    // reservation info will be stored as JSON
    JSONArray reservationInfo = new JSONArray();

    while (line < lines.size()) {
        Element element = lines.get(line);
        if (element.select("hr").size() == 0) {
            Element titleElem = element.firstElementSibling();
            String detail = element.text().trim();
            String title = titleElem.text().replace("\u00a0", " ").trim();

            if (detail.length() == 0 && title.length() == 0) {
                line++;
                continue;
            }

            if (title.contains("Standort") || title.contains("Vorhanden in") || title.contains("Location")) {
                location += detail;
            } else if (title.contains("Sonderstandort")) {
                location += " - " + detail;
            } else if (title.contains("Systemstelle") || title.contains("Subject")) {
                copy.setDepartment(detail);
            } else if (title.contains("Fachnummer") || title.contains("locationnumber")) {
                copy.setLocation(detail);
            } else if (title.contains("Signatur") || title.contains("Shelf mark")) {
                copy.setShelfmark(detail);
            } else if (title.contains("Anmerkung")) {
                location += " (" + detail + ")";
            } else if (title.contains("Link")) {
                result.addDetail(new Detail(title.replace(":", "").trim(), detail));
            } else if (title.contains("Status") || title.contains("Ausleihinfo")
                    || title.contains("Ausleihstatus") || title.contains("Request info")) {
                // Find return date
                Pattern pattern = Pattern.compile("(till|bis) (\\d{2}-\\d{2}-\\d{4})");
                Matcher matcher = pattern.matcher(detail);
                if (matcher.find()) {
                    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd-MM-yyyy").withLocale(Locale.GERMAN);
                    try {
                        copy.setStatus(detail.substring(0, matcher.start() - 1).trim());
                        copy.setReturnDate(fmt.parseLocalDate(matcher.group(2)));
                    } catch (IllegalArgumentException e) {
                        e.printStackTrace();
                        copy.setStatus(detail);
                    }
                } else {
                    copy.setStatus(detail);
                }
                // Get reservation info
                if (element.select("a:has(img[src*=inline_arrow])").size() > 0) {
                    Element a = element.select("a:has(img[src*=inline_arrow])").first();
                    boolean multipleCopies = a.text().matches(".*(Exemplare|Volume list).*");
                    JSONObject reservation = new JSONObject();
                    try {
                        reservation.put("multi", multipleCopies);
                        reservation.put("link", _extract_url(a.absUrl("href")));
                        reservation.put("desc", location);
                        reservationInfo.put(reservation);
                    } catch (JSONException e1) {
                        e1.printStackTrace();
                    }
                    result.setReservable(true);
                }
            }
        } else {
            copy.setBranch(location);
            result.addCopy(copy);
            location = "";
            copy = new Copy();
        }
        line++;
    }

    if (copy.notEmpty()) {
        copy.setBranch(location);
        result.addCopy(copy);
    }

    if (reservationInfo.length() == 0) {
        // No reservation info found yet, because we didn't find any copies.
        // If there is a reservation link somewhere in the rows we interpreted
        // as details, we still want to use it.
        if (doc.select("td a:has(img[src*=inline_arrow])").size() > 0) {
            Element a = doc.select("td a:has(img[src*=inline_arrow])").first();
            boolean multipleCopies = a.text().matches(".*(Exemplare|Volume list).*");
            JSONObject reservation = new JSONObject();
            try {
                reservation.put("multi", multipleCopies);
                reservation.put("link", _extract_url(a.attr("href")));
                reservation.put("desc", location);
                reservationInfo.put(reservation);
            } catch (JSONException e1) {
                e1.printStackTrace();
            }
            result.setReservable(true);
        }
    }
    result.setReservation_info(reservationInfo.toString());

    // Volumes
    if (doc.select("a[href^=FAM?PPN=]").size() > 0) {
        String href = doc.select("a[href^=FAM?PPN=]").attr("href");
        String ppn = getQueryParamsFirst(href).get("PPN");
        Map<String, String> data = new HashMap<>();
        data.put("ppn", ppn);
        result.setVolumesearch(data);
    }

    return result;
}

From source file:mml.handler.post.MMLPostHTMLHandler.java

/**
 * Parse a codeblock/*from  w  w  w  . ja v  a 2 s .c om*/
 * @param elem the element to parse
 * @throws a JSON exception
 */
private void parsePre(Element elem) throws JSONException {
    if (elem.hasText()) {
        int offset = sb.length();
        String name = elem.attr("class");
        if (name == null || name.length() == 0)
            name = "pre";
        Range r = new Range(name, offset, 0);
        stil.add(r);
        if (elem.hasAttr("class")) {
            List<Node> children = elem.childNodes();
            for (Node child : children) {
                if (child instanceof Element) {
                    if (child.nodeName().equals("span"))
                        parseSpan((Element) child);
                    else
                        parseOtherElement((Element) child);
                } else if (child instanceof TextNode)
                    sb.append(((TextNode) child).getWholeText());
            }
        } else
            sb.append(elem.text());
        this.stil.updateLen(r, sb.length() - offset);
    }
    prevWasMilestone = false;
    ensure(1, false);
}

From source file:de.geeksfactory.opacclient.apis.BiBer1992.java

@Override
public ReservationResult reservation(DetailledItem item, Account account, int useraction, String selection)
        throws IOException {
    String resinfo = item.getReservation_info();
    if (selection == null || selection.equals("confirmed")) {
        // STEP 1: Check if reservable and select branch ("ID1")

        // Differences between opax and opac
        String func = opacDir.contains("opax") ? "sigl" : "resF";
        String id = opacDir.contains("opax") ? (resinfo.contains("resF") ? resinfo.substring(5) + "=" + resinfo
                : resinfo + "=resF_" + resinfo) : "ID=" + resinfo;

        String html = httpGet(//from  w  w  w. j  ava  2s.  c  o  m
                opacUrl + "/" + opacDir + "/reserv" + opacSuffix + "?LANG=de&FUNC=" + func + "&" + id,
                getDefaultEncoding());
        Document doc = Jsoup.parse(html);
        newStyleReservations = doc.select("input[name=" + resinfo.replace("resF_", "") + "]").val()
                .length() > 4;
        Elements optionsElements = doc.select("select[name=ID1] option");
        if (optionsElements.size() > 0) {
            List<Map<String, String>> options = new ArrayList<>();
            for (Element option : optionsElements) {
                if ("0".equals(option.attr("value"))) {
                    continue;
                }
                Map<String, String> selopt = new HashMap<>();
                selopt.put("key", option.attr("value") + ":" + option.text());
                selopt.put("value", option.text());
                options.add(selopt);
            }
            if (options.size() > 1) {
                ReservationResult res = new ReservationResult(MultiStepResult.Status.SELECTION_NEEDED);
                res.setActionIdentifier(ReservationResult.ACTION_BRANCH);
                res.setSelection(options);
                return res;
            } else {
                return reservation(item, account, useraction, options.get(0).get("key"));
            }
        } else {
            ReservationResult res = new ReservationResult(MultiStepResult.Status.ERROR);
            res.setMessage("Dieses Medium ist nicht reservierbar.");
            return res;
        }
    } else {
        // STEP 2: Reserve
        List<NameValuePair> nameValuePairs = new ArrayList<>();
        nameValuePairs.add(new BasicNameValuePair("LANG", "de"));
        nameValuePairs.add(new BasicNameValuePair("BENUTZER", account.getName()));
        nameValuePairs.add(new BasicNameValuePair("PASSWORD", account.getPassword()));
        nameValuePairs.add(new BasicNameValuePair("FUNC", "vors"));
        if (opacDir.contains("opax")) {
            nameValuePairs.add(new BasicNameValuePair(resinfo.replace("resF_", ""),
                    "vors" + (newStyleReservations ? resinfo.replace("resF_", "") : "")));
        }
        if (newStyleReservations) {
            nameValuePairs.add(new BasicNameValuePair("ID11", selection.split(":")[1]));
        }
        nameValuePairs.add(new BasicNameValuePair("ID1", selection.split(":")[0]));

        String html = httpPost(opacUrl + "/" + opacDir + "/setreserv" + opacSuffix,
                new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding());

        Document doc = Jsoup.parse(html);
        if (doc.select(".tab21 .p44b, .p2").text().contains("eingetragen")) {
            return new ReservationResult(MultiStepResult.Status.OK);
        } else {
            ReservationResult res = new ReservationResult(MultiStepResult.Status.ERROR);
            if (doc.select(".p1, .p22b").size() > 0) {
                res.setMessage(doc.select(".p1, .p22b").text());
            }
            return res;
        }
    }
}

From source file:org.shareok.data.sagedata.SageJournalIssueDateProcessor.java

@SuppressWarnings("empty-statement")
public void retrieveSageJournalVolIssueDates(Map<String, String> processedJournalsMap) {
    List<String> processedJournals = new ArrayList<>();
    //        JSONObject jsonObj = getSavedSageJournalVolIssueDateInformation();
    try {// www.  j  a  va 2  s . com
        Map<String, Map<String, String>> journalMap = getSavedSageJournalVolIssueDateInformation();
        if (null == journalMap) {
            journalMap = new HashMap<>();
        }
        Document doc = null;
        try {
            doc = Jsoup.connect("http://journals.sagepub.com/action/showPublications?pageSize=20&startPage=199")
                    .userAgent(
                            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                    .cookie("auth", "token").timeout(300000).get();
            Elements trs = doc.select("form#browsePublicationsForm").get(0).select("table").get(0)
                    .select("tbody").get(0).select("tr");
            for (Element tr : trs) {
                Element link = tr.select("td").get(1).select("a").get(0);
                String journalName = link.text();
                String journalLink = SageDataUtil.SAGE_HTTP_PREFIX + link.attr("href");
                String[] linkInfo = journalLink.split("/");
                String journalIssuesLink = SageDataUtil.SAGE_HTTP_PREFIX + "/loi/"
                        + linkInfo[linkInfo.length - 1];
                if (null == journalMap.get(journalName)) {
                    Map<String, String> infoMap = new HashMap<>();
                    infoMap.put("homeLink", journalLink);
                    infoMap.put("issueLink", journalIssuesLink);
                    journalMap.put(journalName, infoMap);
                } else {
                    Map<String, String> infoMap = journalMap.get(journalName);
                    if (null == infoMap.get("homeLink")) {
                        infoMap.put("homeLink", journalLink);
                    }
                    if (null == infoMap.get("issueLink")) {
                        infoMap.put("issueLink", journalIssuesLink);
                    }
                }
            }
            int kk = 0;
            mainLoop: for (String journal : journalMap.keySet()) {
                System.out.println("Print out journal " + journal + " information :");
                if (null != processedJournalsMap && (journal == null ? processedJournalsMap.get(journal) == null
                        : journal.equals(processedJournalsMap.get(journal)))) {
                    System.out.println("Journal : has already been processed!");
                    continue;
                }
                //                    if(journal.contains("Christian Education")){
                //                        System.out.println("Journal name : International Journal of Health Services, cannot be processed!");
                ////                        continue;
                //                    }
                //                    if(journal.contains("Plastic Surgery")){
                //                        System.out.println("Journal name : International Journal of Health Services, cannot be processed!");
                //                        continue;
                //                    }
                Map<String, String> journalInfoMap = journalMap.get(journal);
                for (String key : journalInfoMap.keySet()) {
                    if (key.equals("issueLink")) {
                        Document loiDdoc = null;
                        try {
                            loiDdoc = Jsoup.connect(journalInfoMap.get(key)).userAgent(
                                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                                    .cookie("auth", "token").timeout(300000).get();
                        } catch (HttpStatusException ex) {
                            ex.printStackTrace();
                            break;
                        }
                        Thread.sleep(2200);
                        if (null != loiDdoc) {
                            Map<String, Map<String, String>> dataMap;
                            if (null != journalMap.get(journal).get("data")) {
                                dataMap = DataUtil.getMapFromJson(journalMap.get(journal).get("data"));
                            } else {
                                dataMap = new HashMap<>();
                            }
                            Elements decaseDivs = loiDdoc.select("div.decade");
                            if (null != decaseDivs && decaseDivs.size() > 0) {
                                for (Element decade : decaseDivs) {
                                    Elements yearsDiv = decade.select("div.years").get(0).children();
                                    if (null != yearsDiv && yearsDiv.size() > 0) {
                                        for (Element yearEle : yearsDiv) {
                                            Elements volumesDiv = yearEle.select("div.volumes").get(0)
                                                    .children();
                                            if (null != volumesDiv && volumesDiv.size() > 0) {
                                                for (Element volumeEle : volumesDiv) {
                                                    String volume = volumeEle.select("a").get(0).text().trim()
                                                            .split("Volume")[1].trim();
                                                    Elements issueInfoDivEles = volumeEle
                                                            .select("div.js_issue");
                                                    if (null != issueInfoDivEles
                                                            && issueInfoDivEles.size() > 0) {
                                                        for (Element issueInfoDiv : issueInfoDivEles) {
                                                            String issueText = issueInfoDiv.select("a").get(0)
                                                                    .text();
                                                            issueText = issueText.split(", ")[0]
                                                                    .split("Issue")[1].trim();
                                                            String oldIssueDate = "";
                                                            String issueDate = "";
                                                            if (NO_ARTICLE_PUB_DATE_JOURNALS_LIST
                                                                    .contains(journal)) {
                                                                issueDate = "01 " + issueInfoDiv
                                                                        .select("span.loiIssueCoverDateText")
                                                                        .get(0).text().trim();
                                                                oldIssueDate = issueDate;
                                                                //                                                            if(issueDate.contains("Winter")){
                                                                //                                                                issueDate = issueDate.replaceAll("Winter", "October");
                                                                //                                                            }
                                                                //                                                            if(issueDate.contains("Fall") || issueDate.contains("Autumn")){
                                                                //                                                                issueDate = issueDate.replaceAll("Fall", "September");
                                                                //                                                                issueDate = issueDate.replaceAll("Autumn", "September");
                                                                //                                                            }
                                                                //                                                            if(issueDate.contains("Summer")){
                                                                //                                                                issueDate = issueDate.replaceAll("Summer", "April");
                                                                //                                                            }
                                                                //                                                            if(issueDate.contains("Spring")){
                                                                //                                                                issueDate = issueDate.replaceAll("Spring", "January");
                                                                //                                                            }
                                                                //                                                            try{                                                            
                                                                //                                                                // for date string like "01 July-October 2016"
                                                                //                                                                if(issueDate.contains("-")){
                                                                //                                                                    String[] dateInfo = issueDate.split("-");
                                                                //                                                                    issueDate = dateInfo[0] + " " + dateInfo[1].split(" ")[1];
                                                                //                                                                }
                                                                //                                                                // for date string like "01 July/October 2016"
                                                                //                                                                if(issueDate.contains("/")){
                                                                //                                                                    String[] dataInfo = issueDate.split("/");
                                                                //                                                                    issueDate = dataInfo[0] + " " + dataInfo[1].split(" ")[1];
                                                                //                                                                }
                                                                //                                                            }
                                                                //                                                            catch(ArrayIndexOutOfBoundsException ex){
                                                                //                                                                System.out.println("Journal name: "+journal);
                                                                //                                                                System.out.println("Volume: "+volume+", issue: "+issueText);
                                                                //                                                                System.out.println("This date string cannot be parsed: "+oldIssueDate);
                                                                //                                                                ex.printStackTrace();
                                                                //                                                                continue;
                                                                //                                                            }
                                                                try {
                                                                    issueDate = "01 " + issueInfoDiv.select(
                                                                            "span.loiIssueCoverDateText").get(0)
                                                                            .text().trim();
                                                                    oldIssueDate = issueDate;
                                                                    issueDate = DataHandlersUtil
                                                                            .convertFullMonthDateStringFormat(
                                                                                    issueDate);
                                                                } catch (ParseException ex) {
                                                                    //                                                                if(!journal.contains("OMEGA - Journal of Death and Dying")){
                                                                    //                                                                    continue;
                                                                    //                                                                }
                                                                    System.out.println(
                                                                            "Journal name: " + journal);
                                                                    System.out.println("Volume: " + volume
                                                                            + ", issue: " + issueText);
                                                                    System.out.println(
                                                                            "This date string cannot be parsed: "
                                                                                    + oldIssueDate);
                                                                    ex.printStackTrace();
                                                                    continue;
                                                                }

                                                            } else {
                                                                try {
                                                                    Element issueLinkEle = issueInfoDiv
                                                                            .select("a").get(0);
                                                                    String issueLink = issueLinkEle
                                                                            .attr("href");
                                                                    Document issueDoc = null;
                                                                    try {
                                                                        issueDoc = Jsoup.connect(issueLink)
                                                                                .userAgent(
                                                                                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                                                                                .cookie("auth", "token")
                                                                                .timeout(300000).get();
                                                                    } catch (HttpStatusException ex) {
                                                                        ex.printStackTrace();
                                                                        break mainLoop;
                                                                    }
                                                                    Thread.sleep(2200);
                                                                    Elements articleDivs = issueDoc
                                                                            .select("div.art_title, .linkable");
                                                                    String articleLink = SageDataUtil.SAGE_HTTP_PREFIX
                                                                            + articleDivs.get(0)
                                                                                    .select("a.ref, .nowrap")
                                                                                    .get(0).attr("href");
                                                                    if (articleLink.contains("pdf/")) {
                                                                        System.out.println("journal: " + journal
                                                                                + " volume=" + volume
                                                                                + " issue=" + issueText
                                                                                + " has ONLY PDF links!");
                                                                        try {
                                                                            issueDate = issueInfoDiv.select(
                                                                                    "span.loiIssueCoverDateText")
                                                                                    .get(0).text().trim();
                                                                            oldIssueDate = issueDate;
                                                                            if (issueDate.contains("Winter")) {
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Winter",
                                                                                                "December");
                                                                            }
                                                                            if (issueDate.contains("Fall")
                                                                                    || issueDate.contains(
                                                                                            "Autumn")) {
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Fall",
                                                                                                "September");
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Autumn",
                                                                                                "September");
                                                                            }
                                                                            if (issueDate.contains("Summer")) {
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Summer",
                                                                                                "June");
                                                                            }
                                                                            if (issueDate.contains("Spring")) {
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Spring",
                                                                                                "March");
                                                                            }
                                                                            if (issueDate.contains("/")) {
                                                                                String[] dataInfo = issueDate
                                                                                        .split("/");
                                                                                String dateInfo1 = dataInfo[0]
                                                                                        .trim();
                                                                                String date;
                                                                                String month1;
                                                                                String[] dateInfo1Arr = dateInfo1
                                                                                        .split(" ");
                                                                                if (dateInfo1Arr.length == 2) {
                                                                                    date = dateInfo1Arr[0];
                                                                                    month1 = dateInfo1Arr[1];
                                                                                } else {
                                                                                    date = "01";
                                                                                    month1 = dataInfo[0].trim();
                                                                                }
                                                                                String month2 = dataInfo[1]
                                                                                        .split("\\s+")[0];
                                                                                String year = dataInfo[1]
                                                                                        .split("\\s+")[1];
                                                                                String date1 = DataHandlersUtil
                                                                                        .convertFullMonthDateStringFormat(
                                                                                                date + " "
                                                                                                        + month1
                                                                                                        + " "
                                                                                                        + year);
                                                                                String date2 = DataHandlersUtil
                                                                                        .convertFullMonthDateStringFormat(
                                                                                                date + " "
                                                                                                        + month2
                                                                                                        + " "
                                                                                                        + year);
                                                                                issueDate = date1 + "::"
                                                                                        + date2;
                                                                            }
                                                                            //  The Journal of Psychiatry & Law dd MMMM-MMMM yyyy pattern
                                                                            else if (issueDate.contains("-")) {
                                                                                if (journal.equals(
                                                                                        "OMEGA - Journal of Death and Dying")) {
                                                                                    Document articleDoc = null;
                                                                                    try {
                                                                                        articleDoc = Jsoup
                                                                                                .connect(
                                                                                                        articleLink)
                                                                                                .userAgent(
                                                                                                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                                                                                                .cookie("auth",
                                                                                                        "token")
                                                                                                .timeout(300000)
                                                                                                .get();
                                                                                    } catch (HttpStatusException ex) {
                                                                                        ex.printStackTrace();
                                                                                        break mainLoop;
                                                                                    }
                                                                                    Thread.sleep(2200);
                                                                                    Element pubDateDiv = articleDoc
                                                                                            .select("div.published-dates")
                                                                                            .get(0);
                                                                                    issueDate = pubDateDiv
                                                                                            .text()
                                                                                            .split("Issue published:")[1]
                                                                                                    .trim();
                                                                                    oldIssueDate = issueDate;
                                                                                    issueDate = DataHandlersUtil
                                                                                            .convertFullMonthDateStringFormat(
                                                                                                    issueDate);
                                                                                } else {
                                                                                    String[] dataInfo = issueDate
                                                                                            .split("-");
                                                                                    String dateInfo1 = dataInfo[0]
                                                                                            .trim();
                                                                                    String date;
                                                                                    String month1;
                                                                                    String[] dateInfo1Arr = dateInfo1
                                                                                            .split(" ");
                                                                                    if (dateInfo1Arr.length == 2) {
                                                                                        date = dateInfo1Arr[0]
                                                                                                .trim();
                                                                                        month1 = dateInfo1Arr[1]
                                                                                                .trim();
                                                                                    } else {
                                                                                        date = "01";
                                                                                        month1 = dataInfo[0]
                                                                                                .trim();
                                                                                    }
                                                                                    String month2 = dataInfo[1]
                                                                                            .split("\\s+")[0];
                                                                                    String year = dataInfo[1]
                                                                                            .split("\\s+")[1];
                                                                                    String date1 = DataHandlersUtil
                                                                                            .convertFullMonthDateStringFormat(
                                                                                                    date + " "
                                                                                                            + month1
                                                                                                            + " "
                                                                                                            + year);
                                                                                    String date2 = DataHandlersUtil
                                                                                            .convertFullMonthDateStringFormat(
                                                                                                    date + " "
                                                                                                            + month2
                                                                                                            + " "
                                                                                                            + year);
                                                                                    issueDate = date1 + "::"
                                                                                            + date2;
                                                                                }
                                                                            } else {
                                                                                issueDate = "01 " + issueDate;
                                                                                issueDate = DataHandlersUtil
                                                                                        .convertFullMonthDateStringFormat(
                                                                                                issueDate);
                                                                            }
                                                                        } catch (ParseException
                                                                                | ArrayIndexOutOfBoundsException ex) {
                                                                            System.out.println(
                                                                                    "Journal name: " + journal);
                                                                            System.out.println("Volume: "
                                                                                    + volume + ", issue: "
                                                                                    + issueText);
                                                                            System.out.println(
                                                                                    "This date string cannot be parsed: "
                                                                                            + issueDate);
                                                                            ex.printStackTrace();
                                                                            continue;
                                                                        }
                                                                    } else {
                                                                        Document articleDoc = null;
                                                                        try {
                                                                            articleDoc = Jsoup
                                                                                    .connect(articleLink)
                                                                                    .userAgent(
                                                                                            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                                                                                    .cookie("auth", "token")
                                                                                    .timeout(300000).get();
                                                                        } catch (HttpStatusException ex) {
                                                                            ex.printStackTrace();
                                                                            break mainLoop;
                                                                        }
                                                                        Thread.sleep(2200);
                                                                        Element pubDateDiv = articleDoc
                                                                                .select("div.published-dates")
                                                                                .get(0);
                                                                        issueDate = pubDateDiv.text()
                                                                                .split("Issue published:")[1]
                                                                                        .trim();
                                                                        oldIssueDate = issueDate;
                                                                        issueDate = DataHandlersUtil
                                                                                .convertFullMonthDateStringFormat(
                                                                                        issueDate);
                                                                    }

                                                                } catch (Exception ex) {
                                                                    logger.error(
                                                                            "Cannot get the issue date for journal ="
                                                                                    + journal + " volume="
                                                                                    + volume + " issue="
                                                                                    + issueText + " date="
                                                                                    + oldIssueDate,
                                                                            ex);
                                                                    continue;
                                                                }
                                                            }
                                                            if (DataHandlersUtil.datesCompare(issueDate,
                                                                    "2010-01-01") < 0) {
                                                                if (dataMap.size() > 0) {
                                                                    ObjectMapper mapper = new ObjectMapper();
                                                                    String json = mapper
                                                                            .writeValueAsString(dataMap);
                                                                    journalInfoMap.put("data", json);
                                                                }
                                                                processedJournals.add(journal);
                                                                continue mainLoop;
                                                            }
                                                            try {
                                                                if (null != dataMap && dataMap.size() > 0
                                                                        && null != dataMap.get(volume)
                                                                        && null != dataMap.get(volume)
                                                                                .get(issueText)) {
                                                                    continue;
                                                                } else {
                                                                    Map<String, String> issueMap = dataMap
                                                                            .get(volume);
                                                                    if (null == issueMap) {
                                                                        issueMap = new HashMap<>();
                                                                        issueMap.put(issueText, issueDate);
                                                                        dataMap.put(volume, issueMap);
                                                                    } else {
                                                                        issueMap.put(issueText, issueDate);
                                                                    }
                                                                    System.out.println("This is vol. " + volume
                                                                            + " and issue " + issueText
                                                                            + " and date " + issueDate);
                                                                }
                                                            } catch (Exception ex) {
                                                                System.out.println(
                                                                        "Cannot add the pub date info into data map for vol. "
                                                                                + volume + " and issue "
                                                                                + issueText + " and date "
                                                                                + issueDate);
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }

                                }
                            }
                            if (dataMap.size() > 0) {
                                ObjectMapper mapper = new ObjectMapper();
                                String json = mapper.writeValueAsString(dataMap);
                                journalInfoMap.put("data", json);
                            }
                        }

                    }
                }
                processedJournals.add(journal);
                if (kk > 100) {
                    break;
                }
                kk++;
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        ObjectMapper mapper = new ObjectMapper();
        String json = mapper.writeValueAsString(journalMap);
        String sageJournalIssueDateInfoFilePath = ShareokdataManager.getSageJournalIssueDateInfoFilePath();
        File sageFile = new File(sageJournalIssueDateInfoFilePath);
        if (sageFile.exists()) {
            String sageJournalIssueDateInfoFilePathOld = sageJournalIssueDateInfoFilePath.split("\\.")[0] + "_"
                    + DataHandlersUtil.getCurrentTimeString() + ".json";
            sageFile.renameTo(new File(sageJournalIssueDateInfoFilePathOld));
        }
        DocumentProcessorUtil.outputStringToFile(json,
                ShareokdataManager.getSageJournalIssueDateInfoFilePath());
        System.out.println("processed journals = " + mapper.writeValueAsString(processedJournals));
    } catch (Exception ex) {
        logger.error("Cannot process the issue dates.", ex);
    }
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Pulls a text from a Wikipedia URL without images, tags, etc.
 * //www.j a  va 2s  .com
 * @param url
 *       Address of the targetted text.
 * @return
 *       An Article object representing the retrieved object.
 * 
 * @throws ReaderException
 *       Problem while retrieving the text.
 */
@Override
public Article read(URL url) throws ReaderException {
    Article result = null;
    String name = getName(url);

    try { // get the page
        String address = url.toString();
        logger.log("Retrieving page " + address);
        long startTime = System.currentTimeMillis();
        Document document = retrieveSourceCode(name, url);

        // get its title
        Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0);
        String title = firstHeadingElt.text();
        logger.log("Get title: " + title);

        // get raw and linked texts
        logger.log("Get raw and linked texts.");
        StringBuilder rawStr = new StringBuilder();
        StringBuilder linkedStr = new StringBuilder();
        Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0);
        // processing each element in the content part
        boolean ignoringSection = false;
        boolean first = true;
        for (Element element : bodyContentElt.children()) {
            String eltName = element.tag().getName();
            String eltClass = element.attr(XmlNames.ATT_CLASS);

            // section headers
            if (eltName.equals(XmlNames.ELT_H2)) {
                first = false;
                // get section name
                StringBuilder fakeRaw = new StringBuilder();
                StringBuilder fakeLinked = new StringBuilder();
                processParagraphElement(element, fakeRaw, fakeLinked);
                String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH);
                // check section name
                if (IGNORED_SECTIONS.contains(str))
                    ignoringSection = true;
                else {
                    ignoringSection = false;
                    rawStr.append("\n-----");
                    linkedStr.append("\n-----");
                    processParagraphElement(element, rawStr, linkedStr);
                }
            }

            else if (!ignoringSection) { // lower sections
                if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4)
                        || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) {
                    first = false;
                    processParagraphElement(element, rawStr, linkedStr);
                }

                // paragraph
                else if (eltName.equals(XmlNames.ELT_P)) {
                    String str = element.text();
                    // ignore possible initial disambiguation link
                    if (!first || !str.startsWith(PARAGRAPH_FORTHE)) {
                        first = false;
                        processParagraphElement(element, rawStr, linkedStr);
                    }
                }

                // list
                else if (eltName.equals(XmlNames.ELT_UL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, false);
                } else if (eltName.equals(XmlNames.ELT_OL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, true);
                } else if (eltName.equals(XmlNames.ELT_DL)) {
                    first = false;
                    processDescriptionListElement(element, rawStr, linkedStr);
                }

                // tables
                else if (eltName.equals(XmlNames.ELT_TABLE)) {
                    first = !processTableElement(element, rawStr, linkedStr);
                }

                // divisions
                else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture 
                    if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB))
                        first = !processDivisionElement(element, rawStr, linkedStr);
                }

                // we ignore certain types of span (phonetic trancription, WP buttons...) 
                else if (eltName.equals(XmlNames.ELT_SPAN)) {
                    first = !processSpanElement(element, rawStr, linkedStr);
                }

                // hyperlinks must be included in the linked string, provided they are not external
                else if (eltName.equals(XmlNames.ELT_A)) {
                    first = !processHyperlinkElement(element, rawStr, linkedStr);
                }

                // quotes are just processed recursively
                else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) {
                    first = !processQuoteElement(element, rawStr, linkedStr);
                }

                // other tags are ignored
            }
        }

        // create article object
        result = new Article(name);
        result.setTitle(title);
        result.setUrl(url);
        result.initDate();

        // clean text
        String rawText = rawStr.toString();
        rawText = cleanText(rawText);
        //         rawText = ArticleCleaning.replaceChars(rawText);
        result.setRawText(rawText);
        logger.log("Length of the raw text: " + rawText.length() + " chars.");
        String linkedText = linkedStr.toString();
        linkedText = cleanText(linkedText);
        //         linkedText = ArticleCleaning.replaceChars(linkedText);
        result.setLinkedText(linkedText);
        logger.log("Length of the linked text: " + linkedText.length() + " chars.");

        // get original html source code
        logger.log("Get original HTML source code.");
        String originalPage = document.toString();
        result.setOriginalPage(originalPage);
        logger.log("Length of the original page: " + originalPage.length() + " chars.");

        // get the categories of the article 
        List<ArticleCategory> categories = getArticleCategories(result);
        result.setCategories(categories);

        long endTime = System.currentTimeMillis();
        logger.log("Total duration: " + (endTime - startTime) + " ms.");
    } catch (ClientProtocolException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (org.json.simple.parser.ParseException e) {
        e.printStackTrace();
    }

    return result;
}

From source file:com.vaushell.shaarlijavaapi.ShaarliClient.java

private String extract(final Element source, final String templateName) {
    if (source == null) {
        throw new IllegalArgumentException();
    }//  w ww.  ja  v a 2 s . c om

    final ShaarliTemplates.Template template = templates.get(templateName);
    if (template == null) {
        throw new IllegalArgumentException("template '" + templateName + "' not found");
    }

    final Element elt;
    if (template.cssPath.isEmpty()) {
        elt = source;
    } else {
        final Elements elts = source.select(template.cssPath);
        if (elts.isEmpty()) {
            return null;
        }

        elt = elts.first();
    }

    String content;
    if (template.attribut.isEmpty()) {
        content = elt.text();
    } else {
        content = elt.attr(template.attribut);
    }
    if (content == null) {
        return null;
    }
    content = content.trim();

    if (!template.regex.isEmpty()) {
        final Pattern p = Pattern.compile(template.regex);
        final Matcher m = p.matcher(content);
        if (m.find()) {
            content = m.group().trim();
        }
    }

    if (content.isEmpty()) {
        return null;
    }

    return content;
}