Example usage for org.jsoup.nodes Element children

List of usage examples for org.jsoup.nodes Element children

Introduction

In this page you can find the example usage for org.jsoup.nodes Element children.

Prototype

public Elements children() 

Source Link

Document

Get this element's child elements.

Usage

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Pulls a text from a Wikipedia URL without images, tags, etc.
 * /*  w  w  w .  ja v  a  2 s  .  com*/
 * @param url
 *       Address of the targetted text.
 * @return
 *       An Article object representing the retrieved object.
 * 
 * @throws ReaderException
 *       Problem while retrieving the text.
 */
@Override
public Article read(URL url) throws ReaderException {
    Article result = null;
    String name = getName(url);

    try { // get the page
        String address = url.toString();
        logger.log("Retrieving page " + address);
        long startTime = System.currentTimeMillis();
        Document document = retrieveSourceCode(name, url);

        // get its title
        Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0);
        String title = firstHeadingElt.text();
        logger.log("Get title: " + title);

        // get raw and linked texts
        logger.log("Get raw and linked texts.");
        StringBuilder rawStr = new StringBuilder();
        StringBuilder linkedStr = new StringBuilder();
        Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0);
        // processing each element in the content part
        boolean ignoringSection = false;
        boolean first = true;
        for (Element element : bodyContentElt.children()) {
            String eltName = element.tag().getName();
            String eltClass = element.attr(XmlNames.ATT_CLASS);

            // section headers
            if (eltName.equals(XmlNames.ELT_H2)) {
                first = false;
                // get section name
                StringBuilder fakeRaw = new StringBuilder();
                StringBuilder fakeLinked = new StringBuilder();
                processParagraphElement(element, fakeRaw, fakeLinked);
                String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH);
                // check section name
                if (IGNORED_SECTIONS.contains(str))
                    ignoringSection = true;
                else {
                    ignoringSection = false;
                    rawStr.append("\n-----");
                    linkedStr.append("\n-----");
                    processParagraphElement(element, rawStr, linkedStr);
                }
            }

            else if (!ignoringSection) { // lower sections
                if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4)
                        || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) {
                    first = false;
                    processParagraphElement(element, rawStr, linkedStr);
                }

                // paragraph
                else if (eltName.equals(XmlNames.ELT_P)) {
                    String str = element.text();
                    // ignore possible initial disambiguation link
                    if (!first || !str.startsWith(PARAGRAPH_FORTHE)) {
                        first = false;
                        processParagraphElement(element, rawStr, linkedStr);
                    }
                }

                // list
                else if (eltName.equals(XmlNames.ELT_UL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, false);
                } else if (eltName.equals(XmlNames.ELT_OL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, true);
                } else if (eltName.equals(XmlNames.ELT_DL)) {
                    first = false;
                    processDescriptionListElement(element, rawStr, linkedStr);
                }

                // tables
                else if (eltName.equals(XmlNames.ELT_TABLE)) {
                    first = !processTableElement(element, rawStr, linkedStr);
                }

                // divisions
                else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture 
                    if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB))
                        first = !processDivisionElement(element, rawStr, linkedStr);
                }

                // we ignore certain types of span (phonetic trancription, WP buttons...) 
                else if (eltName.equals(XmlNames.ELT_SPAN)) {
                    first = !processSpanElement(element, rawStr, linkedStr);
                }

                // hyperlinks must be included in the linked string, provided they are not external
                else if (eltName.equals(XmlNames.ELT_A)) {
                    first = !processHyperlinkElement(element, rawStr, linkedStr);
                }

                // quotes are just processed recursively
                else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) {
                    first = !processQuoteElement(element, rawStr, linkedStr);
                }

                // other tags are ignored
            }
        }

        // create article object
        result = new Article(name);
        result.setTitle(title);
        result.setUrl(url);
        result.initDate();

        // clean text
        String rawText = rawStr.toString();
        rawText = cleanText(rawText);
        //         rawText = ArticleCleaning.replaceChars(rawText);
        result.setRawText(rawText);
        logger.log("Length of the raw text: " + rawText.length() + " chars.");
        String linkedText = linkedStr.toString();
        linkedText = cleanText(linkedText);
        //         linkedText = ArticleCleaning.replaceChars(linkedText);
        result.setLinkedText(linkedText);
        logger.log("Length of the linked text: " + linkedText.length() + " chars.");

        // get original html source code
        logger.log("Get original HTML source code.");
        String originalPage = document.toString();
        result.setOriginalPage(originalPage);
        logger.log("Length of the original page: " + originalPage.length() + " chars.");

        // get the categories of the article 
        List<ArticleCategory> categories = getArticleCategories(result);
        result.setCategories(categories);

        long endTime = System.currentTimeMillis();
        logger.log("Total duration: " + (endTime - startTime) + " ms.");
    } catch (ClientProtocolException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (org.json.simple.parser.ParseException e) {
        e.printStackTrace();
    }

    return result;
}

From source file:de.geeksfactory.opacclient.apis.IOpac.java

static void parseMediaList(List<LentItem> media, Document doc, JSONObject data) {
    if (doc.select("a[name=AUS]").size() == 0)
        return;//from  w  w w  .j av a  2s .  c o  m

    Elements copytrs = doc.select("a[name=AUS] ~ table, a[name=AUS] ~ form table").first().select("tr");
    doc.setBaseUri(data.optString("baseurl"));

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    int trs = copytrs.size();
    if (trs < 2) {
        return;
    }
    assert (trs > 0);

    JSONObject copymap = new JSONObject();
    try {
        if (data.has("accounttable")) {
            copymap = data.getJSONObject("accounttable");
        }
    } catch (JSONException e) {
    }

    Pattern datePattern = Pattern.compile("\\d{2}\\.\\d{2}\\.\\d{4}");
    for (int i = 1; i < trs; i++) {
        Element tr = copytrs.get(i);
        LentItem item = new LentItem();

        if (copymap.optInt("title", 0) >= 0) {
            item.setTitle(tr.child(copymap.optInt("title", 0)).text().trim().replace("\u00a0", ""));
        }
        if (copymap.optInt("author", 1) >= 0) {
            item.setAuthor(tr.child(copymap.optInt("author", 1)).text().trim().replace("\u00a0", ""));
        }
        if (copymap.optInt("format", 2) >= 0) {
            item.setFormat(tr.child(copymap.optInt("format", 2)).text().trim().replace("\u00a0", ""));
        }
        int prolongCount = 0;
        if (copymap.optInt("prolongcount", 3) >= 0) {
            prolongCount = Integer
                    .parseInt(tr.child(copymap.optInt("prolongcount", 3)).text().trim().replace("\u00a0", ""));
            item.setStatus(String.valueOf(prolongCount) + "x verl.");
        }
        if (data.optInt("maxprolongcount", -1) != -1) {
            item.setRenewable(prolongCount < data.optInt("maxprolongcount", -1));
        }
        if (copymap.optInt("returndate", 4) >= 0) {
            String value = tr.child(copymap.optInt("returndate", 4)).text().trim().replace("\u00a0", "");
            Matcher matcher = datePattern.matcher(value);
            if (matcher.find()) {
                try {
                    item.setDeadline(fmt.parseLocalDate(matcher.group()));
                } catch (IllegalArgumentException e1) {
                    e1.printStackTrace();
                }
            }
        }
        if (copymap.optInt("prolongurl", 5) >= 0) {
            if (tr.children().size() > copymap.optInt("prolongurl", 5)) {
                Element cell = tr.child(copymap.optInt("prolongurl", 5));
                if (cell.select("input[name=MedNrVerlAll]").size() > 0) {
                    // new iOPAC Version 1.45 - checkboxes to prolong multiple items
                    // internal convention: We add "NEW" to the media ID to show that we have
                    // the new iOPAC version
                    Element input = cell.select("input[name=MedNrVerlAll]").first();
                    String value = input.val();
                    item.setProlongData("NEW" + value);
                    item.setId(value.split(";")[0]);
                    if (input.hasAttr("disabled"))
                        item.setRenewable(false);
                } else {
                    // previous versions - link for prolonging on every medium
                    String link = cell.select("a").attr("href");
                    item.setProlongData(link);
                    // find media number with regex
                    Pattern pattern = Pattern.compile("mednr=([^&]*)&");
                    Matcher matcher = pattern.matcher(link);
                    if (matcher.find() && matcher.group() != null)
                        item.setId(matcher.group(1));
                }
            }
        }

        media.add(item);
    }
    assert (media.size() == trs - 1);

}

From source file:de.geeksfactory.opacclient.apis.SISIS.java

protected DetailledItem parse_result(String html) throws IOException {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);//from   w w  w.j  a  v  a 2  s .c o  m

    String html2 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showTitleActive", ENCODING);

    Document doc2 = Jsoup.parse(html2);
    doc2.setBaseUri(opac_url);

    String html3 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showAvailabilityActive",
            ENCODING);

    Document doc3 = Jsoup.parse(html3);
    doc3.setBaseUri(opac_url);

    DetailledItem result = new DetailledItem();

    try {
        result.setId(doc.select("#bibtip_id").text().trim());
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    List<String> reservationlinks = new ArrayList<>();
    for (Element link : doc3.select("#vormerkung a, #tab-content a")) {
        String href = link.absUrl("href");
        Map<String, String> hrefq = getQueryParamsFirst(href);
        if (result.getId() == null) {
            // ID retrieval
            String key = hrefq.get("katkey");
            if (key != null) {
                result.setId(key);
                break;
            }
        }

        // Vormerken
        if (hrefq.get("methodToCall") != null) {
            if (hrefq.get("methodToCall").equals("doVormerkung")
                    || hrefq.get("methodToCall").equals("doBestellung")) {
                reservationlinks.add(href.split("\\?")[1]);
            }
        }
    }
    if (reservationlinks.size() == 1) {
        result.setReservable(true);
        result.setReservation_info(reservationlinks.get(0));
    } else if (reservationlinks.size() == 0) {
        result.setReservable(false);
    } else {
        // TODO: Multiple options - handle this case!
    }

    if (doc.select(".data td img").size() == 1) {
        result.setCover(doc.select(".data td img").first().attr("abs:src"));
        try {
            downloadCover(result);
        } catch (Exception e) {

        }
    }

    if (doc.select(".aw_teaser_title").size() == 1) {
        result.setTitle(doc.select(".aw_teaser_title").first().text().trim());
    } else if (doc.select(".data td strong").size() > 0) {
        result.setTitle(doc.select(".data td strong").first().text().trim());
    } else {
        result.setTitle("");
    }
    if (doc.select(".aw_teaser_title_zusatz").size() > 0) {
        result.addDetail(new Detail("Titelzusatz", doc.select(".aw_teaser_title_zusatz").text().trim()));
    }

    String title = "";
    String text = "";
    boolean takeover = false;
    Element detailtrs = doc2.select(".box-container .data td").first();
    for (Node node : detailtrs.childNodes()) {
        if (node instanceof Element) {
            if (((Element) node).tagName().equals("strong")) {
                title = ((Element) node).text().trim();
                text = "";
            } else {
                if (((Element) node).tagName().equals("a")
                        && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) {
                    text = text + node.attr("href");
                    takeover = true;
                    break;
                }
            }
        } else if (node instanceof TextNode) {
            text = text + ((TextNode) node).text();
        }
    }
    if (!takeover) {
        text = "";
        title = "";
    }

    detailtrs = doc2.select("#tab-content .data td").first();
    if (detailtrs != null) {
        for (Node node : detailtrs.childNodes()) {
            if (node instanceof Element) {
                if (((Element) node).tagName().equals("strong")) {
                    if (!text.equals("") && !title.equals("")) {
                        result.addDetail(new Detail(title.trim(), text.trim()));
                        if (title.equals("Titel:")) {
                            result.setTitle(text.trim());
                        }
                        text = "";
                    }

                    title = ((Element) node).text().trim();
                } else {
                    if (((Element) node).tagName().equals("a")
                            && (((Element) node).text().trim().contains("hier klicken")
                                    || title.equals("Link:"))) {
                        text = text + node.attr("href");
                    } else {
                        text = text + ((Element) node).text();
                    }
                }
            } else if (node instanceof TextNode) {
                text = text + ((TextNode) node).text();
            }
        }
    } else {
        if (doc2.select("#tab-content .fulltitle tr").size() > 0) {
            Elements rows = doc2.select("#tab-content .fulltitle tr");
            for (Element tr : rows) {
                if (tr.children().size() == 2) {
                    Element valcell = tr.child(1);
                    String value = valcell.text().trim();
                    if (valcell.select("a").size() == 1) {
                        value = valcell.select("a").first().absUrl("href");
                    }
                    result.addDetail(new Detail(tr.child(0).text().trim(), value));
                }
            }
        } else {
            result.addDetail(new Detail(stringProvider.getString(StringProvider.ERROR),
                    stringProvider.getString(StringProvider.COULD_NOT_LOAD_DETAIL)));
        }
    }
    if (!text.equals("") && !title.equals("")) {
        result.addDetail(new Detail(title.trim(), text.trim()));
        if (title.equals("Titel:")) {
            result.setTitle(text.trim());
        }
    }
    for (Element link : doc3.select("#tab-content a")) {
        Map<String, String> hrefq = getQueryParamsFirst(link.absUrl("href"));
        if (result.getId() == null) {
            // ID retrieval
            String key = hrefq.get("katkey");
            if (key != null) {
                result.setId(key);
                break;
            }
        }
    }
    for (Element link : doc3.select(".box-container a")) {
        if (link.text().trim().equals("Download")) {
            result.addDetail(
                    new Detail(stringProvider.getString(StringProvider.DOWNLOAD), link.absUrl("href")));
        }
    }

    Map<String, Integer> copy_columnmap = new HashMap<>();
    // Default values
    copy_columnmap.put("barcode", 1);
    copy_columnmap.put("branch", 3);
    copy_columnmap.put("status", 4);
    Elements copy_columns = doc.select("#tab-content .data tr#bg2 th");
    for (int i = 0; i < copy_columns.size(); i++) {
        Element th = copy_columns.get(i);
        String head = th.text().trim();
        if (head.contains("Status")) {
            copy_columnmap.put("status", i);
        }
        if (head.contains("Zweigstelle")) {
            copy_columnmap.put("branch", i);
        }
        if (head.contains("Mediennummer")) {
            copy_columnmap.put("barcode", i);
        }
        if (head.contains("Standort")) {
            copy_columnmap.put("location", i);
        }
        if (head.contains("Signatur")) {
            copy_columnmap.put("signature", i);
        }
    }

    Pattern status_lent = Pattern.compile(
            "^(entliehen) bis ([0-9]{1,2}.[0-9]{1,2}.[0-9]{2," + "4}) \\(gesamte Vormerkungen: ([0-9]+)\\)$");
    Pattern status_and_barcode = Pattern.compile("^(.*) ([0-9A-Za-z]+)$");

    Elements exemplartrs = doc.select("#tab-content .data tr").not("#bg2");
    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
    for (Element tr : exemplartrs) {
        try {
            Copy copy = new Copy();
            Element status = tr.child(copy_columnmap.get("status"));
            Element barcode = tr.child(copy_columnmap.get("barcode"));
            String barcodetext = barcode.text().trim().replace(" Wegweiser", "");

            // STATUS
            String statustext;
            if (status.getElementsByTag("b").size() > 0) {
                statustext = status.getElementsByTag("b").text().trim();
            } else {
                statustext = status.text().trim();
            }
            if (copy_columnmap.get("status").equals(copy_columnmap.get("barcode"))) {
                Matcher matcher1 = status_and_barcode.matcher(statustext);
                if (matcher1.matches()) {
                    statustext = matcher1.group(1);
                    barcodetext = matcher1.group(2);
                }
            }

            Matcher matcher = status_lent.matcher(statustext);
            if (matcher.matches()) {
                copy.setStatus(matcher.group(1));
                copy.setReservations(matcher.group(3));
                copy.setReturnDate(fmt.parseLocalDate(matcher.group(2)));
            } else {
                copy.setStatus(statustext);
            }
            copy.setBarcode(barcodetext);
            if (status.select("a[href*=doVormerkung]").size() == 1) {
                copy.setResInfo(status.select("a[href*=doVormerkung]").attr("href").split("\\?")[1]);
            }

            String branchtext = tr.child(copy_columnmap.get("branch")).text().trim().replace(" Wegweiser", "");
            copy.setBranch(branchtext);

            if (copy_columnmap.containsKey("location")) {
                copy.setLocation(
                        tr.child(copy_columnmap.get("location")).text().trim().replace(" Wegweiser", ""));
            }

            if (copy_columnmap.containsKey("signature")) {
                copy.setShelfmark(
                        tr.child(copy_columnmap.get("signature")).text().trim().replace(" Wegweiser", ""));
            }

            result.addCopy(copy);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    try {
        Element isvolume = null;
        Map<String, String> volume = new HashMap<>();
        Elements links = doc.select(".data td a");
        int elcount = links.size();
        for (int eli = 0; eli < elcount; eli++) {
            List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(links.get(eli).attr("href")), "UTF-8");
            for (NameValuePair nv : anyurl) {
                if (nv.getName().equals("methodToCall") && nv.getValue().equals("volumeSearch")) {
                    isvolume = links.get(eli);
                } else if (nv.getName().equals("catKey")) {
                    volume.put("catKey", nv.getValue());
                } else if (nv.getName().equals("dbIdentifier")) {
                    volume.put("dbIdentifier", nv.getValue());
                }
            }
            if (isvolume != null) {
                volume.put("volume", "true");
                result.setVolumesearch(volume);
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    return result;
}

From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java

protected SearchRequestResult parse_search(String html, int page) {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);/*from www  .j  a  v a 2s  .c o  m*/
    Elements table = doc.select(".resulttab tr.result_trefferX, .resulttab tr.result_treffer");
    List<SearchResult> results = new ArrayList<>();
    for (int i = 0; i < table.size(); i++) {
        Element tr = table.get(i);
        SearchResult sr = new SearchResult();
        int contentindex = 1;
        if (tr.select("td a img").size() > 0) {
            String[] fparts = tr.select("td a img").get(0).attr("src").split("/");
            String fname = fparts[fparts.length - 1];
            if (data.has("mediatypes")) {
                try {
                    sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname)));
                } catch (JSONException | IllegalArgumentException e) {
                    sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "")
                            .replace(".gif", "").replace(".png", "")));
                }
            } else {
                sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "")
                        .replace(".gif", "").replace(".png", "")));
            }
        } else {
            if (tr.children().size() == 3) {
                contentindex = 2;
            }
        }
        sr.setInnerhtml(tr.child(contentindex).child(0).html());

        sr.setNr(i);
        Element link = tr.child(contentindex).select("a").first();
        try {
            if (link != null && link.attr("href").contains("detmediennr")) {
                Map<String, String> params = getQueryParamsFirst(link.attr("abs:href"));
                String nr = params.get("detmediennr");
                if (Integer.parseInt(nr) > i + 1) {
                    // Seems to be an ID
                    if (params.get("detDB") != null) {
                        sr.setId("&detmediennr=" + nr + "&detDB=" + params.get("detDB"));
                    } else {
                        sr.setId("&detmediennr=" + nr);
                    }
                }
            }
        } catch (Exception e) {
        }
        try {
            if (tr.child(1).childNode(0) instanceof Comment) {
                Comment c = (Comment) tr.child(1).childNode(0);
                String comment = c.getData().trim();
                String id = comment.split(": ")[1];
                sr.setId(id);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        results.add(sr);
    }
    int results_total = -1;
    if (doc.select(".result_gefunden").size() > 0) {
        try {
            results_total = Integer.parseInt(
                    doc.select(".result_gefunden").text().trim().replaceAll(".*[^0-9]+([0-9]+).*", "$1"));
        } catch (NumberFormatException e) {
            e.printStackTrace();
            results_total = -1;
        }
    }
    return new SearchRequestResult(results, results_total, page);
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Retrieve the text located in a table (TABLE) HTML element.
 * <br/>/*from www . j  av  a2 s .c  o  m*/
 * We process each cell in the table as a text element. 
 * Some tables are ignored: infoboxes, wikitables, navboxes,
 * metadata, persondata, etc. 
 * 
 * @param element
 *       Element to be processed.
 * @param rawStr
 *       Current raw text string.
 * @param linkedStr
 *       Current text with hyperlinks.
 * @return
 *       {@code true} iff the element was processed.
 */
private boolean processTableElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) {
    boolean result;
    String eltClass = element.attr(XmlNames.ATT_CLASS);

    if (eltClass == null ||
    // we ignore infoboxes
            (!eltClass.contains(CLASS_INFORMATIONBOX)
                    // and wikitables
                    && !eltClass.contains(CLASS_WIKITABLE)
                    // navigation boxes
                    && !eltClass.contains(CLASS_NAVIGATIONBOX)
                    // navigation boxes, WP warnings (incompleteness, etc.)
                    && !eltClass.contains(CLASS_METADATA)
                    // personal data box (?)
                    && !eltClass.contains(CLASS_PERSONDATA)))

    {
        result = true;
        Element tbodyElt = element.children().get(0);

        for (Element rowElt : tbodyElt.children()) {
            for (Element colElt : rowElt.children()) { // process cell content
                processTextElement(colElt, rawStr, linkedStr);

                // possibly add final dot and space. 
                if (rawStr.charAt(rawStr.length() - 1) != ' ') {
                    if (rawStr.charAt(rawStr.length() - 1) == '.') {
                        rawStr.append(" ");
                        linkedStr.append(" ");
                    } else {
                        rawStr.append(". ");
                        linkedStr.append(". ");
                    }
                }
            }
        }
    }

    else
        result = false;

    return result;
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Retrieve the text located in //from   ww  w  .  jav a  2  s . co m
 * a description list (DL) HTML element.
 * 
 * @param element
 *       Element to be processed.
 * @param rawStr
 *       Current raw text string.
 * @param linkedStr
 *       Current text with hyperlinks.
 */
private void processDescriptionListElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) { // possibly remove the last new line character
    char c = rawStr.charAt(rawStr.length() - 1);
    if (c == '\n') {
        rawStr.deleteCharAt(rawStr.length() - 1);
        linkedStr.deleteCharAt(linkedStr.length() - 1);
    }

    // possibly remove preceeding space
    c = rawStr.charAt(rawStr.length() - 1);
    if (c == ' ') {
        rawStr.deleteCharAt(rawStr.length() - 1);
        linkedStr.deleteCharAt(linkedStr.length() - 1);
    }

    // possibly add a column
    c = rawStr.charAt(rawStr.length() - 1);
    if (c != '.' && c != ':' && c != ';') {
        rawStr.append(":");
        linkedStr.append(":");
    }

    // process each list element
    Elements elements = element.children();
    Iterator<Element> it = elements.iterator();
    Element tempElt = null;
    if (it.hasNext())
        tempElt = it.next();
    while (tempElt != null) { // add leading space
        rawStr.append(" ");
        linkedStr.append(" ");

        // get term
        String tempName = tempElt.tagName();
        if (tempName.equals(XmlNames.ELT_DT)) { // process term
            processTextElement(tempElt, rawStr, linkedStr);

            // possibly remove the last new line character
            c = rawStr.charAt(rawStr.length() - 1);
            if (c == '\n') {
                rawStr.deleteCharAt(rawStr.length() - 1);
                linkedStr.deleteCharAt(linkedStr.length() - 1);
            }

            // possibly remove preceeding space
            c = rawStr.charAt(rawStr.length() - 1);
            if (c == ' ') {
                rawStr.deleteCharAt(rawStr.length() - 1);
                linkedStr.deleteCharAt(linkedStr.length() - 1);
            }

            // possibly add a column and space
            c = rawStr.charAt(rawStr.length() - 1);
            if (c != '.' && c != ':' && c != ';') {
                rawStr.append(": ");
                linkedStr.append(": ");
            }

            // go to next element
            if (it.hasNext())
                tempElt = it.next();
            else
                tempElt = null;
        }

        // get definition
        //         if(tempName.equals(XmlNames.ELT_DD))
        if (tempElt != null) { // process term
            processTextElement(tempElt, rawStr, linkedStr);

            // possibly remove the last new line character
            c = rawStr.charAt(rawStr.length() - 1);
            if (c == '\n') {
                rawStr.deleteCharAt(rawStr.length() - 1);
                linkedStr.deleteCharAt(linkedStr.length() - 1);
            }

            // possibly remove preceeding space
            c = rawStr.charAt(rawStr.length() - 1);
            if (c == ' ') {
                rawStr.deleteCharAt(rawStr.length() - 1);
                linkedStr.deleteCharAt(linkedStr.length() - 1);
            }

            // possibly add a semi-column
            c = rawStr.charAt(rawStr.length() - 1);
            if (c != '.' && c != ':' && c != ';') {
                rawStr.append(";");
                linkedStr.append(";");
            }

            // go to next element
            if (it.hasNext())
                tempElt = it.next();
            else
                tempElt = null;
        }
    }

    // possibly remove last separator
    c = rawStr.charAt(rawStr.length() - 1);
    if (c == ';') {
        rawStr.deleteCharAt(rawStr.length() - 1);
        linkedStr.deleteCharAt(linkedStr.length() - 1);
        c = rawStr.charAt(rawStr.length() - 1);
        if (c != '.') {
            rawStr.append(".");
            linkedStr.append(".");
        }
        rawStr.append("\n");
        linkedStr.append("\n");
    }
}

From source file:cn.wanghaomiao.xpath.core.XpathEvaluator.java

/**
 * ?xpath/*w w  w .  j a v a 2s .  co  m*/
 *
 * @param xpath
 * @param root
 * @return
 */
public List<JXNode> evaluate(String xpath, Elements root) throws NoSuchAxisException, NoSuchFunctionException {
    List<JXNode> res = new LinkedList<JXNode>();
    Elements context = root;
    List<Node> xpathNodes = getXpathNodeTree(xpath);
    for (int i = 0; i < xpathNodes.size(); i++) {
        Node n = xpathNodes.get(i);
        LinkedList<Element> contextTmp = new LinkedList<Element>();
        if (n.getScopeEm() == ScopeEm.RECURSIVE || n.getScopeEm() == ScopeEm.CURREC) {
            if (n.getTagName().startsWith("@")) {
                for (Element e : context) {
                    //?
                    String key = n.getTagName().substring(1);
                    if (key.equals("*")) {
                        res.add(JXNode.t(e.attributes().toString()));
                    } else {
                        String value = e.attr(key);
                        if (StringUtils.isNotBlank(value)) {
                            res.add(JXNode.t(value));
                        }
                    }
                    //??
                    for (Element dep : e.getAllElements()) {
                        if (key.equals("*")) {
                            res.add(JXNode.t(dep.attributes().toString()));
                        } else {
                            String value = dep.attr(key);
                            if (StringUtils.isNotBlank(value)) {
                                res.add(JXNode.t(value));
                            }
                        }
                    }
                }
            } else if (n.getTagName().endsWith("()")) {
                //??text()
                res.add(JXNode.t(context.text()));
            } else {
                Elements searchRes = context.select(n.getTagName());
                for (Element e : searchRes) {
                    Element filterR = filter(e, n);
                    if (filterR != null) {
                        contextTmp.add(filterR);
                    }
                }
                context = new Elements(contextTmp);
                if (i == xpathNodes.size() - 1) {
                    for (Element e : contextTmp) {
                        res.add(JXNode.e(e));
                    }
                }
            }

        } else {
            if (n.getTagName().startsWith("@")) {
                for (Element e : context) {
                    String key = n.getTagName().substring(1);
                    if (key.equals("*")) {
                        res.add(JXNode.t(e.attributes().toString()));
                    } else {
                        String value = e.attr(key);
                        if (StringUtils.isNotBlank(value)) {
                            res.add(JXNode.t(value));
                        }
                    }
                }
            } else if (n.getTagName().endsWith("()")) {
                res = (List<JXNode>) callFunc(n.getTagName().substring(0, n.getTagName().length() - 2),
                        context);
            } else {
                for (Element e : context) {
                    Elements filterScope = e.children();
                    if (StringUtils.isNotBlank(n.getAxis())) {
                        filterScope = getAxisScopeEls(n.getAxis(), e);
                    }
                    for (Element chi : filterScope) {
                        Element fchi = filter(chi, n);
                        if (fchi != null) {
                            contextTmp.add(fchi);
                        }
                    }
                }
                context = new Elements(contextTmp);
                if (i == xpathNodes.size() - 1) {
                    for (Element e : contextTmp) {
                        res.add(JXNode.e(e));
                    }
                }
            }
        }
    }
    return res;
}

From source file:de.geeksfactory.opacclient.apis.SISIS.java

protected SearchRequestResult parse_search(String html, int page) throws OpacErrorException {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url + "/searchfoo");

    if (doc.select(".error").size() > 0) {
        throw new OpacErrorException(doc.select(".error").text().trim());
    } else if (doc.select(".nohits").size() > 0) {
        throw new OpacErrorException(doc.select(".nohits").text().trim());
    } else if (doc.select(".box-header h2, #nohits").text().contains("keine Treffer")) {
        return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1);
    }//from   w  w w  . jav  a  2s.c  o  m

    int results_total = -1;

    String resultnumstr = doc.select(".box-header h2").first().text();
    if (resultnumstr.contains("(1/1)") || resultnumstr.contains(" 1/1")) {
        reusehtml = html;
        throw new OpacErrorException("is_a_redirect");
    } else if (resultnumstr.contains("(")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1"));
    } else if (resultnumstr.contains(": ")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1"));
    }

    Elements table = doc.select("table.data tbody tr");
    identifier = null;

    Elements links = doc.select("table.data a");
    boolean haslink = false;
    for (int i = 0; i < links.size(); i++) {
        Element node = links.get(i);
        if (node.hasAttr("href") & node.attr("href").contains("singleHit.do") && !haslink) {
            haslink = true;
            try {
                List<NameValuePair> anyurl = URLEncodedUtils
                        .parse(new URI(node.attr("href").replace(" ", "%20").replace("&amp;", "&")), ENCODING);
                for (NameValuePair nv : anyurl) {
                    if (nv.getName().equals("identifier")) {
                        identifier = nv.getValue();
                        break;
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }

        }
    }

    List<SearchResult> results = new ArrayList<>();
    for (int i = 0; i < table.size(); i++) {
        Element tr = table.get(i);
        SearchResult sr = new SearchResult();
        if (tr.select("td img[title]").size() > 0) {
            String title = tr.select("td img").get(0).attr("title");
            String[] fparts = tr.select("td img").get(0).attr("src").split("/");
            String fname = fparts[fparts.length - 1];
            MediaType default_by_fname = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "")
                    .replace(".gif", "").replace(".png", ""));
            MediaType default_by_title = defaulttypes.get(title);
            MediaType default_name = default_by_title != null ? default_by_title : default_by_fname;
            if (data.has("mediatypes")) {
                try {
                    sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname)));
                } catch (JSONException | IllegalArgumentException e) {
                    sr.setType(default_name);
                }
            } else {
                sr.setType(default_name);
            }
        }
        String alltext = tr.text();
        if (alltext.contains("eAudio") || alltext.contains("eMusic")) {
            sr.setType(MediaType.MP3);
        } else if (alltext.contains("eVideo")) {
            sr.setType(MediaType.EVIDEO);
        } else if (alltext.contains("eBook")) {
            sr.setType(MediaType.EBOOK);
        } else if (alltext.contains("Munzinger")) {
            sr.setType(MediaType.EDOC);
        }

        if (tr.children().size() > 3 && tr.child(3).select("img[title*=cover]").size() == 1) {
            sr.setCover(tr.child(3).select("img[title*=cover]").attr("abs:src"));
            if (sr.getCover().contains("showCover.do")) {
                downloadCover(sr);
            }
        }

        Element middlething;
        if (tr.children().size() > 2 && tr.child(2).select("a").size() > 0) {
            middlething = tr.child(2);
        } else {
            middlething = tr.child(1);
        }

        List<Node> children = middlething.childNodes();
        if (middlething.select("div").not("#hlrightblock,.bestellfunktionen").size() == 1) {
            Element indiv = middlething.select("div").not("#hlrightblock,.bestellfunktionen").first();
            if (indiv.children().size() > 1) {
                children = indiv.childNodes();
            }
        } else if (middlething.select("span.titleData").size() == 1) {
            children = middlething.select("span.titleData").first().childNodes();
        }
        int childrennum = children.size();

        List<String[]> strings = new ArrayList<>();
        for (int ch = 0; ch < childrennum; ch++) {
            Node node = children.get(ch);
            if (node instanceof TextNode) {
                String text = ((TextNode) node).text().trim();
                if (text.length() > 3) {
                    strings.add(new String[] { "text", "", text });
                }
            } else if (node instanceof Element) {

                List<Node> subchildren = node.childNodes();
                for (int j = 0; j < subchildren.size(); j++) {
                    Node subnode = subchildren.get(j);
                    if (subnode instanceof TextNode) {
                        String text = ((TextNode) subnode).text().trim();
                        if (text.length() > 3) {
                            strings.add(new String[] { ((Element) node).tag().getName(), "text", text,
                                    ((Element) node).className(), node.attr("style") });
                        }
                    } else if (subnode instanceof Element) {
                        String text = ((Element) subnode).text().trim();
                        if (text.length() > 3) {
                            strings.add(new String[] { ((Element) node).tag().getName(),
                                    ((Element) subnode).tag().getName(), text, ((Element) node).className(),
                                    node.attr("style") });
                        }
                    }
                }
            }
        }

        StringBuilder description = null;
        if (tr.select("span.Z3988").size() == 1) {
            // Sometimes there is a <span class="Z3988"> item which provides
            // data in a standardized format.
            List<NameValuePair> z3988data;
            boolean hastitle = false;
            try {
                description = new StringBuilder();
                z3988data = URLEncodedUtils
                        .parse(new URI("http://dummy/?" + tr.select("span.Z3988").attr("title")), "UTF-8");
                for (NameValuePair nv : z3988data) {
                    if (nv.getValue() != null) {
                        if (!nv.getValue().trim().equals("")) {
                            if (nv.getName().equals("rft.btitle") && !hastitle) {
                                description.append("<b>").append(nv.getValue()).append("</b>");
                                hastitle = true;
                            } else if (nv.getName().equals("rft.atitle") && !hastitle) {
                                description.append("<b>").append(nv.getValue()).append("</b>");
                                hastitle = true;
                            } else if (nv.getName().equals("rft.au")) {
                                description.append("<br />").append(nv.getValue());
                            } else if (nv.getName().equals("rft.date")) {
                                description.append("<br />").append(nv.getValue());
                            }
                        }
                    }
                }
            } catch (URISyntaxException e) {
                description = null;
            }
        }
        boolean described = false;
        if (description != null && description.length() > 0) {
            sr.setInnerhtml(description.toString());
            described = true;
        } else {
            description = new StringBuilder();
        }
        int k = 0;
        boolean yearfound = false;
        boolean titlefound = false;
        boolean sigfound = false;
        for (String[] part : strings) {
            if (!described) {
                if (part[0].equals("a") && (k == 0 || !titlefound)) {
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append("<b>").append(part[2]).append("</b>");
                    titlefound = true;
                } else if (part[2].matches("\\D*[0-9]{4}\\D*") && part[2].length() <= 10) {
                    yearfound = true;
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append(part[2]);
                } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) {
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append(part[2]);
                } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) {
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append(part[2]);
                } else if (k > 1 && k < 4 && !sigfound && part[0].equals("text")
                        && part[2].matches("^[A-Za-z0-9,\\- ]+$")) {
                    description.append("<br />");
                    description.append(part[2]);
                }
            }
            if (part.length == 4) {
                if (part[0].equals("span") && part[3].equals("textgruen")) {
                    sr.setStatus(SearchResult.Status.GREEN);
                } else if (part[0].equals("span") && part[3].equals("textrot")) {
                    sr.setStatus(SearchResult.Status.RED);
                }
            } else if (part.length == 5) {
                if (part[4].contains("purple")) {
                    sr.setStatus(SearchResult.Status.YELLOW);
                }
            }
            if (sr.getStatus() == null) {
                if ((part[2].contains("entliehen")
                        && part[2].startsWith("Vormerkung ist leider nicht mglich"))
                        || part[2].contains("nur in anderer Zweigstelle ausleihbar und nicht bestellbar")) {
                    sr.setStatus(SearchResult.Status.RED);
                } else if (part[2].startsWith("entliehen")
                        || part[2].contains("Ein Exemplar finden Sie in einer anderen Zweigstelle")) {
                    sr.setStatus(SearchResult.Status.YELLOW);
                } else if ((part[2].startsWith("bestellbar") && !part[2].contains("nicht bestellbar"))
                        || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar"))
                        || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar"))
                        || (part[2].startsWith("vormerkbar") && !part[2].contains("nicht vormerkbar"))
                        || (part[2].contains("heute zurckgebucht"))
                        || (part[2].contains("ausleihbar") && !part[2].contains("nicht ausleihbar"))) {
                    sr.setStatus(SearchResult.Status.GREEN);
                }
                if (sr.getType() != null) {
                    if (sr.getType().equals(MediaType.EBOOK) || sr.getType().equals(MediaType.EVIDEO)
                            || sr.getType().equals(MediaType.MP3))
                    // Especially Onleihe.de ebooks are often marked
                    // green though they are not available.
                    {
                        sr.setStatus(SearchResult.Status.UNKNOWN);
                    }
                }
            }
            k++;
        }
        if (!described) {
            sr.setInnerhtml(description.toString());
        }

        sr.setNr(10 * (page - 1) + i);
        sr.setId(null);
        results.add(sr);
    }
    resultcount = results.size();
    return new SearchRequestResult(results, results_total, page);
}

From source file:com.lloydtorres.stately.issues.IssueDecisionActivity.java

/**
 * Process the received page into the Issue and its IssueOptions
 * @param v Activity view//from  w w  w  . j av a 2s .c  o  m
 * @param d Document received from NationStates
 */
private void processIssueInfo(View v, Document d) {
    // First check if the issue is still available
    if (d.text().contains(NOT_AVAILABLE)) {
        mSwipeRefreshLayout.setRefreshing(false);
        SparkleHelper.makeSnackbar(v,
                String.format(Locale.US, getString(R.string.issue_unavailable), mNation.name));
        return;
    }

    Element issueInfoContainer = d.select("div#dilemma").first();

    if (issueInfoContainer == null) {
        // safety check
        mSwipeRefreshLayout.setRefreshing(false);
        SparkleHelper.makeSnackbar(v, getString(R.string.login_error_parsing));
        return;
    }

    Elements issueInfoRaw = issueInfoContainer.children();

    String issueText = issueInfoRaw.select("p").first().text();
    // If this is an issue chain, grab the second paragraph instead
    if (d.select("div.dilemmachain").first() != null) {
        issueText = issueInfoRaw.select("p").get(1).text();
        if (d.text().contains(STORY_SO_FAR)) {
            issueText = issueText + "<br><br>" + issueInfoRaw.select("p").get(2).text();
        }
    }
    issue.content = issueText;

    issue.options = new ArrayList<IssueOption>();

    Element optionHolderMain = issueInfoRaw.select("ol.diloptions").first();
    if (optionHolderMain != null) {
        Elements optionsHolder = optionHolderMain.select("li");

        int i = 0;
        for (Element option : optionsHolder) {
            IssueOption issueOption = new IssueOption();
            issueOption.index = i++;

            Element button = option.select("button").first();
            if (button != null) {
                issueOption.header = button.attr("name");
            } else {
                issueOption.header = IssueOption.SELECTED_HEADER;
            }

            Element optionContentHolder = option.select("p").first();
            if (optionContentHolder == null) {
                // safety check
                mSwipeRefreshLayout.setRefreshing(false);
                SparkleHelper.makeSnackbar(v, getString(R.string.login_error_parsing));
                return;
            }

            issueOption.content = optionContentHolder.text();
            issue.options.add(issueOption);
        }
    }

    IssueOption dismissOption = new IssueOption();
    dismissOption.index = -1;
    dismissOption.header = IssueOption.DISMISS_HEADER;
    dismissOption.content = "";
    issue.options.add(dismissOption);

    setRecyclerAdapter(issue);
    mSwipeRefreshLayout.setRefreshing(false);
    mSwipeRefreshLayout.setEnabled(false);
}

From source file:com.lloydtorres.stately.issues.IssuesFragment.java

/**
 * Process the HTML contents of the issues into actual Issue objects
 * @param d//  ww w .j  a  v  a2 s  .c  om
 */
private void processIssues(View v, Document d) {
    issues = new ArrayList<Object>();

    Element issuesContainer = d.select("ul.dilemmalist").first();

    if (issuesContainer == null) {
        // safety check
        mSwipeRefreshLayout.setRefreshing(false);
        SparkleHelper.makeSnackbar(v, getString(R.string.login_error_parsing));
        return;
    }

    Elements issuesRaw = issuesContainer.children();

    for (Element i : issuesRaw) {
        Issue issueCore = new Issue();

        Elements issueContents = i.children();

        // Get issue ID and name
        Element issueMain = issueContents.select("a").first();

        if (issueMain == null) {
            continue;
        }

        String issueLink = issueMain.attr("href");
        issueCore.id = Integer.valueOf(issueLink.replace("page=show_dilemma/dilemma=", ""));
        Matcher chainMatcher = CHAIN_ISSUE_REGEX.matcher(issueMain.text());
        if (chainMatcher.find()) {
            issueCore.chain = chainMatcher.group(1);
            issueCore.title = chainMatcher.group(2);
        } else {
            issueCore.title = issueMain.text();
        }

        issues.add(issueCore);
    }

    Element nextIssueUpdate = d.select("p.dilemmanextupdate").first();
    if (nextIssueUpdate != null) {
        String nextUpdate = nextIssueUpdate.text();
        issues.add(nextUpdate);
    }

    if (issuesRaw.size() <= 0) {
        String nextUpdate = getString(R.string.no_issues);

        Matcher m = NEXT_ISSUE_REGEX.matcher(d.html());
        if (m.find()) {
            long nextUpdateTime = Long.valueOf(m.group(1)) / 1000L;
            nextUpdate = String.format(Locale.US, getString(R.string.next_issue),
                    SparkleHelper.getReadableDateFromUTC(getContext(), nextUpdateTime));
        }

        issues.add(nextUpdate);
    }

    if (mRecyclerAdapter == null) {
        mRecyclerAdapter = new IssuesRecyclerAdapter(getContext(), issues, mNation);
        mRecyclerView.setAdapter(mRecyclerAdapter);
    } else {
        ((IssuesRecyclerAdapter) mRecyclerAdapter).setIssueCards(issues);
    }
    mSwipeRefreshLayout.setRefreshing(false);
}