Example usage for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey)

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:de.geeksfactory.opacclient.apis.Open.java

@Override
public SearchRequestResult searchGetPage(int page) throws IOException, OpacErrorException, JSONException {
    /*/* w ww .ja  v  a2s. c o  m*/
    When there are many pages of results, there will only be links to the next 4 and
    previous 4 pages, so we will click links until it gets to the correct page.
     */

    if (searchResultDoc == null)
        throw new NotReachableException();

    Document doc = searchResultDoc;

    Elements pageLinks = doc.select("span[id$=DataPager1]").first().select("a[id*=LinkButtonPageN");
    int from = Integer.valueOf(pageLinks.first().text());
    int to = Integer.valueOf(pageLinks.last().text());
    Element linkToClick;
    boolean willBeCorrectPage;

    if (page < from) {
        linkToClick = pageLinks.first();
        willBeCorrectPage = false;
    } else if (page > to) {
        linkToClick = pageLinks.last();
        willBeCorrectPage = false;
    } else {
        linkToClick = pageLinks.get(page - from);
        willBeCorrectPage = true;
    }

    Pattern pattern = Pattern.compile("javascript:__doPostBack\\('([^,]*)','([^\\)]*)'\\)");
    Matcher matcher = pattern.matcher(linkToClick.attr("href"));
    if (!matcher.find())
        throw new OpacErrorException(StringProvider.INTERNAL_ERROR);

    FormElement form = (FormElement) doc.select("form").first();
    HttpEntity data = formData(form, null).addTextBody("__EVENTTARGET", matcher.group(1))
            .addTextBody("__EVENTARGUMENT", matcher.group(2)).build();

    ByteArrayOutputStream stream = new ByteArrayOutputStream();
    data.writeTo(stream);

    String postUrl = form.attr("abs:action");

    String html = httpPost(postUrl, data, "UTF-8");
    if (willBeCorrectPage) {
        // We clicked on the correct link
        Document doc2 = Jsoup.parse(html);
        doc2.setBaseUri(postUrl);
        return parse_search(doc2, page);
    } else {
        // There was no correct link, so try to find one again
        searchResultDoc = Jsoup.parse(html);
        searchResultDoc.setBaseUri(postUrl);
        return searchGetPage(page);
    }
}

From source file:com.ibuildapp.romanblack.WebPlugin.WebPlugin.java

/**
 * Prepare and load data to WebView.//from  w  ww  .j  a va 2  s .  com
 */
private void showHtml() {
    try {

        if (isOnline) {

            if (currentUrl.length() > 0 && !currentUrl.equals("about:blank")) {
                url = currentUrl;
            }
            if (url.length() > 0)
                html = "<html><body><a href=\"" + url + "\" id=\"link\" /></body></html>";

            Document doc = Jsoup.parse(html);
            Element iframe = doc.select("iframe").first();

            boolean isGoogleCalendar = false;
            boolean isGoogleForms = false;
            String iframeSrc = "";
            try {
                if (iframe != null) {
                    iframeSrc = iframe.attr("src");
                }
            } catch (Exception e) {
            }
            if (iframeSrc.length() > 0) {
                isGoogleCalendar = iframeSrc.contains("www.google.com/calendar")
                        || iframeSrc.contains("calendar.google.com/calendar");
                isGoogleForms = iframeSrc.contains("google.com/forms");
            }
            if (isGoogleCalendar) {
                webView.loadUrl(iframeSrc);
            } else if (isGoogleForms) {
                webView.getSettings().setBuiltInZoomControls(false);

                DisplayMetrics metrix = getResources().getDisplayMetrics();
                int width = metrix.widthPixels;
                int height = metrix.heightPixels;
                float density = metrix.density;

                iframe.attr("width", (int) (width / density) + "");
                iframe.attr("height", (int) (height / density - (75 /*+ (hasAdView() ? 50 : 0)*/)) + "");

                iframe.attr("style", "margin: 0; padding: 0");

                Element body = doc.select("body").first();
                body.attr("style", "margin: 0; padding: 0");

                html = doc.outerHtml();

                webView.loadDataWithBaseURL("http://", html, "text/html", "utf-8", "");
            } else {
                Elements forms = doc.select("form");
                Iterator<Element> iterator = forms.iterator();
                for (; iterator.hasNext();) {
                    Element form = iterator.next();
                    String action = form.attr("action");

                    if (action.contains("paypal.com")) {
                        form.append("<input type=\"hidden\" name=\"bn\" value=\"ibuildapp_SP\">");
                    }

                    html = doc.html();
                }

                hideProgress = true;

                if (Build.VERSION.SDK_INT >= 20 && html.contains("ibuildapp") && html.contains("powr")) {
                    int height = getResources().getDisplayMetrics().heightPixels;
                    html = "<iframe width=\"" + 420 + "\" height=\"" + height + "\"  frameBorder=\"0\" src="
                            + url + "></iframe>";
                    webView.loadData(html, "text/html", "utf-8");
                } else
                    webView.loadDataWithBaseURL("http://", html, "text/html", "utf-8", "");
            }
        } else {
            if (html.length() > 0) {
                webView.loadDataWithBaseURL("http://", html, "text/html", "utf-8", "");
            }
        }

        handler.sendEmptyMessageDelayed(HIDE_PROGRESS, 10000);

    } catch (Exception ex) { // Error Logging
    }
}

From source file:de.geeksfactory.opacclient.apis.BiBer1992.java

@Override
public List<SearchField> getSearchFields() throws IOException {
    List<SearchField> fields = new ArrayList<>();

    HttpGet httpget;//from ww w  .  j av  a  2 s  .  c o m
    if (opacDir.contains("opax")) {
        httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel.html.S");
    } else {
        httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel_main.S");
    }

    HttpResponse response = http_client.execute(httpget);

    if (response.getStatusLine().getStatusCode() == 500) {
        throw new NotReachableException(response.getStatusLine().getReasonPhrase());
    }
    String html = convertStreamToString(response.getEntity().getContent());
    HttpUtils.consume(response.getEntity());

    Document doc = Jsoup.parse(html);

    // get text fields
    Elements text_opts = doc.select("form select[name=REG1] option");
    for (Element opt : text_opts) {
        TextSearchField field = new TextSearchField();
        field.setId(opt.attr("value"));
        field.setDisplayName(opt.text());
        field.setHint("");
        fields.add(field);
    }

    // get media types
    Elements mt_opts = doc.select("form input[name~=(MT|MS)]");
    if (mt_opts.size() > 0) {
        DropdownSearchField mtDropdown = new DropdownSearchField();
        mtDropdown.setId(mt_opts.get(0).attr("name"));
        mtDropdown.setDisplayName("Medientyp");
        for (Element opt : mt_opts) {
            if (!opt.val().equals("")) {
                String text = opt.text();
                if (text.length() == 0) {
                    // text is empty, check layouts:
                    // Essen: <input name="MT"><img title="mediatype">
                    // Schaffenb: <input name="MT"><img alt="mediatype">
                    Element img = opt.nextElementSibling();
                    if (img != null && img.tagName().equals("img")) {
                        text = img.attr("title");
                        if (text.equals("")) {
                            text = img.attr("alt");
                        }
                    }
                }
                if (text.length() == 0) {
                    // text is still empty, check table layout, Example
                    // Friedrichshafen
                    // <td><input name="MT"></td> <td><img
                    // title="mediatype"></td>
                    Element td1 = opt.parent();
                    Element td2 = td1.nextElementSibling();
                    if (td2 != null) {
                        Elements td2Children = td2.select("img[title]");
                        if (td2Children.size() > 0) {
                            text = td2Children.get(0).attr("title");
                        }
                    }
                }
                if (text.length() == 0) {
                    // text is still empty, check images in label layout, Example
                    // Wiedenst
                    // <input type="radio" name="MT" id="MTYP1" value="MTYP1">
                    // <label for="MTYP1"><img src="http://www.wiedenest.de/bib/image/books
                    // .png" alt="Bcher" title="Bcher"></label>
                    Element label = opt.nextElementSibling();
                    if (label != null) {
                        Elements td2Children = label.select("img[title]");
                        if (td2Children.size() > 0) {
                            text = td2Children.get(0).attr("title");
                        }
                    }
                }
                if (text.length() == 0) {
                    // text is still empty: missing end tag like Offenburg
                    text = parse_option_regex(opt);
                }
                mtDropdown.addDropdownValue(opt.val(), text);
            }
        }
        fields.add(mtDropdown);
    }

    // get branches
    Elements br_opts = doc.select("form select[name=ZW] option");
    if (br_opts.size() > 0) {
        DropdownSearchField brDropdown = new DropdownSearchField();
        brDropdown.setId(br_opts.get(0).parent().attr("name"));
        brDropdown.setDisplayName(br_opts.get(0).parent().parent().previousElementSibling().text()
                .replace("\u00a0", "").replace("?", "").trim());
        for (Element opt : br_opts) {
            brDropdown.addDropdownValue(opt.val(), opt.text());
        }
        fields.add(brDropdown);
    }

    return fields;
}

From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java

@Override
public List<SearchField> getSearchFields() throws IOException, JSONException {
    if (!initialised) {
        start();/* w w  w .j  a v a 2s . c o  m*/
    }

    List<SearchField> fields = new ArrayList<>();
    // Read branches and media types
    List<NameValuePair> nameValuePairs = new ArrayList<>(2);
    nameValuePairs.add(new BasicNameValuePair("link_profis.x", "0"));
    nameValuePairs.add(new BasicNameValuePair("link_profis.y", "1"));
    String html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs),
            getDefaultEncoding());
    Document doc = Jsoup.parse(html);

    Elements fieldElems = doc.select(".suchfeldinhalt");
    for (Element fieldElem : fieldElems) {
        String name = fieldElem.select(".suchfeld_inhalt_titel label").text();
        String hint = "";
        if (fieldElem.select(".suchfeld_inhalt_input").size() > 0) {
            List<TextNode> textNodes = fieldElem.select(".suchfeld_inhalt_input").first().textNodes();
            if (textNodes.size() > 0) {
                for (TextNode node : textNodes) {
                    String text = node.getWholeText().replace("\n", "");
                    if (!text.equals("")) {
                        hint = node.getWholeText().replace("\n", "");
                        break;
                    }
                }
            }
        }

        Elements inputs = fieldElem
                .select(".suchfeld_inhalt_input input[type=text], " + ".suchfeld_inhalt_input select");
        if (inputs.size() == 1) {
            fields.add(createSearchField(name, hint, inputs.get(0)));
        } else if (inputs.size() == 2 && inputs.select("input[type=text]").size() == 2) {
            // Two text fields, e.g. year from/to or two keywords
            fields.add(createSearchField(name, hint, inputs.get(0)));
            TextSearchField secondField = (TextSearchField) createSearchField(name, hint, inputs.get(1));
            secondField.setHalfWidth(true);
            fields.add(secondField);
        } else if (inputs.size() == 2 && inputs.get(0).tagName().equals("select")
                && inputs.get(1).tagName().equals("input") && inputs.get(0).attr("name").equals("feld1")) {
            // A dropdown to select from different search field types.
            // Break it down into single text fields.
            for (Element option : inputs.get(0).select("option")) {
                TextSearchField field = new TextSearchField();
                field.setHint(hint);
                field.setDisplayName(option.text());
                field.setId(inputs.get(1).attr("name") + "$" + option.attr("value"));

                JSONObject data = new JSONObject();
                JSONObject params = new JSONObject();
                params.put(inputs.get(0).attr("name"), option.attr("value"));
                data.put("additional_params", params);
                field.setData(data);

                fields.add(field);
            }
        }
    }

    DropdownSearchField orderField = new DropdownSearchField("orderselect",
            stringProvider.getString(StringProvider.ORDER), false, null);
    orderField.addDropdownValue("1", stringProvider.getString(StringProvider.ORDER_DEFAULT));
    orderField.addDropdownValue("2:desc", stringProvider.getString(StringProvider.ORDER_YEAR_DESC));
    orderField.addDropdownValue("2:asc", stringProvider.getString(StringProvider.ORDER_YEAR_ASC));
    orderField.addDropdownValue("3:desc", stringProvider.getString(StringProvider.ORDER_CATEGORY_DESC));
    orderField.addDropdownValue("3:asc", stringProvider.getString(StringProvider.ORDER_CATEGORY_ASC));
    orderField.setMeaning(Meaning.ORDER);
    fields.add(orderField);

    return fields;
}

From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java

@Override
public ReservationResult reservation(DetailledItem item, Account acc, int useraction, String selection)
        throws IOException {
    String reservation_info = item.getReservation_info();

    Document doc = null;/*from   www .  ja v a  2 s .c om*/

    if (useraction == MultiStepResult.ACTION_CONFIRMATION) {
        List<NameValuePair> nameValuePairs = new ArrayList<>(2);
        nameValuePairs.add(new BasicNameValuePair("make_allvl", "Bestaetigung"));
        nameValuePairs.add(new BasicNameValuePair("target", "makevorbest"));
        httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding());
        return new ReservationResult(MultiStepResult.Status.OK);
    } else if (selection == null || useraction == 0) {
        String html = httpGet(opac_url + "/" + reservation_info, getDefaultEncoding());
        doc = Jsoup.parse(html);

        if (doc.select("input[name=AUSWEIS]").size() > 0) {
            // Needs login
            List<NameValuePair> nameValuePairs = new ArrayList<>(2);
            nameValuePairs.add(new BasicNameValuePair("AUSWEIS", acc.getName()));
            nameValuePairs.add(new BasicNameValuePair("PWD", acc.getPassword()));
            if (data.has("db")) {
                try {
                    nameValuePairs.add(new BasicNameValuePair("vkontodb", data.getString("db")));
                } catch (JSONException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            nameValuePairs.add(new BasicNameValuePair("B1", "weiter"));
            nameValuePairs.add(new BasicNameValuePair("target", doc.select("input[name=target]").val()));
            nameValuePairs.add(new BasicNameValuePair("type", "VT2"));
            html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs),
                    getDefaultEncoding());
            doc = Jsoup.parse(html);
        }
        if (doc.select("select[name=" + branch_inputfield + "]").size() == 0) {
            if (doc.select("select[name=VZST]").size() > 0) {
                branch_inputfield = "VZST";
            }
        }
        if (doc.select("select[name=" + branch_inputfield + "]").size() > 0) {
            List<Map<String, String>> branches = new ArrayList<>();
            for (Element option : doc.select("select[name=" + branch_inputfield + "]").first().children()) {
                String value = option.text().trim();
                String key;
                if (option.hasAttr("value")) {
                    key = option.attr("value");
                } else {
                    key = value;
                }
                Map<String, String> selopt = new HashMap<>();
                selopt.put("key", key);
                selopt.put("value", value);
                branches.add(selopt);
            }
            _res_target = doc.select("input[name=target]").attr("value");
            ReservationResult result = new ReservationResult(MultiStepResult.Status.SELECTION_NEEDED);
            result.setActionIdentifier(ReservationResult.ACTION_BRANCH);
            result.setSelection(branches);
            return result;
        }
    } else if (useraction == ReservationResult.ACTION_BRANCH) {
        List<NameValuePair> nameValuePairs = new ArrayList<>(2);
        nameValuePairs.add(new BasicNameValuePair(branch_inputfield, selection));
        nameValuePairs.add(new BasicNameValuePair("button2", "weiter"));
        nameValuePairs.add(new BasicNameValuePair("target", _res_target));
        String html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs),
                getDefaultEncoding());
        doc = Jsoup.parse(html);
    }

    if (doc == null) {
        return new ReservationResult(MultiStepResult.Status.ERROR);
    }

    if (doc.select("input[name=target]").size() > 0) {
        if (doc.select("input[name=target]").attr("value").equals("makevorbest")) {
            List<String[]> details = new ArrayList<>();

            if (doc.getElementsByClass("kontomeldung").size() == 1) {
                details.add(new String[] { doc.getElementsByClass("kontomeldung").get(0).text().trim() });
            }
            Pattern p = Pattern.compile("geb.hr", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
            for (Element div : doc.select(".kontozeile_center")) {
                for (String text : Jsoup.parse(div.html().replaceAll("(?i)<br[^>]*>", "br2n")).text()
                        .split("br2n")) {
                    if (p.matcher(text).find() && !text.contains("usstehend")
                            && text.contains("orbestellung")) {
                        details.add(new String[] { text.trim() });
                    }
                }
            }

            if (doc.select("#vorbest").size() > 0 && doc.select("#vorbest").val().contains("(")) {
                // Erlangen uses "Kostenpflichtige Vorbestellung (1 Euro)"
                // as the label of its reservation button
                details.add(new String[] { doc.select("#vorbest").val().trim() });
            }

            for (Element row : doc.select(".kontozeile_center table tr")) {
                if (row.select(".konto_feld").size() == 1 && row.select(".konto_feldinhalt").size() == 1) {
                    details.add(new String[] { row.select(".konto_feld").text().trim(),
                            row.select(".konto_feldinhalt").text().trim() });
                }
            }
            ReservationResult result = new ReservationResult(MultiStepResult.Status.CONFIRMATION_NEEDED);
            result.setDetails(details);
            return result;
        }
    }

    if (doc.getElementsByClass("kontomeldung").size() == 1) {
        return new ReservationResult(MultiStepResult.Status.ERROR,
                doc.getElementsByClass("kontomeldung").get(0).text());
    }

    return new ReservationResult(MultiStepResult.Status.ERROR,
            stringProvider.getString(StringProvider.UNKNOWN_ERROR));
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Retrieve the text located in /*ww  w. j a v a2 s  . c  o  m*/
 * a span (SPAN) HTML element.
 * <br/>
 * We process everything but
 * the phonetic transcriptions.
 * 
 * @param element
 *       Element to be processed.
 * @param rawStr
 *       Current raw text string.
 * @param linkedStr
 *       Current text with hyperlinks.
 * @return
 *       {@code true} iff the element was processed.
 */
private boolean processSpanElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) {
    boolean result;
    String eltClass = element.attr(XmlNames.ATT_CLASS);

    if (eltClass == null ||
    // we don't need phonetic transcriptions, and they can mess up NER tools
            (!eltClass.contains(CLASS_IPA)
                    // we also ignore WP buttons such as the "edit" links placed in certain section headers
                    && !eltClass.contains(CLASS_EDIT)
                    // language indications
                    && !eltClass.contains(CLASS_LANGUAGEICON)))

    {
        result = true;
        // otherwise, we process what's inside the span tag
        processTextElement(element, rawStr, linkedStr);
    }

    else
        result = false;

    return result;
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Retrieve the text located in //from   ww w. ja va2s  .  c  o  m
 * a division (DIV) HTML element.
 * <br/>
 * We ignore some of them: table
 * of content, reference list, related links, etc.
 * 
 * @param element
 *       Element to be processed.
 * @param rawStr
 *       Current raw text string.
 * @param linkedStr
 *       Current text with hyperlinks.
 * @return
 *       {@code true} iff the element was processed.
 */
private boolean processDivisionElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) {
    boolean result;
    String eltClass = element.attr(XmlNames.ATT_CLASS);

    //if(eltClass.contains("thumb"))
    //   System.out.print("");

    if (eltClass == null ||
    // we ignore infoboxes
            (!eltClass.contains(CLASS_TABLEOFCONTENT)
                    // list of bibiliographic references located at the end of the page
                    && !eltClass.contains(CLASS_REFERENCES)
                    // WP warning links (disambiguation and such)
                    && !eltClass.contains(CLASS_DABLINK)
                    // related links
                    && !eltClass.contains(CLASS_RELATEDLINK)
                    // audio or video clip
                    && !eltClass.contains(CLASS_MEDIA)
                    // button used to magnify images
                    && !eltClass.contains(CLASS_MAGNIFY)
                    // icons located at the top of the page
                    && !eltClass.contains(CLASS_TOPICON))) {
        result = true;
        processTextElement(element, rawStr, linkedStr);
    }

    else
        result = false;

    return result;
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Retrieve the text located in /*  w w w  .  ja v  a2s. c o  m*/
 * a hyperlink (A) HTML element.
 * <br/>
 * We ignore all external links,
 * as well as linked images.
 * 
 * @param element
 *       Element to be processed.
 * @param rawStr
 *       Current raw text string.
 * @param linkedStr
 *       Current text with hyperlinks.
 * @return
 *       {@code true} iff the element was processed.
 */
private boolean processHyperlinkElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) {
    boolean result;
    String eltClass = element.attr(XmlNames.ATT_CLASS);

    //      if(eltClass==null)
    {
        result = true;

        // simple text
        String str = element.text();
        if (!str.isEmpty()) {
            rawStr.append(str);

            //if(str.contains("Philadelphia, Pa."))   //debug stuff
            //   System.out.print("");

            // hyperlink
            String eltTitle = element.attr(XmlNames.ATT_TITLE);
            if ((eltClass == null || (!eltClass.contains(CLASS_IMAGE) && !eltClass.contains(CLASS_EXTERNAL)))
                    && (eltTitle == null || (!eltTitle.contains(TITLE_LISTEN)))) {
                String href = element.attr(XmlNames.ATT_HREF);
                String code = "<" + XmlNames.ELT_A + " " + XmlNames.ATT_HREF + "=\"" + href + "\">" + str + "</"
                        + XmlNames.ELT_A + ">";
                linkedStr.append(code);
            } else
                linkedStr.append(str);
        }
    }

    //      else
    //         result = false;

    return result;
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Retrieve the text located in a table (TABLE) HTML element.
 * <br/>// w  w w .  ja va 2s  . c  o  m
 * We process each cell in the table as a text element. 
 * Some tables are ignored: infoboxes, wikitables, navboxes,
 * metadata, persondata, etc. 
 * 
 * @param element
 *       Element to be processed.
 * @param rawStr
 *       Current raw text string.
 * @param linkedStr
 *       Current text with hyperlinks.
 * @return
 *       {@code true} iff the element was processed.
 */
private boolean processTableElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) {
    boolean result;
    String eltClass = element.attr(XmlNames.ATT_CLASS);

    if (eltClass == null ||
    // we ignore infoboxes
            (!eltClass.contains(CLASS_INFORMATIONBOX)
                    // and wikitables
                    && !eltClass.contains(CLASS_WIKITABLE)
                    // navigation boxes
                    && !eltClass.contains(CLASS_NAVIGATIONBOX)
                    // navigation boxes, WP warnings (incompleteness, etc.)
                    && !eltClass.contains(CLASS_METADATA)
                    // personal data box (?)
                    && !eltClass.contains(CLASS_PERSONDATA)))

    {
        result = true;
        Element tbodyElt = element.children().get(0);

        for (Element rowElt : tbodyElt.children()) {
            for (Element colElt : rowElt.children()) { // process cell content
                processTextElement(colElt, rawStr, linkedStr);

                // possibly add final dot and space. 
                if (rawStr.charAt(rawStr.length() - 1) != ' ') {
                    if (rawStr.charAt(rawStr.length() - 1) == '.') {
                        rawStr.append(" ");
                        linkedStr.append(" ");
                    } else {
                        rawStr.append(". ");
                        linkedStr.append(". ");
                    }
                }
            }
        }
    }

    else
        result = false;

    return result;
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Pulls a text from a Wikipedia URL without images, tags, etc.
 * /*from   ww  w.j a  v  a2s.  com*/
 * @param url
 *       Address of the targetted text.
 * @return
 *       An Article object representing the retrieved object.
 * 
 * @throws ReaderException
 *       Problem while retrieving the text.
 */
@Override
public Article read(URL url) throws ReaderException {
    Article result = null;
    String name = getName(url);

    try { // get the page
        String address = url.toString();
        logger.log("Retrieving page " + address);
        long startTime = System.currentTimeMillis();
        Document document = retrieveSourceCode(name, url);

        // get its title
        Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0);
        String title = firstHeadingElt.text();
        logger.log("Get title: " + title);

        // get raw and linked texts
        logger.log("Get raw and linked texts.");
        StringBuilder rawStr = new StringBuilder();
        StringBuilder linkedStr = new StringBuilder();
        Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0);
        // processing each element in the content part
        boolean ignoringSection = false;
        boolean first = true;
        for (Element element : bodyContentElt.children()) {
            String eltName = element.tag().getName();
            String eltClass = element.attr(XmlNames.ATT_CLASS);

            // section headers
            if (eltName.equals(XmlNames.ELT_H2)) {
                first = false;
                // get section name
                StringBuilder fakeRaw = new StringBuilder();
                StringBuilder fakeLinked = new StringBuilder();
                processParagraphElement(element, fakeRaw, fakeLinked);
                String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH);
                // check section name
                if (IGNORED_SECTIONS.contains(str))
                    ignoringSection = true;
                else {
                    ignoringSection = false;
                    rawStr.append("\n-----");
                    linkedStr.append("\n-----");
                    processParagraphElement(element, rawStr, linkedStr);
                }
            }

            else if (!ignoringSection) { // lower sections
                if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4)
                        || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) {
                    first = false;
                    processParagraphElement(element, rawStr, linkedStr);
                }

                // paragraph
                else if (eltName.equals(XmlNames.ELT_P)) {
                    String str = element.text();
                    // ignore possible initial disambiguation link
                    if (!first || !str.startsWith(PARAGRAPH_FORTHE)) {
                        first = false;
                        processParagraphElement(element, rawStr, linkedStr);
                    }
                }

                // list
                else if (eltName.equals(XmlNames.ELT_UL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, false);
                } else if (eltName.equals(XmlNames.ELT_OL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, true);
                } else if (eltName.equals(XmlNames.ELT_DL)) {
                    first = false;
                    processDescriptionListElement(element, rawStr, linkedStr);
                }

                // tables
                else if (eltName.equals(XmlNames.ELT_TABLE)) {
                    first = !processTableElement(element, rawStr, linkedStr);
                }

                // divisions
                else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture 
                    if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB))
                        first = !processDivisionElement(element, rawStr, linkedStr);
                }

                // we ignore certain types of span (phonetic trancription, WP buttons...) 
                else if (eltName.equals(XmlNames.ELT_SPAN)) {
                    first = !processSpanElement(element, rawStr, linkedStr);
                }

                // hyperlinks must be included in the linked string, provided they are not external
                else if (eltName.equals(XmlNames.ELT_A)) {
                    first = !processHyperlinkElement(element, rawStr, linkedStr);
                }

                // quotes are just processed recursively
                else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) {
                    first = !processQuoteElement(element, rawStr, linkedStr);
                }

                // other tags are ignored
            }
        }

        // create article object
        result = new Article(name);
        result.setTitle(title);
        result.setUrl(url);
        result.initDate();

        // clean text
        String rawText = rawStr.toString();
        rawText = cleanText(rawText);
        //         rawText = ArticleCleaning.replaceChars(rawText);
        result.setRawText(rawText);
        logger.log("Length of the raw text: " + rawText.length() + " chars.");
        String linkedText = linkedStr.toString();
        linkedText = cleanText(linkedText);
        //         linkedText = ArticleCleaning.replaceChars(linkedText);
        result.setLinkedText(linkedText);
        logger.log("Length of the linked text: " + linkedText.length() + " chars.");

        // get original html source code
        logger.log("Get original HTML source code.");
        String originalPage = document.toString();
        result.setOriginalPage(originalPage);
        logger.log("Length of the original page: " + originalPage.length() + " chars.");

        // get the categories of the article 
        List<ArticleCategory> categories = getArticleCategories(result);
        result.setCategories(categories);

        long endTime = System.currentTimeMillis();
        logger.log("Total duration: " + (endTime - startTime) + " ms.");
    } catch (ClientProtocolException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (org.json.simple.parser.ParseException e) {
        e.printStackTrace();
    }

    return result;
}