List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:de.geeksfactory.opacclient.apis.Open.java
@Override public SearchRequestResult searchGetPage(int page) throws IOException, OpacErrorException, JSONException { /*/* w ww .ja v a2s. c o m*/ When there are many pages of results, there will only be links to the next 4 and previous 4 pages, so we will click links until it gets to the correct page. */ if (searchResultDoc == null) throw new NotReachableException(); Document doc = searchResultDoc; Elements pageLinks = doc.select("span[id$=DataPager1]").first().select("a[id*=LinkButtonPageN"); int from = Integer.valueOf(pageLinks.first().text()); int to = Integer.valueOf(pageLinks.last().text()); Element linkToClick; boolean willBeCorrectPage; if (page < from) { linkToClick = pageLinks.first(); willBeCorrectPage = false; } else if (page > to) { linkToClick = pageLinks.last(); willBeCorrectPage = false; } else { linkToClick = pageLinks.get(page - from); willBeCorrectPage = true; } Pattern pattern = Pattern.compile("javascript:__doPostBack\\('([^,]*)','([^\\)]*)'\\)"); Matcher matcher = pattern.matcher(linkToClick.attr("href")); if (!matcher.find()) throw new OpacErrorException(StringProvider.INTERNAL_ERROR); FormElement form = (FormElement) doc.select("form").first(); HttpEntity data = formData(form, null).addTextBody("__EVENTTARGET", matcher.group(1)) .addTextBody("__EVENTARGUMENT", matcher.group(2)).build(); ByteArrayOutputStream stream = new ByteArrayOutputStream(); data.writeTo(stream); String postUrl = form.attr("abs:action"); String html = httpPost(postUrl, data, "UTF-8"); if (willBeCorrectPage) { // We clicked on the correct link Document doc2 = Jsoup.parse(html); doc2.setBaseUri(postUrl); return parse_search(doc2, page); } else { // There was no correct link, so try to find one again searchResultDoc = Jsoup.parse(html); searchResultDoc.setBaseUri(postUrl); return searchGetPage(page); } }
From source file:com.ibuildapp.romanblack.WebPlugin.WebPlugin.java
/** * Prepare and load data to WebView.//from w ww .j a va 2 s . com */ private void showHtml() { try { if (isOnline) { if (currentUrl.length() > 0 && !currentUrl.equals("about:blank")) { url = currentUrl; } if (url.length() > 0) html = "<html><body><a href=\"" + url + "\" id=\"link\" /></body></html>"; Document doc = Jsoup.parse(html); Element iframe = doc.select("iframe").first(); boolean isGoogleCalendar = false; boolean isGoogleForms = false; String iframeSrc = ""; try { if (iframe != null) { iframeSrc = iframe.attr("src"); } } catch (Exception e) { } if (iframeSrc.length() > 0) { isGoogleCalendar = iframeSrc.contains("www.google.com/calendar") || iframeSrc.contains("calendar.google.com/calendar"); isGoogleForms = iframeSrc.contains("google.com/forms"); } if (isGoogleCalendar) { webView.loadUrl(iframeSrc); } else if (isGoogleForms) { webView.getSettings().setBuiltInZoomControls(false); DisplayMetrics metrix = getResources().getDisplayMetrics(); int width = metrix.widthPixels; int height = metrix.heightPixels; float density = metrix.density; iframe.attr("width", (int) (width / density) + ""); iframe.attr("height", (int) (height / density - (75 /*+ (hasAdView() ? 50 : 0)*/)) + ""); iframe.attr("style", "margin: 0; padding: 0"); Element body = doc.select("body").first(); body.attr("style", "margin: 0; padding: 0"); html = doc.outerHtml(); webView.loadDataWithBaseURL("http://", html, "text/html", "utf-8", ""); } else { Elements forms = doc.select("form"); Iterator<Element> iterator = forms.iterator(); for (; iterator.hasNext();) { Element form = iterator.next(); String action = form.attr("action"); if (action.contains("paypal.com")) { form.append("<input type=\"hidden\" name=\"bn\" value=\"ibuildapp_SP\">"); } html = doc.html(); } hideProgress = true; if (Build.VERSION.SDK_INT >= 20 && html.contains("ibuildapp") && html.contains("powr")) { int height = getResources().getDisplayMetrics().heightPixels; html = "<iframe width=\"" + 420 + "\" height=\"" + height + "\" frameBorder=\"0\" src=" + url + "></iframe>"; webView.loadData(html, "text/html", "utf-8"); } else webView.loadDataWithBaseURL("http://", html, "text/html", "utf-8", ""); } } else { if (html.length() > 0) { webView.loadDataWithBaseURL("http://", html, "text/html", "utf-8", ""); } } handler.sendEmptyMessageDelayed(HIDE_PROGRESS, 10000); } catch (Exception ex) { // Error Logging } }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
@Override public List<SearchField> getSearchFields() throws IOException { List<SearchField> fields = new ArrayList<>(); HttpGet httpget;//from ww w . j av a 2 s . c o m if (opacDir.contains("opax")) { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel.html.S"); } else { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel_main.S"); } HttpResponse response = http_client.execute(httpget); if (response.getStatusLine().getStatusCode() == 500) { throw new NotReachableException(response.getStatusLine().getReasonPhrase()); } String html = convertStreamToString(response.getEntity().getContent()); HttpUtils.consume(response.getEntity()); Document doc = Jsoup.parse(html); // get text fields Elements text_opts = doc.select("form select[name=REG1] option"); for (Element opt : text_opts) { TextSearchField field = new TextSearchField(); field.setId(opt.attr("value")); field.setDisplayName(opt.text()); field.setHint(""); fields.add(field); } // get media types Elements mt_opts = doc.select("form input[name~=(MT|MS)]"); if (mt_opts.size() > 0) { DropdownSearchField mtDropdown = new DropdownSearchField(); mtDropdown.setId(mt_opts.get(0).attr("name")); mtDropdown.setDisplayName("Medientyp"); for (Element opt : mt_opts) { if (!opt.val().equals("")) { String text = opt.text(); if (text.length() == 0) { // text is empty, check layouts: // Essen: <input name="MT"><img title="mediatype"> // Schaffenb: <input name="MT"><img alt="mediatype"> Element img = opt.nextElementSibling(); if (img != null && img.tagName().equals("img")) { text = img.attr("title"); if (text.equals("")) { text = img.attr("alt"); } } } if (text.length() == 0) { // text is still empty, check table layout, Example // Friedrichshafen // <td><input name="MT"></td> <td><img // title="mediatype"></td> Element td1 = opt.parent(); Element td2 = td1.nextElementSibling(); if (td2 != null) { Elements td2Children = td2.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty, check images in label layout, Example // Wiedenst // <input type="radio" name="MT" id="MTYP1" value="MTYP1"> // <label for="MTYP1"><img src="http://www.wiedenest.de/bib/image/books // .png" alt="Bcher" title="Bcher"></label> Element label = opt.nextElementSibling(); if (label != null) { Elements td2Children = label.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty: missing end tag like Offenburg text = parse_option_regex(opt); } mtDropdown.addDropdownValue(opt.val(), text); } } fields.add(mtDropdown); } // get branches Elements br_opts = doc.select("form select[name=ZW] option"); if (br_opts.size() > 0) { DropdownSearchField brDropdown = new DropdownSearchField(); brDropdown.setId(br_opts.get(0).parent().attr("name")); brDropdown.setDisplayName(br_opts.get(0).parent().parent().previousElementSibling().text() .replace("\u00a0", "").replace("?", "").trim()); for (Element opt : br_opts) { brDropdown.addDropdownValue(opt.val(), opt.text()); } fields.add(brDropdown); } return fields; }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
@Override public List<SearchField> getSearchFields() throws IOException, JSONException { if (!initialised) { start();/* w w w .j a v a 2s . c o m*/ } List<SearchField> fields = new ArrayList<>(); // Read branches and media types List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("link_profis.x", "0")); nameValuePairs.add(new BasicNameValuePair("link_profis.y", "1")); String html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); Document doc = Jsoup.parse(html); Elements fieldElems = doc.select(".suchfeldinhalt"); for (Element fieldElem : fieldElems) { String name = fieldElem.select(".suchfeld_inhalt_titel label").text(); String hint = ""; if (fieldElem.select(".suchfeld_inhalt_input").size() > 0) { List<TextNode> textNodes = fieldElem.select(".suchfeld_inhalt_input").first().textNodes(); if (textNodes.size() > 0) { for (TextNode node : textNodes) { String text = node.getWholeText().replace("\n", ""); if (!text.equals("")) { hint = node.getWholeText().replace("\n", ""); break; } } } } Elements inputs = fieldElem .select(".suchfeld_inhalt_input input[type=text], " + ".suchfeld_inhalt_input select"); if (inputs.size() == 1) { fields.add(createSearchField(name, hint, inputs.get(0))); } else if (inputs.size() == 2 && inputs.select("input[type=text]").size() == 2) { // Two text fields, e.g. year from/to or two keywords fields.add(createSearchField(name, hint, inputs.get(0))); TextSearchField secondField = (TextSearchField) createSearchField(name, hint, inputs.get(1)); secondField.setHalfWidth(true); fields.add(secondField); } else if (inputs.size() == 2 && inputs.get(0).tagName().equals("select") && inputs.get(1).tagName().equals("input") && inputs.get(0).attr("name").equals("feld1")) { // A dropdown to select from different search field types. // Break it down into single text fields. for (Element option : inputs.get(0).select("option")) { TextSearchField field = new TextSearchField(); field.setHint(hint); field.setDisplayName(option.text()); field.setId(inputs.get(1).attr("name") + "$" + option.attr("value")); JSONObject data = new JSONObject(); JSONObject params = new JSONObject(); params.put(inputs.get(0).attr("name"), option.attr("value")); data.put("additional_params", params); field.setData(data); fields.add(field); } } } DropdownSearchField orderField = new DropdownSearchField("orderselect", stringProvider.getString(StringProvider.ORDER), false, null); orderField.addDropdownValue("1", stringProvider.getString(StringProvider.ORDER_DEFAULT)); orderField.addDropdownValue("2:desc", stringProvider.getString(StringProvider.ORDER_YEAR_DESC)); orderField.addDropdownValue("2:asc", stringProvider.getString(StringProvider.ORDER_YEAR_ASC)); orderField.addDropdownValue("3:desc", stringProvider.getString(StringProvider.ORDER_CATEGORY_DESC)); orderField.addDropdownValue("3:asc", stringProvider.getString(StringProvider.ORDER_CATEGORY_ASC)); orderField.setMeaning(Meaning.ORDER); fields.add(orderField); return fields; }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
@Override public ReservationResult reservation(DetailledItem item, Account acc, int useraction, String selection) throws IOException { String reservation_info = item.getReservation_info(); Document doc = null;/*from www . ja v a 2 s .c om*/ if (useraction == MultiStepResult.ACTION_CONFIRMATION) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("make_allvl", "Bestaetigung")); nameValuePairs.add(new BasicNameValuePair("target", "makevorbest")); httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); return new ReservationResult(MultiStepResult.Status.OK); } else if (selection == null || useraction == 0) { String html = httpGet(opac_url + "/" + reservation_info, getDefaultEncoding()); doc = Jsoup.parse(html); if (doc.select("input[name=AUSWEIS]").size() > 0) { // Needs login List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("AUSWEIS", acc.getName())); nameValuePairs.add(new BasicNameValuePair("PWD", acc.getPassword())); if (data.has("db")) { try { nameValuePairs.add(new BasicNameValuePair("vkontodb", data.getString("db"))); } catch (JSONException e) { // TODO Auto-generated catch block e.printStackTrace(); } } nameValuePairs.add(new BasicNameValuePair("B1", "weiter")); nameValuePairs.add(new BasicNameValuePair("target", doc.select("input[name=target]").val())); nameValuePairs.add(new BasicNameValuePair("type", "VT2")); html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); doc = Jsoup.parse(html); } if (doc.select("select[name=" + branch_inputfield + "]").size() == 0) { if (doc.select("select[name=VZST]").size() > 0) { branch_inputfield = "VZST"; } } if (doc.select("select[name=" + branch_inputfield + "]").size() > 0) { List<Map<String, String>> branches = new ArrayList<>(); for (Element option : doc.select("select[name=" + branch_inputfield + "]").first().children()) { String value = option.text().trim(); String key; if (option.hasAttr("value")) { key = option.attr("value"); } else { key = value; } Map<String, String> selopt = new HashMap<>(); selopt.put("key", key); selopt.put("value", value); branches.add(selopt); } _res_target = doc.select("input[name=target]").attr("value"); ReservationResult result = new ReservationResult(MultiStepResult.Status.SELECTION_NEEDED); result.setActionIdentifier(ReservationResult.ACTION_BRANCH); result.setSelection(branches); return result; } } else if (useraction == ReservationResult.ACTION_BRANCH) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair(branch_inputfield, selection)); nameValuePairs.add(new BasicNameValuePair("button2", "weiter")); nameValuePairs.add(new BasicNameValuePair("target", _res_target)); String html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); doc = Jsoup.parse(html); } if (doc == null) { return new ReservationResult(MultiStepResult.Status.ERROR); } if (doc.select("input[name=target]").size() > 0) { if (doc.select("input[name=target]").attr("value").equals("makevorbest")) { List<String[]> details = new ArrayList<>(); if (doc.getElementsByClass("kontomeldung").size() == 1) { details.add(new String[] { doc.getElementsByClass("kontomeldung").get(0).text().trim() }); } Pattern p = Pattern.compile("geb.hr", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); for (Element div : doc.select(".kontozeile_center")) { for (String text : Jsoup.parse(div.html().replaceAll("(?i)<br[^>]*>", "br2n")).text() .split("br2n")) { if (p.matcher(text).find() && !text.contains("usstehend") && text.contains("orbestellung")) { details.add(new String[] { text.trim() }); } } } if (doc.select("#vorbest").size() > 0 && doc.select("#vorbest").val().contains("(")) { // Erlangen uses "Kostenpflichtige Vorbestellung (1 Euro)" // as the label of its reservation button details.add(new String[] { doc.select("#vorbest").val().trim() }); } for (Element row : doc.select(".kontozeile_center table tr")) { if (row.select(".konto_feld").size() == 1 && row.select(".konto_feldinhalt").size() == 1) { details.add(new String[] { row.select(".konto_feld").text().trim(), row.select(".konto_feldinhalt").text().trim() }); } } ReservationResult result = new ReservationResult(MultiStepResult.Status.CONFIRMATION_NEEDED); result.setDetails(details); return result; } } if (doc.getElementsByClass("kontomeldung").size() == 1) { return new ReservationResult(MultiStepResult.Status.ERROR, doc.getElementsByClass("kontomeldung").get(0).text()); } return new ReservationResult(MultiStepResult.Status.ERROR, stringProvider.getString(StringProvider.UNKNOWN_ERROR)); }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Retrieve the text located in /*ww w. j a v a2 s . c o m*/ * a span (SPAN) HTML element. * <br/> * We process everything but * the phonetic transcriptions. * * @param element * Element to be processed. * @param rawStr * Current raw text string. * @param linkedStr * Current text with hyperlinks. * @return * {@code true} iff the element was processed. */ private boolean processSpanElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) { boolean result; String eltClass = element.attr(XmlNames.ATT_CLASS); if (eltClass == null || // we don't need phonetic transcriptions, and they can mess up NER tools (!eltClass.contains(CLASS_IPA) // we also ignore WP buttons such as the "edit" links placed in certain section headers && !eltClass.contains(CLASS_EDIT) // language indications && !eltClass.contains(CLASS_LANGUAGEICON))) { result = true; // otherwise, we process what's inside the span tag processTextElement(element, rawStr, linkedStr); } else result = false; return result; }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Retrieve the text located in //from ww w. ja va2s . c o m * a division (DIV) HTML element. * <br/> * We ignore some of them: table * of content, reference list, related links, etc. * * @param element * Element to be processed. * @param rawStr * Current raw text string. * @param linkedStr * Current text with hyperlinks. * @return * {@code true} iff the element was processed. */ private boolean processDivisionElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) { boolean result; String eltClass = element.attr(XmlNames.ATT_CLASS); //if(eltClass.contains("thumb")) // System.out.print(""); if (eltClass == null || // we ignore infoboxes (!eltClass.contains(CLASS_TABLEOFCONTENT) // list of bibiliographic references located at the end of the page && !eltClass.contains(CLASS_REFERENCES) // WP warning links (disambiguation and such) && !eltClass.contains(CLASS_DABLINK) // related links && !eltClass.contains(CLASS_RELATEDLINK) // audio or video clip && !eltClass.contains(CLASS_MEDIA) // button used to magnify images && !eltClass.contains(CLASS_MAGNIFY) // icons located at the top of the page && !eltClass.contains(CLASS_TOPICON))) { result = true; processTextElement(element, rawStr, linkedStr); } else result = false; return result; }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Retrieve the text located in /* w w w . ja v a2s. c o m*/ * a hyperlink (A) HTML element. * <br/> * We ignore all external links, * as well as linked images. * * @param element * Element to be processed. * @param rawStr * Current raw text string. * @param linkedStr * Current text with hyperlinks. * @return * {@code true} iff the element was processed. */ private boolean processHyperlinkElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) { boolean result; String eltClass = element.attr(XmlNames.ATT_CLASS); // if(eltClass==null) { result = true; // simple text String str = element.text(); if (!str.isEmpty()) { rawStr.append(str); //if(str.contains("Philadelphia, Pa.")) //debug stuff // System.out.print(""); // hyperlink String eltTitle = element.attr(XmlNames.ATT_TITLE); if ((eltClass == null || (!eltClass.contains(CLASS_IMAGE) && !eltClass.contains(CLASS_EXTERNAL))) && (eltTitle == null || (!eltTitle.contains(TITLE_LISTEN)))) { String href = element.attr(XmlNames.ATT_HREF); String code = "<" + XmlNames.ELT_A + " " + XmlNames.ATT_HREF + "=\"" + href + "\">" + str + "</" + XmlNames.ELT_A + ">"; linkedStr.append(code); } else linkedStr.append(str); } } // else // result = false; return result; }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Retrieve the text located in a table (TABLE) HTML element. * <br/>// w w w . ja va 2s . c o m * We process each cell in the table as a text element. * Some tables are ignored: infoboxes, wikitables, navboxes, * metadata, persondata, etc. * * @param element * Element to be processed. * @param rawStr * Current raw text string. * @param linkedStr * Current text with hyperlinks. * @return * {@code true} iff the element was processed. */ private boolean processTableElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) { boolean result; String eltClass = element.attr(XmlNames.ATT_CLASS); if (eltClass == null || // we ignore infoboxes (!eltClass.contains(CLASS_INFORMATIONBOX) // and wikitables && !eltClass.contains(CLASS_WIKITABLE) // navigation boxes && !eltClass.contains(CLASS_NAVIGATIONBOX) // navigation boxes, WP warnings (incompleteness, etc.) && !eltClass.contains(CLASS_METADATA) // personal data box (?) && !eltClass.contains(CLASS_PERSONDATA))) { result = true; Element tbodyElt = element.children().get(0); for (Element rowElt : tbodyElt.children()) { for (Element colElt : rowElt.children()) { // process cell content processTextElement(colElt, rawStr, linkedStr); // possibly add final dot and space. if (rawStr.charAt(rawStr.length() - 1) != ' ') { if (rawStr.charAt(rawStr.length() - 1) == '.') { rawStr.append(" "); linkedStr.append(" "); } else { rawStr.append(". "); linkedStr.append(". "); } } } } } else result = false; return result; }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Pulls a text from a Wikipedia URL without images, tags, etc. * /*from ww w.j a v a2s. com*/ * @param url * Address of the targetted text. * @return * An Article object representing the retrieved object. * * @throws ReaderException * Problem while retrieving the text. */ @Override public Article read(URL url) throws ReaderException { Article result = null; String name = getName(url); try { // get the page String address = url.toString(); logger.log("Retrieving page " + address); long startTime = System.currentTimeMillis(); Document document = retrieveSourceCode(name, url); // get its title Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0); String title = firstHeadingElt.text(); logger.log("Get title: " + title); // get raw and linked texts logger.log("Get raw and linked texts."); StringBuilder rawStr = new StringBuilder(); StringBuilder linkedStr = new StringBuilder(); Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0); // processing each element in the content part boolean ignoringSection = false; boolean first = true; for (Element element : bodyContentElt.children()) { String eltName = element.tag().getName(); String eltClass = element.attr(XmlNames.ATT_CLASS); // section headers if (eltName.equals(XmlNames.ELT_H2)) { first = false; // get section name StringBuilder fakeRaw = new StringBuilder(); StringBuilder fakeLinked = new StringBuilder(); processParagraphElement(element, fakeRaw, fakeLinked); String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH); // check section name if (IGNORED_SECTIONS.contains(str)) ignoringSection = true; else { ignoringSection = false; rawStr.append("\n-----"); linkedStr.append("\n-----"); processParagraphElement(element, rawStr, linkedStr); } } else if (!ignoringSection) { // lower sections if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) { first = false; processParagraphElement(element, rawStr, linkedStr); } // paragraph else if (eltName.equals(XmlNames.ELT_P)) { String str = element.text(); // ignore possible initial disambiguation link if (!first || !str.startsWith(PARAGRAPH_FORTHE)) { first = false; processParagraphElement(element, rawStr, linkedStr); } } // list else if (eltName.equals(XmlNames.ELT_UL)) { first = false; processListElement(element, rawStr, linkedStr, false); } else if (eltName.equals(XmlNames.ELT_OL)) { first = false; processListElement(element, rawStr, linkedStr, true); } else if (eltName.equals(XmlNames.ELT_DL)) { first = false; processDescriptionListElement(element, rawStr, linkedStr); } // tables else if (eltName.equals(XmlNames.ELT_TABLE)) { first = !processTableElement(element, rawStr, linkedStr); } // divisions else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB)) first = !processDivisionElement(element, rawStr, linkedStr); } // we ignore certain types of span (phonetic trancription, WP buttons...) else if (eltName.equals(XmlNames.ELT_SPAN)) { first = !processSpanElement(element, rawStr, linkedStr); } // hyperlinks must be included in the linked string, provided they are not external else if (eltName.equals(XmlNames.ELT_A)) { first = !processHyperlinkElement(element, rawStr, linkedStr); } // quotes are just processed recursively else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) { first = !processQuoteElement(element, rawStr, linkedStr); } // other tags are ignored } } // create article object result = new Article(name); result.setTitle(title); result.setUrl(url); result.initDate(); // clean text String rawText = rawStr.toString(); rawText = cleanText(rawText); // rawText = ArticleCleaning.replaceChars(rawText); result.setRawText(rawText); logger.log("Length of the raw text: " + rawText.length() + " chars."); String linkedText = linkedStr.toString(); linkedText = cleanText(linkedText); // linkedText = ArticleCleaning.replaceChars(linkedText); result.setLinkedText(linkedText); logger.log("Length of the linked text: " + linkedText.length() + " chars."); // get original html source code logger.log("Get original HTML source code."); String originalPage = document.toString(); result.setOriginalPage(originalPage); logger.log("Length of the original page: " + originalPage.length() + " chars."); // get the categories of the article List<ArticleCategory> categories = getArticleCategories(result); result.setCategories(categories); long endTime = System.currentTimeMillis(); logger.log("Total duration: " + (endTime - startTime) + " ms."); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (org.json.simple.parser.ParseException e) { e.printStackTrace(); } return result; }