List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:com.digitalpebble.storm.crawler.bolt.JSoupParserBolt.java
@Override public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // check that its content type is HTML // look at value found in HTTP headers boolean CT_OK = false; String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (StringUtils.isNotBlank(httpCT)) { if (httpCT.toLowerCase().contains("html")) { CT_OK = true;// w w w. j a v a2 s . co m } } // simply ignore cases where the content type has not been set // TODO sniff content with Tika? else { CT_OK = true; } if (!CT_OK) { String errorMessage = "Exception content-type " + httpCT + " for " + url; RuntimeException e = new RuntimeException(errorMessage); handleException(url, e, metadata, tuple, "content-type checking", errorMessage); return; } LOG.info("Parsing : starting {}", url); long start = System.currentTimeMillis(); String charset = getContentCharset(content, metadata); // get the robots tags from the fetch metadata RobotsTags robotsTags = new RobotsTags(metadata); Map<String, List<String>> slinks; String text; DocumentFragment fragment; try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url); fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc); // extracts the robots directives from the meta tags robotsTags.extractMetaTags(fragment); // store a normalised representation in metadata // so that the indexer is aware of it robotsTags.normaliseToMetadata(metadata); // do not extract the links if no follow has been set // and we are in strict mode if (robotsTags.isNoFollow() && robots_noFollow_strict) { slinks = new HashMap<String, List<String>>(0); } else { Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<String, List<String>>(links.size()); for (Element link : links) { // abs:href tells jsoup to return fully qualified domains // for // relative urls. // e.g.: /foo will resolve to http://shopstyle.com/foo String targetURL = link.attr("abs:href"); // nofollow boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel")); // remove altogether if (noFollow && robots_noFollow_strict) { continue; } // link not specifically marked as no follow // but whole page is if (!noFollow && robotsTags.isNoFollow()) { noFollow = true; } String anchor = link.text(); if (StringUtils.isNotBlank(targetURL)) { // any existing anchors for the same target? List<String> anchors = slinks.get(targetURL); if (anchors == null) { anchors = new LinkedList<String>(); slinks.put(targetURL, anchors); } // track the anchors only if no follow is false if (!noFollow && StringUtils.isNotBlank(anchor)) { anchors.add(anchor); } } } } text = jsoupDoc.body().text(); } catch (Throwable e) { String errorMessage = "Exception while parsing " + url + ": " + e; handleException(url, e, metadata, tuple, "content parsing", errorMessage); return; } // store identified charset in md metadata.setValue("parse.Content-Encoding", charset); long duration = System.currentTimeMillis() - start; LOG.info("Parsed {} in {} msec", url, duration); List<Outlink> outlinks = toOutlinks(url, metadata, slinks); ParseResult parse = new ParseResult(); parse.setOutlinks(outlinks); // parse data of the parent URL ParseData parseData = parse.get(url); parseData.setMetadata(metadata); parseData.setText(text); parseData.setContent(content); // apply the parse filters if any try { parseFilters.filter(url, content, fragment, parse); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; handleException(url, e, metadata, tuple, "content filtering", errorMessage); return; } if (emitOutlinks) { for (Outlink outlink : outlinks) { collector.emit(StatusStreamName, tuple, new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED)); } } // emit each document/subdocument in the ParseResult object // there should be at least one ParseData item for the "parent" URL for (Map.Entry<String, ParseData> doc : parse) { ParseData parseDoc = doc.getValue(); collector.emit(tuple, new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText())); } collector.ack(tuple); eventCounter.scope("tuple_success").incr(); }
From source file:com.digitalpebble.stormcrawler.bolt.JSoupParserBolt.java
@Override public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); LOG.info("Parsing : starting {}", url); // check that its content type is HTML // look at value found in HTTP headers boolean CT_OK = false; String mimeType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (detectMimeType) { mimeType = guessMimeType(url, mimeType, content); // store identified type in md metadata.setValue("parse.Content-Type", mimeType); }//w ww .j a v a 2 s. c o m if (StringUtils.isNotBlank(mimeType)) { if (mimeType.toLowerCase().contains("html")) { CT_OK = true; } } // go ahead even if no mimetype is available else { CT_OK = true; } if (!CT_OK) { if (this.treat_non_html_as_error) { String errorMessage = "Exception content-type " + mimeType + " for " + url; RuntimeException e = new RuntimeException(errorMessage); handleException(url, e, metadata, tuple, "content-type checking", errorMessage); } else { LOG.info("Incorrect mimetype - passing on : {}", url); collector.emit(tuple, new Values(url, content, metadata, "")); collector.ack(tuple); } return; } long start = System.currentTimeMillis(); String charset = getContentCharset(content, metadata); // get the robots tags from the fetch metadata RobotsTags robotsTags = new RobotsTags(metadata); Map<String, List<String>> slinks; String text = ""; DocumentFragment fragment; try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url); fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc); // extracts the robots directives from the meta tags robotsTags.extractMetaTags(fragment); // store a normalised representation in metadata // so that the indexer is aware of it robotsTags.normaliseToMetadata(metadata); // do not extract the links if no follow has been set // and we are in strict mode if (robotsTags.isNoFollow() && robots_noFollow_strict) { slinks = new HashMap<>(0); } else { Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<>(links.size()); for (Element link : links) { // abs:href tells jsoup to return fully qualified domains // for // relative urls. // e.g.: /foo will resolve to http://shopstyle.com/foo String targetURL = link.attr("abs:href"); // nofollow boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel")); // remove altogether if (noFollow && robots_noFollow_strict) { continue; } // link not specifically marked as no follow // but whole page is if (!noFollow && robotsTags.isNoFollow()) { noFollow = true; } String anchor = link.text(); if (StringUtils.isNotBlank(targetURL)) { // any existing anchors for the same target? List<String> anchors = slinks.get(targetURL); if (anchors == null) { anchors = new LinkedList<>(); slinks.put(targetURL, anchors); } // track the anchors only if no follow is false if (!noFollow && StringUtils.isNotBlank(anchor)) { anchors.add(anchor); } } } } Element body = jsoupDoc.body(); if (body != null) { text = body.text(); } } catch (Throwable e) { String errorMessage = "Exception while parsing " + url + ": " + e; handleException(url, e, metadata, tuple, "content parsing", errorMessage); return; } // store identified charset in md metadata.setValue("parse.Content-Encoding", charset); long duration = System.currentTimeMillis() - start; LOG.info("Parsed {} in {} msec", url, duration); List<Outlink> outlinks = toOutlinks(url, metadata, slinks); ParseResult parse = new ParseResult(); parse.setOutlinks(outlinks); // parse data of the parent URL ParseData parseData = parse.get(url); parseData.setMetadata(metadata); parseData.setText(text); parseData.setContent(content); // apply the parse filters if any try { parseFilters.filter(url, content, fragment, parse); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; handleException(url, e, metadata, tuple, "content filtering", errorMessage); return; } if (emitOutlinks) { for (Outlink outlink : parse.getOutlinks()) { collector.emit(StatusStreamName, tuple, new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED)); } } // emit each document/subdocument in the ParseResult object // there should be at least one ParseData item for the "parent" URL for (Map.Entry<String, ParseData> doc : parse) { ParseData parseDoc = doc.getValue(); collector.emit(tuple, new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText())); } collector.ack(tuple); eventCounter.scope("tuple_success").incr(); }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
@Override public List<SearchField> getSearchFields() throws IOException { List<SearchField> fields = new ArrayList<>(); HttpGet httpget;/*from ww w .j a v a 2 s .c o m*/ if (opacDir.contains("opax")) { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel.html.S"); } else { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel_main.S"); } HttpResponse response = http_client.execute(httpget); if (response.getStatusLine().getStatusCode() == 500) { throw new NotReachableException(response.getStatusLine().getReasonPhrase()); } String html = convertStreamToString(response.getEntity().getContent()); HttpUtils.consume(response.getEntity()); Document doc = Jsoup.parse(html); // get text fields Elements text_opts = doc.select("form select[name=REG1] option"); for (Element opt : text_opts) { TextSearchField field = new TextSearchField(); field.setId(opt.attr("value")); field.setDisplayName(opt.text()); field.setHint(""); fields.add(field); } // get media types Elements mt_opts = doc.select("form input[name~=(MT|MS)]"); if (mt_opts.size() > 0) { DropdownSearchField mtDropdown = new DropdownSearchField(); mtDropdown.setId(mt_opts.get(0).attr("name")); mtDropdown.setDisplayName("Medientyp"); for (Element opt : mt_opts) { if (!opt.val().equals("")) { String text = opt.text(); if (text.length() == 0) { // text is empty, check layouts: // Essen: <input name="MT"><img title="mediatype"> // Schaffenb: <input name="MT"><img alt="mediatype"> Element img = opt.nextElementSibling(); if (img != null && img.tagName().equals("img")) { text = img.attr("title"); if (text.equals("")) { text = img.attr("alt"); } } } if (text.length() == 0) { // text is still empty, check table layout, Example // Friedrichshafen // <td><input name="MT"></td> <td><img // title="mediatype"></td> Element td1 = opt.parent(); Element td2 = td1.nextElementSibling(); if (td2 != null) { Elements td2Children = td2.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty, check images in label layout, Example // Wiedenst // <input type="radio" name="MT" id="MTYP1" value="MTYP1"> // <label for="MTYP1"><img src="http://www.wiedenest.de/bib/image/books // .png" alt="Bcher" title="Bcher"></label> Element label = opt.nextElementSibling(); if (label != null) { Elements td2Children = label.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty: missing end tag like Offenburg text = parse_option_regex(opt); } mtDropdown.addDropdownValue(opt.val(), text); } } fields.add(mtDropdown); } // get branches Elements br_opts = doc.select("form select[name=ZW] option"); if (br_opts.size() > 0) { DropdownSearchField brDropdown = new DropdownSearchField(); brDropdown.setId(br_opts.get(0).parent().attr("name")); brDropdown.setDisplayName(br_opts.get(0).parent().parent().previousElementSibling().text() .replace("\u00a0", "").replace("?", "").trim()); for (Element opt : br_opts) { brDropdown.addDropdownValue(opt.val(), opt.text()); } fields.add(brDropdown); } return fields; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs * so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it * * * @param node// w ww . j a v a2s . c o m * @return */ private boolean isOkToBoost(Element node) { int stepsAway = 0; Element sibling = node.nextElementSibling(); while (sibling != null) { if (sibling.tagName().equals("p")) { if (stepsAway >= 3) { if (logger.isDebugEnabled()) { logger.debug("Next paragraph is too far away, not boosting"); } return false; } String paraText = sibling.text(); WordStats wordStats = StopWords.getStopWordCount(paraText); if (wordStats.getStopWordCount() > 5) { if (logger.isDebugEnabled()) { logger.debug("We're gonna boost this node, seems contenty"); } return true; } } // increase how far away the next paragraph is from this node stepsAway++; sibling = sibling.nextElementSibling(); } return false; }
From source file:de.geeksfactory.opacclient.apis.Pica.java
protected DetailledItem parse_result(String html) { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);//from ww w . j a v a2 s.co m DetailledItem result = new DetailledItem(); for (Element a : doc.select("a[href*=PPN")) { Map<String, String> hrefq = getQueryParamsFirst(a.absUrl("href")); String ppn = hrefq.get("PPN"); result.setId(ppn); break; } // GET COVER if (doc.select("td.preslabel:contains(ISBN) + td.presvalue").size() > 0) { Element isbnElement = doc.select("td.preslabel:contains(ISBN) + td.presvalue").first(); String isbn = ""; for (Node child : isbnElement.childNodes()) { if (child instanceof TextNode) { isbn = ((TextNode) child).text().trim(); break; } } result.setCover(ISBNTools.getAmazonCoverURL(isbn, true)); } // GET TITLE AND SUBTITLE String titleAndSubtitle; Element titleAndSubtitleElem = null; String titleRegex = ".*(Titel|Aufsatz|Zeitschrift|Gesamttitel" + "|Title|Article|Periodical|Collective\\stitle" + "|Titre|Article|P.riodique|Titre\\sg.n.ral).*"; String selector = "td.preslabel:matches(" + titleRegex + ") + td.presvalue"; if (doc.select(selector).size() > 0) { titleAndSubtitleElem = doc.select(selector).first(); titleAndSubtitle = titleAndSubtitleElem.text().trim(); int slashPosition = Math.min(titleAndSubtitle.indexOf("/"), titleAndSubtitle.indexOf(":")); String title; if (slashPosition > 0) { title = titleAndSubtitle.substring(0, slashPosition).trim(); String subtitle = titleAndSubtitle.substring(slashPosition + 1).trim(); result.addDetail(new Detail(stringProvider.getString(StringProvider.SUBTITLE), subtitle)); } else { title = titleAndSubtitle; } result.setTitle(title); } else { result.setTitle(""); } // Details int line = 0; Elements lines = doc.select("td.preslabel + td.presvalue"); if (titleAndSubtitleElem != null) { lines.remove(titleAndSubtitleElem); } for (Element element : lines) { Element titleElem = element.firstElementSibling(); String detail = ""; if (element.select("div").size() > 1 && element.select("div").text().equals(element.text())) { boolean first = true; for (Element div : element.select("div")) { if (!div.text().replace("\u00a0", " ").trim().equals("")) { if (!first) { detail += "\n" + div.text().replace("\u00a0", " ").trim(); } else { detail += div.text().replace("\u00a0", " ").trim(); first = false; } } } } else { detail = element.text().replace("\u00a0", " ").trim(); } String title = titleElem.text().replace("\u00a0", " ").trim(); if (element.select("hr").size() > 0) // after the separator we get the copies { break; } if (detail.length() == 0 && title.length() == 0) { line++; continue; } if (title.contains(":")) { title = title.substring(0, title.indexOf(":")); // remove colon } result.addDetail(new Detail(title, detail)); if (element.select("a").size() == 1 && !element.select("a").get(0).text().trim().equals("")) { String url = element.select("a").first().absUrl("href"); if (!url.startsWith(opac_url)) { result.addDetail(new Detail(stringProvider.getString(StringProvider.LINK), url)); } } line++; } line++; // next line after separator // Copies Copy copy = new Copy(); String location = ""; // reservation info will be stored as JSON JSONArray reservationInfo = new JSONArray(); while (line < lines.size()) { Element element = lines.get(line); if (element.select("hr").size() == 0) { Element titleElem = element.firstElementSibling(); String detail = element.text().trim(); String title = titleElem.text().replace("\u00a0", " ").trim(); if (detail.length() == 0 && title.length() == 0) { line++; continue; } if (title.contains("Standort") || title.contains("Vorhanden in") || title.contains("Location")) { location += detail; } else if (title.contains("Sonderstandort")) { location += " - " + detail; } else if (title.contains("Systemstelle") || title.contains("Subject")) { copy.setDepartment(detail); } else if (title.contains("Fachnummer") || title.contains("locationnumber")) { copy.setLocation(detail); } else if (title.contains("Signatur") || title.contains("Shelf mark")) { copy.setShelfmark(detail); } else if (title.contains("Anmerkung")) { location += " (" + detail + ")"; } else if (title.contains("Link")) { result.addDetail(new Detail(title.replace(":", "").trim(), detail)); } else if (title.contains("Status") || title.contains("Ausleihinfo") || title.contains("Ausleihstatus") || title.contains("Request info")) { // Find return date Pattern pattern = Pattern.compile("(till|bis) (\\d{2}-\\d{2}-\\d{4})"); Matcher matcher = pattern.matcher(detail); if (matcher.find()) { DateTimeFormatter fmt = DateTimeFormat.forPattern("dd-MM-yyyy").withLocale(Locale.GERMAN); try { copy.setStatus(detail.substring(0, matcher.start() - 1).trim()); copy.setReturnDate(fmt.parseLocalDate(matcher.group(2))); } catch (IllegalArgumentException e) { e.printStackTrace(); copy.setStatus(detail); } } else { copy.setStatus(detail); } // Get reservation info if (element.select("a:has(img[src*=inline_arrow])").size() > 0) { Element a = element.select("a:has(img[src*=inline_arrow])").first(); boolean multipleCopies = a.text().matches(".*(Exemplare|Volume list).*"); JSONObject reservation = new JSONObject(); try { reservation.put("multi", multipleCopies); reservation.put("link", _extract_url(a.absUrl("href"))); reservation.put("desc", location); reservationInfo.put(reservation); } catch (JSONException e1) { e1.printStackTrace(); } result.setReservable(true); } } } else { copy.setBranch(location); result.addCopy(copy); location = ""; copy = new Copy(); } line++; } if (copy.notEmpty()) { copy.setBranch(location); result.addCopy(copy); } if (reservationInfo.length() == 0) { // No reservation info found yet, because we didn't find any copies. // If there is a reservation link somewhere in the rows we interpreted // as details, we still want to use it. if (doc.select("td a:has(img[src*=inline_arrow])").size() > 0) { Element a = doc.select("td a:has(img[src*=inline_arrow])").first(); boolean multipleCopies = a.text().matches(".*(Exemplare|Volume list).*"); JSONObject reservation = new JSONObject(); try { reservation.put("multi", multipleCopies); reservation.put("link", _extract_url(a.attr("href"))); reservation.put("desc", location); reservationInfo.put(reservation); } catch (JSONException e1) { e1.printStackTrace(); } result.setReservable(true); } } result.setReservation_info(reservationInfo.toString()); // Volumes if (doc.select("a[href^=FAM?PPN=]").size() > 0) { String href = doc.select("a[href^=FAM?PPN=]").attr("href"); String ppn = getQueryParamsFirst(href).get("PPN"); Map<String, String> data = new HashMap<>(); data.put("ppn", ppn); result.setVolumesearch(data); } return result; }
From source file:mml.handler.post.MMLPostHTMLHandler.java
/** * Parse a codeblock/*from w w w . ja v a 2 s .c om*/ * @param elem the element to parse * @throws a JSON exception */ private void parsePre(Element elem) throws JSONException { if (elem.hasText()) { int offset = sb.length(); String name = elem.attr("class"); if (name == null || name.length() == 0) name = "pre"; Range r = new Range(name, offset, 0); stil.add(r); if (elem.hasAttr("class")) { List<Node> children = elem.childNodes(); for (Node child : children) { if (child instanceof Element) { if (child.nodeName().equals("span")) parseSpan((Element) child); else parseOtherElement((Element) child); } else if (child instanceof TextNode) sb.append(((TextNode) child).getWholeText()); } } else sb.append(elem.text()); this.stil.updateLen(r, sb.length() - offset); } prevWasMilestone = false; ensure(1, false); }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
@Override public ReservationResult reservation(DetailledItem item, Account account, int useraction, String selection) throws IOException { String resinfo = item.getReservation_info(); if (selection == null || selection.equals("confirmed")) { // STEP 1: Check if reservable and select branch ("ID1") // Differences between opax and opac String func = opacDir.contains("opax") ? "sigl" : "resF"; String id = opacDir.contains("opax") ? (resinfo.contains("resF") ? resinfo.substring(5) + "=" + resinfo : resinfo + "=resF_" + resinfo) : "ID=" + resinfo; String html = httpGet(//from w w w. j ava 2s. c o m opacUrl + "/" + opacDir + "/reserv" + opacSuffix + "?LANG=de&FUNC=" + func + "&" + id, getDefaultEncoding()); Document doc = Jsoup.parse(html); newStyleReservations = doc.select("input[name=" + resinfo.replace("resF_", "") + "]").val() .length() > 4; Elements optionsElements = doc.select("select[name=ID1] option"); if (optionsElements.size() > 0) { List<Map<String, String>> options = new ArrayList<>(); for (Element option : optionsElements) { if ("0".equals(option.attr("value"))) { continue; } Map<String, String> selopt = new HashMap<>(); selopt.put("key", option.attr("value") + ":" + option.text()); selopt.put("value", option.text()); options.add(selopt); } if (options.size() > 1) { ReservationResult res = new ReservationResult(MultiStepResult.Status.SELECTION_NEEDED); res.setActionIdentifier(ReservationResult.ACTION_BRANCH); res.setSelection(options); return res; } else { return reservation(item, account, useraction, options.get(0).get("key")); } } else { ReservationResult res = new ReservationResult(MultiStepResult.Status.ERROR); res.setMessage("Dieses Medium ist nicht reservierbar."); return res; } } else { // STEP 2: Reserve List<NameValuePair> nameValuePairs = new ArrayList<>(); nameValuePairs.add(new BasicNameValuePair("LANG", "de")); nameValuePairs.add(new BasicNameValuePair("BENUTZER", account.getName())); nameValuePairs.add(new BasicNameValuePair("PASSWORD", account.getPassword())); nameValuePairs.add(new BasicNameValuePair("FUNC", "vors")); if (opacDir.contains("opax")) { nameValuePairs.add(new BasicNameValuePair(resinfo.replace("resF_", ""), "vors" + (newStyleReservations ? resinfo.replace("resF_", "") : ""))); } if (newStyleReservations) { nameValuePairs.add(new BasicNameValuePair("ID11", selection.split(":")[1])); } nameValuePairs.add(new BasicNameValuePair("ID1", selection.split(":")[0])); String html = httpPost(opacUrl + "/" + opacDir + "/setreserv" + opacSuffix, new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); Document doc = Jsoup.parse(html); if (doc.select(".tab21 .p44b, .p2").text().contains("eingetragen")) { return new ReservationResult(MultiStepResult.Status.OK); } else { ReservationResult res = new ReservationResult(MultiStepResult.Status.ERROR); if (doc.select(".p1, .p22b").size() > 0) { res.setMessage(doc.select(".p1, .p22b").text()); } return res; } } }
From source file:org.shareok.data.sagedata.SageJournalIssueDateProcessor.java
@SuppressWarnings("empty-statement") public void retrieveSageJournalVolIssueDates(Map<String, String> processedJournalsMap) { List<String> processedJournals = new ArrayList<>(); // JSONObject jsonObj = getSavedSageJournalVolIssueDateInformation(); try {// www. j a va 2 s . com Map<String, Map<String, String>> journalMap = getSavedSageJournalVolIssueDateInformation(); if (null == journalMap) { journalMap = new HashMap<>(); } Document doc = null; try { doc = Jsoup.connect("http://journals.sagepub.com/action/showPublications?pageSize=20&startPage=199") .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token").timeout(300000).get(); Elements trs = doc.select("form#browsePublicationsForm").get(0).select("table").get(0) .select("tbody").get(0).select("tr"); for (Element tr : trs) { Element link = tr.select("td").get(1).select("a").get(0); String journalName = link.text(); String journalLink = SageDataUtil.SAGE_HTTP_PREFIX + link.attr("href"); String[] linkInfo = journalLink.split("/"); String journalIssuesLink = SageDataUtil.SAGE_HTTP_PREFIX + "/loi/" + linkInfo[linkInfo.length - 1]; if (null == journalMap.get(journalName)) { Map<String, String> infoMap = new HashMap<>(); infoMap.put("homeLink", journalLink); infoMap.put("issueLink", journalIssuesLink); journalMap.put(journalName, infoMap); } else { Map<String, String> infoMap = journalMap.get(journalName); if (null == infoMap.get("homeLink")) { infoMap.put("homeLink", journalLink); } if (null == infoMap.get("issueLink")) { infoMap.put("issueLink", journalIssuesLink); } } } int kk = 0; mainLoop: for (String journal : journalMap.keySet()) { System.out.println("Print out journal " + journal + " information :"); if (null != processedJournalsMap && (journal == null ? processedJournalsMap.get(journal) == null : journal.equals(processedJournalsMap.get(journal)))) { System.out.println("Journal : has already been processed!"); continue; } // if(journal.contains("Christian Education")){ // System.out.println("Journal name : International Journal of Health Services, cannot be processed!"); //// continue; // } // if(journal.contains("Plastic Surgery")){ // System.out.println("Journal name : International Journal of Health Services, cannot be processed!"); // continue; // } Map<String, String> journalInfoMap = journalMap.get(journal); for (String key : journalInfoMap.keySet()) { if (key.equals("issueLink")) { Document loiDdoc = null; try { loiDdoc = Jsoup.connect(journalInfoMap.get(key)).userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token").timeout(300000).get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break; } Thread.sleep(2200); if (null != loiDdoc) { Map<String, Map<String, String>> dataMap; if (null != journalMap.get(journal).get("data")) { dataMap = DataUtil.getMapFromJson(journalMap.get(journal).get("data")); } else { dataMap = new HashMap<>(); } Elements decaseDivs = loiDdoc.select("div.decade"); if (null != decaseDivs && decaseDivs.size() > 0) { for (Element decade : decaseDivs) { Elements yearsDiv = decade.select("div.years").get(0).children(); if (null != yearsDiv && yearsDiv.size() > 0) { for (Element yearEle : yearsDiv) { Elements volumesDiv = yearEle.select("div.volumes").get(0) .children(); if (null != volumesDiv && volumesDiv.size() > 0) { for (Element volumeEle : volumesDiv) { String volume = volumeEle.select("a").get(0).text().trim() .split("Volume")[1].trim(); Elements issueInfoDivEles = volumeEle .select("div.js_issue"); if (null != issueInfoDivEles && issueInfoDivEles.size() > 0) { for (Element issueInfoDiv : issueInfoDivEles) { String issueText = issueInfoDiv.select("a").get(0) .text(); issueText = issueText.split(", ")[0] .split("Issue")[1].trim(); String oldIssueDate = ""; String issueDate = ""; if (NO_ARTICLE_PUB_DATE_JOURNALS_LIST .contains(journal)) { issueDate = "01 " + issueInfoDiv .select("span.loiIssueCoverDateText") .get(0).text().trim(); oldIssueDate = issueDate; // if(issueDate.contains("Winter")){ // issueDate = issueDate.replaceAll("Winter", "October"); // } // if(issueDate.contains("Fall") || issueDate.contains("Autumn")){ // issueDate = issueDate.replaceAll("Fall", "September"); // issueDate = issueDate.replaceAll("Autumn", "September"); // } // if(issueDate.contains("Summer")){ // issueDate = issueDate.replaceAll("Summer", "April"); // } // if(issueDate.contains("Spring")){ // issueDate = issueDate.replaceAll("Spring", "January"); // } // try{ // // for date string like "01 July-October 2016" // if(issueDate.contains("-")){ // String[] dateInfo = issueDate.split("-"); // issueDate = dateInfo[0] + " " + dateInfo[1].split(" ")[1]; // } // // for date string like "01 July/October 2016" // if(issueDate.contains("/")){ // String[] dataInfo = issueDate.split("/"); // issueDate = dataInfo[0] + " " + dataInfo[1].split(" ")[1]; // } // } // catch(ArrayIndexOutOfBoundsException ex){ // System.out.println("Journal name: "+journal); // System.out.println("Volume: "+volume+", issue: "+issueText); // System.out.println("This date string cannot be parsed: "+oldIssueDate); // ex.printStackTrace(); // continue; // } try { issueDate = "01 " + issueInfoDiv.select( "span.loiIssueCoverDateText").get(0) .text().trim(); oldIssueDate = issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } catch (ParseException ex) { // if(!journal.contains("OMEGA - Journal of Death and Dying")){ // continue; // } System.out.println( "Journal name: " + journal); System.out.println("Volume: " + volume + ", issue: " + issueText); System.out.println( "This date string cannot be parsed: " + oldIssueDate); ex.printStackTrace(); continue; } } else { try { Element issueLinkEle = issueInfoDiv .select("a").get(0); String issueLink = issueLinkEle .attr("href"); Document issueDoc = null; try { issueDoc = Jsoup.connect(issueLink) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token") .timeout(300000).get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break mainLoop; } Thread.sleep(2200); Elements articleDivs = issueDoc .select("div.art_title, .linkable"); String articleLink = SageDataUtil.SAGE_HTTP_PREFIX + articleDivs.get(0) .select("a.ref, .nowrap") .get(0).attr("href"); if (articleLink.contains("pdf/")) { System.out.println("journal: " + journal + " volume=" + volume + " issue=" + issueText + " has ONLY PDF links!"); try { issueDate = issueInfoDiv.select( "span.loiIssueCoverDateText") .get(0).text().trim(); oldIssueDate = issueDate; if (issueDate.contains("Winter")) { issueDate = issueDate .replaceAll("Winter", "December"); } if (issueDate.contains("Fall") || issueDate.contains( "Autumn")) { issueDate = issueDate .replaceAll("Fall", "September"); issueDate = issueDate .replaceAll("Autumn", "September"); } if (issueDate.contains("Summer")) { issueDate = issueDate .replaceAll("Summer", "June"); } if (issueDate.contains("Spring")) { issueDate = issueDate .replaceAll("Spring", "March"); } if (issueDate.contains("/")) { String[] dataInfo = issueDate .split("/"); String dateInfo1 = dataInfo[0] .trim(); String date; String month1; String[] dateInfo1Arr = dateInfo1 .split(" "); if (dateInfo1Arr.length == 2) { date = dateInfo1Arr[0]; month1 = dateInfo1Arr[1]; } else { date = "01"; month1 = dataInfo[0].trim(); } String month2 = dataInfo[1] .split("\\s+")[0]; String year = dataInfo[1] .split("\\s+")[1]; String date1 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month1 + " " + year); String date2 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month2 + " " + year); issueDate = date1 + "::" + date2; } // The Journal of Psychiatry & Law dd MMMM-MMMM yyyy pattern else if (issueDate.contains("-")) { if (journal.equals( "OMEGA - Journal of Death and Dying")) { Document articleDoc = null; try { articleDoc = Jsoup .connect( articleLink) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token") .timeout(300000) .get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break mainLoop; } Thread.sleep(2200); Element pubDateDiv = articleDoc .select("div.published-dates") .get(0); issueDate = pubDateDiv .text() .split("Issue published:")[1] .trim(); oldIssueDate = issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } else { String[] dataInfo = issueDate .split("-"); String dateInfo1 = dataInfo[0] .trim(); String date; String month1; String[] dateInfo1Arr = dateInfo1 .split(" "); if (dateInfo1Arr.length == 2) { date = dateInfo1Arr[0] .trim(); month1 = dateInfo1Arr[1] .trim(); } else { date = "01"; month1 = dataInfo[0] .trim(); } String month2 = dataInfo[1] .split("\\s+")[0]; String year = dataInfo[1] .split("\\s+")[1]; String date1 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month1 + " " + year); String date2 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month2 + " " + year); issueDate = date1 + "::" + date2; } } else { issueDate = "01 " + issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } } catch (ParseException | ArrayIndexOutOfBoundsException ex) { System.out.println( "Journal name: " + journal); System.out.println("Volume: " + volume + ", issue: " + issueText); System.out.println( "This date string cannot be parsed: " + issueDate); ex.printStackTrace(); continue; } } else { Document articleDoc = null; try { articleDoc = Jsoup .connect(articleLink) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token") .timeout(300000).get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break mainLoop; } Thread.sleep(2200); Element pubDateDiv = articleDoc .select("div.published-dates") .get(0); issueDate = pubDateDiv.text() .split("Issue published:")[1] .trim(); oldIssueDate = issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } } catch (Exception ex) { logger.error( "Cannot get the issue date for journal =" + journal + " volume=" + volume + " issue=" + issueText + " date=" + oldIssueDate, ex); continue; } } if (DataHandlersUtil.datesCompare(issueDate, "2010-01-01") < 0) { if (dataMap.size() > 0) { ObjectMapper mapper = new ObjectMapper(); String json = mapper .writeValueAsString(dataMap); journalInfoMap.put("data", json); } processedJournals.add(journal); continue mainLoop; } try { if (null != dataMap && dataMap.size() > 0 && null != dataMap.get(volume) && null != dataMap.get(volume) .get(issueText)) { continue; } else { Map<String, String> issueMap = dataMap .get(volume); if (null == issueMap) { issueMap = new HashMap<>(); issueMap.put(issueText, issueDate); dataMap.put(volume, issueMap); } else { issueMap.put(issueText, issueDate); } System.out.println("This is vol. " + volume + " and issue " + issueText + " and date " + issueDate); } } catch (Exception ex) { System.out.println( "Cannot add the pub date info into data map for vol. " + volume + " and issue " + issueText + " and date " + issueDate); } } } } } } } } } if (dataMap.size() > 0) { ObjectMapper mapper = new ObjectMapper(); String json = mapper.writeValueAsString(dataMap); journalInfoMap.put("data", json); } } } } processedJournals.add(journal); if (kk > 100) { break; } kk++; } } catch (IOException ex) { ex.printStackTrace(); } ObjectMapper mapper = new ObjectMapper(); String json = mapper.writeValueAsString(journalMap); String sageJournalIssueDateInfoFilePath = ShareokdataManager.getSageJournalIssueDateInfoFilePath(); File sageFile = new File(sageJournalIssueDateInfoFilePath); if (sageFile.exists()) { String sageJournalIssueDateInfoFilePathOld = sageJournalIssueDateInfoFilePath.split("\\.")[0] + "_" + DataHandlersUtil.getCurrentTimeString() + ".json"; sageFile.renameTo(new File(sageJournalIssueDateInfoFilePathOld)); } DocumentProcessorUtil.outputStringToFile(json, ShareokdataManager.getSageJournalIssueDateInfoFilePath()); System.out.println("processed journals = " + mapper.writeValueAsString(processedJournals)); } catch (Exception ex) { logger.error("Cannot process the issue dates.", ex); } }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Pulls a text from a Wikipedia URL without images, tags, etc. * //www.j a va 2s .com * @param url * Address of the targetted text. * @return * An Article object representing the retrieved object. * * @throws ReaderException * Problem while retrieving the text. */ @Override public Article read(URL url) throws ReaderException { Article result = null; String name = getName(url); try { // get the page String address = url.toString(); logger.log("Retrieving page " + address); long startTime = System.currentTimeMillis(); Document document = retrieveSourceCode(name, url); // get its title Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0); String title = firstHeadingElt.text(); logger.log("Get title: " + title); // get raw and linked texts logger.log("Get raw and linked texts."); StringBuilder rawStr = new StringBuilder(); StringBuilder linkedStr = new StringBuilder(); Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0); // processing each element in the content part boolean ignoringSection = false; boolean first = true; for (Element element : bodyContentElt.children()) { String eltName = element.tag().getName(); String eltClass = element.attr(XmlNames.ATT_CLASS); // section headers if (eltName.equals(XmlNames.ELT_H2)) { first = false; // get section name StringBuilder fakeRaw = new StringBuilder(); StringBuilder fakeLinked = new StringBuilder(); processParagraphElement(element, fakeRaw, fakeLinked); String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH); // check section name if (IGNORED_SECTIONS.contains(str)) ignoringSection = true; else { ignoringSection = false; rawStr.append("\n-----"); linkedStr.append("\n-----"); processParagraphElement(element, rawStr, linkedStr); } } else if (!ignoringSection) { // lower sections if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) { first = false; processParagraphElement(element, rawStr, linkedStr); } // paragraph else if (eltName.equals(XmlNames.ELT_P)) { String str = element.text(); // ignore possible initial disambiguation link if (!first || !str.startsWith(PARAGRAPH_FORTHE)) { first = false; processParagraphElement(element, rawStr, linkedStr); } } // list else if (eltName.equals(XmlNames.ELT_UL)) { first = false; processListElement(element, rawStr, linkedStr, false); } else if (eltName.equals(XmlNames.ELT_OL)) { first = false; processListElement(element, rawStr, linkedStr, true); } else if (eltName.equals(XmlNames.ELT_DL)) { first = false; processDescriptionListElement(element, rawStr, linkedStr); } // tables else if (eltName.equals(XmlNames.ELT_TABLE)) { first = !processTableElement(element, rawStr, linkedStr); } // divisions else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB)) first = !processDivisionElement(element, rawStr, linkedStr); } // we ignore certain types of span (phonetic trancription, WP buttons...) else if (eltName.equals(XmlNames.ELT_SPAN)) { first = !processSpanElement(element, rawStr, linkedStr); } // hyperlinks must be included in the linked string, provided they are not external else if (eltName.equals(XmlNames.ELT_A)) { first = !processHyperlinkElement(element, rawStr, linkedStr); } // quotes are just processed recursively else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) { first = !processQuoteElement(element, rawStr, linkedStr); } // other tags are ignored } } // create article object result = new Article(name); result.setTitle(title); result.setUrl(url); result.initDate(); // clean text String rawText = rawStr.toString(); rawText = cleanText(rawText); // rawText = ArticleCleaning.replaceChars(rawText); result.setRawText(rawText); logger.log("Length of the raw text: " + rawText.length() + " chars."); String linkedText = linkedStr.toString(); linkedText = cleanText(linkedText); // linkedText = ArticleCleaning.replaceChars(linkedText); result.setLinkedText(linkedText); logger.log("Length of the linked text: " + linkedText.length() + " chars."); // get original html source code logger.log("Get original HTML source code."); String originalPage = document.toString(); result.setOriginalPage(originalPage); logger.log("Length of the original page: " + originalPage.length() + " chars."); // get the categories of the article List<ArticleCategory> categories = getArticleCategories(result); result.setCategories(categories); long endTime = System.currentTimeMillis(); logger.log("Total duration: " + (endTime - startTime) + " ms."); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (org.json.simple.parser.ParseException e) { e.printStackTrace(); } return result; }
From source file:com.vaushell.shaarlijavaapi.ShaarliClient.java
private String extract(final Element source, final String templateName) { if (source == null) { throw new IllegalArgumentException(); }// w ww. ja v a 2 s . c om final ShaarliTemplates.Template template = templates.get(templateName); if (template == null) { throw new IllegalArgumentException("template '" + templateName + "' not found"); } final Element elt; if (template.cssPath.isEmpty()) { elt = source; } else { final Elements elts = source.select(template.cssPath); if (elts.isEmpty()) { return null; } elt = elts.first(); } String content; if (template.attribut.isEmpty()) { content = elt.text(); } else { content = elt.attr(template.attribut); } if (content == null) { return null; } content = content.trim(); if (!template.regex.isEmpty()) { final Pattern p = Pattern.compile(template.regex); final Matcher m = p.matcher(content); if (m.find()) { content = m.group().trim(); } } if (content.isEmpty()) { return null; } return content; }