List of usage examples for org.jsoup.nodes Element html
public String html()
From source file:net.pixomania.crawler.W3C.parser.rules.editors.version.VersionEditorRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(version 1), dt:contains(version 1) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; String version = ""; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev != null) { if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("version 1") && !prev.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = true;//ww w .ja va 2 s .c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("version 1") || next.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = false; continue; } } continue; } } if (editor.tagName().equals("dt")) { version = editor.text(); continue; } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt") && !next.text().trim().toLowerCase().startsWith("editors (version 1")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:be.ibridge.kettle.jsoup.JsoupInput.java
private Object[] buildRow() throws KettleException { // Create new row... Object[] outputRowData = buildEmptyRow(); if (data.readrow != null) outputRowData = data.readrow.clone(); // Read fields... for (int i = 0; i < data.nrInputFields; i++) { // Get field JsoupInputField field = meta.getInputFields()[i]; // get jsoup array for field Elements jsoupa = data.resultList.get(i); String nodevalue = null;//w ww . jav a 2 s . c om if (jsoupa != null) { Element jo = jsoupa.get(data.recordnr); if (jo != null) { // Do Element Type switch (field.getElementType()) { case JsoupInputField.ELEMENT_TYPE_NODE: // Do Result Type switch (field.getResultType()) { case JsoupInputField.RESULT_TYPE_TEXT: nodevalue = jo.text(); break; case JsoupInputField.RESULT_TYPE_TYPE_OUTER_HTML: nodevalue = jo.outerHtml(); break; case JsoupInputField.RESULT_TYPE_TYPE_INNER_HTML: nodevalue = jo.html(); break; default: nodevalue = jo.toString(); break; } break; case JsoupInputField.ELEMENT_TYPE_ATTRIBUT: nodevalue = jo.attr(field.getAttribute()); break; default: nodevalue = jo.toString(); break; } } } // Do trimming switch (field.getTrimType()) { case JsoupInputField.TYPE_TRIM_LEFT: nodevalue = Const.ltrim(nodevalue); break; case JsoupInputField.TYPE_TRIM_RIGHT: nodevalue = Const.rtrim(nodevalue); break; case JsoupInputField.TYPE_TRIM_BOTH: nodevalue = Const.trim(nodevalue); break; default: break; } if (meta.isInFields()) { // Add result field to input stream outputRowData = RowDataUtil.addValueData(outputRowData, data.totalpreviousfields + i, nodevalue); } // Do conversions // ValueMetaInterface targetValueMeta = data.outputRowMeta.getValueMeta(data.totalpreviousfields + i); ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(data.totalpreviousfields + i); outputRowData[data.totalpreviousfields + i] = targetValueMeta.convertData(sourceValueMeta, nodevalue); // Do we need to repeat this field if it is null? if (meta.getInputFields()[i].isRepeated()) { if (data.previousRow != null && Const.isEmpty(nodevalue)) { outputRowData[data.totalpreviousfields + i] = data.previousRow[data.totalpreviousfields + i]; } } } // End of loop over fields... int rowIndex = data.nrInputFields; // See if we need to add the filename to the row... if (meta.includeFilename() && !Const.isEmpty(meta.getFilenameField())) { outputRowData[rowIndex++] = data.filename; } // See if we need to add the row number to the row... if (meta.includeRowNumber() && !Const.isEmpty(meta.getRowNumberField())) { outputRowData[rowIndex++] = new Long(data.rownr); } // Possibly add short filename... if (meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0) { outputRowData[rowIndex++] = data.shortFilename; } // Add Extension if (meta.getExtensionField() != null && meta.getExtensionField().length() > 0) { outputRowData[rowIndex++] = data.extension; } // add path if (meta.getPathField() != null && meta.getPathField().length() > 0) { outputRowData[rowIndex++] = data.path; } // Add Size if (meta.getSizeField() != null && meta.getSizeField().length() > 0) { outputRowData[rowIndex++] = new Long(data.size); } // add Hidden if (meta.isHiddenField() != null && meta.isHiddenField().length() > 0) { outputRowData[rowIndex++] = new Boolean(data.path); } // Add modification date if (meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0) { outputRowData[rowIndex++] = data.lastModificationDateTime; } // Add Uri if (meta.getUriField() != null && meta.getUriField().length() > 0) { outputRowData[rowIndex++] = data.uriName; } // Add RootUri if (meta.getRootUriField() != null && meta.getRootUriField().length() > 0) { outputRowData[rowIndex++] = data.rootUriName; } data.recordnr++; RowMetaInterface irow = getInputRowMeta(); data.previousRow = irow == null ? outputRowData : (Object[]) irow.cloneRow(outputRowData); // copy it to make // surely the next step doesn't change it in between... return outputRowData; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule2.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Editor) ~ dd, dt:contains(Edition Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().contains("version") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;//ww w .j a v a2 s . com } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule8.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("h4:contains(Editor) ~ blockquote"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("h4")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().endsWith("version:") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;/*from w w w . j a va2s. co m*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("h4")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.kevxu.purdueassist.course.ScheduleDetail.java
private ScheduleDetailEntry parseDocument(Document document) throws HtmlParseException, CourseNotFoundException, ResultNotMatchException { ScheduleDetailEntry entry = new ScheduleDetailEntry(term, crn); Elements tableElements = document.getElementsByAttributeValue("summary", "This table is used to present the detailed class information."); if (!tableElements.isEmpty()) { for (Element tableElement : tableElements) { // get basic info for selected course Element tableBasicInfoElement = tableElement.getElementsByClass("ddlabel").first(); if (tableBasicInfoElement != null) { setBasicInfo(entry, tableBasicInfoElement.text()); } else { throw new HtmlParseException("Basic info element empty."); }//from ww w . j a v a 2 s . c o m // get detailed course info Element tableDetailedInfoElement = tableElement.getElementsByClass("dddefault").first(); if (tableDetailedInfoElement != null) { // process seat info Elements tableSeatDetailElements = tableDetailedInfoElement.getElementsByAttributeValue( "summary", "This layout table is used to present the seating numbers."); if (tableSeatDetailElements.size() == 1) { Element tableSeatDetailElement = tableSeatDetailElements.first(); Elements tableSeatDetailEntryElements = tableSeatDetailElement.getElementsByTag("tbody") .first().children(); if (tableSeatDetailEntryElements.size() == 3 || tableSeatDetailEntryElements.size() == 4) { setSeats(entry, tableSeatDetailEntryElements.get(1).text()); setWaitlistSeats(entry, tableSeatDetailEntryElements.get(2).text()); if (tableSeatDetailEntryElements.size() == 4) { setCrosslistSeats(entry, tableSeatDetailEntryElements.get(3).text()); } } else { throw new HtmlParseException("Seat detail entry elements size not 3. We have " + tableSeatDetailEntryElements.size() + "."); } } else { throw new HtmlParseException( "Seat detail elements size not 1. We have " + tableSeatDetailElements.size() + "."); } // remove the seat info from detailed info tableSeatDetailElements.remove(); // remaining information setRemainingInfo(entry, tableDetailedInfoElement.html()); } else { throw new HtmlParseException("Detailed info element empty."); } } } else { // test empty Elements informationElements = document.getElementsByAttributeValue("summary", "This layout table holds message information"); if (!informationElements.isEmpty() && informationElements.text().contains("No detailed class information found")) { throw new CourseNotFoundException(informationElements.text()); } else { throw new HtmlParseException( "Course table not found, but page does not contain message stating no course found."); } } return entry; }
From source file:de.geeksfactory.opacclient.apis.Open.java
protected SearchRequestResult parse_search(Document doc, int page) throws OpacErrorException { searchResultDoc = doc;//from w ww . ja v a 2s . com if (doc.select("#Label1, span[id$=LblInfoMessage]").size() > 0) { String message = doc.select("#Label1, span[id$=LblInfoMessage]").text(); if (message.contains("keine Treffer")) { return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, page); } else { throw new OpacErrorException(message); } } int totalCount = Integer.parseInt(doc.select("span[id$=TotalItemsLabel]").first().text()); Elements elements = doc.select("div[id$=divMedium], div[id$=divComprehensiveItem]"); List<SearchResult> results = new ArrayList<>(); int i = 0; for (Element element : elements) { SearchResult result = new SearchResult(); // Cover if (element.select("input[id$=mediumImage]").size() > 0) { result.setCover(element.select("input[id$=mediumImage]").first().attr("src")); } else if (element.select("img[id$=CoverView_Image]").size() > 0) { result.setCover(getCoverUrl(element.select("img[id$=CoverView_Image]").first())); } Element catalogueContent = element.select(".catalogueContent").first(); // Media Type if (catalogueContent.select("#spanMediaGrpIcon").size() > 0) { String mediatype = catalogueContent.select("#spanMediaGrpIcon").attr("class"); if (mediatype.startsWith("itemtype ")) { mediatype = mediatype.substring("itemtype ".length()); } SearchResult.MediaType defaulttype = defaulttypes.get(mediatype); if (defaulttype == null) defaulttype = SearchResult.MediaType.UNKNOWN; if (data.has("mediatypes")) { try { result.setType(SearchResult.MediaType .valueOf(data.getJSONObject("mediatypes").getString(mediatype))); } catch (JSONException e) { result.setType(defaulttype); } } else { result.setType(defaulttype); } } else { result.setType(SearchResult.MediaType.UNKNOWN); } // Text String title = catalogueContent.select("a[id$=LbtnShortDescriptionValue], a[id$=LbtnTitleValue]") .text(); String subtitle = catalogueContent.select("span[id$=LblSubTitleValue]").text(); String author = catalogueContent.select("span[id$=LblAuthorValue]").text(); String year = catalogueContent.select("span[id$=LblProductionYearValue]").text(); String publisher = catalogueContent .select("span[id$=LblManufacturerValue], span[id$=LblPublisherValue]").text(); String series = catalogueContent.select("span[id$=LblSeriesValue]").text(); StringBuilder text = new StringBuilder(); text.append("<b>").append(title).append("</b>"); if (!subtitle.equals("")) text.append("<br/>").append(subtitle); if (!author.equals("")) text.append("<br/>").append(author); if (!year.equals("")) text.append("<br/>").append(year); if (!publisher.equals("")) text.append("<br/>").append(publisher); if (!series.equals("")) text.append("<br/>").append(series); result.setInnerhtml(text.toString()); // ID Pattern idPattern = Pattern.compile("\\$mdv(\\d+)\\$"); Matcher matcher = idPattern.matcher(catalogueContent.html()); if (matcher.find()) { result.setId(matcher.group(1)); } // Availability if (result.getId() != null) { String url = opac_url + "/DesktopModules/OCLC.OPEN.PL.DNN.SearchModule/SearchService" + ".asmx/GetAvailability"; String culture = element.select("input[name$=culture]").val(); JSONObject data = new JSONObject(); try { // Determine portalID value int portalId = 1; for (Element scripttag : doc.select("script")) { String scr = scripttag.html(); if (scr.contains("LoadSharedCatalogueViewAvailabilityAsync")) { Pattern portalIdPattern = Pattern .compile(".*LoadSharedCatalogueViewAvailabilityAsync\\([^,]*,[^,]*," + "[^0-9,]*([0-9]+)[^0-9,]*,.*\\).*"); Matcher portalIdMatcher = portalIdPattern.matcher(scr); if (portalIdMatcher.find()) { portalId = Integer.parseInt(portalIdMatcher.group(1)); } } } data.put("portalId", portalId).put("mednr", result.getId()).put("culture", culture) .put("requestCopyData", false).put("branchFilter", ""); StringEntity entity = new StringEntity(data.toString()); entity.setContentType(ContentType.APPLICATION_JSON.getMimeType()); String json = httpPost(url, entity, getDefaultEncoding()); JSONObject availabilityData = new JSONObject(json); String isAvail = availabilityData.getJSONObject("d").getString("IsAvail"); switch (isAvail) { case "true": result.setStatus(SearchResult.Status.GREEN); break; case "false": result.setStatus(SearchResult.Status.RED); break; case "digital": result.setStatus(SearchResult.Status.UNKNOWN); break; } } catch (JSONException | IOException e) { e.printStackTrace(); } } result.setNr(i); results.add(result); } return new SearchRequestResult(results, totalCount, page); }
From source file:com.storm.function.GsxtFunction.java
private Map<String, Object> getHtmlInfoMapOfLiaoning(String area, HtmlPage firstInfoPage, String keyword, ChannelLogger LOGGER) throws Exception { LOGGER.info("=========" + area + "=========" + keyword + "========="); Map<String, Object> resultHtmlMap = new LinkedHashMap<String, Object>(); if (null == firstInfoPage) { resultHtmlMap.put("statusCodeDef", StatusCodeDef.FAILURE); } else {/*ww w . j av a 2 s. c o m*/ WebWindow webWindow = firstInfoPage.getWebClient().getCurrentWindow(); final String HOST_OF_LIAONING = "http://gsxt.lngs.gov.cn"; @SuppressWarnings("unchecked") List<HtmlAnchor> anchors = (List<HtmlAnchor>) firstInfoPage .getByXPath("//div[@id='listContent']/div/ul/li/a"); HtmlElement div_none = firstInfoPage.getFirstByXPath("//div[@class='list-a']"); if (null == anchors || anchors.isEmpty()) { if (null == div_none) { resultHtmlMap.put("statusCodeDef", StatusCodeDef.IMAGECODE_ERROR); } else { if (div_none.asXml().contains("??")) { resultHtmlMap.put("statusCodeDef", StatusCodeDef.NO_DATA_FOUND); } else { resultHtmlMap.put("statusCodeDef", StatusCodeDef.IMAGECODE_ERROR); } } } HtmlAnchor htmlAnchor = null; boolean flag = false; if (anchors != null && !anchors.isEmpty()) { for (HtmlAnchor anchor : anchors) { String anchorTitle = anchor.getTextContent().toString().trim(); if (anchorTitle.contains(keyword)) { // ???? htmlAnchor = anchor; flag = true; break; } } if (!flag) { resultHtmlMap.put("statusCodeDef", StatusCodeDef.NO_DATA_FOUND); LOGGER.info("????"); } } if (flag) { // ? String pripid = ""; String type = ""; // ??? HtmlElement target_item_info = (HtmlElement) htmlAnchor.getParentNode().getParentNode(); resultHtmlMap.put("target_item_info", target_item_info.asXml()); String liaoning_onclick = htmlAnchor.getAttribute("onclick"); if (!StringUtils.isEmpty(liaoning_onclick)) { pripid = liaoning_onclick.split(",")[2].replace("'", "").replace("'", ""); type = liaoning_onclick.split(",")[1].replace("'", "").replace("'", ""); } // ??? HtmlPage gsgsxx = htmlAnchor.click(); Thread.sleep(3000); Document gsgsxx_dm = Jsoup.parseBodyFragment(gsgsxx.asXml()); Element s_gs_dj_1 = gsgsxx_dm.getElementById("s_gs_dj_1"); // Element s_gs_dj_2 = gsgsxx_dm.getElementById("s_gs_dj_2"); // ???->?->? // String jbxx_url = // "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getJbxxAction.action?pripid=" // + pripid + "&type=" + type; // HtmlPage gsgsxx_djxx_jbxx = firstInfoPage.getWebClient() // .getPage(jbxx_url); String gsgsxx_djxx_jbxx_str = null; if (null != s_gs_dj_1) { gsgsxx_djxx_jbxx_str = s_gs_dj_1.html(); } resultHtmlMap.put("gsgsxx_djxx_jbxx", gsgsxx_djxx_jbxx_str); // ??->?->? //String gsgsxx_djxx_tzrxx_str = null; //if (null != s_gs_dj_2) { // gsgsxx_djxx_tzrxx_str = s_gs_dj_2.html(); //} //resultHtmlMap.put("gsgsxx_djxx_tzrxx", gsgsxx_djxx_tzrxx_str); // ????->?-> @SuppressWarnings("unchecked") List<HtmlAnchor> touziren_anchors = (List<HtmlAnchor>) gsgsxx .getByXPath("//tbody[@id='tzr_itemContainer']/tr/td/a"); if (null != touziren_anchors && !touziren_anchors.isEmpty()) { List<Map<String, Object>> gsgsxx_djxx_tzrxx_xqs = new ArrayList<Map<String, Object>>(); for (HtmlAnchor touziren_anchor : touziren_anchors) { Map<String, Object> gsgsxx_djxx_tzrxx_xq = new LinkedHashMap<String, Object>(); HtmlPage gsgsxx_djxx_tzrxx_xq_page = touziren_anchor.click(); gsgsxx_djxx_tzrxx_xq.put("gsgsxx_djxx_tzrxx_xq", gsgsxx_djxx_tzrxx_xq_page.asXml()); gsgsxx_djxx_tzrxx_xqs.add(gsgsxx_djxx_tzrxx_xq); } resultHtmlMap.put("gsgsxx_djxx_tzrxx_xqs", gsgsxx_djxx_tzrxx_xqs); } WebClient wc = firstInfoPage.getWebClient(); wc.getOptions().setJavaScriptEnabled(false); // ??->?->? String tzr_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getTzrxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage tzrPage = wc.getPage(tzr_url); String gsgsxx_djxx_tzrxx_str = null; if (null != tzrPage) { gsgsxx_djxx_tzrxx_str = tzrPage.asXml(); } resultHtmlMap.put("gsgsxx_djxx_tzrxx", gsgsxx_djxx_tzrxx_str); // ??->?->?? String bgxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getBgxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage bgxxPage = wc.getPage(webWindow, new WebRequest(new URL(bgxx_url))); // ??->?->?? String gsgsxx_djxx_bgxx_str = null; if (null != bgxxPage) { gsgsxx_djxx_bgxx_str = bgxxPage.asXml(); } resultHtmlMap.put("gsgsxx_djxx_bgxx", gsgsxx_djxx_bgxx_str); // String[] command = {"casperjs", // "/home/ubuntu/nfs-images/casperjscode/getSimpleRequestPage.js", // "--web-security=no", "--url=" + bgxx_url}; // String casperjsResult = CommandUtil.runCommand(command); // resultHtmlMap.put("gsgsxx_djxx_bgxx", casperjsResult); // ??->?->?? String gsgsxx_baxx_zyryxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getZyryxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_baxx_zyryxx_page = wc.getPage(webWindow, new WebRequest(new URL(gsgsxx_baxx_zyryxx_url))); String gsgsxx_baxx_zyryxx_str = null; if (null != gsgsxx_baxx_zyryxx_page) { gsgsxx_baxx_zyryxx_str = gsgsxx_baxx_zyryxx_page.asXml(); } resultHtmlMap.put("gsgsxx_baxx_zyryxx", gsgsxx_baxx_zyryxx_str); // ??->?->? String gsgsxx_baxx_zgbmxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getTzrxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_baxx_zgbmxx_page = wc.getPage(webWindow, new WebRequest(new URL(gsgsxx_baxx_zgbmxx_url))); String gsgsxx_baxx_zgbmxx_str = null; if (null != gsgsxx_baxx_zgbmxx_page) { gsgsxx_baxx_zgbmxx_str = gsgsxx_baxx_zgbmxx_page.asXml(); } resultHtmlMap.put("gsgsxx_baxx_zgbmxx", gsgsxx_baxx_zgbmxx_str); wc.getOptions().setJavaScriptEnabled(true); // ??->?->? String gsgsxx_baxx_fzjgxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getFgsxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_baxx_fzjgxx_page = firstInfoPage.getWebClient().getPage(gsgsxx_baxx_fzjgxx_url); String gsgsxx_baxx_fzjgxx_str = null; if (null != gsgsxx_baxx_fzjgxx_page) { gsgsxx_baxx_fzjgxx_str = gsgsxx_baxx_fzjgxx_page.asXml(); } resultHtmlMap.put("gsgsxx_baxx_fzjgxx", gsgsxx_baxx_fzjgxx_str); // ??->?->? String gsgsxx_baxx_qsxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getQsxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_baxx_qsxx_page = firstInfoPage.getWebClient().getPage(gsgsxx_baxx_qsxx_url); String gsgsxx_baxx_qsxx_str = null; if (null != gsgsxx_baxx_qsxx_page) { gsgsxx_baxx_qsxx_str = gsgsxx_baxx_qsxx_page.asXml(); } resultHtmlMap.put("gsgsxx_baxx_qsxx", gsgsxx_baxx_qsxx_str); // ??->?->? String gsgsxx_dcdydjxx_dcdydjxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getDcdydjAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_dcdydjxx_dcdydjxx_page = firstInfoPage.getWebClient() .getPage(gsgsxx_dcdydjxx_dcdydjxx_url); String gsgsxx_dcdydjxx_dcdydjxx_str = null; if (null != gsgsxx_dcdydjxx_dcdydjxx_page) { gsgsxx_dcdydjxx_dcdydjxx_str = gsgsxx_dcdydjxx_dcdydjxx_page.asXml(); } resultHtmlMap.put("gsgsxx_dcdydjxx_dcdydjxx", gsgsxx_dcdydjxx_dcdydjxx_str); // ??->??->?? String gsgsxx_gqczdjxx_gqczdjxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getGsgsGqczxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_gqczdjxx_gqczdjxx_page = firstInfoPage.getWebClient() .getPage(gsgsxx_gqczdjxx_gqczdjxx_url); String gsgsxx_gqczdjxx_gqczdjxx_str = null; if (null != gsgsxx_gqczdjxx_gqczdjxx_page) { gsgsxx_gqczdjxx_gqczdjxx_str = gsgsxx_gqczdjxx_gqczdjxx_page.asXml(); } resultHtmlMap.put("gsgsxx_gqczdjxx_gqczdjxx", gsgsxx_gqczdjxx_gqczdjxx_str); // ??->?->? String gsgsxx_xzcfxx_xzcfxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getXzcfxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_xzcfxx_xzcfxx_page = firstInfoPage.getWebClient().getPage(gsgsxx_xzcfxx_xzcfxx_url); String gsgsxx_xzcfxx_xzcfxx_str = null; if (null != gsgsxx_xzcfxx_xzcfxx_page) { gsgsxx_xzcfxx_xzcfxx_str = gsgsxx_xzcfxx_xzcfxx_page.asXml(); } resultHtmlMap.put("gsgsxx_xzcfxx_xzcfxx", gsgsxx_xzcfxx_xzcfxx_str); // ??->???->??? String gsgsxx_jyycxx_jyycxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getJyycxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_jyycxx_jyycxx_page = firstInfoPage.getWebClient().getPage(gsgsxx_jyycxx_jyycxx_url); String gsgsxx_jyycxx_jyycxx_str = null; if (null != gsgsxx_jyycxx_jyycxx_page) { gsgsxx_jyycxx_jyycxx_str = gsgsxx_jyycxx_jyycxx_page.asXml(); } resultHtmlMap.put("gsgsxx_jyycxx_jyycxx", gsgsxx_jyycxx_jyycxx_str); // ??->???->??? String gsgsxx_yzwfxx_yzwfxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getYzwfxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_yzwfxx_yzwfxx_page = firstInfoPage.getWebClient().getPage(gsgsxx_yzwfxx_yzwfxx_url); String gsgsxx_yzwfxx_yzwfxx_str = null; if (null != gsgsxx_yzwfxx_yzwfxx_page) { gsgsxx_yzwfxx_yzwfxx_str = gsgsxx_yzwfxx_yzwfxx_page.asXml(); } resultHtmlMap.put("gsgsxx_yzwfxx_yzwfxx", gsgsxx_yzwfxx_yzwfxx_str); // ??->?->? String gsgsxx_ccjcxx_ccjcxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getCcjcxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gsgsxx_ccjcxx_ccjcxx_page = firstInfoPage.getWebClient().getPage(gsgsxx_ccjcxx_ccjcxx_url); String gsgsxx_ccjcxx_ccjcxx_str = null; if (null != gsgsxx_ccjcxx_ccjcxx_page) { gsgsxx_ccjcxx_ccjcxx_str = gsgsxx_ccjcxx_ccjcxx_page.asXml(); } resultHtmlMap.put("gsgsxx_ccjcxx_ccjcxx", gsgsxx_ccjcxx_ccjcxx_str); // ???->?-> String qygsxx_qynb_list_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getQygsQynbxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage qygsxx_qynb_list_page = firstInfoPage.getWebClient().getPage(qygsxx_qynb_list_url); resultHtmlMap.put("qygsxx_qynb_list_page", qygsxx_qynb_list_page.asXml()); // ???->?-> 1_3? ? ? // ???? ????? @SuppressWarnings("unchecked") List<HtmlElement> qygsxx_qynb_list_as = (List<HtmlElement>) qygsxx_qynb_list_page .getByXPath("//tbody[@id='qynbItemContainer']/tr/td[2]/a"); List<Map<String, Object>> qygsxx_qynb_infos = new ArrayList<Map<String, Object>>(); if (qygsxx_qynb_list_as != null && !qygsxx_qynb_list_as.isEmpty()) { for (HtmlElement qygsxx_qynb_list_a : qygsxx_qynb_list_as) { Map<String, Object> qygsxx_qynb_info_map = new LinkedHashMap<String, Object>(); String qygsxx_qynb_list_a_href = HOST_OF_LIAONING + qygsxx_qynb_list_a.getAttribute("href"); String qygsxx_qynb_list_a_text = qygsxx_qynb_list_a.getTextContent(); // String qygsxx_qynb_list_pubdate = ((HtmlElement) // qygsxx_qynb_list_a // .getParentNode().getNextSibling()).getTextContent(); qygsxx_qynb_info_map.put("qygsxx_qynb_list_a_href", qygsxx_qynb_list_a_href); qygsxx_qynb_info_map.put("qygsxx_qynb_list_a_text", qygsxx_qynb_list_a_text); // qygsxx_qynb_info_map.put("qygsxx_qynb_list_pubdate", // qygsxx_qynb_list_pubdate); // ????&(??|??) // ???? // ?? ?????? // ?? // ?? ? HtmlPage qygsxx_qynb_info_page = firstInfoPage.getWebClient() .getPage(qygsxx_qynb_list_a_href); qygsxx_qynb_info_map.put("qygsxx_qynb_info_page", qygsxx_qynb_info_page.asXml()); qygsxx_qynb_infos.add(qygsxx_qynb_info_map); } } resultHtmlMap.put("qygsxx_qynb_infos", qygsxx_qynb_infos); // ???->?? String qygsxx_gdjczxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getQygsJsGdjczxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage qygsxx_gdjczxx_page = firstInfoPage.getWebClient().getPage(qygsxx_gdjczxx_url); String qygsxx_gdjczxx_str = null; if (null != qygsxx_gdjczxx_page) { qygsxx_gdjczxx_str = qygsxx_gdjczxx_page.asXml(); } resultHtmlMap.put("qygsxx_gdjczxx", qygsxx_gdjczxx_str); // ???->??->?? String qygsxx_gdjczxx_bgxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getQygsJsGdjczbgxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage qygsxx_gdjczxx_bgxx_page = firstInfoPage.getWebClient().getPage(qygsxx_gdjczxx_bgxx_url); String qygsxx_gdjczxx_bgxx_str = null; if (null != qygsxx_gdjczxx_page) { qygsxx_gdjczxx_bgxx_str = qygsxx_gdjczxx_bgxx_page.asXml(); } resultHtmlMap.put("qygsxx_gdjczxx_bgxx", qygsxx_gdjczxx_bgxx_str); // ? ??->??? String qygsxx_gqbgxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getQygsJsGqbgxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage qygsxx_gqbgxx_page = firstInfoPage.getWebClient().getPage(qygsxx_gqbgxx_url); String qygsxx_gqbgxx_str = null; if (null != qygsxx_gqbgxx_page) { qygsxx_gqbgxx_str = qygsxx_gqbgxx_page.asXml(); } resultHtmlMap.put("qygsxx_gqbgxx", qygsxx_gqbgxx_str); // ? ??->?? String qygsxx_xzxkxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getQygsJsXzxkxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage qygsxx_xzxkxx_page = firstInfoPage.getWebClient().getPage(qygsxx_xzxkxx_url); String qygsxx_xzxkxx_str = null; if (null != qygsxx_xzxkxx_page) { qygsxx_xzxkxx_str = qygsxx_xzxkxx_page.asXml(); } resultHtmlMap.put("qygsxx_xzxkxx", qygsxx_xzxkxx_str); // ???->?? String qygsxx_zscqczdjxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getQygsJsZscqczxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage qygsxx_zscqczdjxx_page = firstInfoPage.getWebClient().getPage(qygsxx_zscqczdjxx_url); String qygsxx_zscqczdjxx_str = null; if (null != qygsxx_zscqczdjxx_page) { qygsxx_zscqczdjxx_str = qygsxx_zscqczdjxx_page.asXml(); } resultHtmlMap.put("qygsxx_zscqczdjxx", qygsxx_zscqczdjxx_str); // ???->? String qygsxx_xzcfxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getQygsJsXzcfxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage qygsxx_xzcfxx_page = firstInfoPage.getWebClient().getPage(qygsxx_xzcfxx_url); String qygsxx_xzcfxx_str = null; if (null != qygsxx_xzcfxx_page) { qygsxx_xzcfxx_str = qygsxx_xzcfxx_page.asXml(); } resultHtmlMap.put("qygsxx_xzcfxx", qygsxx_xzcfxx_str); // ?????->?? String gqdjxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getSfgsGqdjxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gqdjxx_page = firstInfoPage.getWebClient().getPage(gqdjxx_url); String sfxzgsxx_gqdjxx_str = null; if (null != gqdjxx_page) { sfxzgsxx_gqdjxx_str = gqdjxx_page.asXml(); } resultHtmlMap.put("sfxzgsxx_gqdjxx", sfxzgsxx_gqdjxx_str); // ?????->?? String gdbgxx_url = "http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/getSfgsGdbgxxAction.action?pripid=" + pripid + "&type=" + type; HtmlPage gdbgxx_page = firstInfoPage.getWebClient().getPage(gdbgxx_url); String sfxzgsxx_gdbgxx_str = null; if (null != gdbgxx_page) { sfxzgsxx_gdbgxx_str = gdbgxx_page.asXml(); } resultHtmlMap.put("sfxzgsxx_gdbgxx", sfxzgsxx_gdbgxx_str); resultHtmlMap.put("statusCodeDef", StatusCodeDef.SCCCESS); } } LOGGER.returnRedisResource(); return resultHtmlMap; }
From source file:com.dalthed.tucan.scraper.SingleEventScraper.java
/** * //from www.j ava 2s . co m */ private void scrapeInformations(Iterator<Element> informationIterator) { while (informationIterator.hasNext()) { Element nextElement = informationIterator.next(); Elements td = nextElement.select("td"); if (td != null && td.hasClass("tbdata")) { Elements Paragraphs = nextElement.select("p"); Iterator<Element> PaIt = Paragraphs.iterator(); ArrayList<String> titles = new ArrayList<String>(); ArrayList<String> values = new ArrayList<String>(); while (PaIt.hasNext()) { Element next = PaIt.next(); String[] information = crop(next.html()); if (information[1].length() > 0) { titles.add(information[0]); values.add(information[1]); } } Log.i(LOG_TAG, "Informationscraper working"); if (mPageAdapter != null) { Log.i(LOG_TAG, "InformationAdapter set"); mPageAdapter.setAdapter(new TwoLinesAdapter(context, titles, values)); } } } }
From source file:com.github.irshulx.Components.InputExtensions.java
@Override public Node buildNodeFromHTML(Element element) { String text;//from w ww .j a v a2 s. co m int count; TextView tv; HtmlTag tag = HtmlTag.valueOf(element.tagName().toLowerCase()); switch (tag) { case h1: case h2: case h3: RenderHeader(tag, element); break; case p: case div: text = element.html(); count = editorCore.getParentView().getChildCount(); tv = insertEditText(count, null, text); applyStyles(tv, element); break; case blockquote: text = element.html(); count = editorCore.getParentView().getChildCount(); tv = insertEditText(count, null, text); UpdateTextStyle(EditorTextStyle.BLOCKQUOTE, tv); applyStyles(tv, element); } return null; }
From source file:com.near.chimerarevo.fragments.PostFragment.java
private void parseParagraphs(Elements ps) { for (Element p : ps) { if (!p.html().startsWith("&") && !p.html().startsWith("<iframe") && !p.html().startsWith("<!") && !p.html().contains("<h") && !p.html().contains("<ol") && !p.html().contains("<ul") && !p.html().contains("<pre") && !p.html().contains("<tr")) { parseNormalImages(p.select("img")); p.select("img").remove(); Elements lnks = p.getElementsByTag("a"); for (Element lnk : lnks) { if (lnk.attr("href").startsWith("#")) lnk.removeAttr("href"); }//from w ww.j av a2 s . c o m String txt = p.html().replace("<br />", "").replace("\n", "").trim(); if (txt.length() > 0) addText(txt, true, Typeface.DEFAULT); } } }