List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:de.geeksfactory.opacclient.apis.Zones22.java
@Override public AccountData account(Account acc) throws IOException, NotReachableException, JSONException, SocketException, OpacErrorException { Document login = login(acc);/*from w w w.j a va 2 s.c om*/ if (login == null) return null; AccountData res = new AccountData(acc.getId()); String lent_link = null; String res_link = null; int lent_cnt = -1; int res_cnt = -1; for (Element td : login.select( ".AccountSummaryCounterNameCell, .AccountSummaryCounterNameCellStripe, .CAccountDetailFieldNameCellStripe, .CAccountDetailFieldNameCell")) { String section = td.text().trim(); if (section.contains("Entliehene Medien")) { lent_link = td.select("a").attr("href"); lent_cnt = Integer.parseInt(td.nextElementSibling().text().trim()); } else if (section.contains("Vormerkungen")) { res_link = td.select("a").attr("href"); res_cnt = Integer.parseInt(td.nextElementSibling().text().trim()); } else if (section.contains("Kontostand")) { res.setPendingFees(td.nextElementSibling().text().trim()); } else if (section.matches("Ausweis g.ltig bis")) { res.setValidUntil(td.nextElementSibling().text().trim()); } } assert (lent_cnt >= 0); assert (res_cnt >= 0); if (lent_link == null) return null; String lent_html = httpGet(opac_url + "/" + lent_link.replace("utf-8?Method", "utf-8&Method"), getDefaultEncoding()); Document lent_doc = Jsoup.parse(lent_html); List<Map<String, String>> lent = new ArrayList<Map<String, String>>(); SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy", Locale.GERMAN); Pattern id_pat = Pattern.compile("javascript:renewItem\\('[0-9]+','(.*)'\\)"); for (Element table : lent_doc .select(".LoansBrowseItemDetailsCellStripe table, .LoansBrowseItemDetailsCell table")) { Map<String, String> item = new HashMap<String, String>(); for (Element tr : table.select("tr")) { String desc = tr.select(".LoanBrowseFieldNameCell").text().trim(); String value = tr.select(".LoanBrowseFieldDataCell").text().trim(); if (desc.equals("Titel")) item.put(AccountData.KEY_LENT_TITLE, value); if (desc.equals("Verfasser")) item.put(AccountData.KEY_LENT_AUTHOR, value); if (desc.equals("Mediennummer")) item.put(AccountData.KEY_LENT_BARCODE, value); if (desc.equals("ausgeliehen in")) item.put(AccountData.KEY_LENT_BRANCH, value); if (desc.matches("F.+lligkeits.*datum")) { value = value.split(" ")[0]; item.put(AccountData.KEY_LENT_DEADLINE, value); try { item.put(AccountData.KEY_LENT_DEADLINE_TIMESTAMP, String.valueOf(sdf.parse(value).getTime())); } catch (ParseException e) { e.printStackTrace(); } } } if (table.select(".button[Title~=Zum]").size() == 1) { Matcher matcher1 = id_pat.matcher(table.select(".button[Title~=Zum]").attr("href")); if (matcher1.matches()) { item.put(AccountData.KEY_LENT_LINK, matcher1.group(1)); } } lent.add(item); } res.setLent(lent); assert (lent_cnt <= lent.size()); List<Map<String, String>> reservations = new ArrayList<Map<String, String>>(); String res_html = httpGet(opac_url + "/" + res_link, getDefaultEncoding()); Document res_doc = Jsoup.parse(res_html); for (Element table : res_doc .select(".MessageBrowseItemDetailsCell table, .MessageBrowseItemDetailsCellStripe table")) { Map<String, String> item = new HashMap<String, String>(); for (Element tr : table.select("tr")) { String desc = tr.select(".MessageBrowseFieldNameCell").text().trim(); String value = tr.select(".MessageBrowseFieldDataCell").text().trim(); if (desc.equals("Titel")) item.put(AccountData.KEY_RESERVATION_TITLE, value); if (desc.equals("Publikationsform")) item.put(AccountData.KEY_RESERVATION_FORMAT, value); if (desc.equals("Liefern an")) item.put(AccountData.KEY_RESERVATION_BRANCH, value); if (desc.equals("Status")) item.put(AccountData.KEY_RESERVATION_READY, value); } if ("Gelscht".equals(item.get(AccountData.KEY_RESERVATION_READY))) { continue; } reservations.add(item); } res.setReservations(reservations); assert (reservations.size() >= res_cnt); return res; }
From source file:com.fluidops.iwb.provider.HTMLProvider.java
@Override public void gather(List<Statement> res) throws Exception { String url = config.url;/* w w w . ja v a 2 s .com*/ Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); Elements imports = doc.select("link[href]"); // Elements article = // doc.select("div.wrapper").select("div.box-shadow").select("div#content.cols").select("div.cl").select("div.crm").select("article").select("section.article").select("div.textblock").select("table"); Elements article = doc.getElementsByTag("tbody").select("tr"); Elements tableElem; URI nameURI = null; URI roadsURI = null; URI sideURI = null; URI totalURI = null; File file = new File("HTMLdata.txt"); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file))); out.println("Media"); print("\nMedia: (%d)", media.size()); for (Element el : media) { if (el.tagName().equals("img")) { print(" * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"), el.attr("height"), trim(el.attr("alt"), 20)); out.printf(" \n * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"), el.attr("height"), trim(el.attr("alt"), 20)); out.println(); } else { print(" * %s: <%s>", el.tagName(), el.attr("abs:src")); out.printf(" \n * %s: <%s>", el.tagName(), el.attr("abs:src")); out.println(); } } out.println("Imports"); print("\nImports: (%d)", imports.size()); for (Element link : imports) { print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); out.printf(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); out.println(); } out.println("Links"); print("\nLinks: (%d)", links.size()); for (Element link : links) { print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); out.printf(" * a: <%s> (%s)", link.attr("abs:href"), link.text()); out.println(); } /* * out.println("Custom text"); print("\nCustom: (%d)",customArt.size()); * for (Element custom:customArt){ * out.printf(" * a (%s): (%s)",custom.tagName(),custom.text()); * out.println(); } */ out.println("Article"); print("\nArticle: (%d)", article.size()); for (int i = 3; i < article.size() - 2; i++) { tableElem = article.get(i).select("td"); out.println(); if (i == 3) { nameURI = ProviderUtils.objectToUri(tableElem.get(0).text()); roadsURI = ProviderUtils.objectToUri(tableElem.get(1).text()); sideURI = ProviderUtils.objectToUri(tableElem.get(2).text()); totalURI = ProviderUtils.objectToUri(tableElem.get(3).text()); } else { res.add(ProviderUtils.createStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDF.TYPE, nameURI)); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDFS.LABEL, tableElem.get(0).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), roadsURI, tableElem.get(1).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), sideURI, tableElem.get(2).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), totalURI, tableElem.get(3).text())); for (Element el : tableElem) { out.printf("\n * (%s): (%s)", el.tagName(), el.text()); out.println(); } } out.println(); out.printf("\n * a (%s) (%d): (%s)", article.get(i).tagName(), tableElem.size(), article.get(i).text()); out.println(); } out.close(); }
From source file:be.ibridge.kettle.jsoup.JsoupInput.java
private Object[] buildRow() throws KettleException { // Create new row... Object[] outputRowData = buildEmptyRow(); if (data.readrow != null) outputRowData = data.readrow.clone(); // Read fields... for (int i = 0; i < data.nrInputFields; i++) { // Get field JsoupInputField field = meta.getInputFields()[i]; // get jsoup array for field Elements jsoupa = data.resultList.get(i); String nodevalue = null;// w w w . j a va 2 s .c o m if (jsoupa != null) { Element jo = jsoupa.get(data.recordnr); if (jo != null) { // Do Element Type switch (field.getElementType()) { case JsoupInputField.ELEMENT_TYPE_NODE: // Do Result Type switch (field.getResultType()) { case JsoupInputField.RESULT_TYPE_TEXT: nodevalue = jo.text(); break; case JsoupInputField.RESULT_TYPE_TYPE_OUTER_HTML: nodevalue = jo.outerHtml(); break; case JsoupInputField.RESULT_TYPE_TYPE_INNER_HTML: nodevalue = jo.html(); break; default: nodevalue = jo.toString(); break; } break; case JsoupInputField.ELEMENT_TYPE_ATTRIBUT: nodevalue = jo.attr(field.getAttribute()); break; default: nodevalue = jo.toString(); break; } } } // Do trimming switch (field.getTrimType()) { case JsoupInputField.TYPE_TRIM_LEFT: nodevalue = Const.ltrim(nodevalue); break; case JsoupInputField.TYPE_TRIM_RIGHT: nodevalue = Const.rtrim(nodevalue); break; case JsoupInputField.TYPE_TRIM_BOTH: nodevalue = Const.trim(nodevalue); break; default: break; } if (meta.isInFields()) { // Add result field to input stream outputRowData = RowDataUtil.addValueData(outputRowData, data.totalpreviousfields + i, nodevalue); } // Do conversions // ValueMetaInterface targetValueMeta = data.outputRowMeta.getValueMeta(data.totalpreviousfields + i); ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(data.totalpreviousfields + i); outputRowData[data.totalpreviousfields + i] = targetValueMeta.convertData(sourceValueMeta, nodevalue); // Do we need to repeat this field if it is null? if (meta.getInputFields()[i].isRepeated()) { if (data.previousRow != null && Const.isEmpty(nodevalue)) { outputRowData[data.totalpreviousfields + i] = data.previousRow[data.totalpreviousfields + i]; } } } // End of loop over fields... int rowIndex = data.nrInputFields; // See if we need to add the filename to the row... if (meta.includeFilename() && !Const.isEmpty(meta.getFilenameField())) { outputRowData[rowIndex++] = data.filename; } // See if we need to add the row number to the row... if (meta.includeRowNumber() && !Const.isEmpty(meta.getRowNumberField())) { outputRowData[rowIndex++] = new Long(data.rownr); } // Possibly add short filename... if (meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0) { outputRowData[rowIndex++] = data.shortFilename; } // Add Extension if (meta.getExtensionField() != null && meta.getExtensionField().length() > 0) { outputRowData[rowIndex++] = data.extension; } // add path if (meta.getPathField() != null && meta.getPathField().length() > 0) { outputRowData[rowIndex++] = data.path; } // Add Size if (meta.getSizeField() != null && meta.getSizeField().length() > 0) { outputRowData[rowIndex++] = new Long(data.size); } // add Hidden if (meta.isHiddenField() != null && meta.isHiddenField().length() > 0) { outputRowData[rowIndex++] = new Boolean(data.path); } // Add modification date if (meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0) { outputRowData[rowIndex++] = data.lastModificationDateTime; } // Add Uri if (meta.getUriField() != null && meta.getUriField().length() > 0) { outputRowData[rowIndex++] = data.uriName; } // Add RootUri if (meta.getRootUriField() != null && meta.getRootUriField().length() > 0) { outputRowData[rowIndex++] = data.rootUriName; } data.recordnr++; RowMetaInterface irow = getInputRowMeta(); data.previousRow = irow == null ? outputRowData : (Object[]) irow.cloneRow(outputRowData); // copy it to make // surely the next step doesn't change it in between... return outputRowData; }
From source file:org.shareok.data.sagedata.SageSourceDataHandlerImpl.java
private String[] getArticleKeyWordsFromFullTextDoc(Document doc) throws NoHtmlComponentsFoundException { String[] keys = null;// w w w .jav a2s.co m Elements keyElements = doc.select("div.hlFld-KeywordText"); if (null == keyElements || keyElements.isEmpty()) { return null; } Elements keyLinkElements = keyElements.get(0).select("a"); if (null == keyLinkElements || keyLinkElements.isEmpty()) { return null; } List<String> keyList = new ArrayList<>(); for (Element link : keyLinkElements) { keyList.add(link.text()); } if (keyList.size() > 0) { keys = keyList.toArray(new String[keyList.size()]); } return keys; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * remove any divs that looks like non-content, clusters of links, or paras with no gusto * * @param node/*from w w w.j av a2s . c om*/ * @return */ private Element cleanupNode(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting cleanup Node"); } node = addSiblings(node); Elements nodes = node.children(); for (Element e : nodes) { if (e.tagName().equals("p")) { continue; } if (logger.isDebugEnabled()) { logger.debug("CLEANUP NODE: " + e.id() + " class: " + e.attr("class")); } boolean highLinkDensity = isHighLinkDensity(e); if (highLinkDensity) { if (logger.isDebugEnabled()) { logger.debug("REMOVING NODE FOR LINK DENSITY: " + e.id() + " class: " + e.attr("class")); } e.remove(); continue; } // now check for word density // grab all the paragraphs in the children and remove ones that are too small to matter Elements subParagraphs = e.getElementsByTag("p"); for (Element p : subParagraphs) { if (p.text().length() < 25) { p.remove(); } } // now that we've removed shorty paragraphs let's make sure to exclude any first paragraphs that don't have paras as // their next siblings to avoid getting img bylines // first let's remove any element that now doesn't have any p tags at all Elements subParagraphs2 = e.getElementsByTag("p"); if (subParagraphs2.size() == 0 && !e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node because it doesn't have any paragraphs"); } e.remove(); continue; } //if this node has a decent enough gravityScore we should keep it as well, might be content int topNodeScore = getScore(node); int currentNodeScore = getScore(e); float thresholdScore = (float) (topNodeScore * .08); if (logger.isDebugEnabled()) { logger.debug("topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore + " threshold: " + thresholdScore); } if (currentNodeScore < thresholdScore) { if (!e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node due to low threshold score"); } e.remove(); } else { if (logger.isDebugEnabled()) { logger.debug("Not removing TD node"); } } continue; } } return node; }
From source file:org.shareok.data.sagedata.SageJournalIssueDateProcessor.java
public Map<String, Map<String, String>> updateSageJournalLinks(Map<String, Map<String, String>> journalMap) { Document doc = null;//from w w w . j a v a 2 s .c om try { doc = Jsoup.connect("http://journals.sagepub.com/action/showPublications?pageSize=20&startPage=199") .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token").timeout(300000).get(); Elements trs = doc.select("form#browsePublicationsForm").get(0).select("table").get(0).select("tbody") .get(0).select("tr"); for (Element tr : trs) { Element link = tr.select("td").get(1).select("a").get(0); String journalName = link.text(); String journalLink = SageDataUtil.SAGE_HTTP_PREFIX + link.attr("href"); String[] linkInfo = journalLink.split("/"); String journalIssuesLink = SageDataUtil.SAGE_HTTP_PREFIX + "/loi/" + linkInfo[linkInfo.length - 1]; if (null == journalMap.get(journalName)) { Map<String, String> infoMap = new HashMap<>(); infoMap.put("homeLink", journalLink); infoMap.put("issueLink", journalIssuesLink); journalMap.put(journalName, infoMap); } else { Map<String, String> infoMap = journalMap.get(journalName); if (null == infoMap.get("homeLink")) { infoMap.put("homeLink", journalLink); } if (null == infoMap.get("issueLink")) { infoMap.put("issueLink", journalIssuesLink); } } } } catch (Exception ex) { ex.printStackTrace(); } return journalMap; }
From source file:crawler.HackerEarthCrawler.java
@Override public void crawl() { int flag = 0; //set of urls which should be crawled TreeSet<String> linksset = new TreeSet<String>(); TreeSet<String> tempset = new TreeSet<String>(); TreeSet<String> tutorialset = new TreeSet<String>(); //final set of problem urls TreeSet<String> problemset = new TreeSet<String>(); //visited for maintaing status of if url is already crawled or not TreeMap<String, Integer> visited = new TreeMap<String, Integer>(); //add base url linksset.add(baseUrl);/* w w w .jav a 2 s.co m*/ //mark base url as not crawled visited.put(baseUrl, 0); try { while (true) { flag = 0; tempset.clear(); for (String str : linksset) { //check if url is already crawled or not and it has valid domain name if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) { System.out.println("crawling " + str); //retriving response of current url as document Document doc = Jsoup.connect(str).timeout(0).userAgent( "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0") .referrer("http://www.google.com").ignoreHttpErrors(true).get(); //retriving all urls from current page Elements links = doc.select("a[href]"); //mark url as crawled visited.put(str, 1); //mark flag as url is crawled flag = 1; //retrive all urls for (Element link : links) { if (link.absUrl("href").endsWith("/tutorial/")) { tutorialset.add(link.absUrl("href")); } //check if url is problem url then add it in problemurlset if (link.absUrl("href").startsWith("https://www.hackerearth.com/") && isProblemUrl(link.absUrl("href"))) { problemset.add(link.absUrl("href")); } //check if url has valid domain and it has problem urls or not if (link.absUrl("href").contains(("https://www.hackerearth.com/")) && isCrawlable(link.absUrl("href"))) { //if link is not visited then mark it as uncrawled if (!visited.containsKey(link.absUrl("href"))) { visited.put(link.absUrl("href"), 0); } //add it in tempsetorary set tempset.add(link.absUrl("href")); //System.out.println("\n base: "+str+" ::: link : " + link.absUrl("href")); } } } } //if nothing is left to crawl break the loop if (flag == 0) { break; } //add all retrieved links to linksset linksset.addAll(tempset); } System.out.println("\n\ntotal problem urls " + problemset.size()); int i = 0; for (String str : problemset) { System.out.println("link " + i + " : " + str); i++; } } catch (IOException ex) { Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex); } //scrap and store into database //for every problem url scrap problem page for (String problemUrl : problemset) { System.out.println("problemUrl :" + problemUrl); try { //create problem class to store in database Problem problem = new Problem(); String problemSIOC = "", problemIOC = ""; String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "", problemConstraints = ""; String sampleInput = "", sampleOutput = ""; String problemExplanation = ""; //set default timelimit to 1 second double problemTimeLimit = 1.0; ArrayList<String> tags = new ArrayList<String>(); //get response for given problem url Response response = Jsoup.connect(problemUrl).execute(); Document doc = response.parse(); //retrieve problem title from page Element elementTitle = doc.getElementsByTag("title").first(); StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|"); problemTitle = stTitle.nextToken().trim(); Element content = doc.getElementsByClass("starwars-lab").first(); problemSIOC = content.text(); Elements e = content.children(); //to find problem statement String breakloop[] = { "input", "input:", "input :", "input format:", "input format :", "input format", "Input and output", "constraints :", "constraints:", "constraints", "$$Input :$$" }; flag = 0; for (Element p : e) { String tempStatement = ""; for (Element pp : p.getAllElements()) { for (String strbreak : breakloop) { if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) { //System.out.println("strbreak :"+strbreak); tempStatement = p.text().substring(0, p.text().toLowerCase().indexOf(strbreak.toLowerCase())); // System.out.println("temp "+tempStatement); flag = 1; break; } } } if (flag == 1) { problemStatement += tempStatement; //remove extra space at end if (tempStatement.length() == 0) { problemStatement = problemStatement.substring(0, problemStatement.length() - 1); } break; } problemStatement += p.text() + " "; } System.out.println("problemSIOC :" + problemSIOC); System.out.println("problemStatement :" + problemStatement); if (problemStatement.length() <= problemSIOC.length()) { //remove problem statement from whole text and remove extra spaces at the beginning and the end problemIOC = problemSIOC.substring(problemStatement.length()).trim(); } else { problemIOC = ""; } System.out.println("problemIOC :" + problemIOC); //keywords for identifying input String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:", "inputformat :", "inputformat", "input and output", "input :", "input:", "input" }; //keywords for identifying output String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:", "outputformat :", "outputformat", "output :", "output:", "output" }; //keywords for identifying constraint String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :", "constraint:", "constraint :", "constraint", "Contraints :" }; int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0, flagcon = 0, inlen = 0, outlen = 0, conlen = 0; //find inputformat position,length of keyword for (idxin = 0; idxin < decideInput.length; idxin++) { if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) { posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase()); flaginput = 1; inlen = decideInput[idxin].length(); //decide it is keyowrd for actucal input or it is "sample input" if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) { if (posin > problemIOC.toLowerCase().indexOf("sample input")) { flaginput = 0; inlen = 0; } else { break; } } else { break; } } } //find outputformat position,length of keyword for (idxout = 0; idxout < decideOutput.length; idxout++) { if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) { posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase()); flagoutput = 1; outlen = decideOutput[idxout].length(); break; } } //find constraint position,length of keyword for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) { if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) { poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase()); flagcon = 1; conlen = decideConstraint[idxcon].length(); break; } } System.out.println("input " + flaginput + " " + inlen + " " + posin); System.out.println("output " + flagoutput + " " + outlen + " " + posoutput); System.out.println("constraint " + flagcon + " " + conlen + " " + poscon); //retrieve problem input and output if present in problem page //if input format is present if (flaginput == 1) { //if input keyword is "input and output" and contraint is present in problem page if (idxin == 6 && flagcon == 1) { problemInput = problemIOC.substring(inlen, poscon); } //if input keyword is "input and output" and contraint is not present in problem page else if (idxin == 6 && flagcon == 0) { problemInput = problemIOC.substring(inlen); } //if output format and constraint is present else if (flagoutput == 1 && flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } //if constraint is present before sample else if (poscon < posoutput) { problemInput = problemIOC.substring(inlen, poscon); problemOutput = problemIOC.substring(posoutput + outlen); } else { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen, poscon); } } //if constraint is not present else if (flagoutput == 1 && flagcon == 0) { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } else if (flagoutput == 0 && flagcon == 1) { if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen); } else { problemInput = problemIOC.substring(poscon + conlen, posin); } problemOutput = ""; } else { problemInput = problemIOC.substring(inlen); problemOutput = ""; } } //if input format and output format is not present else { problemInput = ""; problemOutput = ""; } //if constraint is present if (flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemConstraints = problemIOC.substring(0, posin); } //if constraint is present before output format else if (poscon < posoutput) { problemConstraints = problemIOC.substring(poscon + conlen, posoutput); } else { problemConstraints = problemIOC.substring(poscon + conlen); } } System.out.println("problemInput :" + problemInput); System.out.println("problemOutput :" + problemOutput); System.out.println("problemConstraints :" + problemConstraints); //retrieve problem tags from problem page Element elementtag = doc.getElementsByClass("problem-tags").first().child(1); StringTokenizer st = new StringTokenizer(elementtag.text(), ","); while (st.hasMoreTokens()) { tags.add(st.nextToken().trim()); } //retrieve sample input sample output if present Element elementSIO = doc.getElementsByClass("input-output-container").first(); //if sample input output is present if (elementSIO != null) { //find position of sample output int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT"); sampleInput = elementSIO.text().substring(12, soutpos); sampleOutput = elementSIO.text().substring(soutpos + 13); System.out.println("Sample input :\n" + sampleInput + "\n\n\n"); System.out.println("Sample Output :\n" + sampleOutput); } else { sampleInput = ""; sampleOutput = ""; } //retrieve problem explanation from problem page if present Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0); if (elementExplanation.text().toLowerCase().contains("explanation")) { problemExplanation = elementExplanation.nextElementSibling().text(); } System.out.println("Explanation :" + problemExplanation); //retrieve timelimit Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1); StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " "); problemTimeLimit = Double.parseDouble(stTL.nextToken()); //System.out.println("problemTimeLimit :"+problemTimeLimit); //set all retrieved information to problem class problem.setProblemUrl(problemUrl); if (problemTitle.length() == 0) { problemTitle = null; } if (problemStatement.length() == 0) { problemStatement = null; } if (problemInput.length() == 0) { problemInput = null; } if (problemOutput.length() == 0) { problemOutput = null; } if (problemExplanation.length() == 0) { problemExplanation = null; } if (problemConstraints.length() == 0) { problemConstraints = null; } problem.setTitle(problemTitle); problem.setProblemUrl(problemUrl); problem.setProblemStatement(problemStatement); problem.setInputFormat(problemInput); problem.setOutputFormat(problemOutput); problem.setTimeLimit(problemTimeLimit); problem.setExplanation(problemExplanation); problem.setConstraints(problemConstraints); //set sample input output to problem class SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput); problem.getSampleInputOutputs().add(sampleInputOutput); //set platform as hackerearth problem.setPlatform(Platform.HackerEarth); for (String strtag : tags) { problem.getTags().add(strtag); } //store in database Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Problem p where p.problemUrl = :problem_url"; Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem problem.setId(oldProblem.getId()); session.delete(oldProblem); session.flush(); session.save(problem); } else { task = "saved"; session.save(problem); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, problem.getProblemUrl() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + problemUrl, e); } finally { //close the session if (session != null) { session.close(); } } } catch (Exception ee) { System.out.println(ee.toString()); } } System.out.println("\n\n\n\ntutorial urls\n\n"); try { for (String tutorialurl : tutorialset) { //System.out.println(tutorialurl+"\n\n"); Response tutorialres = Jsoup.connect(tutorialurl).execute(); Document doc = tutorialres.parse(); Tutorial tutorial = new Tutorial(); tutorial.setContent(doc.getElementsByClass("tutorial").first().text()); tutorial.setName(baseUrl); tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10); StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/"); String tempstr = ""; while (tutorialtok.hasMoreTokens()) { tempstr = tutorialtok.nextToken(); } Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Tutorial p where p.name = :name"; Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem tutorial.setName(oldProblem.getName()); session.delete(oldProblem); session.flush(); session.save(tutorial); } else { task = "saved"; tutorial.setName(tempstr); session.save(tutorial); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, tutorial.getName() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + tempstr, ee); } finally { //close the session if (session != null) { session.close(); } } } } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:org.shareok.data.sagedata.SageSourceDataHandlerImpl.java
private String getPublisherFromFullTextDoc(Document doc) throws NoHtmlComponentsFoundException { String publisher = null;//from w w w . j a va 2s .c om Elements headerTitleContainerElements = doc.select("div#headerTitleContainer"); if (null == headerTitleContainerElements || headerTitleContainerElements.isEmpty()) { throw new NoHtmlComponentsFoundException("Cannot find headerTitleContainer"); } Element headerTitleContainer = headerTitleContainerElements.get(0); String pub = headerTitleContainer.text(); if (null != pub) { publisher = pub; } return publisher; }
From source file:de.geeksfactory.opacclient.apis.Heidi.java
@Override public DetailledItem getResultById(String id, final String homebranch) throws IOException { if (sessid == null) { start();/*www . j a va 2s . c om*/ } // Homebranch if (homebranch != null && !"".equals(homebranch)) { cookieStore.addCookie(new BasicClientCookie("zweig", homebranch)); } String html = httpGet(opac_url + "/titel.cgi?katkey=" + id + "&sess=" + sessid, ENCODING, false, cookieStore); Document doc = Jsoup.parse(html); DetailledItem item = new DetailledItem(); item.setId(id); Elements table = doc.select(".titelsatz tr"); for (Element tr : table) { if (tr.select("th").size() == 0 || tr.select("td").size() == 0) { continue; } String d = tr.select("th").first().text(); String c = tr.select("td").first().text(); if (d.equals("Titel:")) { item.setTitle(c); } else if ((d.contains("URL") || d.contains("Link")) && tr.select("td a").size() > 0) { item.addDetail(new Detail(d, tr.select("td a").first().attr("href"))); } else { item.addDetail(new Detail(d, c)); } } if (doc.select(".ex table tr").size() > 0) { table = doc.select(".ex table tr"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (Element tr : table) { if (tr.hasClass("exueber") || tr.select(".exsig").size() == 0 || tr.select(".exso").size() == 0 || tr.select(".exstatus").size() == 0) { continue; } Copy copy = new Copy(); copy.setShelfmark(tr.select(".exsig").first().text()); copy.setBranch(tr.select(".exso").first().text()); String status = tr.select(".exstatus").first().text(); if (status.contains("entliehen bis")) { copy.setReturnDate(fmt.parseLocalDate(status.replaceAll("entliehen bis ([0-9.]+) .*", "$1"))); copy.setReservations(status.replaceAll(".*\\(.*Vormerkungen: ([0-9]+)\\)", "$1")); copy.setStatus("entliehen"); } else { copy.setStatus(status); } item.addCopy(copy); } } for (Element a : doc.select(".status1 a")) { if (a.attr("href").contains("bestellung.cgi")) { item.setReservable(true); item.setReservation_info(id); break; } } for (Element a : doc.select(".titelsatz a")) { if (a.text().trim().matches("B.+nde")) { Map<String, String> volumesearch = new HashMap<>(); volumesearch.put("query", getQueryParamsFirst(a.attr("href")).get("query")); item.setVolumesearch(volumesearch); } } return item; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * adds any siblings that may have a decent score to this node * * @param node//ww w . j a v a 2s . c o m * @return */ private Element addSiblings(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting to add siblings"); } int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node); Element currentSibling = node.previousElementSibling(); while (currentSibling != null) { if (logger.isDebugEnabled()) { logger.debug("SIBLINGCHECK: " + debugNode(currentSibling)); } if (currentSibling.tagName().equals("p")) { node.child(0).before(currentSibling.outerHtml()); currentSibling = currentSibling.previousElementSibling(); continue; } // check for a paraph embedded in a containing element int insertedSiblings = 0; Elements potentialParagraphs = currentSibling.getElementsByTag("p"); if (potentialParagraphs.first() == null) { currentSibling = currentSibling.previousElementSibling(); continue; } for (Element firstParagraph : potentialParagraphs) { WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text()); int paragraphScore = wordStats.getStopWordCount(); if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) { if (logger.isDebugEnabled()) { logger.debug("This node looks like a good sibling, adding it"); } node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>"); insertedSiblings++; } } currentSibling = currentSibling.previousElementSibling(); } return node; }