List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:org.jtotus.database.NetworkOP.java
public BigDecimal fetchData(String stockName, DateTime date, int col) { BigDecimal result = null;/* ww w . j a v a2 s . c o m*/ URL url; System.out.printf("NetworkOP fetchData(%s,hex:%s, date:%s col:%d)\n", stockName, new StockType(stockName).getHexName(), date.toString(), col); try { url = new URL(this.buildRequest(date, stockName)); Document doc = Jsoup.parse(url, 2 * 1000); Elements elems = doc.select("td"); Iterator<Element> iter = elems.iterator(); while (iter.hasNext()) { Element elem = iter.next(); String data = elem.html(); String datePattern = dateFormatter.print(date); //String formatHttp = "<div class=\"Ensimmainen\">\n" + datePattern + "\n</div>"; if (data.indexOf(datePattern) != -1) { for (int i = 0; i < col; i++) { elem = iter.next(); } data = elem.text(); String fdata = data.replace(',', '.'); if (debug) { System.out.printf("Fetched value from OP bank ->:%s for date:%s\n", fdata, datePattern); } return BigDecimal.valueOf(Double.valueOf(fdata).doubleValue()); } } } catch (IOException ex) { System.out.printf("Failed in :%s\n", "NetworkOP"); //Logger.getLogger(NetworkGoogle.class.getName()).log(Level.SEVERE, null, ex); } return result; }
From source file:org.jtotus.database.NetworkOP.java
private double[] fetchDataPeriod(String stockName, DateTime fromDate, DateTime toDate, int col) { List<Double> values = new ArrayList<Double>(); URL url;/*from w w w .jav a 2s. c o m*/ System.out.printf("NetworkOP fetchData(%s,hex:%s, date:%s-%s col:%d)\n", stockName, new StockType(stockName).getHexName(), fromDate.toString(), toDate.toString(), col); try { url = new URL(this.buildRequest(fromDate, toDate, stockName)); Document doc = Jsoup.parse(url, 2 * 1000); Elements elems = doc.select("td"); DateIterator dateIter = new DateIterator(fromDate, toDate); while (dateIter.hasNext()) { Iterator<Element> iter = elems.iterator(); String datePattern = dateFormatter.print(dateIter.nextInCalendar()); while (iter.hasNext()) { Element elem = iter.next(); String data = elem.html(); //System.out.printf("Fetching.. :%s\n", dateFormatter.print(dateIter.getCurrentAsCalendar())); //String formatHttp = "<div class=\"Ensimmainen\">\n" + datePattern + "\n</div>"; if (data.indexOf(datePattern) != -1) { for (int i = 0; i < col; i++) { elem = iter.next(); } data = elem.text(); String fdata = data.replace(',', '.'); if (debug) { System.out.printf("Fetched value from OP bank ->:%s for date:%s\n", fdata, datePattern); } values.add(Double.valueOf(fdata)); break; } } } } catch (IOException ex) { System.out.printf("Failed in :%s\n", "NetworkOP"); //Logger.getLogger(NetworkGoogle.class.getName()).log(Level.SEVERE, null, ex); } return ArrayUtils.toPrimitive(values.toArray(new Double[0])); }
From source file:org.jtotus.network.NordnetConnect.java
private StockTick parseAuthenticatedStream(String infoPage, String stockName) { StockTick tick = null;/*from w w w . j a v a2s . c o m*/ Document doc = Jsoup.parse(infoPage); Elements elements = doc.select("tr[class=first]"); doc = Jsoup.parse(elements.html()); elements = doc.select("td"); if (elements.size() != 15) { //not authenticated 13 return tick; } tick = new StockTick(); tick.setStockName(stockName); Iterator<Element> iter = elements.iterator(); for (int count = 0; iter.hasNext(); count++) { Element elem = iter.next(); log.info("Element value (" + count + "):" + elem.text()); switch (count) { case 3: if (!elem.text().equalsIgnoreCase("OMX Helsinki")) { System.err.printf("Data corruption in broker site? :%s for: %s\n", elem.text(), stockName); return null; } break; case 4://latest price tick.setLatestPrice(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 5://latest buy tick.setLatestBuy(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 6://latest sell tick.setLatestSell(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 7://latest Highest tick.setLatestHighest(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 8://latest Lowest tick.setLatestLowest(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 11://latest Lowest tick.setVolume(Double.parseDouble(elem.text().replace(" ", "").trim())); break; case 12://latest Lowest tick.setTradesSum(Double.parseDouble(elem.text().replace(" ", "").trim())); break; case 14://Time tick.setTime(elem.text().trim()); break; //TODO:currency and time default: log.info("Not matched(" + count + ") = " + elem.text()); break; } } log.info("StockTick:" + tick.toString()); return tick; }
From source file:org.jtotus.network.NordnetConnect.java
private StockTick parseNonAuthenticatedStream(String infoPage, String stockName) { StockTick tick = null;/*from w w w .j a v a2 s . c om*/ Document doc = Jsoup.parse(infoPage); Elements elements = doc.select("tr[class=first]"); doc = Jsoup.parse(elements.html()); elements = doc.select("td"); if (elements.size() != 13) { //not authenticated 13 return tick; } tick = new StockTick(); tick.setStockName(stockName); Iterator<Element> iter = elements.iterator(); for (int count = 0; iter.hasNext(); count++) { Element elem = iter.next(); System.out.printf("Non-Auth Element value (%d):%s for:%s\n", count, elem.text(), stockName); switch (count) { case 1: if (!elem.text().equalsIgnoreCase("OMX Helsinki")) { System.err.printf("Data corruption in broker site? :%s for: %s\n", elem.text(), stockName); return null; } break; case 2://latest price tick.setLatestPrice(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 3://latest buy tick.setLatestBuy(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 4://latest sell tick.setLatestSell(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 5://latest Highest tick.setLatestHighest(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 6://latest Lowest tick.setLatestLowest(Double.parseDouble(elem.text().replace(",", ".").trim())); break; case 9://Volume tick.setVolume(Double.parseDouble(elem.text().replace(" ", "").trim())); break; case 10://Trade Sum tick.setTradesSum(Double.parseDouble(elem.text().replace(" ", "").trim())); break; case 12://Time tick.setTime(elem.text().trim()); break; //TODO:currency and time default: System.out.printf("Not matched(%d) = %s \n", count, elem.text()); break; } } System.out.printf("StockTick:%s\n", tick.toString()); return tick; }
From source file:org.lockss.extractor.JsoupTagExtractor.java
/** * extract the values for the as defined by the selectors and store them in * article. These can be selectors or they can be css/jquery selection strings * metadata/*from w w w . ja v a 2 s.c o m*/ * @param doc the jsoup parsed doc * @param articleMeta the ArticleMetadata in which to store the selector/value(s) */ void extractSelectors(Document doc, ArticleMetadata articleMeta) { // if we don't have any selectors there is nothing to do, so we return if (m_selectors == null || m_selectors.isEmpty()) return; for (String selector : m_selectors) { String val; Elements elements = doc.select(selector); for (Element element : elements) { if (element.hasText()) { if (m_isHtml) { val = processHtml(selector, element.text()); } else { val = processXml(selector, element.text()); } if (theLog.isDebug3()) theLog.debug3("Add: " + selector + " = " + val); articleMeta.putRaw(selector, val); } } } }
From source file:org.lockss.extractor.JsoupXmlTagExtractor.java
/** * extract the values for the desired tags and store them in article metadata * @param doc the jsoup parsed doc//from www. j a v a 2 s .c o m * @param articleMeta the ArticleMetadata in which to store the tag/value(s) */ void extractTags(Document doc, ArticleMetadata articleMeta) { // if we don't have any tags, there is nothing to do so we return if (m_tags == null || m_tags.isEmpty()) return; for (String tag : m_tags) { String value; Elements tag_elements = doc.select(tag); for (Element tag_el : tag_elements) { if (tag_el.hasText()) { value = processXml(tag, tag_el.text()); articleMeta.putRaw(tag, value); } } } }
From source file:org.loklak.api.search.MeetupsCrawlerService.java
public static SusiThought crawlMeetups(String url) { Document meetupHTML = null;/*from ww w .j av a 2s.com*/ String meetupGroupName = null; String meetupType = null; String groupDescription = null; String groupLocality = null; String groupCountry = null; String latitude = null; String longitude = null; String imageLink = null; Elements topicList = null; String[] topicListArray = new String[100]; Integer numberOfTopics = 0; Elements recentMeetupsSection = null; Integer numberOfRecentMeetupsShown = 0; Integer i = 0, j = 0; String recentMeetupsResult[][] = new String[100][3]; // recentMeetupsResult[i][0] == date && time // recentMeetupsResult[i][1] == Attendance && Review // recentMeetupsResult[i][2] == Information JSONObject result = new JSONObject(); try { meetupHTML = Jsoup.connect(url).userAgent("Mozilla)").get(); } catch (Exception e) { e.printStackTrace(); } meetupGroupName = meetupHTML.getElementsByAttributeValue("property", "og:title").attr("content"); result.put("group_name", meetupGroupName); meetupType = meetupHTML.getElementsByAttributeValue("property", "og:type").attr("content"); result.put("meetup_type", meetupType); groupDescription = meetupHTML.getElementById("groupDesc").text(); result.put("group_description", groupDescription); groupLocality = meetupHTML.getElementsByAttributeValue("property", "og:locality").attr("content"); result.put("group_locality", groupLocality); groupCountry = meetupHTML.getElementsByAttributeValue("property", "og:country-name").attr("content"); result.put("group_country_code", groupCountry); latitude = meetupHTML.getElementsByAttributeValue("property", "og:latitude").attr("content"); result.put("group_latitude", latitude); longitude = meetupHTML.getElementsByAttributeValue("property", "og:longitude").attr("content"); result.put("group_longitude", longitude); imageLink = meetupHTML.getElementsByAttributeValue("property", "og:image").attr("content"); result.put("group_imageLink", imageLink); topicList = meetupHTML.getElementById("topic-box-2012").getElementsByTag("a"); int p = 0; for (Element topicListStringsIterator : topicList) { topicListArray[p] = topicListStringsIterator.text().toString(); p++; } numberOfTopics = p; JSONArray groupTopics = new JSONArray(); for (int l = 0; l < numberOfTopics; l++) { groupTopics.put(l, topicListArray[l]); } result.put("group_topics", groupTopics); recentMeetupsSection = meetupHTML.getElementById("recentMeetups").getElementsByTag("p"); i = 0; j = 0; for (Element recentMeetups : recentMeetupsSection) { if (j % 3 == 0) { j = 0; i++; } recentMeetupsResult[i][j] = recentMeetups.text().toString(); j++; } numberOfRecentMeetupsShown = i; JSONArray recentMeetups = new JSONArray(); for (int k = 1; k < numberOfRecentMeetupsShown; k++) { JSONObject obj = new JSONObject(); obj.put("recent_meetup_number", k); obj.put("date_time", recentMeetupsResult[k][0]); obj.put("attendance", recentMeetupsResult[k][1]); obj.put("information", recentMeetupsResult[k][2]); recentMeetups.put(obj); } result.put("recent_meetups", recentMeetups); JSONArray meetupsCrawlerResultArray = new JSONArray(); meetupsCrawlerResultArray.put(result); SusiThought json = new SusiThought(); json.setData(meetupsCrawlerResultArray); return json; }
From source file:org.loklak.api.search.WordpressCrawlerService.java
public static SusiThought crawlWordpress(String blogURL) { Document blogHTML = null;//from w ww . j a v a 2 s . c om Elements articles = null; Elements articleList_title = null; Elements articleList_content = null; Elements articleList_dateTime = null; Elements articleList_author = null; String[][] blogPosts = new String[100][4]; // blogPosts[][0] = Blog Title // blogPosts[][1] = Posted On // blogPosts[][2] = Author // blogPosts[][3] = Blog Content Integer numberOfBlogs = 0; Integer iterator = 0; try { blogHTML = Jsoup.connect(blogURL).get(); } catch (IOException e) { e.printStackTrace(); } articles = blogHTML.getElementsByTag("article"); iterator = 0; for (Element article : articles) { articleList_title = article.getElementsByClass("entry-title"); for (Element blogs : articleList_title) { blogPosts[iterator][0] = blogs.text().toString(); } articleList_dateTime = article.getElementsByClass("posted-on"); for (Element blogs : articleList_dateTime) { blogPosts[iterator][1] = blogs.text().toString(); } articleList_author = article.getElementsByClass("byline"); for (Element blogs : articleList_author) { blogPosts[iterator][2] = blogs.text().toString(); } articleList_content = article.getElementsByClass("entry-content"); for (Element blogs : articleList_content) { blogPosts[iterator][3] = blogs.text().toString(); } iterator++; } numberOfBlogs = iterator; JSONArray blog = new JSONArray(); for (int k = 0; k < numberOfBlogs; k++) { JSONObject blogpost = new JSONObject(); blogpost.put("blog_url", blogURL); blogpost.put("title", blogPosts[k][0]); blogpost.put("posted_on", blogPosts[k][1]); blogpost.put("author", blogPosts[k][2]); blogpost.put("content", blogPosts[k][3]); blog.put(blogpost); } SusiThought json = new SusiThought(); json.setData(blog); return json; }
From source file:org.neo4j.browser.CannedCypherExecutionTest.java
@Test public void shouldBeAbleToExecuteAllTheCannedCypherQueriesContainedInStaticHtmlFiles() throws Exception { URL resourceLoc = getClass().getClassLoader().getResource("browser"); assertNotNull(resourceLoc);//from ww w. j a va 2s. c o m final AtomicInteger explainCount = new AtomicInteger(0); final AtomicInteger executionCount = new AtomicInteger(0); Files.walkFileTree(Paths.get(resourceLoc.toURI()), new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attributes) throws IOException { final GraphDatabaseService database = new TestGraphDatabaseFactory().newImpermanentDatabase(); String fileName = file.getFileName().toString(); if (fileName.endsWith(".html")) { String content = FileUtils.readTextFile(file.toFile(), Charsets.UTF_8); Elements cypherElements = Jsoup.parse(content).select("pre.runnable") .not(".standalone-example"); for (Element cypherElement : cypherElements) { String statement = replaceAngularExpressions(cypherElement.text()); if (!statement.startsWith(":")) { if (shouldExplain(statement)) { try (Transaction transaction = database.beginTx()) { Iterable<Notification> actual = database.execute(prependExplain(statement)) .getNotifications(); boolean skipKnownInefficientCypher = !cypherElement.parent().select(".warn") .isEmpty(); if (skipKnownInefficientCypher) { List<Notification> targetCollection = new ArrayList<Notification>(); CollectionUtils.addAll(targetCollection, actual); CollectionUtils.filter(targetCollection, new org.apache.commons.collections4.Predicate<Notification>() { @Override public boolean evaluate(Notification notification) { return notification.getDescription() .contains(NotificationCode.CARTESIAN_PRODUCT .values().toString()); } }); assertThat( format("Query [%s] should only produce cartesian product " + "notifications. [%s]", statement, fileName), targetCollection, empty()); explainCount.incrementAndGet(); transaction.success(); } else { assertThat(format("Query [%s] should produce no notifications. [%s]", statement, fileName), actual, is(emptyIterable())); explainCount.incrementAndGet(); transaction.success(); } } catch (QueryExecutionException e) { throw new AssertionError( format("Failed to explain query [%s] in file [%s]", statement, file), e); } } try (Transaction transaction = database.beginTx()) { database.execute(statement); executionCount.incrementAndGet(); transaction.success(); } catch (QueryExecutionException e) { throw new AssertionError( format("Failed to execute query [%s] in file [%s]", statement, file), e); } } } } return FileVisitResult.CONTINUE; } }); assertTrue("Static files should contain at least one valid cypher statement", executionCount.intValue() >= 1); System.out.printf("Explained %s cypher statements extracted from HTML files, with no notifications.%n", explainCount); System.out.printf("Executed %s cypher statements extracted from HTML files, with no errors.%n", executionCount); }
From source file:org.norvelle.addressdiscoverer.parse.structured.StructuredPageEmailContactLink.java
/** * Try to find an email address in both the HTML (so that we can get attributes * of elements) as well as in the plain text (in case the HTML has been scrambled * to obfuscate the address).// w w w . j a va 2 s . com * * @param element * @throws DoesNotContainContactLinkException * @throws MultipleContactLinksOfSameTypeFoundException */ public StructuredPageEmailContactLink(Element element) throws DoesNotContainContactLinkException, MultipleContactLinksOfSameTypeFoundException { super(element); String content = element.html(); try { this.address = this.findLinkInString(content); } catch (DoesNotContainContactLinkException ex) { content = element.text(); this.address = this.findLinkInString(content); } }