List of usage examples for org.jsoup.nodes Document html
public String html()
From source file:org.b3log.symphony.service.LinkForgeMgmtService.java
/** * Forges the specified URL./* ww w. ja v a2 s . c o m*/ * * @param url the specified URL * @param userId the specified user id */ public void forge(final String url, final String userId) { String html; String baseURL; try { final Document doc = Jsoup.connect(url).timeout(5000).userAgent(Symphonys.USER_AGENT_BOT).get(); doc.select("body").prepend("<a href=\"" + url + "\">" + url + "</a>"); // Add the specified URL itfself html = doc.html(); baseURL = doc.baseUri(); } catch (final Exception e) { LOGGER.log(Level.ERROR, "Parses link [" + url + "] failed", e); return; } final List<JSONObject> links = Links.getLinks(baseURL, html); final List<JSONObject> cachedTags = tagCache.getTags(); final Transaction transaction = linkRepository.beginTransaction(); try { for (final JSONObject lnk : links) { final String addr = lnk.optString(Link.LINK_ADDR); JSONObject link = linkRepository.getLink(addr); if (null == link) { link = new JSONObject(); link.put(Link.LINK_ADDR, lnk.optString(Link.LINK_ADDR)); link.put(Link.LINK_BAD_CNT, 0); link.put(Link.LINK_BAIDU_REF_CNT, 0); link.put(Link.LINK_CLICK_CNT, 0); link.put(Link.LINK_GOOD_CNT, 0); link.put(Link.LINK_SCORE, 0); link.put(Link.LINK_SUBMIT_CNT, 0); link.put(Link.LINK_TITLE, lnk.optString(Link.LINK_TITLE)); link.put(Link.LINK_TYPE, Link.LINK_TYPE_C_FORGE); LOGGER.info(link.optString(Link.LINK_ADDR) + "____" + link.optString(Link.LINK_TITLE)); linkRepository.add(link); final JSONObject linkCntOption = optionRepository.get(Option.ID_C_STATISTIC_LINK_COUNT); final int linkCnt = linkCntOption.optInt(Option.OPTION_VALUE); linkCntOption.put(Option.OPTION_VALUE, linkCnt + 1); optionRepository.update(Option.ID_C_STATISTIC_LINK_COUNT, linkCntOption); } else { link.put(Link.LINK_BAIDU_REF_CNT, lnk.optInt(Link.LINK_BAIDU_REF_CNT)); link.put(Link.LINK_TITLE, lnk.optString(Link.LINK_TITLE)); link.put(Link.LINK_SCORE, lnk.optInt(Link.LINK_BAIDU_REF_CNT)); // XXX: Need a score algorithm linkRepository.update(link.optString(Keys.OBJECT_ID), link); } final String linkId = link.optString(Keys.OBJECT_ID); final double linkScore = link.optDouble(Link.LINK_SCORE, 0D); String title = link.optString(Link.LINK_TITLE) + " " + link.optString(Link.LINK_T_KEYWORDS); title = Pangu.spacingText(title); String[] titles = title.split(" "); titles = Strings.trimAll(titles); for (final JSONObject cachedTag : cachedTags) { final String tagId = cachedTag.optString(Keys.OBJECT_ID); final String tagTitle = cachedTag.optString(Tag.TAG_TITLE); if (!Strings.containsIgnoreCase(tagTitle, titles)) { continue; } final JSONObject tag = tagRepository.get(tagId); // clean tagUserLinkRepository.removeByTagIdUserIdAndLinkId(tagId, userId, linkId); // re-add final JSONObject tagLinkRel = new JSONObject(); tagLinkRel.put(Tag.TAG_T_ID, tagId); tagLinkRel.put(UserExt.USER_T_ID, userId); tagLinkRel.put(Link.LINK_T_ID, linkId); tagLinkRel.put(Link.LINK_SCORE, linkScore); tagUserLinkRepository.add(tagLinkRel); // refresh link score tagUserLinkRepository.updateTagLinkScore(tagId, linkId, linkScore); // re-calc tag link count final int tagLinkCnt = tagUserLinkRepository.countTagLink(tagId); tag.put(Tag.TAG_LINK_CNT, tagLinkCnt); tagRepository.update(tagId, tag); } } transaction.commit(); LOGGER.info("Forged link [" + url + "]"); } catch (final Exception e) { if (transaction.isActive()) { transaction.rollback(); } LOGGER.log(Level.ERROR, "Saves links failed", e); } }
From source file:org.b3log.symphony.util.MarkdownsTestCase.java
@Test public void jsoupParse() { final Document parse = Jsoup.parse("<p><strong><br>??????</strong></p>\n" + "<hr>\n" + "<p> <br> -? java <br> -? javascript<br> -???????<br> -?blog?<br> - <br> - </p>\n" + "<p> ???</p>\n" + "<p>1java ???a1?????<br> A ) a1.java B) a1.class C) a1 D) </p>\n" + "<p>2 Java<br> A) ????<br> B) ?????<br> C) ????<br> D)?</p>\n" + "<p>3 mainJava??main?<br> A)public static void main<br> B)public static void main String[] args <br> C)public static int mainString [] arg <br> D)public void mainString arg[] </p>\n" + "<p>4 Java??????????????????? <br>A)?? B) C)? D)Java??</p>\n" + "<p>5 A?BB?CJava?? </p>\n" + "<pre><code>1. A a0=new A(); \n" + "</code></pre><ol>\n" + "<li>A a1 =new B(); </li>\n" + "<li>A a2=new C();<br> <br>A)?1<br>B)1?23<br>C)1?2?32?3?<br>D)1?23 </li>\n" + "</ol>\n" + "<p>6 ? <br> 1 String s1=a?+b?;<br> 2 String s2=new Strings1<br>3 ifs1= =s2<br>4 System.out.println(= = is succeeded?);<br>5 if (s1.equals(s2))<br>6 System.out.println(.equals() is succeeded?);<br>A)46<br>B)46?<br>C)64?<br> D)4?6? </p>\n" + "<p>7 ??18??</p>\n" + "<p>A)int B) char C) varchar D)text </p>\n" + "<p>8?<br> A.)? B) C) D) </p>\n" + "<p>9 HTML?Javascript?<br> A)<javascript></javascript><br> B)<script></script><br> C) <head></head><br> D) <body </body></p>\n" + "<p>10 ?</p>\n" + "<p><input id=\"btnGo\" type=\"button\" value=\"?\" class=\"btn\"/><br>A) $("#btnGo")<br>B) $(".btnGo")<br>C) $(".btn")<br> D) $("input[type='button']")</p>\n" + "<p></p>\n" + "<p> <br>-Java?</p>\n" + "<p>-java?listMap,Set, Queue</p>\n" + "<p>-??<br>Spring:<br>springmvcstruts2):<br>Hibernatemybatis):<br>Jquery:<br>Bootstrap</p>\n" + "<p>-Javascript??</p>\n" + "<p>-sql<br>Select * from Table:<br>Where :<br>Having:<br>Group by:<br>Order by:</p>\n" + "<p> ?<br><br>-</p>\n" + "<p>-java(SSH)</p>\n" + "<p>-?</p>\n" + "<p>-?:</p>\n" + "<p>-?</p>\n" + "<hr>\n"); final String html = parse.html(); System.out.println(html);// ww w . j a va 2 s.c o m Assert.assertTrue(html.contains("<body < body>")); // Jsoup bug }
From source file:org.jasig.portlet.proxy.mvc.portlet.proxy.ProxyPortletController.java
@RenderMapping public void showContent(final RenderRequest request, final RenderResponse response) { final PortletPreferences preferences = request.getPreferences(); // locate the content service to use to retrieve our HTML content final String contentServiceKey = preferences.getValue(CONTENT_SERVICE_KEY, null); final IContentService contentService = applicationContext.getBean(contentServiceKey, IContentService.class); final IContentRequest proxyRequest; try {/*from w w w . j a v a 2 s . co m*/ proxyRequest = contentService.getRequest(request); } catch (RuntimeException e) { log.error("URL was not in the proxy list"); // TODO: how should we handle these errors? return; } // retrieve the HTML content final IContentResponse proxyResponse; try { proxyResponse = contentService.getContent(proxyRequest, request); } catch (Exception e) { log.error("Failed to proxy content", e); // TODO: error handling return; } // locate all filters configured for this portlet final List<IDocumentFilter> filters = new ArrayList<IDocumentFilter>(); final String[] filterKeys = preferences.getValues(FILTER_LIST_KEY, new String[] {}); for (final String filterKey : filterKeys) { final IDocumentFilter filter = applicationContext.getBean(filterKey, IDocumentFilter.class); filters.add(filter); } try { String sourceEncodingFormat = preferences.getValue(PREF_CHARACTER_ENCODING, CHARACTER_ENCODING_DEFAULT); final Document document = Jsoup.parse(proxyResponse.getContent(), sourceEncodingFormat, proxyResponse.getProxiedLocation()); // apply each of the document filters in order for (final IDocumentFilter filter : filters) { filter.filter(document, proxyResponse, request, response); } // write out the final content OutputStream out = null; try { out = response.getPortletOutputStream(); IOUtils.write(document.html(), out); out.flush(); } catch (IOException e) { log.error("Exception writing proxied content", e); } finally { IOUtils.closeQuietly(out); } } catch (IOException e) { log.error("Error parsing HTML content", e); } finally { if (proxyResponse != null) { proxyResponse.close(); } } }
From source file:org.structr.web.function.HttpGetFunction.java
@Override public Object apply(ActionContext ctx, final GraphObject entity, final Object[] sources) { if (sources != null && sources.length >= 1 && sources.length <= 4 && sources[0] != null) { try {// w w w.j a va 2 s . c o m String address = sources[0].toString(); String contentType = null; String username = null; String password = null; switch (sources.length) { case 4: password = sources[3].toString(); case 3: username = sources[2].toString(); case 2: contentType = sources[1].toString(); break; } //long t0 = System.currentTimeMillis(); if ("text/html".equals(contentType)) { HttpClient client = getHttpClient(); GetMethod get = new GetMethod(address); get.addRequestHeader("User-Agent", "curl/7.35.0"); get.addRequestHeader("Connection", "close"); get.getParams().setParameter("http.protocol.single-cookie-header", true); get.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); get.setFollowRedirects(true); client.executeMethod(get); final InputStream response = get.getResponseBodyAsStream(); // Skip BOM to workaround this Jsoup bug: https://github.com/jhy/jsoup/issues/348 String code = IOUtils.toString(response, "UTF-8"); if (code.charAt(0) == 65279) { code = code.substring(1); } final Document doc = Jsoup.parse(code); if (sources.length > 2) { return doc.select(sources[2].toString()).html(); } else { return doc.html(); } } else { return getFromUrl(ctx, address, username, password); } } catch (Throwable t) { logException(entity, t, sources); } return ""; } else { logParameterError(entity, sources, ctx.isJavaScriptContext()); } return usage(ctx.isJavaScriptContext()); }
From source file:org.uberfire.server.locale.GWTLocaleHeaderFilter.java
@Override public void doFilter(final ServletRequest request, final ServletResponse response, final FilterChain chain) throws IOException, ServletException { final CharResponseWrapper wrappedResponse = getWrapper((HttpServletResponse) response); chain.doFilter(request, wrappedResponse); final String output; final Locale locale = getLocale(request); final String injectedScript = "<meta name=\"gwt:property\" content=\"locale=" + locale.toString() + "\">"; final Document document = Jsoup.parse(wrappedResponse.toString()); document.head().append(injectedScript); output = document.html(); final byte[] outputBytes = output.getBytes("UTF-8"); response.setContentLength(outputBytes.length); response.getWriter().print(output);//from w w w .j a va 2 s. c o m }
From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java
/** * The main method for querying a {@link SearchEngine}. This method will be called by the metadict core on incoming * search queries. The core will always try to parallelize the query as much as possible according to the specified * supported dictionaries of this engine. * <p>// w w w .j a va 2 s . co m * Upon calling, the core will make sure that the language parameters of this method correspond exactly to a * supported {@link Dictionary} as described in the engine's {@link * FeatureSet}. However, an engine may also return results from a different * language. In this case, the core component will decide it the supplied results are useful. * <p> * Example: * If the engine says it supports a one-way german-english dictionary, this method will be called with the language * parameters inputLanguage=GERMAN, outputLanguage=ENGLISH and allowBothWay=false. * However, it the engine supports a bidirectional german-english dictionary, this method will be called with the * language parameters inputLanguage=GERMAN, outputLanguage=ENGLISH and allowBothWay=true. * * @param queryInput * The query string i.e. word that should be looked up. * @param inputLanguage * The input language of the query. This language must be specified as a dictionary's input language of * this engine. * @param outputLanguage * The expected output language of the query. This language must be specified as the output language of the * same dictionary to which the given inputLanguage belongs. * @param allowBothWay * True, if the engine may search in both directions. I.e. the queryInput can also be seen as the * outputLanguage. The core will set this flag only if the engine declared a dictionary with matching input * and output language. Otherwise the will be called for each direction separately. * @return The results from the search query. You can use an instance of {@link EngineQueryResultBuilder} * to build this result list. */ @Override public EngineQueryResult executeSearchQuery(String queryInput, Language inputLanguage, Language outputLanguage, boolean allowBothWay) throws Exception { Connection targetConnection = buildTargetConnection(queryInput, inputLanguage, outputLanguage); Document doc = targetConnection.get(); LOGGER.debug(doc.html()); EngineQueryResultBuilder builder = processDocument(doc); return builder.build(); }
From source file:perflab.LoadrunnerWrapper.java
/** * @param htmlSummaryFile - load runner analysis html report file to parse * @param summaryFile - location of summary file to be generated out of loadrunner html analysis *//*from w ww. j a v a2 s. c o m*/ protected void parseSummaryFile(String htmlSummaryFile, String summaryFile) { try { File input = new File(htmlSummaryFile); Document document = Jsoup.parse(input, "UTF-8"); Document parse = Jsoup.parse(document.html()); Elements table = parse.select("table").select("[summary=Transactions statistics summary table]"); Elements rows = table.select("tr"); getLog().info("number of rows in summary file=" + rows.size()); for (Element row : rows) { //getLog().info("table element = " + row.toString()); String name = row.select("td[headers=LraTransaction Name]").select("span").text(); if (!name.isEmpty()) { float avgRT = Float.valueOf(row.select("td[headers=LraAverage]").select("span").text()); float minRT = Float.valueOf(row.select("td[headers=LraMinimum]").select("span").text()); float maxRT = Float.valueOf(row.select("td[headers=LraMaximum]").select("span").text()); int passed = Integer.valueOf(row.select("td[headers=LraPass]").select("span").text() .replace(".", "").replace(",", "")); int failed = Integer.valueOf(row.select("td[headers=LraFail]").select("span").text() .replace(".", "").replace(",", "")); int failedPrecentage = failed / (failed + passed) * 100; getLog().info("Saving Transaction [" + name + "]"); this.transactions.add( new LoadRunnerTransaction(name, minRT, avgRT, maxRT, passed, failed, failedPrecentage)); } } } catch (IOException e) { getLog().error("Can't read LoadRunner Analysis html report " + e.getMessage()); } }
From source file:perflab.loadrunnerwrapperjenkins.LoadRunnerWrapper.java
/** * @param htmlSummaryFile - load runner analysis html report file to parse * @param summaryFile - location of summary file to be generated out of loadrunner * html analysis//from www . j a v a2 s .com */ protected void parseSummaryFile(String htmlSummaryFile, String summaryFile) { try { File input = new File(htmlSummaryFile); Document document = Jsoup.parse(input, "UTF-8"); Document parse = Jsoup.parse(document.html()); Elements table = parse.select("table").select("[summary=Transactions statistics summary table]"); Elements rows = table.select("tr"); logger.println("number of rows in summary file=" + rows.size()); for (Element row : rows) { // logger.println("table element = " + row.toString()); String name = row.select("td[headers=LraTransaction Name]").select("span").text(); if (!name.isEmpty()) { float avgRT = Float.valueOf(row.select("td[headers=LraAverage]").select("span").text()); float minRT = Float.valueOf(row.select("td[headers=LraMinimum]").select("span").text()); float maxRT = Float.valueOf(row.select("td[headers=LraMaximum]").select("span").text()); int passed = Integer.valueOf(row.select("td[headers=LraPass]").select("span").text() .replace(".", "").replace(",", "")); int failed = Integer.valueOf(row.select("td[headers=LraFail]").select("span").text() .replace(".", "").replace(",", "")); // logger.println("Saving Transaction [" + name + "]"); this.transactions.add(new LoadRunnerTransaction(name, minRT, avgRT, maxRT, passed, failed)); } } } catch (IOException e) { logger.println("Can't read LoadRunner Analysis html report " + e.getMessage()); } }
From source file:psef.handler.HTMLFilter.java
/** * Filter the entire document/*w w w . j a v a 2s .c om*/ * @return the filtered document * @throws PsefException */ public String filter() throws PsefException { Document doc = Jsoup.parse(src); System.out.println("Filtering scripts"); filterScripts(doc); System.out.println("Filtering styles"); filterStyles(doc); System.out.println("Filtering links"); filterLinks(doc); System.out.println("Filtering anchors"); filterAnchors(doc); // write converted dom back to a string StringWriter sw = new StringWriter(src.length()); PrintWriter writer = new PrintWriter(sw); writer.write(doc.html()); writer.flush(); writer.close(); return sw.toString(); }
From source file:webcralwerproject1.Webcrawler.java
public String writeContent(Document htmlDocument) {// throws IOException { FileWriter fWriter = null;/* ww w . j a v a2 s . com*/ BufferedWriter writer = null; String path = null; try { File file = new File(DirectoryName + "/" + crawlcount); if (!file.exists()) { if (file.mkdir()) { System.out.println("Repository Directory is created!"); } else { System.out.println("Failed to create directory!"); } } File f = new File(file.getAbsolutePath() + "/" + MaxPage + "file.html"); path = f.getAbsolutePath(); Elements img = htmlDocument.getElementsByTag("img"); Elements srcc = htmlDocument.getElementsByAttribute("src"); for (Element el : img) { imagecount++; el.attr("src", "a"); } // System.out.println("Imagecount : " + imagecount ); FileUtils.writeStringToFile(f, htmlDocument.html(), "UTF-8"); } catch (Exception e) { System.out.println("Inside writeContent Exception " + e); } System.out.println("Inside writeContent "); return path; }