List of usage examples for org.jsoup.select Elements html
public String html()
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private Double findShippingWeight() { Elements weight = doc.select(PathMapping.WEIGHT); weight.select("b").remove(); weight.select("a").remove(); String str = weight.html().replace("(", "").replace(")", "").split(" ")[0]; try {/* w w w . j a va 2 s . c om*/ if (StringUtils.isNotBlank(str)) { return Double.valueOf(str); } } catch (Exception ex) { } return null; }
From source file:fusion.Fusion.java
private static boolean isSynonym(Value val1, Value val2) throws IOException { boolean isSyn = false; String thesaurusUrl = "http://words.bighugelabs.com/api/2/92eae7f933f0f63404b3438ca46861e5/" + val1.getValue() + "/xml"; Document doc = Jsoup.connect(thesaurusUrl).get(); Elements synonyms = doc.select("w"); String syn = synonyms.html(); String[] synonymsArray = syn.split("\n"); ArrayList<String> synonymsList = new ArrayList<String>(Arrays.asList(synonymsArray)); if (synonymsList.contains(val2.getValue())) { val1.addToSynonyms(val2); val2.addToSynonyms(val1); isSyn = true;//w w w . j ava 2 s . c o m } return isSyn; }
From source file:com.gote.downloader.kgs.KGSDownloader.java
/** * Check if a game is public, if yes, then the URL of that game will be sent back. * /* w ww. j av a 2 s. com*/ * @param pCell Element which represents the first KGS archives column * @return link of the SGF or null */ public String isPublicGame(Element pCell) { Elements a = pCell.getElementsByTag("a"); if (a != null && a.size() > 0) { // Check if it is a visible game if (a.html().equals(KGSUtils.KGS_TAG_FR_YES)) { return a.attr("href"); } } return null; }
From source file:com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java
/** * ?page??/*w w w .j a v a 2s . co m*/ */ @Override public void visit(Page page) { try { String url = page.getWebURL().getURL(); page.setContentType("text/html; charset=" + gather.getEncoding()); Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get(); String title = doc.title(); if (gather.getTitleExternal() && gather.getTitleRegex() != null && gather.getTitleRegex().length() > 0) { Elements titleEles = doc.select(gather.getTitleRegex()); if (!titleEles.isEmpty()) { String tempTitle = titleEles.text(); if (tempTitle != null && tempTitle.length() > 0) { title = tempTitle; } } } if (title != null && title.trim().length() > 0) { Elements elements = doc.select(matchRegex); if (filterRegex != null && filterRegex.trim().length() > 0) { elements = elements.not(filterRegex); } if (!elements.isEmpty()) { String subHtml = elements.html(); Document blockDoc = Jsoup.parse(subHtml); String contentText = blockDoc.html(); if (gather.getRemoveHref()) { Document moveDoc = Jsoup.parse(contentText); Elements moveEles = moveDoc.select("*").not("a"); contentText = moveEles.html(); } if (gather.getRemoveHtmlTag()) contentText = doc.text(); if (isLocal) { contentText = doc.text(); Boolean isMatcher = true; for (int i = 0; i < keys.length; i++) { Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find(); if (!result) { isMatcher = false; break; } } if (isMatcher) { Storage storage = new Storage(); storage.setGatherId(gather.getId()); storage.setGatherName(gather.getName()); storage.setTitle(title); storage.setUrl(url); try { gatherService.addStorage(storage); } catch (Exception e) { logger.error("save storage error : {}", e.getLocalizedMessage()); } finally { storage = null; } } } else { Content content = new Content(); content.setDetail(contentText); content.setPage(1); List<Content> contents = new ArrayList<Content>(); contents.add(content); Article article = new Article(); article.setTitle(title); article.setContents(contents); articleMainService.addArticleMainByCrawler(article, gather.getChannelId(), CrawlerUtil.USER_NAME); } } } } catch (IOException e) { logger.warn(e.getLocalizedMessage()); } }
From source file:com.dmrr.asistenciasx.Horarios.java
private void jButton1ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton1ActionPerformed try {/*from w w w. ja v a 2 s .com*/ int x = jTableHorarios.getSelectedRow(); if (x == -1) { JOptionPane.showMessageDialog(this, "Seleccione un profesor primero", "Datos incompletos", JOptionPane.WARNING_MESSAGE); return; } Integer idProfesor = Integer .parseInt((String) jTableHorarios.getValueAt(jTableHorarios.getSelectedRow(), 1)); JPasswordField pf = new JPasswordField(); String nip = ""; int okCxl = JOptionPane.showConfirmDialog(null, pf, "Introduzca el NIP del jefe del departamento", JOptionPane.OK_CANCEL_OPTION, JOptionPane.PLAIN_MESSAGE); if (okCxl == JOptionPane.OK_OPTION) { nip = new String(pf.getPassword()); } else { return; } org.jsoup.Connection.Response respuesta = Jsoup .connect("http://siiauescolar.siiau.udg.mx/wus/gupprincipal.valida_inicio") .data("p_codigo_c", "2225255", "p_clave_c", nip).method(org.jsoup.Connection.Method.POST) .timeout(0).execute(); Document login = respuesta.parse(); String sessionId = respuesta.cookie(getFecha() + "SIIAUSESION"); String sessionId2 = respuesta.cookie(getFecha() + "SIIAUUDG"); Document listaHorarios = Jsoup.connect("http://siiauescolar.siiau.udg.mx/wse/sspsecc.consulta_oferta") .data("ciclop", "201510", "cup", "J", "deptop", "", "codprofp", "" + idProfesor, "ordenp", "0", "mostrarp", "1000", "tipop", "T", "secp", "A", "regp", "T") .userAgent("Mozilla").cookie(getFecha() + "SIIAUSESION", sessionId) .cookie(getFecha() + "SIIAUUDG", sessionId2).timeout(0).post(); Elements tabla = listaHorarios.select("body"); tabla.select("style").remove(); Elements font = tabla.select("font"); font.removeAttr("size"); System.out.println(tabla.html()); JEditorPane jEditorPane = new JEditorPane(); jEditorPane.setEditable(false); HTMLEditorKit kit = new HTMLEditorKit(); jEditorPane.setEditorKit(kit); javax.swing.text.Document doc = kit.createDefaultDocument(); jEditorPane.setDocument(doc); jEditorPane.setText(tabla.html()); JOptionPane.showMessageDialog(null, jEditorPane); } catch (IOException ex) { Logger.getLogger(Horarios.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:org.jresponder.message.MessageRefImpl.java
/** * Render a message in the context of a particular subscriber * and subscription./*from w ww. ja v a 2s . com*/ */ @Override public boolean populateMessage(MimeMessage aMimeMessage, SendConfig aSendConfig, Subscriber aSubscriber, Subscription aSubscription) { try { // prepare context Map<String, Object> myRenderContext = new HashMap<String, Object>(); myRenderContext.put("subscriber", aSubscriber); myRenderContext.put("subscription", aSubscription); myRenderContext.put("config", aSendConfig); myRenderContext.put("message", this); // render the whole file String myRenderedFileContents = TextRenderUtil.getInstance().render(fileContents, myRenderContext); // now parse again with Jsoup Document myDocument = Jsoup.parse(myRenderedFileContents); String myHtmlBody = ""; String myTextBody = ""; // html body Elements myBodyElements = myDocument.select("#htmlbody"); if (!myBodyElements.isEmpty()) { myHtmlBody = myBodyElements.html(); } // text body Elements myJrTextBodyElements = myDocument.select("#textbody"); if (!myJrTextBodyElements.isEmpty()) { myTextBody = TextUtil.getInstance().getWholeText(myJrTextBodyElements.first()); } // now build the actual message MimeMessage myMimeMessage = aMimeMessage; // wrap it in a MimeMessageHelper - since some things are easier with that MimeMessageHelper myMimeMessageHelper = new MimeMessageHelper(myMimeMessage); // set headers // subject myMimeMessageHelper.setSubject(TextRenderUtil.getInstance() .render((String) propMap.get(MessageRefProp.JR_SUBJECT.toString()), myRenderContext)); // TODO: implement DKIM, figure out subetha String mySenderEmailPattern = aSendConfig.getSenderEmailPattern(); String mySenderEmail = TextRenderUtil.getInstance().render(mySenderEmailPattern, myRenderContext); myMimeMessage.setSender(new InternetAddress(mySenderEmail)); myMimeMessageHelper.setTo(aSubscriber.getEmail()); // from myMimeMessageHelper.setFrom( TextRenderUtil.getInstance() .render((String) propMap.get(MessageRefProp.JR_FROM_EMAIL.toString()), myRenderContext), TextRenderUtil.getInstance() .render((String) propMap.get(MessageRefProp.JR_FROM_NAME.toString()), myRenderContext)); // see how to set body // if we have both text and html, then do multipart if (myTextBody.trim().length() > 0 && myHtmlBody.trim().length() > 0) { // create wrapper multipart/alternative part MimeMultipart ma = new MimeMultipart("alternative"); myMimeMessage.setContent(ma); // create the plain text BodyPart plainText = new MimeBodyPart(); plainText.setText(myTextBody); ma.addBodyPart(plainText); // create the html part BodyPart html = new MimeBodyPart(); html.setContent(myHtmlBody, "text/html"); ma.addBodyPart(html); } // if only HTML, then just use that else if (myHtmlBody.trim().length() > 0) { myMimeMessageHelper.setText(myHtmlBody, true); } // if only text, then just use that else if (myTextBody.trim().length() > 0) { myMimeMessageHelper.setText(myTextBody, false); } // if neither text nor HTML, then the message is being skipped, // so we just return null else { return false; } return true; } catch (MessagingException e) { throw new RuntimeException(e); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } }
From source file:org.brunocvcunha.taskerbox.impl.jobs.MonsterJobSeeker.java
private boolean handleJob(String jobTitle, String jobEmployer, String location, String jobUrl) throws JSONException, ClientProtocolException, IOException, URISyntaxException { if (alreadyPerformedAction(jobUrl)) { return true; }// w w w.j a v a 2 s.c o m String headline = jobUrl + " - " + location + " - " + jobTitle + " - " + jobEmployer; if (!considerTitle(jobTitle)) { logInfo(log, "-- Ignored [title] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerEmployer(jobEmployer)) { logInfo(log, "-- Ignored [employer] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerLocation(location)) { logInfo(log, "-- Ignored [location] " + headline); addAlreadyPerformedAction(jobUrl); return true; } try { Thread.sleep(1000L); } catch (InterruptedException e) { e.printStackTrace(); } HttpEntity jobEntity = TaskerboxHttpBox.getInstance().getEntityForURL(jobUrl); String jobResult = TaskerboxHttpBox.getInstance().readResponseFromEntity(jobEntity); Document jobDocument = Jsoup.parse(jobResult); Elements elDescription = jobDocument.select("div#jobBodyContent"); if (!jobDocument.html().contains("ApplyOnlineUrl: ''") && !jobDocument.html().contains("ApplyOnlineUrl: 'http://my.monster.com") && !this.externalApply) { logInfo(log, "-- Ignored [externalApply] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerVisaDescription(elDescription.html())) { logInfo(log, "-- Ignored [visa] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerExperienceDescription(elDescription.html())) { logInfo(log, "-- Ignored [exp] " + headline); addAlreadyPerformedAction(jobUrl); return true; } ScorerResult result = LinkedInJobDBComparer.getScore(elDescription.html()); if (result.getScore() < this.requiredScore) { logInfo(log, "-- Ignored [scorer] " + result.getScore() + " - " + result.getMatches() + " - " + headline); addAlreadyPerformedAction(jobUrl); return true; } headline = headline + " - " + result.getMatches(); logInfo(log, "Open --> " + headline); // logInfo(log, elDescription.html()); performUnique(jobUrl); try { Thread.sleep(5000L); } catch (InterruptedException e) { e.printStackTrace(); } return true; }
From source file:org.brunocvcunha.taskerbox.impl.jobs.DiceJobSeeker.java
private boolean handleJob(String jobTitle, String jobEmployer, String location, String jobUrl) throws JSONException, ClientProtocolException, IOException, URISyntaxException { if (alreadyPerformedAction(jobUrl)) { return true; }/* w ww. j ava2 s . c om*/ String headline = jobUrl + " - " + location + " - " + jobTitle + " - " + jobEmployer; System.out.println(headline); if (!considerTitle(jobTitle)) { logInfo(log, "-- Ignored [title] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerEmployer(jobEmployer)) { logInfo(log, "-- Ignored [employer] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerLocation(location)) { logInfo(log, "-- Ignored [location] " + headline); addAlreadyPerformedAction(jobUrl); return true; } HttpEntity jobEntity = TaskerboxHttpBox.getInstance().getEntityForURL(jobUrl); String jobResult = TaskerboxHttpBox.getInstance().readResponseFromEntity(jobEntity); Document jobDocument = Jsoup.parse(jobResult); Elements elDescription = jobDocument.select("div.job_description"); if (elDescription.isEmpty()) { elDescription = jobDocument.select("div#detailDescription"); } /* * if (!jobDocument.html().contains("ApplyOnlineUrl: ''") && * !jobDocument.html().contains("ApplyOnlineUrl: 'http://my.monster.com") && !externalApply) { * logInfo(log, "-- Ignored [externalApply] " + headline); addAlreadyPerformedAction(jobUrl); * return true; } */ if (!considerVisaDescription(elDescription.html())) { logInfo(log, "-- Ignored [visa] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerExperienceDescription(elDescription.html())) { log.info("-- Ignored [exp] " + headline); addAlreadyPerformedAction(jobUrl); return true; } ScorerResult result = LinkedInJobDBComparer.getScore(elDescription.html()); if (result.getScore() < this.requiredScore) { logInfo(log, "-- Ignored [scorer] " + result.getScore() + " - " + result.getMatches() + " - " + headline); addAlreadyPerformedAction(jobUrl); return true; } headline = headline + " - " + result.getMatches(); logInfo(log, "Open --> " + headline); // logInfo(log, elDescription.html()); performUnique(jobUrl); try { Thread.sleep(5000L); } catch (InterruptedException e) { e.printStackTrace(); } return true; }
From source file:org.confab.PhpBB3Parser.java
/** * Parses each topic for a particular forum. * @param forum Document of html containing topics * @param parent Forum the threads belong to * @return List of ForumThread objects *//*from w w w . j a va 2s .c o m*/ public List<ForumThread> parseForumThreads(Document forum, Forum parent) { Utilities.debug("parseForumThreads"); List<ForumThread> ret = new ArrayList<ForumThread>(); // Get topic table Elements thread_table_tds = forum.select("tbody[id*=threadbits_forum_] td"); if (thread_table_tds.isEmpty()) { Utilities.debug("It seems " + parent.url + " has no topics."); return ret; } // Get any stickies Elements stickies = thread_table_tds.select("td:contains(Sticky:) a[id*=thread_title_]"); // Get all topics Elements els_a = thread_table_tds.select("a[id*=thread_title_]"); assert !els_a.isEmpty(); // Loop topics and grab info about each for (Element el_a : els_a) { ForumThread new_topic = new ForumThread(parent); // Get topic new_topic.title = el_a.text(); assert new_topic.title != null; Utilities.debug("new_topic.title: " + new_topic.title); // Check if sticky if (stickies.html().contains(new_topic.title)) { new_topic.isSticky = true; Utilities.debug("new_topic.isSticky: " + new_topic.isSticky); } // Get URL new_topic.url = el_a.attr("href"); assert new_topic.url != null; Utilities.debug("new_topic.url:" + new_topic.url); ret.add(new_topic); } Utilities.debug("end printForumThreads"); return ret; }
From source file:org.confab.VBulletinParser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table/*from w w w. j ava 2s .c om*/ Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr"); assert !forum_table.isEmpty(); for (Element el_tr : forum_table) { Forum new_forum = new Forum(parent); // Get the table data for this row Elements el_tds = el_tr.select("td"); assert !el_tds.isEmpty() : el_tr.html(); // xbox360achievements has a lot of subforums and puts these in their own table // The <a>'s are picked up as children of the parent <td> so don't parse this sub- // tables row's seperatly if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) { //Utilities.debug("tr doesn't seem to have anything we want, skipping."); continue; } // Get the title URL Elements els_a = el_tds.get(1).select("a"); assert !els_a.isEmpty() : el_tds.html(); new_forum.url = els_a.first().attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text assert els_a.first() != null; new_forum.title = els_a.first().text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get num viewing the current forum Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first(); if (el_viewing != null) { new_forum.numViewing = el_viewing.text(); } else { new_forum.numViewing = "0"; } Utilities.debug("new_forum.numViewing : " + new_forum.numViewing); // Get the description/message of this topic Element el_description = el_tds.get(1).select("div.smallfont").first(); if (el_description != null) { new_forum.description = el_description.text(); } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } Utilities.debug("end parseForums"); return ret; }