List of usage examples for org.jsoup.nodes Element html
public String html()
From source file:me.vertretungsplan.parser.UntisCommonParser.java
static String findLastChange(Element doc, SubstitutionScheduleData scheduleData) { String lastChange = null;/*w w w. j a va 2 s. co m*/ boolean lastChangeLeft = false; if (scheduleData != null) { if (scheduleData.getData().has("stand_links")) { // backwards compatibility lastChangeLeft = scheduleData.getData().optBoolean("stand_links", false); } else { lastChangeLeft = scheduleData.getData().optBoolean(PARAM_LAST_CHANGE_LEFT, false); } } if (doc.select("table.mon_head").size() > 0) { Element monHead = doc.select("table.mon_head").first(); lastChange = findLastChangeFromMonHeadTable(monHead); } else if (lastChangeLeft) { final String bodyHtml = doc.select("body").size() > 0 ? doc.select("body").html() : doc.html(); lastChange = bodyHtml.substring(0, bodyHtml.indexOf("<p>") - 1); } else { List<Node> childNodes; if (doc instanceof Document) { childNodes = ((Document) doc).body().childNodes(); } else { childNodes = doc.childNodes(); } for (Node node : childNodes) { if (node instanceof Comment) { Comment comment = (Comment) node; if (comment.getData().contains("<table class=\"mon_head\">")) { Document commentedDoc = Jsoup.parse(comment.getData()); Element monHead = commentedDoc.select("table.mon_head").first(); lastChange = findLastChangeFromMonHeadTable(monHead); break; } } } } return lastChange; }
From source file:me.rkfg.xmpp.bot.plugins.CoolStoryPlugin.java
private String fetchStory(Website website) throws IOException { int roll = 0; String result;/*w w w . ja v a2 s .co m*/ int resultLength; int resultLines; //noinspection ConstantConditions do { roll++; final Document doc = Jsoup.connect(website.getUrlString()).userAgent(DEFAULT_UA).get(); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); logger.info("Fetched a story from {}", doc.location()); final Element story = doc.select(website.getCssQuery()).first(); if (story == null) { return ERROR_COULD_NOT_PARSE; } story.select("div").remove(); story.select("img").forEach(img -> img.replaceWith(new TextNode(img.attr("src"), ""))); story.select("br").after("\\n"); story.select("p").before("\\n\\n"); final String storyHtml = story.html().replaceAll("\\\\n", "\n"); result = Jsoup.clean(storyHtml, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)) .trim(); resultLength = result.length(); resultLines = countLines(result); } while (CONFIG_REROLL_LONG_STORIES && (resultLength > CONFIG_MAX_STORY_LENGTH || resultLines > CONFIG_MAX_STORY_LINES) && roll <= CONFIG_MAX_ROLLS); return result; }
From source file:mx.itdurango.rober.siitdocentes.ActivityAlumnos.java
/** * Permite descomponer el cdigo html que se enva con una estructura especifica para llenar los datos de la vista * * @param html cdigo html que se recibi de una peticin HttpGet, debe tener una estructura similar a la siguiente para que el proceso funcione correctamente * <p/>//from w ww .ja va 2 s . co m * <input name="periodo" type="hidden" value="20141" /> * <input name="materia" type="hidden" value="SD2424" /> * <input name="grupo" type="hidden" value="5VR" /> * <input name="docente" type="hidden" value="LOQR841213822" /> * <input name="fecha_captura" type="hidden" value="2014/06/12" /> * <table> * <tr> * <td>No</td> * <td>Noctrl</td> * <td>Nombre</td> * <td>Unidad 1</td> * <td>Unidad 1</td> * <td>Unidad 3</td> * <td>...</td> * <td>Unidad N</td> * </tr> * <tr> * <td>1</td> * <td>9999999</td> * <td>XXXXXXXXXXXXXXXXXXXXX</td> * <td><input type="text" name="calif[1][1]" value="999"/></td> * <td><input type="text" name="calif[1][2]" value="999"/></td> * <td><input type="text" name="calif[1][3]" value="999"/></td> * <td>...</td> * <td><input type="text" name="calif[1][N]" value="999"/></td> * </tr> * <tr> * <td>2</td> * <td>888888888</td> * <td>YYYYYYYYYYYYYYYYYYYYY</td> * <td><input type="text" name="calif[2][1]" value="999"/></td> * <td><input type="text" name="calif[2][2]" value="999"/></td> * <td><input type="text" name="calif[2][3]" value="999"/></td> * <td>...</td> * <td><input type="text" name="calif[2][N]" value="999"/></td> * </tr> * <tr> * <td>M</td> * <td>000000000</td> * <td>ZZZZZZZZZZZZZZZZZZZZZZ</td> * <td><input type="text" name="calif[M][1]" value="999"/></td> * <td><input type="text" name="calif[M][2]" value="999"/></td> * <td><input type="text" name="calif[M][3]" value="999"/></td> * <td>...</td> * <td><input type="text" name="calif[M][N]" value="999"/></td> * </tr> * </table> */ void llenaAlumnos(String html) { //Generar un archivo de documento para almacenar los datos del html de forma que se pueda //manipular facilmente usando la librera Jsoup Document doc = Jsoup.parse(html); try { //extraer los valores de los elementos del formulario y almacenarlos en los atributos correspondientes de la clase Elements e = doc.getElementsByAttributeValue("name", "periodo"); periodo = e.get(0).attr("value"); e = doc.getElementsByAttributeValue("name", "materia"); materia = e.get(0).attr("value"); e = doc.getElementsByAttributeValue("name", "grupo"); grupo = e.get(0).attr("value"); e = doc.getElementsByAttributeValue("name", "docente"); docente = e.get(0).attr("value"); e = doc.getElementsByAttributeValue("name", "fecha_captura"); fecha_captura = e.get(0).attr("value"); //extraer la tabla correspondiente al listado de alumnos en el caso del siit.itdurango.edu.mx, // corresponde a la tabla numero 2 y ya que la numeracin comienza en 0, la tabla que necesitamos est en el indice 1 Element tabla = doc.getElementsByTag("table").get(1); //Extraer todos los elementos de tipo tr que pertenecen a la tabla y almacenarlos en una coleccion de tipo Elements. Elements renglones = tabla.getElementsByTag("tr"); //Recorrer la coleccin de renglones y almacenar cada uno en un objeto for (Element tr : renglones) { //para cada objeto tr, extraer sus elementos td y almacenarlos en una coleccion Elements tds = tr.getElementsByTag("td"); //permite llevar el control de la columna que se est leyendo, ya que las columnas no tienen un id o clase, se realiza el proceso a mano. int col = 1; //contenedor de tipo AlumosParciales para almacenar la informacin de cada alumno (tr) AlumnosParciales c = new AlumnosParciales(); for (Element td : tds) { if (col == 1) {// la columna 1 corresponde al nmero consecutivo de la tabla c.setNum(td.html()); } else if (col == 2) {// la columna 2 corresponde al nmero de control del alumno c.setControl(td.html()); } else if (col == 3) {// la columna 3 corresponde al nombre del alumno c.setNombre(Estaticos.sanitize(td.html())); } else { //el resto de las columnas pertenecen a las calificaciones parciales //se extrae el elemento <input> de la columna y se obtiene el atributo valor para recuperar la calificacin en caso de que ya hubiera sido asignada String cal = td.getElementsByTag("input").get(0).attr("value"); ArrayList<String> calif = c.getCalificaciones(); calif.add(cal); //se agrega la nueva calificacin al conjunto de calificaciones del alumno c.setCalificaciones(calif); } col++; //incrementa el numero de columa } if (c.getCalificaciones().size() > 0) { //para evitar agregar al listado de alumnos el encabezado de la tabla, validamos que existan calificaciones. gcs.add(c); } } //Llenamos el spinner de unidades a partir del numero de calificaciones que existen en el arreglo List<String> spinnerArray = new ArrayList<String>(); for (int i = 1; i <= gcs.get(1).getCalificaciones().size() - 1; i++) { spinnerArray.add("Unidad " + i); } ArrayAdapter<String> adapter = new ArrayAdapter<String>(this, android.R.layout.simple_spinner_item, spinnerArray); adapter.setDropDownViewResource(android.R.layout.simple_spinner_dropdown_item); spn_unidad.setAdapter(adapter); //llenamos el listado de alumnos con la informacin que se obtuvo del proceso anterior alumnosParcialesAdapter = new AlumnosParcialesAdapter(this, gcs, unidad); lvAlumnos.setAdapter(alumnosParcialesAdapter); } catch (Exception e) { e.printStackTrace(); Toast.makeText(this, getString(R.string.error_parser), Toast.LENGTH_SHORT).show(); finish(); //finaliza el intent actual para desplegar el anterior } }
From source file:mergedoc.core.APIDocument.java
/** * ? Javadoc ????/*from w w w . j a va 2 s . c om*/ * @param className ?? * @param docHtml API */ private void parseMethodComment(String className, Document doc) { Elements elements = doc.select("body > div.contentContainer > div.details > ul > li > ul > li > ul > li"); for (Element element : elements) { Element sigElm = element.select("pre").first(); if (sigElm == null) { continue; } String sigStr = sigElm.html(); Signature sig = createSignature(className, sigStr); Comment comment = new Comment(sig); // deprecated String depre = ""; Elements divs = element.select("div"); if (divs.size() == 2) { depre = divs.get(0).html(); } if (divs.size() > 0) { String body = divs.last().html(); body = formatLinkTag(className, body); comment.setDocumentBody(body); } Elements dtTags = element.select("dl dt"); for (Element dtTag : dtTags) { String dtText = dtTag.text(); if (dtText.contains(":")) { Element dd = dtTag; while (true) { dd = dd.nextElementSibling(); if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) { break; } String name = dd.select("code").first().text(); if (dtText.contains(":")) { name = "<" + name + ">"; } String items = dd.html(); Pattern p = PatternCache .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)"); Matcher m = p.matcher(items); if (m.find()) { String desc = formatLinkTag(className, m.group(2)); comment.addParam(name, desc); } } continue; } if (dtText.contains(":")) { Element dd = dtTag.nextElementSibling(); String str = dd.html(); str = formatLinkTag(className, str); comment.addReturn(str); continue; } if (dtText.contains(":")) { Element dd = dtTag; while (true) { dd = dd.nextElementSibling(); if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) { break; } String name = dd.select("code").first().text(); String items = dd.html(); Pattern p = PatternCache .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)"); Matcher m = p.matcher(items); if (m.find()) { String desc = formatLinkTag(className, m.group(2)); String param = name + " " + desc; comment.addThrows(param); } } continue; } } // deprecated parseDeprecatedTag(className, depre, comment); // parseCommonTag(className, element, comment); contextTable.put(sig, comment); } }
From source file:com.serphacker.serposcope.scraper.google.scraper.GoogleScraper.java
protected long parseResultsNumberOnFirstPage() { if (lastSerpHtml == null) { return 0; }//from w w w.j a v a 2 s .co m Element resultstStatsDiv = lastSerpHtml.getElementById("resultStats"); if (resultstStatsDiv == null) { return 0; } return extractResultsNumber(resultstStatsDiv.html()); }
From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java
/** * ???/*from ww w .j av a2s . c o m*/ * * @param eleTrs * @param rowNo * @return */ private String parseDetailTr(Element eleTr) throws Exception { Element eleTd = eleTr.select("td").get(1); // td if (eleTd.children().size() > 0) { return eleTd.child(0).html(); } else { return eleTd.html().trim(); } }
From source file:org.confab.VBulletinParser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table/*w ww.j a v a2s . c om*/ Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr"); assert !forum_table.isEmpty(); for (Element el_tr : forum_table) { Forum new_forum = new Forum(parent); // Get the table data for this row Elements el_tds = el_tr.select("td"); assert !el_tds.isEmpty() : el_tr.html(); // xbox360achievements has a lot of subforums and puts these in their own table // The <a>'s are picked up as children of the parent <td> so don't parse this sub- // tables row's seperatly if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) { //Utilities.debug("tr doesn't seem to have anything we want, skipping."); continue; } // Get the title URL Elements els_a = el_tds.get(1).select("a"); assert !els_a.isEmpty() : el_tds.html(); new_forum.url = els_a.first().attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text assert els_a.first() != null; new_forum.title = els_a.first().text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get num viewing the current forum Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first(); if (el_viewing != null) { new_forum.numViewing = el_viewing.text(); } else { new_forum.numViewing = "0"; } Utilities.debug("new_forum.numViewing : " + new_forum.numViewing); // Get the description/message of this topic Element el_description = el_tds.get(1).select("div.smallfont").first(); if (el_description != null) { new_forum.description = el_description.text(); } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } Utilities.debug("end parseForums"); return ret; }
From source file:info.smartkit.hairy_batman.query.SogouSearchQuery.java
public void parseWxOpenId() { Document doc;//from w ww . j ava2 s .co m try { // need http protocol // doc = Jsoup.connect(GlobalConsts.SOGOU_SEARCH_URL_BASE+ wxFoo.getSubscribeId()).get(); doc = Jsoup.connect("http://weixin.sogou.com/weixin?type=1&query=" + wxFoo.getSubscribeId() + "&fr=sgsearch&ie=utf8&_ast=1423915648&_asf=null&w=01019900&cid=null&sut=19381").get(); LOG.debug("openID html INFO:" + doc.html()); // get page title String title = doc.title(); LOG.debug("title : " + title); // get all "?:" value of html <span> //Elements openIdLink = doc.select(GlobalConsts.SOGOU_SEARCH_WX_OPEN_ID_HTML_ELEMENTS).select(GlobalConsts.SOGOU_SEARCH_WX_OPEN_ID_HTML_ELE_IDENTITY); Elements openIdLink = doc.getElementsByClass("wx-rb"); Element a = null; String openIdLinkHref = ""; if (openIdLink != null && openIdLink.size() > 0) { Iterator<Element> itea = openIdLink.iterator(); while (itea.hasNext()) { a = itea.next(); LOG.debug("openID html INFO:" + a.html()); if (a.getElementsByTag("em").html().indexOf(wxFoo.getSubscribeId()) != -1) { break; } } } if (a != null) { openIdLinkHref = a.attr("href"); } LOG.debug("openIdLinkHref:" + openIdLinkHref); // FIXME:???? if (this.wxFoo.getOpenId() == null && openIdLinkHref.length() > 0) { this.wxFoo.setOpenId(openIdLinkHref.split(GlobalConsts.SOGOU_SEARCH_WX_OPEN_ID_KEYWORDS)[1]); LOG.info("saved wxOpenId value: " + this.wxFoo.getOpenId()); GlobalVariables.wxFooListWithOpenId.add(this.wxFoo); // File reporting new FileReporter(GlobalConsts.REPORT_FILE_OUTPUT_OPENID, GlobalVariables.wxFooListWithOpenId, FileReporter.REPORTER_TYPE.R_T_OPENID, FileReporter.REPORTER_FILE_TYPE.EXCEL).write(); // Then,OpenID JSON site parse if (this.wxFoo.getOpenId() != null) { // Save openId to DB. try { GlobalVariables.jdbcTempate.update("insert into " + GlobalConsts.QUERY_TABLE_NAME_BASIC + "(id,store,agency,unit,subscribeId,onSubscribe,code,openId) values(?,?,?,?,?,?,?,?)", new Object[] { this.wxFoo.getId(), this.wxFoo.getStore(), this.wxFoo.getAgency(), this.wxFoo.getUnit(), this.wxFoo.getSubscribeId(), this.wxFoo.getOnSubscribe(), this.wxFoo.getCode(), this.wxFoo.getOpenId() }, new int[] { java.sql.Types.INTEGER, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR }); this.parseSogouJsonSite(this.wxFoo.getOpenId()); } catch (DataAccessException e) { e.printStackTrace(); } } else { LOG.warn("SogouSearchQuery getOpenId Failure! site info:" + wxFoo.getCode()); // TODO write those info to File or DB for collect which // agency not open weixin service // Save openId to DB. try { GlobalVariables.jdbcTempate.update("insert into " + GlobalConsts.QUERY_TABLE_NAME_BASIC + "(id,store,agency,unit,subscribeId,onSubscribe,code,openId) values(?,?,?,?,?,?,?,?)", new Object[] { this.wxFoo.getId(), this.wxFoo.getStore(), this.wxFoo.getAgency(), this.wxFoo.getUnit(), this.wxFoo.getSubscribeId(), this.wxFoo.getOnSubscribe(), this.wxFoo.getCode(), "" }, new int[] { java.sql.Types.INTEGER, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR }); LOG.warn("Can not get subsriber info: " + this.wxFoo.getCode()); this.parseSogouJsonSite(this.wxFoo.getOpenId()); } catch (DataAccessException e) { e.printStackTrace(); } } } } catch (IOException e) { // e.printStackTrace(); LOG.error(e.toString()); } }
From source file:edu.usu.sdl.openstorefront.service.io.HelpImporter.java
/** * Accept a stream pointed to markdown/*ww w . j av a 2 s . c om*/ * * @param in * @return */ public List<HelpSection> processHelp(InputStream in) { List<HelpSection> helpSections = new ArrayList<>(); String data = ""; try (BufferedReader bin = new BufferedReader(new InputStreamReader(in))) { data = bin.lines().collect(Collectors.joining("\n")); } catch (IOException e) { } PegDownProcessor pegDownProcessor = new PegDownProcessor(PROCESSING_TIMEOUT); String html = pegDownProcessor.markdownToHtml(data); Document doc = Jsoup.parse(html); Elements elements = doc.getAllElements(); Set<String> headerTags = new HashSet<>(); headerTags.add("h1"); headerTags.add("h2"); headerTags.add("h3"); headerTags.add("h4"); headerTags.add("h5"); headerTags.add("h6"); boolean capture = false; HelpSection helpSection = null; for (Element element : elements) { if (headerTags.contains(element.tagName().toLowerCase()) == false && capture) { if (helpSection != null) { if (helpSection.getContent().contains(element.outerHtml()) == false) { helpSection.setContent(helpSection.getContent() + element.outerHtml()); } } } if (headerTags.contains(element.tagName().toLowerCase())) { String title = element.html(); if (helpSection != null) { //save old section addHelpSection(helpSections, helpSection); } String titleSplit[] = title.split(" "); helpSection = new HelpSection(); helpSection.setTitle(title); helpSection.setHeaderLevel(Convert.toInteger(element.tagName().toLowerCase().replace("h", ""))); helpSection.setSectionNumber(titleSplit[0]); helpSection.setContent(""); if (title.contains("*")) { helpSection.setAdminSection(true); } else { helpSection.setAdminSection(false); } capture = true; } } //Add last section if (helpSection != null) { addHelpSection(helpSections, helpSection); } return helpSections; }
From source file:info.mikaelsvensson.devtools.sitesearch.SiteSearchPlugin.java
private IndexEntry createIndexEntry(final File file) { try {//from w ww. j a va 2 s.co m Document document = Jsoup.parse(file, "UTF-8", "http://invalid.host"); Element contentEl = document.getElementById("contentBox"); if (contentEl == null) { contentEl = document.body(); } if (contentEl != null) { String text = Jsoup.clean(contentEl.html(), Whitelist.simpleText()); Collection<WordCount> wordCount = getWordCount(text); Collection<WordCount> filteredWordCount = filterWordCount(wordCount); return new IndexEntry(document.title(), getRelativePath(getSiteOutputFolder(), file), filteredWordCount); } } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } return null; }