Example usage for org.jsoup.nodes Element html

List of usage examples for org.jsoup.nodes Element html

Introduction

In this page you can find the example usage for org.jsoup.nodes Element html.

Prototype

public String html() 

Source Link

Document

Retrieves the element's inner HTML.

Usage

From source file:me.vertretungsplan.parser.UntisCommonParser.java

static String findLastChange(Element doc, SubstitutionScheduleData scheduleData) {
    String lastChange = null;/*w  w  w. j  a  va  2  s.  co m*/

    boolean lastChangeLeft = false;
    if (scheduleData != null) {
        if (scheduleData.getData().has("stand_links")) {
            // backwards compatibility
            lastChangeLeft = scheduleData.getData().optBoolean("stand_links", false);
        } else {
            lastChangeLeft = scheduleData.getData().optBoolean(PARAM_LAST_CHANGE_LEFT, false);
        }
    }

    if (doc.select("table.mon_head").size() > 0) {
        Element monHead = doc.select("table.mon_head").first();
        lastChange = findLastChangeFromMonHeadTable(monHead);
    } else if (lastChangeLeft) {
        final String bodyHtml = doc.select("body").size() > 0 ? doc.select("body").html() : doc.html();
        lastChange = bodyHtml.substring(0, bodyHtml.indexOf("<p>") - 1);
    } else {
        List<Node> childNodes;
        if (doc instanceof Document) {
            childNodes = ((Document) doc).body().childNodes();
        } else {
            childNodes = doc.childNodes();
        }
        for (Node node : childNodes) {
            if (node instanceof Comment) {
                Comment comment = (Comment) node;
                if (comment.getData().contains("<table class=\"mon_head\">")) {
                    Document commentedDoc = Jsoup.parse(comment.getData());
                    Element monHead = commentedDoc.select("table.mon_head").first();
                    lastChange = findLastChangeFromMonHeadTable(monHead);
                    break;
                }
            }
        }
    }
    return lastChange;
}

From source file:me.rkfg.xmpp.bot.plugins.CoolStoryPlugin.java

private String fetchStory(Website website) throws IOException {
    int roll = 0;
    String result;/*w w w  .  ja v a2  s .co m*/
    int resultLength;
    int resultLines;

    //noinspection ConstantConditions
    do {
        roll++;

        final Document doc = Jsoup.connect(website.getUrlString()).userAgent(DEFAULT_UA).get();
        doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
        logger.info("Fetched a story from {}", doc.location());

        final Element story = doc.select(website.getCssQuery()).first();
        if (story == null) {
            return ERROR_COULD_NOT_PARSE;
        }

        story.select("div").remove();
        story.select("img").forEach(img -> img.replaceWith(new TextNode(img.attr("src"), "")));
        story.select("br").after("\\n");
        story.select("p").before("\\n\\n");
        final String storyHtml = story.html().replaceAll("\\\\n", "\n");

        result = Jsoup.clean(storyHtml, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false))
                .trim();
        resultLength = result.length();
        resultLines = countLines(result);

    } while (CONFIG_REROLL_LONG_STORIES
            && (resultLength > CONFIG_MAX_STORY_LENGTH || resultLines > CONFIG_MAX_STORY_LINES)
            && roll <= CONFIG_MAX_ROLLS);

    return result;
}

From source file:mx.itdurango.rober.siitdocentes.ActivityAlumnos.java

/**
 * Permite descomponer el cdigo html que se enva con una estructura especifica para llenar los datos de la vista
 *
 * @param html cdigo html que se recibi de una peticin HttpGet, debe tener una estructura similar a la siguiente para que el proceso funcione correctamente
 *             <p/>//from w ww .ja va 2 s . co m
 *             <input name="periodo" type="hidden" value="20141" />
 *             <input name="materia" type="hidden" value="SD2424" />
 *             <input name="grupo" type="hidden" value="5VR" />
 *             <input name="docente" type="hidden" value="LOQR841213822" />
 *             <input name="fecha_captura" type="hidden" value="2014/06/12" />
 *             <table>
 *             <tr>
 *             <td>No</td>
 *             <td>Noctrl</td>
 *             <td>Nombre</td>
 *             <td>Unidad 1</td>
 *             <td>Unidad 1</td>
 *             <td>Unidad 3</td>
 *             <td>...</td>
 *             <td>Unidad N</td>
 *             </tr>
 *             <tr>
 *             <td>1</td>
 *             <td>9999999</td>
 *             <td>XXXXXXXXXXXXXXXXXXXXX</td>
 *             <td><input type="text" name="calif[1][1]" value="999"/></td>
 *             <td><input type="text" name="calif[1][2]" value="999"/></td>
 *             <td><input type="text" name="calif[1][3]" value="999"/></td>
 *             <td>...</td>
 *             <td><input type="text" name="calif[1][N]" value="999"/></td>
 *             </tr>
 *             <tr>
 *             <td>2</td>
 *             <td>888888888</td>
 *             <td>YYYYYYYYYYYYYYYYYYYYY</td>
 *             <td><input type="text" name="calif[2][1]" value="999"/></td>
 *             <td><input type="text" name="calif[2][2]" value="999"/></td>
 *             <td><input type="text" name="calif[2][3]" value="999"/></td>
 *             <td>...</td>
 *             <td><input type="text" name="calif[2][N]" value="999"/></td>
 *             </tr>
 *             <tr>
 *             <td>M</td>
 *             <td>000000000</td>
 *             <td>ZZZZZZZZZZZZZZZZZZZZZZ</td>
 *             <td><input type="text" name="calif[M][1]" value="999"/></td>
 *             <td><input type="text" name="calif[M][2]" value="999"/></td>
 *             <td><input type="text" name="calif[M][3]" value="999"/></td>
 *             <td>...</td>
 *             <td><input type="text" name="calif[M][N]" value="999"/></td>
 *             </tr>
 *             </table>
 */
void llenaAlumnos(String html) {
    //Generar un archivo de documento para almacenar los datos del html de forma que se pueda
    //manipular facilmente usando la librera Jsoup
    Document doc = Jsoup.parse(html);

    try {
        //extraer los valores de los elementos del formulario y almacenarlos en los atributos correspondientes de la clase
        Elements e = doc.getElementsByAttributeValue("name", "periodo");
        periodo = e.get(0).attr("value");
        e = doc.getElementsByAttributeValue("name", "materia");
        materia = e.get(0).attr("value");
        e = doc.getElementsByAttributeValue("name", "grupo");
        grupo = e.get(0).attr("value");
        e = doc.getElementsByAttributeValue("name", "docente");
        docente = e.get(0).attr("value");
        e = doc.getElementsByAttributeValue("name", "fecha_captura");
        fecha_captura = e.get(0).attr("value");

        //extraer la tabla correspondiente al listado de alumnos en el caso del siit.itdurango.edu.mx,
        // corresponde a la tabla numero 2 y ya que la numeracin comienza en 0, la tabla que necesitamos est en el indice 1
        Element tabla = doc.getElementsByTag("table").get(1);
        //Extraer todos los elementos de tipo tr que pertenecen a la tabla y almacenarlos en una coleccion de tipo Elements.
        Elements renglones = tabla.getElementsByTag("tr");
        //Recorrer la coleccin de renglones y almacenar cada uno en un objeto
        for (Element tr : renglones) {
            //para cada objeto tr, extraer sus elementos td y almacenarlos en una coleccion
            Elements tds = tr.getElementsByTag("td");
            //permite llevar el control de la columna que se est leyendo, ya que las columnas no tienen un id o clase, se realiza el proceso a mano.
            int col = 1;
            //contenedor de tipo AlumosParciales para almacenar la informacin de cada alumno (tr)
            AlumnosParciales c = new AlumnosParciales();
            for (Element td : tds) {
                if (col == 1) {// la columna 1 corresponde al nmero consecutivo de la tabla
                    c.setNum(td.html());
                } else if (col == 2) {// la columna 2 corresponde al nmero de control del alumno
                    c.setControl(td.html());
                } else if (col == 3) {// la columna 3 corresponde al nombre del alumno
                    c.setNombre(Estaticos.sanitize(td.html()));
                } else { //el resto de las columnas pertenecen a las calificaciones parciales
                    //se extrae el elemento <input> de la columna y se obtiene el atributo valor para recuperar la calificacin en caso de que ya hubiera sido asignada
                    String cal = td.getElementsByTag("input").get(0).attr("value");

                    ArrayList<String> calif = c.getCalificaciones();
                    calif.add(cal);
                    //se agrega la nueva calificacin al conjunto de calificaciones del alumno
                    c.setCalificaciones(calif);
                }
                col++; //incrementa el numero de columa
            }
            if (c.getCalificaciones().size() > 0) { //para evitar agregar al listado de alumnos el encabezado de la tabla, validamos que existan calificaciones.
                gcs.add(c);
            }
        }

        //Llenamos el spinner de unidades a partir del numero de calificaciones que existen en el arreglo
        List<String> spinnerArray = new ArrayList<String>();
        for (int i = 1; i <= gcs.get(1).getCalificaciones().size() - 1; i++) {
            spinnerArray.add("Unidad " + i);
        }
        ArrayAdapter<String> adapter = new ArrayAdapter<String>(this, android.R.layout.simple_spinner_item,
                spinnerArray);
        adapter.setDropDownViewResource(android.R.layout.simple_spinner_dropdown_item);
        spn_unidad.setAdapter(adapter);

        //llenamos el listado de alumnos con la informacin que se obtuvo del proceso anterior
        alumnosParcialesAdapter = new AlumnosParcialesAdapter(this, gcs, unidad);
        lvAlumnos.setAdapter(alumnosParcialesAdapter);

    } catch (Exception e) {
        e.printStackTrace();
        Toast.makeText(this, getString(R.string.error_parser), Toast.LENGTH_SHORT).show();
        finish(); //finaliza el intent actual para desplegar el anterior
    }
}

From source file:mergedoc.core.APIDocument.java

/**
 * ? Javadoc ????/*from w  w w  . j a va  2  s  .  c  om*/
 * @param className ??
 * @param docHtml API 
 */
private void parseMethodComment(String className, Document doc) {
    Elements elements = doc.select("body > div.contentContainer > div.details > ul > li > ul > li > ul > li");
    for (Element element : elements) {
        Element sigElm = element.select("pre").first();
        if (sigElm == null) {
            continue;
        }
        String sigStr = sigElm.html();
        Signature sig = createSignature(className, sigStr);
        Comment comment = new Comment(sig);

        // deprecated 
        String depre = "";
        Elements divs = element.select("div");
        if (divs.size() == 2) {
            depre = divs.get(0).html();
        }
        if (divs.size() > 0) {
            String body = divs.last().html();
            body = formatLinkTag(className, body);
            comment.setDocumentBody(body);
        }

        Elements dtTags = element.select("dl dt");
        for (Element dtTag : dtTags) {
            String dtText = dtTag.text();
            if (dtText.contains(":")) {
                Element dd = dtTag;
                while (true) {
                    dd = dd.nextElementSibling();
                    if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) {
                        break;
                    }
                    String name = dd.select("code").first().text();
                    if (dtText.contains(":")) {
                        name = "<" + name + ">";
                    }
                    String items = dd.html();
                    Pattern p = PatternCache
                            .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)");
                    Matcher m = p.matcher(items);
                    if (m.find()) {
                        String desc = formatLinkTag(className, m.group(2));
                        comment.addParam(name, desc);
                    }
                }
                continue;
            }

            if (dtText.contains(":")) {
                Element dd = dtTag.nextElementSibling();
                String str = dd.html();
                str = formatLinkTag(className, str);
                comment.addReturn(str);
                continue;
            }

            if (dtText.contains(":")) {
                Element dd = dtTag;
                while (true) {
                    dd = dd.nextElementSibling();
                    if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) {
                        break;
                    }
                    String name = dd.select("code").first().text();
                    String items = dd.html();
                    Pattern p = PatternCache
                            .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)");
                    Matcher m = p.matcher(items);
                    if (m.find()) {
                        String desc = formatLinkTag(className, m.group(2));
                        String param = name + " " + desc;
                        comment.addThrows(param);
                    }
                }
                continue;
            }

        }
        // deprecated 
        parseDeprecatedTag(className, depre, comment);

        // 
        parseCommonTag(className, element, comment);

        contextTable.put(sig, comment);
    }
}

From source file:com.serphacker.serposcope.scraper.google.scraper.GoogleScraper.java

protected long parseResultsNumberOnFirstPage() {
    if (lastSerpHtml == null) {
        return 0;
    }//from w  w  w.j  a  v  a 2  s .co  m

    Element resultstStatsDiv = lastSerpHtml.getElementById("resultStats");
    if (resultstStatsDiv == null) {
        return 0;
    }

    return extractResultsNumber(resultstStatsDiv.html());
}

From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java

/**
 * ???/*from  ww w  .j  av  a2s . c  o m*/
 * 
 * @param eleTrs
 * @param rowNo
 * @return
 */
private String parseDetailTr(Element eleTr) throws Exception {
    Element eleTd = eleTr.select("td").get(1);

    // td
    if (eleTd.children().size() > 0) {
        return eleTd.child(0).html();
    } else {
        return eleTd.html().trim();
    }
}

From source file:org.confab.VBulletinParser.java

public List<Forum> parseForums(Document root, BulletinBoard parent) {
    Utilities.debug("parseForums");

    List<Forum> ret = new ArrayList<Forum>();

    // get table/*w  ww.j  a  v a2s  . c om*/
    Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr");
    assert !forum_table.isEmpty();

    for (Element el_tr : forum_table) {
        Forum new_forum = new Forum(parent);

        // Get the table data for this row
        Elements el_tds = el_tr.select("td");
        assert !el_tds.isEmpty() : el_tr.html();

        // xbox360achievements has a lot of subforums and puts these in their own table
        // The <a>'s are picked up as children of the parent <td> so don't parse this sub-
        // tables row's seperatly
        if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) {
            //Utilities.debug("tr doesn't seem to have anything we want, skipping.");
            continue;
        }

        // Get the title URL
        Elements els_a = el_tds.get(1).select("a");
        assert !els_a.isEmpty() : el_tds.html();
        new_forum.url = els_a.first().attr("href");
        assert new_forum.url != null;
        Utilities.debug("new_forum.url : " + new_forum.url);

        // Get the title text
        assert els_a.first() != null;
        new_forum.title = els_a.first().text();
        assert new_forum.title != null;
        Utilities.debug("new_forum.title : " + new_forum.title);

        // Check for any subforums in remaining a elements
        els_a.remove(els_a.first());
        for (Element el_a : els_a) {
            Forum sub_forum = new Forum(parent);
            sub_forum.url = el_a.attr("href");
            assert sub_forum.url != null;
            sub_forum.title = el_a.text();
            assert sub_forum.title != null;
            new_forum.subForums.add(sub_forum);
            Utilities.debug("added subForum: " + sub_forum.title);
        }

        // Get num viewing the current forum
        Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first();
        if (el_viewing != null) {
            new_forum.numViewing = el_viewing.text();
        } else {
            new_forum.numViewing = "0";
        }
        Utilities.debug("new_forum.numViewing : " + new_forum.numViewing);

        // Get the description/message of this topic
        Element el_description = el_tds.get(1).select("div.smallfont").first();
        if (el_description != null) {
            new_forum.description = el_description.text();
        } else {
            new_forum.description = "";
        }
        Utilities.debug("new_forum.description : " + new_forum.description);

        Utilities.debug("new_forum.parent.url : " + new_forum.parent.url);

        ret.add(new_forum);
        Utilities.debug("-----");
    }
    Utilities.debug("end parseForums");
    return ret;
}

From source file:info.smartkit.hairy_batman.query.SogouSearchQuery.java

public void parseWxOpenId() {
    Document doc;//from w ww .  j ava2  s  .co m
    try {

        // need http protocol
        // doc = Jsoup.connect(GlobalConsts.SOGOU_SEARCH_URL_BASE+ wxFoo.getSubscribeId()).get();
        doc = Jsoup.connect("http://weixin.sogou.com/weixin?type=1&query=" + wxFoo.getSubscribeId()
                + "&fr=sgsearch&ie=utf8&_ast=1423915648&_asf=null&w=01019900&cid=null&sut=19381").get();

        LOG.debug("openID html INFO:" + doc.html());

        // get page title
        String title = doc.title();
        LOG.debug("title : " + title);
        // get all "?:" value of html <span>
        //Elements openIdLink = doc.select(GlobalConsts.SOGOU_SEARCH_WX_OPEN_ID_HTML_ELEMENTS).select(GlobalConsts.SOGOU_SEARCH_WX_OPEN_ID_HTML_ELE_IDENTITY);

        Elements openIdLink = doc.getElementsByClass("wx-rb");
        Element a = null;
        String openIdLinkHref = "";
        if (openIdLink != null && openIdLink.size() > 0) {
            Iterator<Element> itea = openIdLink.iterator();
            while (itea.hasNext()) {
                a = itea.next();
                LOG.debug("openID html INFO:" + a.html());
                if (a.getElementsByTag("em").html().indexOf(wxFoo.getSubscribeId()) != -1) {
                    break;
                }
            }
        }
        if (a != null) {
            openIdLinkHref = a.attr("href");
        }
        LOG.debug("openIdLinkHref:" + openIdLinkHref);
        // FIXME:????
        if (this.wxFoo.getOpenId() == null && openIdLinkHref.length() > 0) {

            this.wxFoo.setOpenId(openIdLinkHref.split(GlobalConsts.SOGOU_SEARCH_WX_OPEN_ID_KEYWORDS)[1]);
            LOG.info("saved wxOpenId value: " + this.wxFoo.getOpenId());
            GlobalVariables.wxFooListWithOpenId.add(this.wxFoo);
            // File reporting
            new FileReporter(GlobalConsts.REPORT_FILE_OUTPUT_OPENID, GlobalVariables.wxFooListWithOpenId,
                    FileReporter.REPORTER_TYPE.R_T_OPENID, FileReporter.REPORTER_FILE_TYPE.EXCEL).write();
            // Then,OpenID JSON site parse
            if (this.wxFoo.getOpenId() != null) {
                // Save openId to DB.
                try {
                    GlobalVariables.jdbcTempate.update("insert into " + GlobalConsts.QUERY_TABLE_NAME_BASIC
                            + "(id,store,agency,unit,subscribeId,onSubscribe,code,openId) values(?,?,?,?,?,?,?,?)",
                            new Object[] { this.wxFoo.getId(), this.wxFoo.getStore(), this.wxFoo.getAgency(),
                                    this.wxFoo.getUnit(), this.wxFoo.getSubscribeId(),
                                    this.wxFoo.getOnSubscribe(), this.wxFoo.getCode(), this.wxFoo.getOpenId() },
                            new int[] { java.sql.Types.INTEGER, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR,
                                    java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR,
                                    java.sql.Types.VARCHAR, java.sql.Types.VARCHAR });
                    this.parseSogouJsonSite(this.wxFoo.getOpenId());
                } catch (DataAccessException e) {
                    e.printStackTrace();
                }
            } else {
                LOG.warn("SogouSearchQuery getOpenId Failure! site info:" + wxFoo.getCode());
                // TODO write those info to File or DB for collect which
                // agency not open weixin service
                // Save openId to DB.
                try {
                    GlobalVariables.jdbcTempate.update("insert into " + GlobalConsts.QUERY_TABLE_NAME_BASIC
                            + "(id,store,agency,unit,subscribeId,onSubscribe,code,openId) values(?,?,?,?,?,?,?,?)",
                            new Object[] { this.wxFoo.getId(), this.wxFoo.getStore(), this.wxFoo.getAgency(),
                                    this.wxFoo.getUnit(), this.wxFoo.getSubscribeId(),
                                    this.wxFoo.getOnSubscribe(), this.wxFoo.getCode(), "" },
                            new int[] { java.sql.Types.INTEGER, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR,
                                    java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR,
                                    java.sql.Types.VARCHAR, java.sql.Types.VARCHAR });
                    LOG.warn("Can not get subsriber info: " + this.wxFoo.getCode());

                    this.parseSogouJsonSite(this.wxFoo.getOpenId());
                } catch (DataAccessException e) {
                    e.printStackTrace();
                }
            }
        }

    } catch (IOException e) {
        // e.printStackTrace();
        LOG.error(e.toString());
    }
}

From source file:edu.usu.sdl.openstorefront.service.io.HelpImporter.java

/**
 * Accept a stream pointed to markdown/*ww  w .  j av a 2  s  .  c om*/
 *
 * @param in
 * @return
 */
public List<HelpSection> processHelp(InputStream in) {
    List<HelpSection> helpSections = new ArrayList<>();

    String data = "";
    try (BufferedReader bin = new BufferedReader(new InputStreamReader(in))) {
        data = bin.lines().collect(Collectors.joining("\n"));
    } catch (IOException e) {

    }

    PegDownProcessor pegDownProcessor = new PegDownProcessor(PROCESSING_TIMEOUT);
    String html = pegDownProcessor.markdownToHtml(data);
    Document doc = Jsoup.parse(html);
    Elements elements = doc.getAllElements();

    Set<String> headerTags = new HashSet<>();
    headerTags.add("h1");
    headerTags.add("h2");
    headerTags.add("h3");
    headerTags.add("h4");
    headerTags.add("h5");
    headerTags.add("h6");

    boolean capture = false;
    HelpSection helpSection = null;
    for (Element element : elements) {
        if (headerTags.contains(element.tagName().toLowerCase()) == false && capture) {
            if (helpSection != null) {
                if (helpSection.getContent().contains(element.outerHtml()) == false) {
                    helpSection.setContent(helpSection.getContent() + element.outerHtml());
                }
            }
        }

        if (headerTags.contains(element.tagName().toLowerCase())) {
            String title = element.html();

            if (helpSection != null) {
                //save old section
                addHelpSection(helpSections, helpSection);
            }

            String titleSplit[] = title.split(" ");

            helpSection = new HelpSection();
            helpSection.setTitle(title);
            helpSection.setHeaderLevel(Convert.toInteger(element.tagName().toLowerCase().replace("h", "")));
            helpSection.setSectionNumber(titleSplit[0]);
            helpSection.setContent("");

            if (title.contains("*")) {
                helpSection.setAdminSection(true);
            } else {
                helpSection.setAdminSection(false);
            }

            capture = true;
        }
    }
    //Add last section
    if (helpSection != null) {
        addHelpSection(helpSections, helpSection);
    }

    return helpSections;
}

From source file:info.mikaelsvensson.devtools.sitesearch.SiteSearchPlugin.java

private IndexEntry createIndexEntry(final File file) {
    try {//from  w ww.  j a  va 2 s.co m
        Document document = Jsoup.parse(file, "UTF-8", "http://invalid.host");
        Element contentEl = document.getElementById("contentBox");
        if (contentEl == null) {
            contentEl = document.body();
        }
        if (contentEl != null) {
            String text = Jsoup.clean(contentEl.html(), Whitelist.simpleText());
            Collection<WordCount> wordCount = getWordCount(text);
            Collection<WordCount> filteredWordCount = filterWordCount(wordCount);
            return new IndexEntry(document.title(), getRelativePath(getSiteOutputFolder(), file),
                    filteredWordCount);
        }
    } catch (IOException e) {
        e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
    }
    return null;
}