Example usage for org.jsoup.nodes Document body

Introduction

In this page you can find the example usage for org.jsoup.nodes Document body.

Prototype

public Element body()

Source Link

Document

Accessor to the document's body element.

Usage

From source file:edu.stanford.muse.index.NER.java

public static List<Pair<String, Float>> namesFromArchive(String url, boolean removeCommonNames)
        throws ClassCastException, IOException, ClassNotFoundException {
    // only http conns allowed currently
    HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();

    conn.setInstanceFollowRedirects(true);
    conn.setRequestProperty("User-agent",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
    conn.connect();//w w  w . ja  v a  2s  .  c o m

    byte[] b = Util.getBytesFromStream(conn.getInputStream());
    String text = new String(b, "UTF-8");
    text = Util.unescapeHTML(text);
    org.jsoup.nodes.Document doc = Jsoup.parse(text);
    text = doc.body().text();
    return namesFromText(text, removeCommonNames);
}

From source file:edu.stanford.muse.index.NER.java

/** returns a list of <name, #occurrences> */
public static List<Pair<String, Float>> namesFromURL(String url, boolean removeCommonNames)
        throws ClassCastException, IOException, ClassNotFoundException {
    // only http conns allowed currently
    Indexer.log.info(url);/*from  w w w.j  av  a  2s . c o m*/
    HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();
    conn.setInstanceFollowRedirects(true);
    conn.setRequestProperty("User-agent",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
    conn.connect();
    Indexer.log.info("url for extracting names:" + conn.getURL());
    byte[] b = Util.getBytesFromStream(conn.getInputStream());
    String text = new String(b, "UTF-8");
    text = Util.unescapeHTML(text);
    org.jsoup.nodes.Document doc = Jsoup.parse(text);
    text = doc.body().text();
    return namesFromText(text, removeCommonNames);
}

From source file:io.andyc.papercut.api.PrintApi.java

/**
 * Parses the final form submission HTMl and extracts the submit job url
 * which can be used to check the file upload status
 *
 * @param prevElement {Element} - the final file upload form HTML result
 *
 * @return {String} - the full URL used to check the status of the file
 * upload//from   www.j a  v  a  2 s . c o m
 */
static String getStatusCheckURL(Document prevElement, String baseDomain) throws MalformedURLException {

    Matcher match = PrintApi.statusPathPattern.matcher(prevElement.body().data());
    match.find();
    String printPathVariable = match.group();
    match = PrintApi.urlPattern.matcher(printPathVariable);
    match.find();
    String uploadId = match.group().replaceAll("\'", "");
    URL u = new URL(baseDomain);
    String url = baseDomain.replaceAll(u.getPath(), "");
    return url + "/rpc/web-print/job-status/" + uploadId + ".json";
}

From source file:com.gsr.myschool.server.service.impl.InboxServiceImpl.java

@Override
public List<InboxMessage> findAllInboxMessage(Long userId) {
    List<InboxMessage> messages = inboxMessageRepos.findByParentUser_id(userId);
    for (InboxMessage message : messages) {
        Document doc = Jsoup.parse(message.getContent());
        message.setRawContent(doc.body().text());
    }//w  w w  . j  av a2  s .  c o m
    return messages;
}

From source file:com.example.muzei.muzeiapod.ApodNasaArtSource.java

@Override
protected void onTryUpdate(int reason) throws RetryException {
    URI topUri;//from www . jav  a 2s.c o m
    try {
        topUri = new URI("http://apod.nasa.gov/");
    } catch (URISyntaxException e) {
        return;
    }

    URI mainUri = topUri.resolve("/apod/astropix.html");
    String bodyStr = getURLContent(mainUri.toString());

    /* TODO code below should go to a separate method/class */

    /* start parsing page */
    Document doc = Jsoup.parse(bodyStr);
    Element body = doc.body();

    /* get image URI */
    Element firstCenterTag = body.child(0);
    Element imgAnchor = firstCenterTag.getElementsByTag("a").last();
    Element img = imgAnchor.getElementsByTag("img").first();
    URI bigImageUri = topUri.resolve("/apod/" + img.attr("src"));
    String uri = bigImageUri.toString();

    /* get title */
    Element secondCenterTag = body.child(1);
    Element titleElem = secondCenterTag.child(0);
    String title = titleElem.text();

    /* get byline */
    String secondCenterText = secondCenterTag.text();
    /* byline: everything after 'title' above */
    int idx = secondCenterText.lastIndexOf(title) + title.length();
    String byline = secondCenterText.substring(idx).trim();

    /* TODO figure out the permanent link */
    String link = "http://apod.nasa.gov/apod/astropix.html";

    publishArtwork(new Artwork.Builder().title(title).byline(byline).imageUri(Uri.parse(uri)).token(title)
            .viewIntent(new Intent(Intent.ACTION_VIEW, Uri.parse(link))).build());
    scheduleUpdate(System.currentTimeMillis() + ROTATE_TIME_MILLIS);
}

From source file:net.orzo.data.Web.java

public List<Element> queryPage(Document document, String select, ScriptFunction fn) {
    return queryPage(document.body(), select, fn);
}

From source file:net.sf.jabref.logic.fetcher.DoiResolution.java

@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
    Objects.requireNonNull(entry);
    Optional<URL> pdfLink = Optional.empty();

    Optional<DOI> doi = DOI.build(entry.getField("doi"));

    if (doi.isPresent()) {
        String sciLink = doi.get().getURLAsASCIIString();

        // follow all redirects and scan for a single pdf link
        if (!sciLink.isEmpty()) {
            try {
                Connection connection = Jsoup.connect(sciLink);
                connection.followRedirects(true);
                connection.ignoreHttpErrors(true);
                // some publishers are quite slow (default is 3s)
                connection.timeout(5000);

                Document html = connection.get();
                // scan for PDF
                Elements elements = html.body().select("[href]");
                List<Optional<URL>> links = new ArrayList<>();

                for (Element element : elements) {
                    String href = element.attr("abs:href");
                    // Only check if pdf is included in the link
                    // See https://github.com/lehner/LocalCopy for scrape ideas
                    if (href.contains("pdf") && MimeTypeDetector.isPdfContentType(href)) {
                        links.add(Optional.of(new URL(href)));
                    }/*from www .j  ava  2  s .c  o m*/
                }
                // return if only one link was found (high accuracy)
                if (links.size() == 1) {
                    LOGGER.info("Fulltext PDF found @ " + sciLink);
                    pdfLink = links.get(0);
                }
            } catch (IOException e) {
                LOGGER.warn("DoiResolution fetcher failed: ", e);
            }
        }
    }
    return pdfLink;
}

From source file:net.sf.jabref.logic.fulltext.DoiResolution.java

@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
    Objects.requireNonNull(entry);
    Optional<URL> pdfLink = Optional.empty();

    Optional<DOI> doi = entry.getFieldOptional(FieldName.DOI).flatMap(DOI::build);

    if (doi.isPresent()) {
        String sciLink = doi.get().getURIAsASCIIString();

        // follow all redirects and scan for a single pdf link
        if (!sciLink.isEmpty()) {
            try {
                Connection connection = Jsoup.connect(sciLink);
                connection.followRedirects(true);
                connection.ignoreHttpErrors(true);
                // some publishers are quite slow (default is 3s)
                connection.timeout(5000);

                Document html = connection.get();
                // scan for PDF
                Elements elements = html.body().select("[href]");
                List<Optional<URL>> links = new ArrayList<>();

                for (Element element : elements) {
                    String href = element.attr("abs:href");
                    // Only check if pdf is included in the link
                    // See https://github.com/lehner/LocalCopy for scrape ideas
                    if (href.contains("pdf") && MimeTypeDetector.isPdfContentType(href)) {
                        links.add(Optional.of(new URL(href)));
                    }// w w w . j  ava2  s  .  c o m
                }
                // return if only one link was found (high accuracy)
                if (links.size() == 1) {
                    LOGGER.info("Fulltext PDF found @ " + sciLink);
                    pdfLink = links.get(0);
                }
            } catch (IOException e) {
                LOGGER.warn("DoiResolution fetcher failed: ", e);
            }
        }
    }
    return pdfLink;
}

From source file:mml.handler.post.MMLPostVersionHandler.java

/**
 * Handle a POST request//from ww  w.ja v  a 2 s. co  m
 * @param request the raw request
 * @param response the response we will write to
 * @param urn the rest of the URL after stripping off the context
 * @throws MMLException 
 */
public void handle(HttpServletRequest request, HttpServletResponse response, String urn) throws MMLException {
    try {
        String value = request.getParameter("data");
        if (value != null) {
            JSONObject jObj = (JSONObject) JSONValue.parse(value);
            this.version1 = (String) jObj.get(JSONKeys.VERSION1);
            if (version1 == null)
                version1 = "/base";
            else
                version1 = URLDecoder.decode(version1, "UTF-8");
            this.longName = (String) jObj.get(JSONKeys.LONGNAME);
            this.docid = (String) jObj.get(JSONKeys.DOCID);
            this.dialect = getDialectFromDocid();
            JSONArray layers = (JSONArray) jObj.get("layers");
            ScratchVersion corcodeDefault = new ScratchVersion(version1, longName, docid + "/default",
                    Database.CORCODE, null, true);
            ScratchVersion corcodePages = new ScratchVersion(version1, longName, docid + "/pages",
                    Database.CORCODE, null, true);
            ScratchVersion text = new ScratchVersion(version1, longName, docid, Database.CORTEX, null, true);
            this.style = ScratchVersionSet.getDefaultStyleName(docid);
            for (int i = 0; i < layers.size(); i++) {
                JSONObject layer = (JSONObject) layers.get(i);
                String name = (String) layer.get(JSONKeys.NAME);
                String html = (String) layer.get(JSONKeys.BODY);
                stil = new STILDocument(style);
                pages = new STILDocument(style);
                // reduce html to text, corcode-default and corcode-pages
                Document doc = Jsoup.parseBodyFragment(html);
                Element body = doc.body();
                parseBody(body);
                int num = ScratchVersion.layerNumber(name);
                text.addLayer(sb.toString().toCharArray(), num);
                corcodeDefault.addLayer(stil.toString().toCharArray(), num);
                corcodePages.addLayer(pages.toString().toCharArray(), num);
            }
            Scratch.save(text);
            Scratch.save(corcodeDefault);
            Scratch.save(corcodePages);
            response.setContentType("text/plain");
            response.getWriter().write("OK");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
        throw new MMLException(e);
    }
}

From source file:me.vertretungsplan.parser.TurboVertretungParser.java

@Override
public SubstitutionSchedule getSubstitutionSchedule()
        throws IOException, JSONException, CredentialInvalidException {
    new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); //

    JSONArray urls = data.getJSONArray(PARAM_URLS);
    String encoding = data.optString(PARAM_ENCODING, null);
    List<Document> docs = new ArrayList<>();

    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    for (int i = 0; i < urls.length(); i++) {
        String url;/*w  ww. j a v  a2  s . co  m*/
        if (urls.get(i) instanceof JSONObject) {
            // backwards compatibility
            url = urls.getJSONObject(i).getString("url");
        } else {
            url = urls.getString(i);
        }
        loadUrl(url, encoding, docs);
    }

    for (Document doc : docs) {
        String html = doc.body().html();
        String[] parts = html.split("<p class=\"Titel\">");
        for (int i = 1; i < parts.length; i++) {
            Document partDoc = Jsoup.parse("<p class=\"Titel\">" + parts[i]);
            parseTurboVertretungDay(v, partDoc);
        }
    }

    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());

    return v;
}