List of usage examples for org.jsoup.nodes Document body
public Element body()
From source file:edu.stanford.muse.index.NER.java
public static List<Pair<String, Float>> namesFromArchive(String url, boolean removeCommonNames) throws ClassCastException, IOException, ClassNotFoundException { // only http conns allowed currently HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection(); conn.setInstanceFollowRedirects(true); conn.setRequestProperty("User-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:6.0.2) Gecko/20100101 Firefox/6.0.2"); conn.connect();//w w w . ja v a 2s . c o m byte[] b = Util.getBytesFromStream(conn.getInputStream()); String text = new String(b, "UTF-8"); text = Util.unescapeHTML(text); org.jsoup.nodes.Document doc = Jsoup.parse(text); text = doc.body().text(); return namesFromText(text, removeCommonNames); }
From source file:edu.stanford.muse.index.NER.java
/** returns a list of <name, #occurrences> */ public static List<Pair<String, Float>> namesFromURL(String url, boolean removeCommonNames) throws ClassCastException, IOException, ClassNotFoundException { // only http conns allowed currently Indexer.log.info(url);/*from w w w.j av a 2s . c o m*/ HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection(); conn.setInstanceFollowRedirects(true); conn.setRequestProperty("User-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:6.0.2) Gecko/20100101 Firefox/6.0.2"); conn.connect(); Indexer.log.info("url for extracting names:" + conn.getURL()); byte[] b = Util.getBytesFromStream(conn.getInputStream()); String text = new String(b, "UTF-8"); text = Util.unescapeHTML(text); org.jsoup.nodes.Document doc = Jsoup.parse(text); text = doc.body().text(); return namesFromText(text, removeCommonNames); }
From source file:io.andyc.papercut.api.PrintApi.java
/** * Parses the final form submission HTMl and extracts the submit job url * which can be used to check the file upload status * * @param prevElement {Element} - the final file upload form HTML result * * @return {String} - the full URL used to check the status of the file * upload//from www.j a v a 2 s . c o m */ static String getStatusCheckURL(Document prevElement, String baseDomain) throws MalformedURLException { Matcher match = PrintApi.statusPathPattern.matcher(prevElement.body().data()); match.find(); String printPathVariable = match.group(); match = PrintApi.urlPattern.matcher(printPathVariable); match.find(); String uploadId = match.group().replaceAll("\'", ""); URL u = new URL(baseDomain); String url = baseDomain.replaceAll(u.getPath(), ""); return url + "/rpc/web-print/job-status/" + uploadId + ".json"; }
From source file:com.gsr.myschool.server.service.impl.InboxServiceImpl.java
@Override public List<InboxMessage> findAllInboxMessage(Long userId) { List<InboxMessage> messages = inboxMessageRepos.findByParentUser_id(userId); for (InboxMessage message : messages) { Document doc = Jsoup.parse(message.getContent()); message.setRawContent(doc.body().text()); }//w w w . j av a2 s . c o m return messages; }
From source file:com.example.muzei.muzeiapod.ApodNasaArtSource.java
@Override protected void onTryUpdate(int reason) throws RetryException { URI topUri;//from www . jav a 2s.c o m try { topUri = new URI("http://apod.nasa.gov/"); } catch (URISyntaxException e) { return; } URI mainUri = topUri.resolve("/apod/astropix.html"); String bodyStr = getURLContent(mainUri.toString()); /* TODO code below should go to a separate method/class */ /* start parsing page */ Document doc = Jsoup.parse(bodyStr); Element body = doc.body(); /* get image URI */ Element firstCenterTag = body.child(0); Element imgAnchor = firstCenterTag.getElementsByTag("a").last(); Element img = imgAnchor.getElementsByTag("img").first(); URI bigImageUri = topUri.resolve("/apod/" + img.attr("src")); String uri = bigImageUri.toString(); /* get title */ Element secondCenterTag = body.child(1); Element titleElem = secondCenterTag.child(0); String title = titleElem.text(); /* get byline */ String secondCenterText = secondCenterTag.text(); /* byline: everything after 'title' above */ int idx = secondCenterText.lastIndexOf(title) + title.length(); String byline = secondCenterText.substring(idx).trim(); /* TODO figure out the permanent link */ String link = "http://apod.nasa.gov/apod/astropix.html"; publishArtwork(new Artwork.Builder().title(title).byline(byline).imageUri(Uri.parse(uri)).token(title) .viewIntent(new Intent(Intent.ACTION_VIEW, Uri.parse(link))).build()); scheduleUpdate(System.currentTimeMillis() + ROTATE_TIME_MILLIS); }
From source file:net.orzo.data.Web.java
public List<Element> queryPage(Document document, String select, ScriptFunction fn) { return queryPage(document.body(), select, fn); }
From source file:net.sf.jabref.logic.fetcher.DoiResolution.java
@Override public Optional<URL> findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); Optional<URL> pdfLink = Optional.empty(); Optional<DOI> doi = DOI.build(entry.getField("doi")); if (doi.isPresent()) { String sciLink = doi.get().getURLAsASCIIString(); // follow all redirects and scan for a single pdf link if (!sciLink.isEmpty()) { try { Connection connection = Jsoup.connect(sciLink); connection.followRedirects(true); connection.ignoreHttpErrors(true); // some publishers are quite slow (default is 3s) connection.timeout(5000); Document html = connection.get(); // scan for PDF Elements elements = html.body().select("[href]"); List<Optional<URL>> links = new ArrayList<>(); for (Element element : elements) { String href = element.attr("abs:href"); // Only check if pdf is included in the link // See https://github.com/lehner/LocalCopy for scrape ideas if (href.contains("pdf") && MimeTypeDetector.isPdfContentType(href)) { links.add(Optional.of(new URL(href))); }/*from www .j ava 2 s .c o m*/ } // return if only one link was found (high accuracy) if (links.size() == 1) { LOGGER.info("Fulltext PDF found @ " + sciLink); pdfLink = links.get(0); } } catch (IOException e) { LOGGER.warn("DoiResolution fetcher failed: ", e); } } } return pdfLink; }
From source file:net.sf.jabref.logic.fulltext.DoiResolution.java
@Override public Optional<URL> findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); Optional<URL> pdfLink = Optional.empty(); Optional<DOI> doi = entry.getFieldOptional(FieldName.DOI).flatMap(DOI::build); if (doi.isPresent()) { String sciLink = doi.get().getURIAsASCIIString(); // follow all redirects and scan for a single pdf link if (!sciLink.isEmpty()) { try { Connection connection = Jsoup.connect(sciLink); connection.followRedirects(true); connection.ignoreHttpErrors(true); // some publishers are quite slow (default is 3s) connection.timeout(5000); Document html = connection.get(); // scan for PDF Elements elements = html.body().select("[href]"); List<Optional<URL>> links = new ArrayList<>(); for (Element element : elements) { String href = element.attr("abs:href"); // Only check if pdf is included in the link // See https://github.com/lehner/LocalCopy for scrape ideas if (href.contains("pdf") && MimeTypeDetector.isPdfContentType(href)) { links.add(Optional.of(new URL(href))); }// w w w . j ava2 s . c o m } // return if only one link was found (high accuracy) if (links.size() == 1) { LOGGER.info("Fulltext PDF found @ " + sciLink); pdfLink = links.get(0); } } catch (IOException e) { LOGGER.warn("DoiResolution fetcher failed: ", e); } } } return pdfLink; }
From source file:mml.handler.post.MMLPostVersionHandler.java
/** * Handle a POST request//from ww w.ja v a 2 s. co m * @param request the raw request * @param response the response we will write to * @param urn the rest of the URL after stripping off the context * @throws MMLException */ public void handle(HttpServletRequest request, HttpServletResponse response, String urn) throws MMLException { try { String value = request.getParameter("data"); if (value != null) { JSONObject jObj = (JSONObject) JSONValue.parse(value); this.version1 = (String) jObj.get(JSONKeys.VERSION1); if (version1 == null) version1 = "/base"; else version1 = URLDecoder.decode(version1, "UTF-8"); this.longName = (String) jObj.get(JSONKeys.LONGNAME); this.docid = (String) jObj.get(JSONKeys.DOCID); this.dialect = getDialectFromDocid(); JSONArray layers = (JSONArray) jObj.get("layers"); ScratchVersion corcodeDefault = new ScratchVersion(version1, longName, docid + "/default", Database.CORCODE, null, true); ScratchVersion corcodePages = new ScratchVersion(version1, longName, docid + "/pages", Database.CORCODE, null, true); ScratchVersion text = new ScratchVersion(version1, longName, docid, Database.CORTEX, null, true); this.style = ScratchVersionSet.getDefaultStyleName(docid); for (int i = 0; i < layers.size(); i++) { JSONObject layer = (JSONObject) layers.get(i); String name = (String) layer.get(JSONKeys.NAME); String html = (String) layer.get(JSONKeys.BODY); stil = new STILDocument(style); pages = new STILDocument(style); // reduce html to text, corcode-default and corcode-pages Document doc = Jsoup.parseBodyFragment(html); Element body = doc.body(); parseBody(body); int num = ScratchVersion.layerNumber(name); text.addLayer(sb.toString().toCharArray(), num); corcodeDefault.addLayer(stil.toString().toCharArray(), num); corcodePages.addLayer(pages.toString().toCharArray(), num); } Scratch.save(text); Scratch.save(corcodeDefault); Scratch.save(corcodePages); response.setContentType("text/plain"); response.getWriter().write("OK"); } } catch (Exception e) { System.out.println(e.getMessage()); throw new MMLException(e); } }
From source file:me.vertretungsplan.parser.TurboVertretungParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); // JSONArray urls = data.getJSONArray(PARAM_URLS); String encoding = data.optString(PARAM_ENCODING, null); List<Document> docs = new ArrayList<>(); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); for (int i = 0; i < urls.length(); i++) { String url;/*w ww. j a v a2 s . co m*/ if (urls.get(i) instanceof JSONObject) { // backwards compatibility url = urls.getJSONObject(i).getString("url"); } else { url = urls.getString(i); } loadUrl(url, encoding, docs); } for (Document doc : docs) { String html = doc.body().html(); String[] parts = html.split("<p class=\"Titel\">"); for (int i = 1; i < parts.length; i++) { Document partDoc = Jsoup.parse("<p class=\"Titel\">" + parts[i]); parseTurboVertretungDay(v, partDoc); } } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }