List of usage examples for org.jsoup.nodes Document title
public String title()
From source file:com.techcavern.wavetact.ircCommands.utils.Title.java
@Override public void onCommand(User user, PircBotX network, String prefix, Channel channel, boolean isPrivate, int userPermLevel, String... args) throws Exception { Document doc = Jsoup.connect(args[0]).userAgent( "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17") .get();/*from w ww . j a v a 2 s . c o m*/ IRCUtils.sendMessage(user, network, channel, doc.title(), prefix); }
From source file:me.vertretungsplan.parser.DSBLightParser.java
private void parseDay(String url, Map<String, String> referer, SubstitutionSchedule schedule, String startUrl) throws IOException, JSONException, CredentialInvalidException { String html = httpGet(url, data.optString(PARAM_ENCODING, null), referer); Document doc = Jsoup.parse(html); if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis") || doc.select(".mon_list").size() > 0) { parseMultipleMonitorDays(schedule, doc, data); if (doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!redirectUrl.equals(startUrl)) { parseDay(redirectUrl, referer, schedule, startUrl); }/*from w w w .j a v a2s . c o m*/ } } }
From source file:com.illustrationfinder.process.post.HtmlPostProcessor.java
@Override public List<String> generateKeywords() { // TODO If two words are always close to each other, they should be considered as an expression and managed like one word if (this.url == null) return null; try {/*from w w w.jav a 2 s. c om*/ // Retrieve the document and store it temporary try (final InputStream stream = this.url.openStream()) { final String rawText = IOUtils.toString(stream); // Retrieve useful HTML data final Document document = Jsoup.parse(rawText); String htmlTitle = document.title(); String htmlKeywords = document.select("meta[name=keywords]").text(); String htmlDescription = document.select("meta[name=description]").text(); // Extract the content of the raw text String content = ArticleExtractor.getInstance().getText(rawText); // Now we apply a simple algorithm to get keywords // 1) We remove all punctuation marks from the title // 2) We remove all words with less than 4 characters // 3) We remove excessive spacing and tabulations htmlTitle = htmlTitle.toLowerCase(); htmlTitle = htmlTitle.replaceAll(PUNCTUATION_REGEX, ""); htmlTitle = htmlTitle.replaceAll(WORD_WITH_LESS_THAN_4_CHARACTERS_REGEX, ""); htmlTitle = htmlTitle.replaceAll(EXCESSIVE_SPACING_REGEX, " "); final List<String> keywords = new ArrayList<>(); final List<String> keywordsList = Arrays.asList(htmlTitle.split(" ")); for (String tmp : keywordsList) { if (tmp.length() >= MINIMUM_WORD_LENGTH) { keywords.add(tmp); } } // If there is enough keywords, we return if (keywords.size() >= MINIMUM_KEYWORDS_COUNT) { return keywords; } else { // Otherwise, we look for more keywords from the text by taking the more frequent words content = content.toLowerCase(); content = content.replaceAll(PUNCTUATION_REGEX, ""); content = content.replaceAll(WORD_WITH_LESS_THAN_4_CHARACTERS_REGEX, ""); content = content.replaceAll(EXCESSIVE_SPACING_REGEX, " "); final Map<String, Integer> frequencies = new HashMap<>(); final String[] words = content.split(" "); // Count word frequencies for (final String word : words) { if (frequencies.containsKey(word)) { frequencies.put(word, frequencies.get(word) + 1); } else { frequencies.put(word, 1); } } // Sort the words per frequency final SortedMap<Integer, HashSet<String>> sortedWords = new TreeMap<>(); for (Map.Entry<String, Integer> entry : frequencies.entrySet()) { if (sortedWords.containsKey(entry.getValue())) { sortedWords.get(entry.getValue()).add(entry.getKey()); } else { final HashSet<String> set = new HashSet<>(); set.add(entry.getKey()); sortedWords.put(entry.getValue(), set); } } // Add the most frequent words until we reach the minimu keywords count while (keywords.size() < MINIMUM_KEYWORDS_COUNT) { final HashSet<String> set = sortedWords.get(sortedWords.lastKey()); final String keyword = set.iterator().next(); set.remove(keyword); if (set.size() == 0) { sortedWords.remove(sortedWords.lastKey()); } if (keyword.length() > MINIMUM_WORD_LENGTH) { keywords.add(keyword); } } return keywords; } } } catch (BoilerpipeProcessingException e) { // TODO e.printStackTrace(); } catch (IOException e) { // TODO e.printStackTrace(); } return null; }
From source file:me.vertretungsplan.parser.DSBMobileParser.java
private void loadScheduleFromUrl(SubstitutionSchedule v, String url, List<String> usedUrls) throws IOException, JSONException, CredentialInvalidException, IncompatibleScheduleException { usedUrls.add(url);/* w w w . ja v a2 s . co m*/ String html = httpGet(url, data.has(PARAM_ENCODING) ? data.optString(PARAM_ENCODING, null) : "UTF-8"); Document doc = Jsoup.parse(html); if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis") || data.optString(PARAM_TYPE, "").equals("untis")) { parseMultipleMonitorDays(v, doc, data); } else if (doc.html().toLowerCase().contains("created by davinci") || data.optString(PARAM_TYPE, "").equals("davinci")) { Elements titles = doc.select("h2"); Elements tables = doc.select("h2 + p + table"); if (titles.size() != tables.size()) throw new IOException("Anzahl berschriften != Anzahl Tabellen"); for (int i = 0; i < titles.size(); i++) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); String date = titles.get(i).text(); day.setDateString(date); day.setDate(ParserUtils.parseDate(date)); DaVinciParser.parseDaVinciTable(tables.get(i), v, day, colorProvider); v.addDay(day); } } else if (doc.select(".tdaktionen").size() > 0 || data.optString(PARAM_TYPE, "").equals("indiware")) { new IndiwareParser(scheduleData, cookieProvider).parseIndiwarePage(v, doc.html()); } else if (doc.text().matches(".*Fr diesen Bereich.*wurde kein Inhalt bereitgestellt\\.")) { return; } else { throw new IncompatibleScheduleException(); } if (doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!usedUrls.contains(redirectUrl)) { loadScheduleFromUrl(v, redirectUrl, usedUrls); } } }
From source file:com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java
/** * ?page??/* www . jav a 2 s.c o m*/ */ @Override public void visit(Page page) { try { String url = page.getWebURL().getURL(); page.setContentType("text/html; charset=" + gather.getEncoding()); Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get(); String title = doc.title(); if (gather.getTitleExternal() && gather.getTitleRegex() != null && gather.getTitleRegex().length() > 0) { Elements titleEles = doc.select(gather.getTitleRegex()); if (!titleEles.isEmpty()) { String tempTitle = titleEles.text(); if (tempTitle != null && tempTitle.length() > 0) { title = tempTitle; } } } if (title != null && title.trim().length() > 0) { Elements elements = doc.select(matchRegex); if (filterRegex != null && filterRegex.trim().length() > 0) { elements = elements.not(filterRegex); } if (!elements.isEmpty()) { String subHtml = elements.html(); Document blockDoc = Jsoup.parse(subHtml); String contentText = blockDoc.html(); if (gather.getRemoveHref()) { Document moveDoc = Jsoup.parse(contentText); Elements moveEles = moveDoc.select("*").not("a"); contentText = moveEles.html(); } if (gather.getRemoveHtmlTag()) contentText = doc.text(); if (isLocal) { contentText = doc.text(); Boolean isMatcher = true; for (int i = 0; i < keys.length; i++) { Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find(); if (!result) { isMatcher = false; break; } } if (isMatcher) { Storage storage = new Storage(); storage.setGatherId(gather.getId()); storage.setGatherName(gather.getName()); storage.setTitle(title); storage.setUrl(url); try { gatherService.addStorage(storage); } catch (Exception e) { logger.error("save storage error : {}", e.getLocalizedMessage()); } finally { storage = null; } } } else { Content content = new Content(); content.setDetail(contentText); content.setPage(1); List<Content> contents = new ArrayList<Content>(); contents.add(content); Article article = new Article(); article.setTitle(title); article.setContents(contents); articleMainService.addArticleMainByCrawler(article, gather.getChannelId(), CrawlerUtil.USER_NAME); } } } } catch (IOException e) { logger.warn(e.getLocalizedMessage()); } }
From source file:com.josue.lottery.eap.service.core.LotoImporter.java
private void parseHtml(File file) { // String html = "<html><head><title>First parse</title></head>" // + "<body><p>Parsed HTML into a doc.</p>" // +/* www . j a va2s .c o m*/ // " <table><tr><td>satu</td><td>satu-1</td></tr><tr><td>dua</td><td>dua-1</td></tr><tr><td>tiga</td><td>tiga-1</td></tr></table> " // + "</body></html>"; StringBuilder sb = new StringBuilder(); BufferedReader br = null; try { br = new BufferedReader(new FileReader(file)); } catch (FileNotFoundException ex) { java.util.logging.Logger.getLogger(LotoImporter.class.getName()).log(Level.SEVERE, null, ex); } String line; try { while ((line = br.readLine()) != null) { sb.append(line); } } catch (IOException ex) { java.util.logging.Logger.getLogger(LotoImporter.class.getName()).log(Level.SEVERE, null, ex); } Document doc = Jsoup.parse(sb.toString()); Element table = doc.select("table").first(); Iterator<Element> iterator = table.select("td").iterator(); while (iterator.hasNext()) { logger.info("text : " + iterator.next().text()); } String title = doc.title(); System.out.println("Document title : " + title); }
From source file:me.vertretungsplan.parser.SVPlanParser.java
private void parseSvPlanDay(SubstitutionSchedule v, Element svp, Document doc) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); if ((svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0 || doc.title().startsWith("Vertretungsplan fr "))) { setDate(svp, doc, day);/*from w ww. j a v a 2s . com*/ if (svp.select(".svp-tabelle, table:has(.Klasse)").size() > 0) { Elements rows = svp.select(".svp-tabelle tr, table:has(.Klasse) tr"); String lastLesson = ""; String lastClass = ""; for (Element row : rows) { if ((doc.select(".svp-header").size() > 0 && row.hasClass("svp-header")) || row.select("th").size() > 0 || row.text().trim().equals("")) { continue; } Substitution substitution = new Substitution(); for (Element column : row.select("td")) { String type = column.className(); if (!hasData(column.text())) { if ((type.startsWith("svp-stunde") || type.startsWith("Stunde")) && hasData(lastLesson)) { substitution.setLesson(lastLesson); } else if ((type.startsWith("svp-klasse") || type.startsWith("Klasse")) && hasData(lastClass)) { substitution.getClasses().addAll(Arrays .asList(lastClass.split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); } continue; } if (type.startsWith("svp-stunde") || type.startsWith("Stunde")) { substitution.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse") || type.startsWith("Klasse")) { substitution.getClasses().addAll(Arrays .asList(column.text().split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); lastClass = column.text(); } else if (type.startsWith("svp-esfehlt") || type.startsWith("Lehrer")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setPreviousTeacher(column.text()); } } else if (type.startsWith("svp-esvertritt") || type.startsWith("Vertretung")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setTeacher(column.text().replaceAll(" \\+$", "")); } } else if (type.startsWith("svp-fach") || type.startsWith("Fach")) { substitution.setSubject(column.text()); } else if (type.startsWith("svp-bemerkung") || type.startsWith("Anmerkung")) { substitution.setDesc(column.text()); String recognizedType = recognizeType(column.text()); substitution.setType(recognizedType); substitution.setColor(colorProvider.getColor(recognizedType)); } else if (type.startsWith("svp-raum") || type.startsWith("Raum")) { substitution.setRoom(column.text()); } } if (substitution.getType() == null) { substitution.setType("Vertretung"); substitution.setColor(colorProvider.getColor("Vertretung")); } day.addSubstitution(substitution); } } if (svp.select(".LehrerVerplant").size() > 0) { day.addMessage("<b>Verplante Lehrer:</b> " + svp.select(".LehrerVerplant").text()); } if (svp.select(".Abwesenheiten").size() > 0) { day.addMessage("<b>Abwesenheiten:</b> " + svp.select(".Abwesenheiten").text()); } if (svp.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = svp.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } sibling = sibling.nextElementSibling(); } } else if (svp.select(".Mitteilungen").size() > 0) { for (Element p : svp.select(".Mitteilungen")) { for (String nachricht : TextNode.createFromEncoded(p.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } } } v.addDay(day); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } }
From source file:com.mythesis.userbehaviouranalysis.WebParser.java
/** * Parse the url and get all the content * @param link the url to parse/*from w ww.j av a 2 s . co m*/ * @return The content parsed */ private String cleanhtml(String link) { try { Document doc = Jsoup.connect(link).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) { link = link.substring(0, link.length() - 1); } if (link.substring(0, 5).equalsIgnoreCase("https")) { link = link.substring(8); } else if (link.substring(0, 4).equalsIgnoreCase("http")) { link = link.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element el : links) { String str_check = el.attr("abs:href"); if (el.attr("abs:href").contains(link) && el.text().length() > 1) { anchortext = anchortext + el.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").contains(link)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } if (medi.getElementsByTag("img").attr("src").startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:me.vertretungsplan.parser.SVPlanParser.java
private void setDate(Element svp, Document doc, SubstitutionScheduleDay day) { String date = "Unbekanntes Datum"; if (svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0) { date = svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").text() .replaceAll("Vertretungsplan (fr )?", "").trim(); } else if (doc.title().startsWith("Vertretungsplan fr ")) { date = doc.title().substring("Vertretungsplan fr ".length()); }/*from ww w . jav a 2s . c om*/ date = date.replaceAll("\\s+", " "); day.setDateString(date); day.setDate(ParserUtils.parseDate(date)); if (svp.select(".svp-uploaddatum, .Stand").size() > 0) { String lastChange = svp.select(".svp-uploaddatum, .Stand").text().replace("Aktualisierung: ", "") .replace("Stand: ", ""); day.setLastChangeString(lastChange); day.setLastChange(ParserUtils.parseDateTime(lastChange)); } }
From source file:org.kitesdk.spring.hbase.example.service.WebPageSnapshotService.java
/** * Fetch the web page from the URL, parse the HTML to populate the metadata * required by WebPageSnapshotModel, and return the constructed * WebPageSnapshotModel./*from w ww . ja va 2 s . co m*/ * * @param url The URL to fetch the web page from * @return The WebPageSnapshotModel * @throws IOException Thrown if there's an issue fetching the web page. */ private WebPageSnapshotModel fetchWebPage(String url, String contentKey) throws IOException { long fetchTime = System.currentTimeMillis(); Connection connection = Jsoup.connect(url); Response response = connection.execute(); long postFetchTime = System.currentTimeMillis(); int timeToFetch = (int) (postFetchTime - fetchTime); Document doc = response.parse(); String destinationUrl = response.url().toString(); String title = doc.title(); String description = getDescriptionFromDocument(doc); List<String> keywords = getKeywordsFromDocument(doc); List<String> outlinks = getOutlinksFromDocument(doc); return WebPageSnapshotModel.newBuilder().setUrl(destinationUrl) .setFetchedAtRevTs(Long.MAX_VALUE - fetchTime).setSize(doc.html().length()).setFetchedAt(fetchTime) .setFetchTimeMs(timeToFetch).setTitle(title).setDescription(description).setKeywords(keywords) .setOutlinks(outlinks).setContentKey(contentKey).setContent(ImmutableMap.of(contentKey, doc.html())) .build(); }