Example usage for org.jsoup.nodes Document title

Introduction

In this page you can find the example usage for org.jsoup.nodes Document title.

Prototype

public String title()

Source Link

Document

Get the string contents of the document's title element.

Usage

From source file:com.techcavern.wavetact.ircCommands.utils.Title.java

@Override
public void onCommand(User user, PircBotX network, String prefix, Channel channel, boolean isPrivate,
        int userPermLevel, String... args) throws Exception {
    Document doc = Jsoup.connect(args[0]).userAgent(
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17")
            .get();/*from   w ww . j a v  a  2  s  .  c  o m*/
    IRCUtils.sendMessage(user, network, channel, doc.title(), prefix);
}

From source file:me.vertretungsplan.parser.DSBLightParser.java

private void parseDay(String url, Map<String, String> referer, SubstitutionSchedule schedule, String startUrl)
        throws IOException, JSONException, CredentialInvalidException {
    String html = httpGet(url, data.optString(PARAM_ENCODING, null), referer);
    Document doc = Jsoup.parse(html);
    if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis")
            || doc.select(".mon_list").size() > 0) {
        parseMultipleMonitorDays(schedule, doc, data);
        if (doc.select("meta[http-equiv=refresh]").size() > 0) {
            Element meta = doc.select("meta[http-equiv=refresh]").first();
            String attr = meta.attr("content").toLowerCase();
            String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1)
                    + attr.substring(attr.indexOf("url=") + 4);
            if (!redirectUrl.equals(startUrl)) {
                parseDay(redirectUrl, referer, schedule, startUrl);
            }/*from w  w  w  .j  a  v  a2s  . c  o  m*/
        }
    }
}

From source file:com.illustrationfinder.process.post.HtmlPostProcessor.java

@Override
public List<String> generateKeywords() {
    // TODO If two words are always close to each other, they should be considered as an expression and managed like one word
    if (this.url == null)
        return null;

    try {/*from  w w  w.jav a  2 s.  c  om*/
        // Retrieve the document and store it temporary
        try (final InputStream stream = this.url.openStream()) {
            final String rawText = IOUtils.toString(stream);

            // Retrieve useful HTML data
            final Document document = Jsoup.parse(rawText);

            String htmlTitle = document.title();
            String htmlKeywords = document.select("meta[name=keywords]").text();
            String htmlDescription = document.select("meta[name=description]").text();

            // Extract the content of the raw text
            String content = ArticleExtractor.getInstance().getText(rawText);

            // Now we apply a simple algorithm to get keywords
            //  1) We remove all punctuation marks from the title
            //  2) We remove all words with less than 4 characters
            //  3) We remove excessive spacing and tabulations

            htmlTitle = htmlTitle.toLowerCase();
            htmlTitle = htmlTitle.replaceAll(PUNCTUATION_REGEX, "");
            htmlTitle = htmlTitle.replaceAll(WORD_WITH_LESS_THAN_4_CHARACTERS_REGEX, "");
            htmlTitle = htmlTitle.replaceAll(EXCESSIVE_SPACING_REGEX, " ");

            final List<String> keywords = new ArrayList<>();
            final List<String> keywordsList = Arrays.asList(htmlTitle.split(" "));
            for (String tmp : keywordsList) {
                if (tmp.length() >= MINIMUM_WORD_LENGTH) {
                    keywords.add(tmp);
                }
            }

            // If there is enough keywords, we return
            if (keywords.size() >= MINIMUM_KEYWORDS_COUNT) {
                return keywords;
            } else {
                // Otherwise, we look for more keywords from the text by taking the more frequent words
                content = content.toLowerCase();
                content = content.replaceAll(PUNCTUATION_REGEX, "");
                content = content.replaceAll(WORD_WITH_LESS_THAN_4_CHARACTERS_REGEX, "");
                content = content.replaceAll(EXCESSIVE_SPACING_REGEX, " ");

                final Map<String, Integer> frequencies = new HashMap<>();
                final String[] words = content.split(" ");

                // Count word frequencies
                for (final String word : words) {
                    if (frequencies.containsKey(word)) {
                        frequencies.put(word, frequencies.get(word) + 1);
                    } else {
                        frequencies.put(word, 1);
                    }
                }

                // Sort the words per frequency
                final SortedMap<Integer, HashSet<String>> sortedWords = new TreeMap<>();

                for (Map.Entry<String, Integer> entry : frequencies.entrySet()) {
                    if (sortedWords.containsKey(entry.getValue())) {
                        sortedWords.get(entry.getValue()).add(entry.getKey());
                    } else {
                        final HashSet<String> set = new HashSet<>();
                        set.add(entry.getKey());
                        sortedWords.put(entry.getValue(), set);
                    }
                }

                // Add the most frequent words until we reach the minimu keywords count
                while (keywords.size() < MINIMUM_KEYWORDS_COUNT) {
                    final HashSet<String> set = sortedWords.get(sortedWords.lastKey());
                    final String keyword = set.iterator().next();

                    set.remove(keyword);
                    if (set.size() == 0) {
                        sortedWords.remove(sortedWords.lastKey());
                    }

                    if (keyword.length() > MINIMUM_WORD_LENGTH) {
                        keywords.add(keyword);
                    }
                }

                return keywords;
            }
        }
    } catch (BoilerpipeProcessingException e) {
        // TODO
        e.printStackTrace();
    } catch (IOException e) {
        // TODO
        e.printStackTrace();
    }

    return null;
}

From source file:me.vertretungsplan.parser.DSBMobileParser.java

private void loadScheduleFromUrl(SubstitutionSchedule v, String url, List<String> usedUrls)
        throws IOException, JSONException, CredentialInvalidException, IncompatibleScheduleException {
    usedUrls.add(url);/*  w w w .  ja  v  a2 s  .  co  m*/
    String html = httpGet(url, data.has(PARAM_ENCODING) ? data.optString(PARAM_ENCODING, null) : "UTF-8");
    Document doc = Jsoup.parse(html);

    if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis")
            || data.optString(PARAM_TYPE, "").equals("untis")) {
        parseMultipleMonitorDays(v, doc, data);
    } else if (doc.html().toLowerCase().contains("created by davinci")
            || data.optString(PARAM_TYPE, "").equals("davinci")) {
        Elements titles = doc.select("h2");
        Elements tables = doc.select("h2 + p + table");
        if (titles.size() != tables.size())
            throw new IOException("Anzahl berschriften != Anzahl Tabellen");
        for (int i = 0; i < titles.size(); i++) {
            SubstitutionScheduleDay day = new SubstitutionScheduleDay();
            String date = titles.get(i).text();
            day.setDateString(date);
            day.setDate(ParserUtils.parseDate(date));
            DaVinciParser.parseDaVinciTable(tables.get(i), v, day, colorProvider);
            v.addDay(day);
        }
    } else if (doc.select(".tdaktionen").size() > 0 || data.optString(PARAM_TYPE, "").equals("indiware")) {
        new IndiwareParser(scheduleData, cookieProvider).parseIndiwarePage(v, doc.html());
    } else if (doc.text().matches(".*Fr diesen Bereich.*wurde kein Inhalt bereitgestellt\\.")) {
        return;
    } else {
        throw new IncompatibleScheduleException();
    }

    if (doc.select("meta[http-equiv=refresh]").size() > 0) {
        Element meta = doc.select("meta[http-equiv=refresh]").first();
        String attr = meta.attr("content").toLowerCase();
        String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1)
                + attr.substring(attr.indexOf("url=") + 4);
        if (!usedUrls.contains(redirectUrl)) {
            loadScheduleFromUrl(v, redirectUrl, usedUrls);
        }
    }
}

From source file:com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java

/**
 * ?page??/*  www  . jav  a 2 s.c o m*/
 */
@Override
public void visit(Page page) {
    try {
        String url = page.getWebURL().getURL();

        page.setContentType("text/html; charset=" + gather.getEncoding());
        Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get();

        String title = doc.title();
        if (gather.getTitleExternal() && gather.getTitleRegex() != null
                && gather.getTitleRegex().length() > 0) {
            Elements titleEles = doc.select(gather.getTitleRegex());
            if (!titleEles.isEmpty()) {
                String tempTitle = titleEles.text();
                if (tempTitle != null && tempTitle.length() > 0) {
                    title = tempTitle;
                }
            }
        }

        if (title != null && title.trim().length() > 0) {
            Elements elements = doc.select(matchRegex);
            if (filterRegex != null && filterRegex.trim().length() > 0) {
                elements = elements.not(filterRegex);
            }
            if (!elements.isEmpty()) {
                String subHtml = elements.html();
                Document blockDoc = Jsoup.parse(subHtml);
                String contentText = blockDoc.html();

                if (gather.getRemoveHref()) {
                    Document moveDoc = Jsoup.parse(contentText);
                    Elements moveEles = moveDoc.select("*").not("a");
                    contentText = moveEles.html();
                }
                if (gather.getRemoveHtmlTag())
                    contentText = doc.text();

                if (isLocal) {
                    contentText = doc.text();

                    Boolean isMatcher = true;
                    for (int i = 0; i < keys.length; i++) {
                        Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find();
                        if (!result) {
                            isMatcher = false;
                            break;
                        }
                    }

                    if (isMatcher) {
                        Storage storage = new Storage();
                        storage.setGatherId(gather.getId());
                        storage.setGatherName(gather.getName());
                        storage.setTitle(title);
                        storage.setUrl(url);
                        try {
                            gatherService.addStorage(storage);
                        } catch (Exception e) {
                            logger.error("save storage error : {}", e.getLocalizedMessage());
                        } finally {
                            storage = null;
                        }
                    }
                } else {
                    Content content = new Content();
                    content.setDetail(contentText);
                    content.setPage(1);
                    List<Content> contents = new ArrayList<Content>();
                    contents.add(content);

                    Article article = new Article();
                    article.setTitle(title);
                    article.setContents(contents);

                    articleMainService.addArticleMainByCrawler(article, gather.getChannelId(),
                            CrawlerUtil.USER_NAME);
                }
            }
        }
    } catch (IOException e) {
        logger.warn(e.getLocalizedMessage());
    }
}

From source file:com.josue.lottery.eap.service.core.LotoImporter.java

private void parseHtml(File file) {
    // String html = "<html><head><title>First parse</title></head>"
    // + "<body><p>Parsed HTML into a doc.</p>"
    // +/*  www  . j  a va2s  .c  o m*/
    // " <table><tr><td>satu</td><td>satu-1</td></tr><tr><td>dua</td><td>dua-1</td></tr><tr><td>tiga</td><td>tiga-1</td></tr></table> "
    // + "</body></html>";

    StringBuilder sb = new StringBuilder();
    BufferedReader br = null;
    try {
        br = new BufferedReader(new FileReader(file));
    } catch (FileNotFoundException ex) {
        java.util.logging.Logger.getLogger(LotoImporter.class.getName()).log(Level.SEVERE, null, ex);
    }
    String line;
    try {
        while ((line = br.readLine()) != null) {
            sb.append(line);
        }
    } catch (IOException ex) {
        java.util.logging.Logger.getLogger(LotoImporter.class.getName()).log(Level.SEVERE, null, ex);
    }

    Document doc = Jsoup.parse(sb.toString());
    Element table = doc.select("table").first();
    Iterator<Element> iterator = table.select("td").iterator();
    while (iterator.hasNext()) {
        logger.info("text : " + iterator.next().text());
    }
    String title = doc.title();
    System.out.println("Document title : " + title);

}

From source file:me.vertretungsplan.parser.SVPlanParser.java

private void parseSvPlanDay(SubstitutionSchedule v, Element svp, Document doc) throws IOException {
    SubstitutionScheduleDay day = new SubstitutionScheduleDay();
    if ((svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0
            || doc.title().startsWith("Vertretungsplan fr "))) {
        setDate(svp, doc, day);/*from w  ww.  j a v a 2s . com*/
        if (svp.select(".svp-tabelle, table:has(.Klasse)").size() > 0) {

            Elements rows = svp.select(".svp-tabelle tr, table:has(.Klasse) tr");
            String lastLesson = "";
            String lastClass = "";
            for (Element row : rows) {
                if ((doc.select(".svp-header").size() > 0 && row.hasClass("svp-header"))
                        || row.select("th").size() > 0 || row.text().trim().equals("")) {
                    continue;
                }

                Substitution substitution = new Substitution();

                for (Element column : row.select("td")) {
                    String type = column.className();
                    if (!hasData(column.text())) {
                        if ((type.startsWith("svp-stunde") || type.startsWith("Stunde"))
                                && hasData(lastLesson)) {
                            substitution.setLesson(lastLesson);
                        } else if ((type.startsWith("svp-klasse") || type.startsWith("Klasse"))
                                && hasData(lastClass)) {
                            substitution.getClasses().addAll(Arrays
                                    .asList(lastClass.split(data.optString(PARAM_CLASS_SEPARATOR, ", "))));
                        }
                        continue;
                    }
                    if (type.startsWith("svp-stunde") || type.startsWith("Stunde")) {
                        substitution.setLesson(column.text());
                        lastLesson = column.text();
                    } else if (type.startsWith("svp-klasse") || type.startsWith("Klasse")) {
                        substitution.getClasses().addAll(Arrays
                                .asList(column.text().split(data.optString(PARAM_CLASS_SEPARATOR, ", "))));
                        lastClass = column.text();
                    } else if (type.startsWith("svp-esfehlt") || type.startsWith("Lehrer")) {
                        if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) {
                            substitution.setPreviousTeacher(column.text());
                        }
                    } else if (type.startsWith("svp-esvertritt") || type.startsWith("Vertretung")) {
                        if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) {
                            substitution.setTeacher(column.text().replaceAll(" \\+$", ""));
                        }
                    } else if (type.startsWith("svp-fach") || type.startsWith("Fach")) {
                        substitution.setSubject(column.text());
                    } else if (type.startsWith("svp-bemerkung") || type.startsWith("Anmerkung")) {
                        substitution.setDesc(column.text());
                        String recognizedType = recognizeType(column.text());
                        substitution.setType(recognizedType);
                        substitution.setColor(colorProvider.getColor(recognizedType));
                    } else if (type.startsWith("svp-raum") || type.startsWith("Raum")) {
                        substitution.setRoom(column.text());
                    }
                }

                if (substitution.getType() == null) {
                    substitution.setType("Vertretung");
                    substitution.setColor(colorProvider.getColor("Vertretung"));
                }

                day.addSubstitution(substitution);
            }
        }
        if (svp.select(".LehrerVerplant").size() > 0) {
            day.addMessage("<b>Verplante Lehrer:</b> " + svp.select(".LehrerVerplant").text());
        }
        if (svp.select(".Abwesenheiten").size() > 0) {
            day.addMessage("<b>Abwesenheiten:</b> " + svp.select(".Abwesenheiten").text());
        }

        if (svp.select("h2:contains(Mitteilungen)").size() > 0) {
            Element h2 = svp.select("h2:contains(Mitteilungen)").first();
            Element sibling = h2.nextElementSibling();
            while (sibling != null && sibling.tagName().equals("p")) {
                for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText()
                        .split("<br />\\s*<br />")) {
                    if (hasData(nachricht))
                        day.addMessage(nachricht);
                }
                sibling = sibling.nextElementSibling();
            }
        } else if (svp.select(".Mitteilungen").size() > 0) {
            for (Element p : svp.select(".Mitteilungen")) {
                for (String nachricht : TextNode.createFromEncoded(p.html(), null).getWholeText()
                        .split("<br />\\s*<br />")) {
                    if (hasData(nachricht))
                        day.addMessage(nachricht);
                }
            }
        }
        v.addDay(day);
    } else {
        throw new IOException("keine SVPlan-Tabelle gefunden");
    }
}

From source file:com.mythesis.userbehaviouranalysis.WebParser.java

/**
 * Parse the url and get all the content
 * @param link the url to parse/*from  w ww.j av a 2  s . co  m*/
 * @return The content parsed
 */
private String cleanhtml(String link) {
    try {
        Document doc = Jsoup.connect(link).timeout(10 * 1000).get();
        String title = doc.title();
        String mainbody = doc.body().text();
        Elements links = doc.select("a[href]");
        Elements media = doc.select("[src]");
        //fix link html to remove https:// or http:// and simple /
        if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) {
            link = link.substring(0, link.length() - 1);
        }
        if (link.substring(0, 5).equalsIgnoreCase("https")) {
            link = link.substring(8);
        } else if (link.substring(0, 4).equalsIgnoreCase("http")) {
            link = link.substring(7);
        }
        String anchortext = "";
        String alttext = "";
        //-----get the anchor text of internal links
        for (Element el : links) {
            String str_check = el.attr("abs:href");
            if (el.attr("abs:href").contains(link) && el.text().length() > 1) {
                anchortext = anchortext + el.text() + " ";
            }
        }
        //-------get alt text to internal images links
        for (Element medi : media) {
            if (medi.getElementsByTag("img").attr("src").contains(link)) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt");
            }
            if (medi.getElementsByTag("img").attr("src").startsWith("/")) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt");
            }
        }
        String content = mainbody + title + anchortext + alttext;

        return content;

    } catch (IOException ex) {
        Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (NullPointerException ex) {
        Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (Exception ex) {
        Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    }

}

From source file:me.vertretungsplan.parser.SVPlanParser.java

private void setDate(Element svp, Document doc, SubstitutionScheduleDay day) {
    String date = "Unbekanntes Datum";
    if (svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0) {
        date = svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").text()
                .replaceAll("Vertretungsplan (fr )?", "").trim();
    } else if (doc.title().startsWith("Vertretungsplan fr ")) {
        date = doc.title().substring("Vertretungsplan fr ".length());
    }/*from   ww w .  jav  a 2s . c  om*/
    date = date.replaceAll("\\s+", " ");
    day.setDateString(date);
    day.setDate(ParserUtils.parseDate(date));
    if (svp.select(".svp-uploaddatum, .Stand").size() > 0) {
        String lastChange = svp.select(".svp-uploaddatum, .Stand").text().replace("Aktualisierung: ", "")
                .replace("Stand: ", "");
        day.setLastChangeString(lastChange);
        day.setLastChange(ParserUtils.parseDateTime(lastChange));
    }
}

From source file:org.kitesdk.spring.hbase.example.service.WebPageSnapshotService.java

/**
 * Fetch the web page from the URL, parse the HTML to populate the metadata
 * required by WebPageSnapshotModel, and return the constructed
 * WebPageSnapshotModel./*from   w  ww . ja va  2  s . co  m*/
 *
 * @param url The URL to fetch the web page from
 * @return The WebPageSnapshotModel
 * @throws IOException Thrown if there's an issue fetching the web page.
 */
private WebPageSnapshotModel fetchWebPage(String url, String contentKey) throws IOException {
    long fetchTime = System.currentTimeMillis();
    Connection connection = Jsoup.connect(url);
    Response response = connection.execute();
    long postFetchTime = System.currentTimeMillis();
    int timeToFetch = (int) (postFetchTime - fetchTime);

    Document doc = response.parse();
    String destinationUrl = response.url().toString();
    String title = doc.title();
    String description = getDescriptionFromDocument(doc);
    List<String> keywords = getKeywordsFromDocument(doc);
    List<String> outlinks = getOutlinksFromDocument(doc);

    return WebPageSnapshotModel.newBuilder().setUrl(destinationUrl)
            .setFetchedAtRevTs(Long.MAX_VALUE - fetchTime).setSize(doc.html().length()).setFetchedAt(fetchTime)
            .setFetchTimeMs(timeToFetch).setTitle(title).setDescription(description).setKeywords(keywords)
            .setOutlinks(outlinks).setContentKey(contentKey).setContent(ImmutableMap.of(contentKey, doc.html()))
            .build();
}