Example usage for org.jsoup.nodes Element tagName

List of usage examples for org.jsoup.nodes Element tagName

Introduction

In this page you can find the example usage for org.jsoup.nodes Element tagName.

Prototype

public String tagName() 

Source Link

Document

Get the name of the tag for this element.

Usage

From source file:com.kingfong.webcrawler.util.DOMContentUtils.java

/**
 * This method finds all anchors below the supplied DOM
 * <code>node</code>, and creates appropriate {@link Outlink}
 * records for each (relative to the supplied <code>base</code>
 * URL), and adds them to the <code>outlinks</code> {@link
 * ArrayList}./*from w w  w.  j  a  v  a 2  s .  co m*/
 *
 * <p>
 *
 * Links without inner structure (tags, text, etc) are discarded, as
 * are links which contain only single nested links and empty text
 * nodes (this is a common DOM-fixup artifact, at least with
 * nekohtml).
 */
public void getOutlinks(String html, URL url, HashSet<String> outlinks) {

    Document document = Jsoup.parse(html);
    Elements elements = document.getAllElements();
    for (Element currentNode : elements) {
        String nodeName = currentNode.tagName();
        // short nodeType = currentNode.;
        Elements children = currentNode.children();
        nodeName = nodeName.toLowerCase();
        LinkParams params = linkParams.get(nodeName);
        if (params != null) {
            // if (!shouldThrowAwayLink(currentNode, children, childLen,
            // params)) {

            // StringBuilder linkText = new StringBuilder();
            // getText(linkText, currentNode, true);

            Attributes attrs = currentNode.attributes();
            String target = null;
            boolean noFollow = false;
            boolean post = false;
            Iterator<Attribute> iterator = attrs.iterator();
            while (iterator.hasNext()) {
                Attribute attr = iterator.next();
                String attrName = attr.getKey();
                if (params.attrName.equalsIgnoreCase(attrName)) {
                    target = attr.getValue();
                } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getValue())) {
                    noFollow = true;
                } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getValue())) {
                    post = true;
                }
            }
            if (StringUtils.startsWith(target, "/")) {
                target = url.getProtocol() + "://" + url.getHost() + target;
            }
            if (target != null && URLFilter.filt(target)) {
                outlinks.add(target);
            }
            // }
            // this should not have any children, skip them
            if (params.childLen == 0)
                continue;
        }
    }
}

From source file:by.heap.remark.convert.TextCleaner.java

private void fixLineBreaks(Element el) {
    for (final Element e : el.children()) {
        if (e.tagName().equals("br")) {
            e.before("\n");
            e.remove();/*from   w  w  w.j av  a  2 s  .  c o m*/
        } else {
            fixLineBreaks(e);
        }
    }
}

From source file:by.heap.remark.convert.TextCleaner.java

private boolean isBlock(Node n) {
    boolean block = false;
    if (n != null && n instanceof Element) {
        Element el = (Element) n;
        block = el.isBlock() || el.tagName().equals("br");
    }//from   w  w w  . j a va 2  s.  co m
    return block;
}

From source file:com.aquest.emailmarketing.web.controllers.BroadcastTemplateController.java

/**
 * Adds the tracking.//  w  w  w  .  j  a  v  a2 s.  c  o  m
 *
 * @param model the model
 * @param urls the urls
 * @param principal the principal
 * @param id the id
 * @param trackingFlg the tracking flg
 * @param openGAflg the open g aflg
 * @param openPixelFlg the open pixel flg
 * @param trackingType the tracking type
 * @return the string
 */
@RequestMapping(value = "/bcastTempGenerateUrls", method = RequestMethod.POST)
public String addTracking(Model model, Urls urls, Principal principal, @RequestParam(value = "id") int id,
        @RequestParam(value = "trackingFlg", required = false) boolean trackingFlg,
        @RequestParam(value = "openGAflg", required = false) boolean openGAflg,
        @RequestParam(value = "openPixelFlg", required = false) boolean openPixelFlg,
        @RequestParam(value = "trackingType", required = false) String trackingType) {
    TrackingConfig trackingConfig = new TrackingConfig();
    BroadcastTemplate broadcastTemplate = broadcastTemplateService.getBroadcastTemplateById(id);
    String workingHtml = broadcastTemplate.getHtmlbody();
    if (trackingFlg == true) {
        if (openGAflg == true) {
            workingHtml = emailTracking.addGaOpenEmailTracking(workingHtml, urls);
            System.out.println("GA Open: " + workingHtml);
        }
        if (openPixelFlg == true) {
            workingHtml = emailTracking.addPixelOpenEmailTracking(workingHtml);
            System.out.println("Pixel Open: " + workingHtml);
        }
        if (trackingType.equals("ga")) {
            workingHtml = emailTracking.addGaTrackingToUrl(workingHtml, urls);
            System.out.println("GA Click added: " + workingHtml);
        } else if (trackingType.equals("intTrack")) {
            workingHtml = emailTracking.addIntTrackingToUrl(workingHtml, urls);
            System.out.println("Internal Tracking: " + workingHtml);
        } else {
            workingHtml = emailTracking.addBothTrackingToUrl(workingHtml, urls);
        }

    }

    broadcastTemplate.setHtmlbody_tracking(workingHtml);
    System.out.println(broadcastTemplate.getHtmlbody_tracking());
    String confirm = broadcastTemplateService.SaveOrUpdate(broadcastTemplate);
    System.out.println(confirm);
    System.out.println(trackingFlg);
    System.out.println(openGAflg);
    System.out.println(openPixelFlg);
    System.out.println(trackingType);
    if (confirm == broadcastTemplate.getB_template_name()) {
        trackingConfig.setBcast_template_id(broadcastTemplate.getId());
        // taking care of tracking flg
        int tracking_flg = 0;
        if (trackingFlg == true) {
            tracking_flg = 1;
        }
        trackingConfig.setTracking_flg(tracking_flg);
        // taking care of openGAflg
        int open_ga_flg = 0;
        if (openGAflg == true) {
            open_ga_flg = 1;
        }
        trackingConfig.setOpen_ga_flg(open_ga_flg);
        // taking care of openPixelFlg
        int open_pixel_flg = 0;
        if (openPixelFlg == true) {
            open_pixel_flg = 1;
        }
        trackingConfig.setOpen_pixel_flg(open_pixel_flg);
        // set tracking type
        trackingConfig.setTracking_type(trackingType);
        // seting utm's
        trackingConfig.setUtm_campaign(urls.getUtmCampaign());
        trackingConfig.setUtm_content(urls.getUtmContent());
        trackingConfig.setUtm_medium(urls.getUtmMedium());
        trackingConfig.setUtm_source(urls.getUtmSource());
        trackingConfigService.SaveOrUpdate(trackingConfig);
    }
    // find images in html to be able to embed images in email as in-line attachments
    EmbeddedImage embeddedImage = new EmbeddedImage();
    List<String> imgList = new ArrayList<String>();
    String html = broadcastTemplate.getHtmlbody();
    Document doc = Jsoup.parse(html);
    Elements media = doc.select("[src]");
    for (Element src : media) {
        if (src.tagName().equals("img")) {
            imgList.add(src.attr("abs:src"));
        }
    }
    model.addAttribute("imgList", imgList);
    model.addAttribute("embeddedImage", embeddedImage);
    model.addAttribute("broadcastTemplate", broadcastTemplate);
    return "bcasttempembeddedimage";
}

From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.DBpediaOntologyOld.java

private void traverseHierarchy(Element e, DBpediaCategory category, HashMap<String, DBpediaCategory> map) {
    for (Element c : e.children()) {
        String tagName = c.tag().getName();
        if (tagName.equals("a")) {
            String href = c.attr("href");
            if (href != null && href.length() > 0) {
                category.setLabel(c.text());
                category.setUri(CLASSES_BASE_URI + c.text());
                map.put(category.getLabel(), category);
                System.out.println(c.text() + "\t" + CLASSES_BASE_URI + c.text());
            }//from w w  w .  jav a 2 s  .  co m
        } else if (tagName.equals("ul")) {
            for (Element c1 : c.children()) {
                if (c1.tagName().equals("li")) {
                    DBpediaCategory cc = new DBpediaCategory();
                    traverseHierarchy(c1, cc, map);
                    cc.parents = new HashSet<>();
                    cc.parents.add(category);
                    category.getSubClasses().add(cc);
                }
            }
        }
    }
}

From source file:mergedoc.core.APIDocument.java

/**
 * ? Javadoc ????/*from  www .j  a v  a 2 s .  c o  m*/
 * @param className ??
 * @param docHtml API 
 */
private void parseMethodComment(String className, Document doc) {
    Elements elements = doc.select("body > div.contentContainer > div.details > ul > li > ul > li > ul > li");
    for (Element element : elements) {
        Element sigElm = element.select("pre").first();
        if (sigElm == null) {
            continue;
        }
        String sigStr = sigElm.html();
        Signature sig = createSignature(className, sigStr);
        Comment comment = new Comment(sig);

        // deprecated 
        String depre = "";
        Elements divs = element.select("div");
        if (divs.size() == 2) {
            depre = divs.get(0).html();
        }
        if (divs.size() > 0) {
            String body = divs.last().html();
            body = formatLinkTag(className, body);
            comment.setDocumentBody(body);
        }

        Elements dtTags = element.select("dl dt");
        for (Element dtTag : dtTags) {
            String dtText = dtTag.text();
            if (dtText.contains(":")) {
                Element dd = dtTag;
                while (true) {
                    dd = dd.nextElementSibling();
                    if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) {
                        break;
                    }
                    String name = dd.select("code").first().text();
                    if (dtText.contains(":")) {
                        name = "<" + name + ">";
                    }
                    String items = dd.html();
                    Pattern p = PatternCache
                            .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)");
                    Matcher m = p.matcher(items);
                    if (m.find()) {
                        String desc = formatLinkTag(className, m.group(2));
                        comment.addParam(name, desc);
                    }
                }
                continue;
            }

            if (dtText.contains(":")) {
                Element dd = dtTag.nextElementSibling();
                String str = dd.html();
                str = formatLinkTag(className, str);
                comment.addReturn(str);
                continue;
            }

            if (dtText.contains(":")) {
                Element dd = dtTag;
                while (true) {
                    dd = dd.nextElementSibling();
                    if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) {
                        break;
                    }
                    String name = dd.select("code").first().text();
                    String items = dd.html();
                    Pattern p = PatternCache
                            .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)");
                    Matcher m = p.matcher(items);
                    if (m.find()) {
                        String desc = formatLinkTag(className, m.group(2));
                        String param = name + " " + desc;
                        comment.addThrows(param);
                    }
                }
                continue;
            }

        }
        // deprecated 
        parseDeprecatedTag(className, depre, comment);

        // 
        parseCommonTag(className, element, comment);

        contextTable.put(sig, comment);
    }
}

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

private Element getContentElement() throws Exception {
    clean();/*from   w ww  . ja v  a  2s. c  om*/
    computeInfo(doc.body());
    double maxScore = 0;
    Element content = null;
    for (Map.Entry<Element, CountInfo> entry : infoMap.entrySet()) {
        Element tag = entry.getKey();
        if (tag.tagName().equals("a") || tag.tagName().equals("p") || tag == doc.body()) {
            continue;
        }
        double score = computeScore(tag);
        if (score > maxScore) {
            maxScore = score;
            content = tag;
        }
    }
    if (content == null) {
        throw new Exception("extraction failed");
    }
    return content;
}

From source file:me.vertretungsplan.parser.IndiwareParser.java

SubstitutionScheduleDay parseIndiwareDay(Element doc, boolean html) throws IOException {
    SubstitutionScheduleDay day = new SubstitutionScheduleDay();

    DataSource ds;/*from w  ww . ja  va2s.co m*/
    if (html) {
        ds = new HTMLDataSource(doc);
    } else {
        ds = new XMLDataSource(doc);
    }

    Matcher matcher = datePattern.matcher(ds.titel().text());
    if (!matcher.find())
        throw new IOException("malformed date: " + ds.titel().text());
    String date = matcher.group();
    day.setDate(
            DateTimeFormat.forPattern("EEEE, dd. MMMM yyyy").withLocale(Locale.GERMAN).parseLocalDate(date));

    String lastChange = ds.datum().text();
    day.setLastChange(DateTimeFormat.forPattern("dd.MM.yyyy, HH:mm").withLocale(Locale.GERMAN)
            .parseLocalDateTime(lastChange));

    if (ds.kopfinfos().size() > 0) {
        for (Element kopfinfo : ds.kopfinfos()) {
            String title = html ? kopfinfo.select("th").text() : kopfinfoTitle(kopfinfo.tagName()) + ":";

            StringBuilder message = new StringBuilder();
            if (title != null && !title.isEmpty()) {
                message.append("<b>").append(title).append("</b>").append(" ");
            }
            message.append(html ? kopfinfo.select("td").text() : kopfinfo.text());

            day.addMessage(message.toString());
        }
    }

    if (ds.fuss() != null) {
        StringBuilder message = new StringBuilder();
        boolean first = true;
        for (Element fusszeile : ds.fusszeilen()) {
            if (first) {
                first = false;
            } else {
                message.append("\n");
            }
            message.append(fusszeile.text());
        }
        day.addMessage(message.toString());
    }

    List<String> columnTypes = null;
    if (html) {
        columnTypes = new ArrayList<>();
        for (Element th : ((HTMLDataSource) ds).headers()) {
            columnTypes.add(th.className().replace("thplan", "").replace("thlplan", ""));
        }
    }

    for (Element aktion : ds.aktionen()) {
        Substitution substitution = new Substitution();
        String type = "Vertretung";
        String course = null;
        int i = 0;
        for (Element info : aktion.children()) {
            String value = info.text().replace("\u00a0", "");
            if (value.equals("---")) {
                i++;
                continue;
            }
            final String columnType = html ? columnTypes.get(i) : info.tagName();
            switch (columnType) {
            case "klasse":
                Set<String> classes = new HashSet<>();
                for (String klasse : value.split(",")) {
                    Matcher courseMatcher = coursePattern.matcher(klasse);
                    if (courseMatcher.matches()) {
                        classes.add(courseMatcher.group(1));
                        course = courseMatcher.group(2);
                    } else {
                        classes.add(klasse);
                    }
                }
                substitution.setClasses(classes);
                break;
            case "stunde":
                substitution.setLesson(value);
                break;
            case "fach":
                String subject = subjectAndCourse(course, value);
                if (columnTypes != null && columnTypes.contains("vfach")) {
                    substitution.setPreviousSubject(subject);
                } else {
                    substitution.setSubject(subject);
                }
                break;
            case "vfach":
                substitution.setSubject(subjectAndCourse(course, value));
            case "lehrer":
                Matcher bracesMatcher = bracesPattern.matcher(value);
                if (bracesMatcher.matches())
                    value = bracesMatcher.group(1);
                substitution.setTeacher(value);
                break;
            case "raum":
                if (columnTypes != null && columnTypes.contains("vraum")) {
                    substitution.setPreviousRoom(value);
                } else {
                    substitution.setRoom(value);
                }
                break;
            case "vraum":
                substitution.setRoom(value);
            case "info":
                Matcher substitutionMatcher = substitutionPattern.matcher(value);
                Matcher cancelMatcher = cancelPattern.matcher(value);
                Matcher delayMatcher = delayPattern.matcher(value);
                Matcher selfMatcher = selfPattern.matcher(value);
                if (substitutionMatcher.matches()) {
                    substitution.setPreviousSubject(substitutionMatcher.group(1));
                    substitution.setPreviousTeacher(substitutionMatcher.group(2));
                    if (!substitutionMatcher.group(3).isEmpty()) {
                        substitution.setDesc(substitutionMatcher.group(3));
                    }
                } else if (cancelMatcher.matches()) {
                    type = "Entfall";
                    substitution.setPreviousSubject(cancelMatcher.group(1));
                    substitution.setPreviousTeacher(cancelMatcher.group(2));
                } else if (delayMatcher.matches()) {
                    type = "Verlegung";
                    substitution.setPreviousSubject(delayMatcher.group(1));
                    substitution.setPreviousTeacher(delayMatcher.group(2));
                    substitution.setDesc(delayMatcher.group(3));
                } else if (selfMatcher.matches()) {
                    type = "selbst.";
                    if (!selfMatcher.group(1).isEmpty())
                        substitution.setDesc(selfMatcher.group(1));
                } else if (value.equals("fllt aus") || value.equals("Klausur") || value.equals("Aufg.")) {
                    type = value;
                } else {
                    substitution.setDesc(value);
                }
                break;
            }
            i++;
        }
        substitution.setType(type);
        substitution.setColor(colorProvider.getColor(substitution.getType()));
        if (course != null && substitution.getSubject() == null) {
            substitution.setSubject(course);
        }
        day.addSubstitution(substitution);
    }

    return day;
}

From source file:com.fluidops.iwb.provider.HTMLProvider.java

@Override
public void gather(List<Statement> res) throws Exception {

    String url = config.url;/* www  .ja  va 2s.  c  o m*/
    Document doc = Jsoup.connect(url).get();
    Elements links = doc.select("a[href]");
    Elements media = doc.select("[src]");
    Elements imports = doc.select("link[href]");
    // Elements article =
    // doc.select("div.wrapper").select("div.box-shadow").select("div#content.cols").select("div.cl").select("div.crm").select("article").select("section.article").select("div.textblock").select("table");
    Elements article = doc.getElementsByTag("tbody").select("tr");
    Elements tableElem;
    URI nameURI = null;
    URI roadsURI = null;
    URI sideURI = null;
    URI totalURI = null;

    File file = new File("HTMLdata.txt");
    PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file)));

    out.println("Media");
    print("\nMedia: (%d)", media.size());
    for (Element el : media) {
        if (el.tagName().equals("img")) {
            print(" * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"),
                    el.attr("height"), trim(el.attr("alt"), 20));
            out.printf(" \n * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"),
                    el.attr("height"), trim(el.attr("alt"), 20));
            out.println();
        } else {
            print(" * %s: <%s>", el.tagName(), el.attr("abs:src"));
            out.printf(" \n * %s: <%s>", el.tagName(), el.attr("abs:src"));
            out.println();
        }

    }

    out.println("Imports");
    print("\nImports: (%d)", imports.size());
    for (Element link : imports) {
        print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
        out.printf(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
        out.println();
    }

    out.println("Links");
    print("\nLinks: (%d)", links.size());
    for (Element link : links) {
        print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
        out.printf(" * a: <%s> (%s)", link.attr("abs:href"), link.text());
        out.println();
    }

    /*
     * out.println("Custom text"); print("\nCustom: (%d)",customArt.size());
     * for (Element custom:customArt){
     * out.printf(" * a (%s): (%s)",custom.tagName(),custom.text());
     * out.println(); }
     */

    out.println("Article");
    print("\nArticle: (%d)", article.size());

    for (int i = 3; i < article.size() - 2; i++) {
        tableElem = article.get(i).select("td");
        out.println();

        if (i == 3) {
            nameURI = ProviderUtils.objectToUri(tableElem.get(0).text());
            roadsURI = ProviderUtils.objectToUri(tableElem.get(1).text());
            sideURI = ProviderUtils.objectToUri(tableElem.get(2).text());
            totalURI = ProviderUtils.objectToUri(tableElem.get(3).text());

        } else {

            res.add(ProviderUtils.createStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDF.TYPE,
                    nameURI));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    RDFS.LABEL, tableElem.get(0).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    roadsURI, tableElem.get(1).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    sideURI, tableElem.get(2).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    totalURI, tableElem.get(3).text()));

            for (Element el : tableElem) {
                out.printf("\n * (%s): (%s)", el.tagName(), el.text());
                out.println();

            }
        }
        out.println();
        out.printf("\n * a (%s) (%d): (%s)", article.get(i).tagName(), tableElem.size(), article.get(i).text());
        out.println();
    }
    out.close();
}

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

/**
 * metaTitle?metaTitle,metaTitle??????title
 *
 * @param contentElement// w w w.  j  ava 2 s.  co  m
 * @return
 * @throws Exception
 */
protected String getTitle(final Element contentElement) throws Exception {
    final ArrayList<Element> titleList = new ArrayList<Element>();
    final ArrayList<Double> titleSim = new ArrayList<Double>();
    final String metaTitle = getText(doc.title().trim());
    if (!metaTitle.isEmpty()) {
        doc.body().traverse(new NodeVisitor() {
            @Override
            public void head(Node node, int i) {
                if (node instanceof Element) {
                    Element tag = (Element) node;
                    String tagName = tag.tagName();
                    if (Pattern.matches("h[1-6]", tagName)) {
                        String title = tag.text().trim();
                        double sim = strSim(title, metaTitle);
                        titleSim.add(sim);
                        titleList.add(tag);
                    }
                }
            }

            @Override
            public void tail(Node node, int i) {
            }
        });
        int index = titleSim.size();
        if (index >= 0) {
            double maxScore = 0;
            int maxIndex = -1;
            for (int i = 0; i < index; i++) {
                double score = (i + 1) * titleSim.get(i);
                if (score > maxScore) {
                    maxScore = score;
                    maxIndex = i;
                }
            }

            if (maxIndex == -1 || titleSim.get(maxIndex) < 0.3) {
                String title = getText(metaTitle);
                if (!title.endsWith("") && title.length() > 7) {
                    return title;
                }
                Collections.sort(titleList, new Comparator<Element>() {
                    @Override
                    public int compare(Element o1, Element o2) {
                        int len1 = 1;
                        int len2 = 1;
                        if (o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26
                                || o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) {
                            len1 = 0;
                        }
                        if (o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26
                                || o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) {
                            len2 = 0;
                        }
                        if (len1 == len2) {
                            return o1.tagName().charAt(1) - o2.tagName().charAt(1);
                        }
                        return len2 - len1;
                    }
                });
                return getText(titleList.get(0).text());
            }
            return titleList.get(maxIndex).text();
        }
    }

    /**
     * ?
     */
    Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]");
    if (titles.size() > 0) {
        String title = titles.first().text();
        if (title.length() > 5 && title.length() < 40) {
            return titles.first().text();
        }
    }
    try {
        return getTitleByEditDistance(contentElement);
    } catch (Exception ex) {
        throw new Exception("title not found");
    }

}