Example usage for org.jsoup.nodes Element absUrl

List of usage examples for org.jsoup.nodes Element absUrl

Introduction

In this page you can find the example usage for org.jsoup.nodes Element absUrl.

Prototype

public String absUrl(String attributeKey) 

Source Link

Document

Get an absolute URL from a URL attribute that may be relative (i.e.

Usage

From source file:org.sbs.goodcrawler.plugin.extract.ExtractYouku.java

/**
 * ??/*  w ww  .  j  a va  2 s  .  c o m*/
 * @param url
 * @return
 */
public HashMap<String, Object> getInformation(Page page) {
    HashMap<String, Object> map = Maps.newHashMap();
    String url = page.getWebURL().getURL();
    try {
        ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>();
        epage.setUrl(page.getWebURL());
        Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                urlUtils.getBaseUrl(page.getWebURL().getURL()));
        // ???Url?Url
        Elements links = doc.getElementsByTag("a");
        if (!links.isEmpty()) {
            for (Element link : links) {
                String linkHref = link.absUrl("href");
                if (filterUrls(linkHref)) {
                    WebURL weburl = new WebURL();
                    weburl.setURL(linkHref);
                    weburl.setJobName(conf.jobName);
                    Submitor.submitUrl(weburl);
                }
            }
        }
        if (url.contains("/show_page/")) {

            String title = doc.select(".title .name").text();
            if (StringUtils.isBlank(title))
                return null;
            map.put("title", title);
            String category = doc.select(".title .type a").text();
            if (StringUtils.isBlank(category))
                return null;
            map.put("category", category);

            String _year = CharMatcher.DIGIT.retainFrom(doc.select(".title .pub").text());
            if (StringUtils.isNotBlank(_year)) {
                int year = Integer.parseInt(_year);
                map.put("year", year);
            }

            String score = CharMatcher.DIGIT.retainFrom(doc.select(".ratingstar .num").text());
            map.put("score", score);
            String alias = doc.select(".alias").text();

            if (alias.contains(":")) {
                map.put("translation", alias.split(":")[1]);
            }
            String img = doc.select(".thumb img").attr("src");
            if (StringUtils.isBlank(img))
                return null;
            map.put("thumbnail", img);
            String area = doc.select(".row2 .area a").text();
            if (StringUtils.isBlank(area))
                return null;
            map.put("area", area);
            String[] type = doc.select(".row2 .type a").text().split(" ");
            if (null == type || type.length == 0)
                return null;
            map.put("type", Sets.newHashSet(type));
            String director = doc.select(".row2 .director a").text();
            map.put("director", director);

            String _duration = CharMatcher.DIGIT.retainFrom(doc.select(".row2 .duration").text());
            if (StringUtils.isNotBlank(_duration)) {
                int duration = Integer.parseInt(_duration);
                map.put("duration", duration);
            }
            String _hot = CharMatcher.anyOf(",").removeFrom(doc.select(".row2 .vr .num").text());
            _hot = CharMatcher.DIGIT.retainFrom(_hot);
            if (StringUtils.isNotBlank(_hot)) {
                int hot = Integer.parseInt(_hot);
                map.put("hot", hot);
            }

            String sumary = doc.select(".detail .long").text();
            map.put("summary", sumary);
            // 
            Elements elements = doc.select(".baseaction a");
            HashMap<String, String> playList = Maps.newHashMap();
            for (Element element : elements) {
                String n = element.text();
                String urlString = element.attr("href");
                if (StringUtils.isBlank(urlString))
                    return null;
                Document d2 = Jsoup.parse(new URL(urlString), 10000);
                if (null != d2) {
                    String x = d2.select("#link2").attr("value");
                    if (StringUtils.isBlank(x))
                        return null;
                    playList.put(n, x);
                }
            }
            map.put("online", playList);
        } else if (url.contains("/v_show/")) {
            Document d3 = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                    urlUtils.getBaseUrl(page.getWebURL().getURL()));
            // ???Url?Url
            Elements links2 = d3.getElementsByTag("a");
            if (!links2.isEmpty()) {
                for (Element link : links2) {
                    String linkHref = link.absUrl("href");
                    if (filterUrls(linkHref)) {
                        WebURL weburl = new WebURL();
                        weburl.setURL(linkHref);
                        weburl.setJobName(conf.jobName);
                        try {
                            pendingUrls.addUrl(weburl);
                        } catch (QueueException e) {
                            log.error(e.getMessage());
                        }
                    }
                }
            }
            String p = d3.select("h1.title a").attr("href");
            if (StringUtils.isBlank(p))
                return null;
            return getInformation(p);
        }
    } catch (MalformedURLException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    if (map != null && map.size() > 4) {
        if (null == map.get("year")) {
            map.put("year", 1800);
        }
    }
    return map;
}

From source file:org.sbs.goodcrawler.plugin.extract.ExtractYouku.java

private HashMap<String, Object> getInformation(String p) {
    HashMap<String, Object> map = Maps.newHashMap();
    try {//from  w  w w.  j ava 2s . c o  m
        if (p.contains("/show_page/")) {
            Document doc = Jsoup.parse(new URL(p), 15000);
            // ???Url?Url
            Elements links = doc.getElementsByTag("a");
            if (!links.isEmpty()) {
                for (Element link : links) {
                    String linkHref = link.absUrl("href");
                    if (filterUrls(linkHref)) {
                        WebURL weburl = new WebURL();
                        weburl.setURL(linkHref);
                        weburl.setJobName(conf.jobName);
                        try {
                            pendingUrls.addUrl(weburl);
                        } catch (QueueException e) {
                            log.error(e.getMessage());
                        }
                    }
                }
            }

            String title = doc.select(".title .name").text();
            if (StringUtils.isBlank(title))
                return null;
            map.put("title", title);
            String category = doc.select(".title .type a").text();
            if (StringUtils.isBlank(category))
                return null;
            map.put("category", category);

            String _year = CharMatcher.DIGIT.retainFrom(doc.select(".title .pub").text());
            if (StringUtils.isNotBlank(_year)) {
                int year = Integer.parseInt(_year);
                map.put("year", year);
            }

            String score = CharMatcher.DIGIT.retainFrom(doc.select(".ratingstar .num").text());
            map.put("score", score);
            String alias = doc.select(".alias").text();
            if (alias.contains(":")) {
                map.put("translation", alias.split(":")[1]);
            }
            String img = doc.select(".thumb img").attr("src");
            if (StringUtils.isBlank(img))
                return null;
            map.put("thumbnail", Lists.newArrayList(img));
            String area = doc.select(".row2 .area a").text();
            if (StringUtils.isBlank(area))
                return null;
            map.put("area", area);
            String[] type = doc.select(".row2 .type a").text().split(" ");
            if (null == type || type.length == 0)
                return null;
            map.put("type", Lists.newArrayList(type));
            String director = doc.select(".row2 .director a").text();
            map.put("director", director);

            String _duration = CharMatcher.DIGIT.retainFrom(doc.select(".row2 .duration").text());
            if (StringUtils.isNotBlank(_duration)) {
                int duration = Integer.parseInt(_duration);
                map.put("duration", duration);
            }
            String _hot = CharMatcher.anyOf(",").removeFrom(doc.select(".row2 .vr .num").text());
            if (StringUtils.isNotBlank(_hot)) {
                int hot = Integer.parseInt(_hot);
                map.put("hot", hot);
            }

            String sumary = doc.select(".detail .long").text();
            map.put("summary", sumary);
            // 
            Elements elements = doc.select(".baseaction a");
            HashMap<String, String> playList = Maps.newHashMap();
            for (Element element : elements) {
                String n = element.text();
                String urlString = element.attr("href");
                if (StringUtils.isBlank(urlString))
                    return null;
                Document d2 = Jsoup.parse(new URL(urlString), 10000);
                if (null != d2) {
                    String x = d2.select("#link2").attr("value");
                    if (StringUtils.isBlank(x))
                        return null;
                    playList.put(n, x);
                }
            }
            map.put("online", playList);
        } else
            return null;
    } catch (Exception e) {
        return map;
    }
    return map;
}

From source file:webcralwerproject1.Webcrawler.java

public int spider(String url, String word) {
    try {/*from   w w w . j  a  v a 2  s .  c om*/
        Connection connection = Jsoup.connect(url);
        Document htmlDocument = connection.timeout(0).get(); //make connection
        this.htmlDocument = htmlDocument; //download page
        int httpStatuscode = connection.response().statusCode();
        connection.ignoreHttpErrors(false);//ignoreHttpErrors - - false (default) if HTTP errors should be ignored.

        if (connection.response().statusCode() == 200) // 200 is the HTTP OK status code , indicating that everything is great.
        {
            System.out.println("\n**Visiting** Received web page at " + url);
        } else {
            System.out.println("\nHttpStstaus code" + httpStatuscode);
        }
        if (!connection.response().contentType().contains("text/html")) {
            System.out.println("**Failure** Retrieved something other than HTML");
            return 0;
        }

        Elements linksOnPage = htmlDocument.select("a[href]");
        System.out.println("Found (" + linksOnPage.size() + ") links");
        for (Element link : linksOnPage) {
            if (word == "") { //searchword is null
                this.links.add(link.absUrl("href"));//copy all links
                word_found = 1;
            } else {
                if (link.attr("href").contains(word)) {//copy links that contain searchword
                    this.links.add(link.absUrl("href"));
                    word_found = 1;
                } else {
                    word_notfound = 1; //search word not present set flag              
                }
            }
        }
        if (word_found == 1) { //after copying all links write the downloaded content
            if (htmlDocument != null) {
                String path = writeContent(htmlDocument);
                writeReportHtml(url, path, httpStatuscode);
            } else {
                System.out.println("Inside Spider - HTMLDOCUMENt null");
            }
        }
        if (word_notfound == 1 && word_found == 0) {
            return 0; //search word not present in any link
        }
        return 1;//word found
    } catch (IOException ioe) {
        // We were not successful in our HTTP request
        System.out.println("Inside Spider - excpetion occured: " + ioe);
        return -1;
    }
}

From source file:webscrap.WebScrap.java

/**
 * @param args the command line arguments
 *///from  www.j  a va  2s .  c  om
public static void main(String[] args) {
    // TODO code application logic here
    Document doc;
    try {
        doc = Jsoup.connect(
                "http://www.metmuseum.org/collection/the-collection-online/search/15538?pos=1&rpp=30&pg=1&rndkey=20150122&ft=*&deptids=2")
                .get();

        File jsonFile = new File("Records.json");
        FileWriter output = new FileWriter(jsonFile);
        JSONArray store = new JSONArray();
        //Declarations for JSON output
        String nameTag = "Name";
        String name;
        String artistTag = "Artist";
        String artistName;
        String imgURLTag = "imgURL";
        String imgsrc;
        String dateTag = "Date";
        String date;
        String geoTag = "Geography";
        String geoVal;
        String cultureTag = "Culture";
        String culture;
        String mediumTag = "Medium";
        String medium;
        String dimTag = "Dimension";
        String dim;
        String classTag = "Classification";
        String classification;
        String credit_line_tag = "Credit_Line";
        String credit_line;
        String accessNumTag = "Accession_Number";
        String accessNum;
        String RnRTag = "Rights_and_Reproduction";
        String RnR;

        //trying to load the next urls
        String next = "http://www.metmuseum.org/collection/the-collection-online/search/11432?pos=1&rpp=30&pg=1&rndkey=20150123&ft=*&deptids=2";
        int i = 500;
        while (i != 0) {

            name = "";
            artistName = "";
            imgsrc = "";
            date = "";
            //geoVal = "not available";
            //culture = "not available";
            medium = "";
            dim = "";
            classification = "";
            credit_line = "";
            accessNum = "";
            //RnR = "not available";

            doc = Jsoup.connect(next).get();
            String o_title = doc.getElementsByTag("h2").text();
            String[] part_o = o_title.split("Email");
            String part_o1 = part_o[0];
            String part_o2 = part_o[1];
            //System.out.println(o_title);
            name = part_o1;
            //String artist = doc.getElementsByTag("h3").text();
            //System.out.println(artist);
            //artistName = artist;
            Elements imgdiv = doc.select("div#inner-image-container img");
            for (Element e : imgdiv) {
                imgsrc = e.absUrl("src");
            }

            Elements divs;
            divs = doc.select("div.tombstone");
            Elements divchild;
            divchild = divs.select("div");
            int count = 0;
            for (Element div : divchild) {
                String info = div.text();
                if (count != 0) {
                    String[] parts = info.split(":");
                    String part1 = parts[0];
                    String part2 = parts[1];

                    switch (part1) {
                    case "Artist":
                        artistName = part2;
                        break;
                    case "Date":
                        date = part2;
                        break;
                    case "Geography":
                        geoVal = part2;
                        break;
                    case "Culture":
                        culture = part2;
                        break;
                    case "Medium":
                        medium = part2;
                        break;
                    case "Dimensions":
                        dim = part2;
                        break;
                    case "Classification":
                        classification = part2;
                        break;
                    case "Credit Line":
                        credit_line = part2;
                        break;
                    case "Accession Number":
                        accessNum = part2;
                        break;
                    case "Rights and Reproduction":
                        RnR = part2;
                        break;
                    }
                }
                count++;
            }
            if (classification.equals(" Paintings")) {
                //System.out.println(nameTag+name);
                //System.out.println(artistTag+artistName);
                //System.out.println(imgURLTag+imgsrc);
                //System.out.println(dateTag+date);
                //System.out.println(mediumTag+medium);
                //System.out.println(dimTag+dim);
                //System.out.println(classTag+classification);
                //System.out.println(credit_line_tag+credit_line);
                //System.out.println(accessNumTag+accessNum);
                //System.out.println(i);
                //json writing
                JSONObject jsonObj = new JSONObject();
                jsonObj.put(nameTag, name);
                jsonObj.put(artistTag, artistName);
                jsonObj.put(imgURLTag, imgsrc);
                jsonObj.put(dateTag, date);
                jsonObj.put(mediumTag, medium);
                jsonObj.put(dimTag, dim);
                jsonObj.put(classTag, classification);
                jsonObj.put(credit_line_tag, credit_line);
                jsonObj.put(accessNumTag, accessNum);

                store.add(jsonObj);
                i--;
            }
            //going to next page      
            Element link = doc.select("a.next").first();
            next = link.attr("abs:href");

        }
        output.write(store.toJSONString());
        output.write("\n");
        output.flush();
        output.close();

    } catch (IOException e) {
    }

}