sachin.spider.Page.java Source code

Java tutorial

Introduction

Here is the source code for sachin.spider.Page.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package sachin.spider;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.http.Header;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author JARVIS
 */
public class Page {

    private Header[] headers;
    private final Document document;
    private boolean proccessed;
    private final int docid;
    private int resposneTime;
    private final String address;
    private int statusCode;
    private String statusMessage;
    private List<String> outgoingLinks;
    private List<WebURL> parents;

    /**
     * Constructor of the Page Class
     *
     * @param address URL for the webPage
     * @param document document of the webPage
     */
    Page(String address, Document document) {
        this.address = address;
        this.document = document;
        this.docid = address.hashCode();
        parents = new ArrayList<>();
    }

    /**
     * This function is called to get the all the headers of the page.
     *
     * @return array of all the headers
     */
    public Header[] getHeaders() {
        return headers;
    }

    /**
     * This function is called to set headers
     *
     * @param headers array of headers
     */
    void setHeaders(Header[] headers) {
        this.headers = headers;
    }

    /**
     * This function is called to get the document of the page. This document is
     * the object of Jsoup
     *
     * @return document object of the page
     */
    public Document getDocument() {
        return document;
    }

    /**
     * This function is called to check if the page has been processed.
     *
     * @return boolean value
     */
    boolean isProccessed() {
        return proccessed;
    }

    void setProccessed(boolean proccessed) {
        this.proccessed = proccessed;
    }

    /**
     * This function is called to get the unique pageId
     *
     * @return an Integer value for the page
     */
    public int getDocid() {
        return docid;
    }

    /**
     * This function is called to get the page response time
     *
     * @return an Integer value for the time taken to get response.
     */
    public int getResposneTime() {
        return resposneTime;
    }

    void setResposneTime(int resposneTime) {
        this.resposneTime = resposneTime;
    }

    /**
     * This function is called to get the page url
     *
     * @return url of the page as String
     */
    public String getAddress() {
        return address;
    }

    /**
     * This function is called to get status code of the page
     *
     * @return an Integer value for status code
     */
    public int getStatusCode() {
        return statusCode;
    }

    void setStatusCode(int statusCode) {
        this.statusCode = statusCode;
    }

    /**
     * This function is called to get status message of the page
     *
     * @return string value for status message
     */
    public String getStatusMessage() {
        return statusMessage;
    }

    void setStatusMessage(String statusMessage) {
        this.statusMessage = statusMessage;
    }

    /**
     * This function is called to get the list of outgoing links from the page
     *
     * @return List of all outgoing links
     */
    public List<String> getOutgoingLinks() {
        outgoingLinks = new ArrayList<String>();
        Elements alinks = document.getElementsByAttribute("href");
        Elements slinks = document.getElementsByAttribute("src");
        for (Element alink : alinks) {
            String link = alink.absUrl("href");
            if (!link.isEmpty() && !outgoingLinks.contains(link)) {
                outgoingLinks.add(link);
            }
        }
        for (Element slink : slinks) {
            String link = slink.absUrl("src");
            if (!link.isEmpty() && !outgoingLinks.contains(link)) {
                outgoingLinks.add(link);
            }
        }
        return outgoingLinks;
    }

    /**
     * This function is called to get the list of external outgoing links from
     * the page
     *
     * @return List of all external outgoing links
     */
    public List<String> getExternalLinks() {
        List<String> externalLinks = new ArrayList<String>();
        if (outgoingLinks == null) {
            outgoingLinks = getOutgoingLinks();
        }
        try {
            URL url = new URL(address);
            String host = url.getHost();
            String baseHref = address.substring(0, address.indexOf(host, 0) + host.length());
            for (String link : outgoingLinks) {
                if (!link.startsWith(baseHref)) {
                    externalLinks.add(link);
                }
            }
        } catch (MalformedURLException ex) {
            Logger.getLogger(Page.class.getName()).log(Level.SEVERE, null, ex);
        }
        return externalLinks;
    }

    /**
     * This function is called to get the list of the source of the images on
     * the page
     *
     * @return List of all images
     */
    public List<String> getImageLinks() {
        List<String> imageLinks = new ArrayList<String>();
        Elements slinks = document.select("img[src]");
        for (Element slink : slinks) {
            String link = slink.absUrl("src");
            if (!link.isEmpty() && !imageLinks.contains(link)) {
                imageLinks.add(link);
            }
        }
        return imageLinks;
    }

    /**
     * This function is called to get the list of all the anchor tag links.
     *
     * @return List of all anchor tags
     */
    public List<String> getHyperLinks() {
        List<String> aLinks = new ArrayList<String>();
        Elements slinks = document.select("a[href]");
        for (Element slink : slinks) {
            String link = slink.absUrl("href");
            if (!link.isEmpty() && !aLinks.contains(link) && !link.equals("#")) {
                aLinks.add(link);
            }
        }
        return aLinks;
    }

    /**
     * This function is called to get all the meta tags.
     *
     * @return Map of all meta tags on the page
     */
    public Map<String, String> getAllMetaTags() {
        Map<String, String> metaTags = new HashMap<String, String>();
        Elements tags = document.select("meta");
        metaTags.put(tags.attr("name"), tags.attr("content"));
        return metaTags;
    }
}