sachin.spider.Demoss.java Source code

Java tutorial

Introduction

Here is the source code for sachin.spider.Demoss.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package sachin.spider;

/**
 *
 * @author sku202
 */
/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
import java.net.MalformedURLException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.http.HttpResponse;

public class Demoss extends WebSpider {

    public final static Pattern FILTERS = Pattern.compile(
            ".*(\\.(css|js|bmp|gif|jpe?g" + "|ico|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf|txt"
                    + "|doc|xls|java|cs|cpp|xml|ppt" + "|rm|smil|wmv|swf|wma|zip|rar|gz))",
            Pattern.CASE_INSENSITIVE);
    final String site;
    String host;
    SpiderConfig config;

    public Demoss(String site) {
        this.site = site;
        try {
            this.host = new URL(site).getHost().replaceAll("www.", "");
        } catch (MalformedURLException ex) {
            Logger.getLogger(Demoss.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    @Override
    public boolean shouldVisit(String url) {
        //        return false;
        return (url != null && !url.contains("?") && url.startsWith(config.getModifiedSiteName())
                && (!FILTERS.matcher(url).find()) && !url.contains("/recipes/detail"));
    }

    @Override
    public void handleLink(WebURL webUrl, HttpResponse response, int statusCode, String statusDescription) {
        if (statusCode > 400) {
            System.out.println(statusCode + " : " + webUrl.getUrl());
            System.out.println("Parent : " + webUrl.getParents());
            System.out.println(
                    "--------------------------------------------------------------------------------------------------------");
        }
    }
    //    /**
    //     *
    //     * @param page
    //     */
    //    @Override
    //    public void viewPage(Page page) {
    //        if (page.getAddress().equals(site)) {
    //            System.out.println(page.getHyperLinks());
    //        }
    //    }

    //    @Override
    //    protected void visitPage(Document document, WebURL webUrl) {
    //        super.visitPage(document, webUrl); //To change body of generated methods, choose Tools | Templates.
    //    }
    void go() {
        config = new SpiderConfig(site.trim());
        config.setConnectionRequestTimeout(120000);
        config.setConnectionTimeout(120000);
        config.setSocketTimeout(120000);
        config.setTotalSpiders(30);
        config.setAuthenticate(true);
        config.setUserAgentString(
                "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1");
        config.setUsername("wlnonproduser");
        config.setPassword("Pass@word11");
        try {
            config.start(this, config);
        } catch (Exception ex) {
            Logger.getLogger(Demoss.class.getName()).log(Level.SEVERE, null, ex);
        }
        //        System.out.println(config.getModifiedSiteName());
    }

    public static void main(String... str) {

        new Demoss("http://m-flora-ca-uat.unileversolutions.com/fr-CA").go();
    }
}