com.quangphuong.crawler.util.Crawler.java Source code

Java tutorial

Introduction

Here is the source code for com.quangphuong.crawler.util.Crawler.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.quangphuong.crawler.util;

import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;

import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.DomText;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.quangphuong.crawler.dto.Calendar;
import com.quangphuong.crawler.dto.Event;
import com.quangphuong.crawler.dto.Events;
import java.io.IOException;
import java.math.BigDecimal;
import org.springframework.beans.factory.annotation.Autowired;

/**
 *
 * @author quangphuong
 */
@Configuration
@EnableScheduling
public class Crawler {
    @Autowired
    XMLUtil xMLUtil;
    private static final WebClient webClient = new WebClient(BrowserVersion.CHROME);

    public Crawler() {
        webClient.getOptions().setJavaScriptEnabled(false);
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        //        ProxyConfig proxyConfig = new ProxyConfig("10.88.16.183", 8080);
        //        webClient.getOptions().setProxyConfig(proxyConfig); 
    }

    //    public static void main(String[] args) {
    //        try {
    //////            List<Event> events = comingupCrawler();
    //////            Events events1 = new Events(events);
    //////            XMLUtil.marshallUtil(AppConstant.comingUpData, events1);
    ////
    ////            Events events = XMLUtil.unmarshallUtil(AppConstant.comingUpData, Events.class);
    ////
    ////            List<Event> listEvent = events.getEvent();
    ////            for (Event event : listEvent) {
    ////                System.out.println("Match: " + event.getMatch());
    ////            }
    //            getEventDetail(AppConstant.eventDemo);
    //        } catch (Exception e) {
    //            e.printStackTrace();
    //        }
    //    }
    @Scheduled(fixedDelay = 60000)
    public void schedule1() throws FileNotFoundException, IOException {
        List<Event> events = comingupCrawler();
        Events events1 = new Events(events);
        xMLUtil.marshallUtil(AppConstant.comingUpData, events1);
    }

    public static List<Event> comingupCrawler() throws IOException {
        List<Event> events = new ArrayList();
        WebClient client = webClient;
        //        try {
        HtmlPage page = client.getPage(AppConstant.comingUpPage);
        List<HtmlElement> tds = new ArrayList();
        tds = (List<HtmlElement>) page.getByXPath(AppConstant.comingUpEventColumn);
        //            listLinks = new ArrayList<String>();
        for (int i = 0; i < tds.size(); i++) {
            List<HtmlElement> tables = (List<HtmlElement>) tds.get(i).getByXPath("table");
            for (int j = 0; j < tables.size(); j = j + 2) {
                HtmlElement kindTable = tables.get(j);
                HtmlElement detailTable = tables.get(j + 1);
                HtmlElement kindtext = (HtmlElement) kindTable.getFirstByXPath(AppConstant.comingUpEventKind);
                String kind = kindtext.asText();

                List<HtmlElement> matches = (List<HtmlElement>) detailTable
                        .getByXPath(AppConstant.comingUpSameKindEvents);
                for (int k = 0; k < matches.size(); k++) {
                    HtmlElement matchtext = (HtmlElement) matches.get(k)
                            .getFirstByXPath(AppConstant.comingUpEventMatch);
                    HtmlElement liveText = (HtmlElement) matches.get(k)
                            .getFirstByXPath(AppConstant.comingUpEventLive);
                    DomText timeText = (DomText) matches.get(k).getFirstByXPath(AppConstant.comingUpEventTime);
                    HtmlElement tournamentText = (HtmlElement) matches.get(k)
                            .getFirstByXPath(AppConstant.comingUpEventTournament);
                    HtmlElement img = (HtmlElement) matches.get(k).getFirstByXPath(AppConstant.comingUpEventImage);
                    try {
                        if (tournamentText != null) {
                            String tournament = tournamentText.asText().replace(timeText.asText(), "")
                                    .replaceAll("\\(", "").replaceAll("\\)", "").trim();
                            String match = matchtext.asText();
                            String time = timeText.asText();
                            String link = AppConstant.prefix + matchtext.getAttribute("href");
                            String image = img.getAttribute("src");
                            String live = "";
                            if (liveText != null) {
                                live = liveText.getAttribute("src");
                            }
                            //                                listLinks.add(link);
                            Event event = new Event(kind, live, tournament.replace("\n", ""), match, time, link,
                                    image);
                            events.add(event);
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                        break;
                    }
                }
            }
        }
        System.out.println("-------------------");
        System.out.println(" web Client Memory size: " + Agent.sizeOf(webClient));
        //        } catch (Exception e) {
        //            e.printStackTrace();
        //        } finally {
        client.close();
        //        }
        return events;
    }

    @Scheduled(fixedDelay = 604800000)
    public void schedule2() {
        WebClient client = webClient;
        int[] calType = { 1, 10, 11, 22, 36, 37, 93 };
        for (int k : calType) {
            try {
                HtmlPage page = client.getPage(AppConstant.calendarPage + k);
                List<DomElement> els = (List<DomElement>) page.getByXPath(AppConstant.rounds);

                List<Calendar.Round> rounds = new ArrayList<Calendar.Round>();
                for (int i = 1; i < els.size() - 3; i++) {

                    DomElement tmp = els.get(i);
                    DomNode parent = tmp.getParentNode().getParentNode().getParentNode().getParentNode();

                    List<DomElement> trs = (List<DomElement>) parent.getByXPath("tbody/tr");
                    String date = "";
                    List<Calendar.Round.Match> matches = new ArrayList<Calendar.Round.Match>();
                    for (int j = 1; j < trs.size() - 1; j++) {
                        DomElement tr = trs.get(j);
                        HtmlElement tmp2 = tr.getFirstByXPath("td");
                        if (tmp2.hasAttribute("bgcolor")) {
                            date = tr.getTextContent().trim();
                        } else {
                            try {
                                DomElement td = tr.getFirstByXPath("td[1]/a");
                                if (td == null) {
                                    td = tr.getFirstByXPath("td[1]");
                                }
                                String teams = td.getTextContent().trim();
                                String link = td.getAttribute("href");
                                td = tr.getFirstByXPath("td[2]/a/img");
                                String logoTeam1 = td.getAttribute("src");
                                td = tr.getFirstByXPath("td[3]");
                                String score = td.getTextContent().trim();
                                td = tr.getFirstByXPath("td[4]/a/img");
                                String logoTeam2 = td.getAttribute("src");

                                Calendar.Round.Match match = new Calendar.Round.Match(date, link, teams, logoTeam1,
                                        logoTeam2, score);
                                match.toString();
                                matches.add(match);
                            } catch (Exception e) {
                                e.printStackTrace();
                            }
                        }
                    }
                    rounds.add(new Calendar.Round(matches, i));
                }
                Calendar calendar = new Calendar(rounds, k);
                xMLUtil.marshallUtil(intToCalendar(k), calendar);
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                client.close();
            }
        }
    }

    public static String intToCalendar(int k) {
        switch (k) {
        case 1:
            return AppConstant.EnglandCalendar;
        case 10:
            return AppConstant.SpainCalendar;
        case 11:
            return AppConstant.ItalyCalendar;
        case 22:
            return AppConstant.DutchCalendar;
        case 36:
            return AppConstant.GermanCalendar;
        case 37:
            return AppConstant.FranceCalendar;
        case 93:
            return AppConstant.BrazilCalendar;
        }
        return "";
    }
}