net.trustie.model.SFProject_Model.java Source code

Java tutorial

Introduction

Here is the source code for net.trustie.model.SFProject_Model.java

Source

package net.trustie.model;

import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.Date;

import net.trustie.utils.DateHandler;
import net.trustie.utils.StringHandler;
import net.trustie.utils.VanishTime;

import org.apache.commons.codec.digest.DigestUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import core.AfterExtractor;
import core.Page;
import us.codecraft.webmagic.model.annotation.ExtractBy;

@ExtractBy("//body[@id='pg_project']/div[@id='page-body']/article[@id='project']")
// ??
public class SFProject_Model implements AfterExtractor {
    // @ExtractBy("//div[@id='project-header']/section[@id='project-title']/h1/allText() | //div[@id='project-header']/div[@class='content-group']/h1[@class='project-name']/text()")
    private String name = "";
    // @ExtractBy("//div[@id='project-header']/section[@id='project-title']/p[@itemprop='author']/a/allText()")
    private String maintainers = "";
    // @ExtractBy("//section[@id='main-content']/section[@id='call-to-action']/section[@id='counts-sharing']/section[@id='project-info']/section[@class='content']/a[@title='Browse reviews']/text()")
    private float stars = 0;
    private String downloadCount = "";
    // @ExtractBy("//section[@id='main-content']/section[@id='call-to-action']/section[@id='counts-sharing']/section[@id='last-updated']/section[@class='content']/time[@class='dateUpdated']/@datetime | //div[@id='project-header']/div[@class='content-group']/div[@class='project-rating']/time[@class='dateUpdated']/@datetime")
    private String lastUpdate = "";
    // @ExtractBy("//article[@id='project']/section[@id='main-content']/section[@id='call-to-action']/section[@id='download_button']/section[@class='project-info']/allText() | //div[@id='project-header']/div[@class='content-group']/h1[@class='download-os']/allText()")
    private String platform = "";
    // @ExtractBy("//article[@id='project']/section[@id='main-content//']/section[@id='project-description']/p/allText() | //section[@id='overview']/div[@class='content-group']/section[@class='primary-content']/p[@id='project-description']/allText()")
    private String desc = "";
    // @ExtractBy("//article[@id='project']/section[@id='main-content']/section[@id='project-categories-and-license']/div/section[1]/a/allText()")
    private String categories = "";
    // @ExtractBy("//article[@id='project']/section[@id='main-content']/section[@id='project-categories-and-license']/div/section[2]/section[@class='project-info']/section[@class='content']/allText()")
    private String license = "";
    // @ExtractBy("//article[@id='project']/section[@id='main-content']/section[@id='project-features']/div[@class='content editable']/allText() | //section[@id='overview']/div[@class='content-group']/section[@class='primary-content']/p[@id='project-features']/div[@class='content editable']/allText()")
    private String feature = "";
    // additional project details
    private String language = "";
    private String intendedAudience = "";
    private String userInterface = "";
    private String programmingLanguage = "";
    private String registeredTime = "";
    private String collectTime;
    // @ExtractByUrl()
    private String url;
    private String urlMd5;
    private String pageMd5;
    private int history = 0;

    // private String html;

    public void afterProcess(Page page) {
        // long start = System.currentTimeMillis();
        this.url = page.getPageUrl();
        // justify it's enterprise or bluesteel user
        // this.html = page.getHtml().toString();
        this.urlMd5 = DigestUtils.md5Hex(page.getPageUrl());
        SimpleDateFormat bartDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        this.collectTime = bartDateFormat.format(new Date());
        this.pageMd5 = DigestUtils.md5Hex(urlMd5 + lastUpdate + feature + downloadCount + stars);
        Document doc = page.getHtml().getDocument();
        Elements bodyEles = doc.select("body");

        if (bodyEles.size() > 0) {
            Element body = bodyEles.get(0);
            String bodyType = body.attr("id");
            if ("pg_project".equals(bodyType)) {
                String type = body.attr("class");

                if (type.equals("bluesteel user")) {

                    // bluesteel user
                    extractPageBluesteelUser(doc);

                } else if (type.equals("enterprise user")) {
                    // enterprise user
                    extractPageEnterpriseUser(doc);

                } else {
                    // others
                }

                if (lastUpdate.contains("ago")) {
                    this.lastUpdate = getTime(lastUpdate);
                }
                if (registeredTime.contains("ago")) {
                    this.registeredTime = getTime((registeredTime));
                }

                if (lastUpdate.equals("")) {
                    this.lastUpdate = "0000-00-00 00:00:00";
                }
                if (registeredTime.equals("")) {
                    this.registeredTime = "0000-00-00 00:00:00";
                }
            } else {
                // name
                Elements nameEles = body.select("div#proj_header div.proj-title h2");
                this.name = nameEles.text();

                // desc
                Elements descEles = body.select("div#top_left div#home_intro div#proj-overview p");
                this.desc = descEles.text();

                // features
                Elements featuresEles = body.select("div#top_left div#home_intro div#proj-overview ul");
                this.feature = featuresEles.text();
            }

            this.lastUpdate = DateHandler.formatAllTypeDate(lastUpdate, page.getTime());
            this.registeredTime = DateHandler.formatAllTypeDate(registeredTime, page.getTime());

        }

        // long end = System.currentTimeMillis();
        // System.out.println(end-start);

        // System.out.println(this.toString());
        // System.out.println(types.get(0).attr("class"));
        // Document doc=page.getHtml().getDocument();

    }

    private void extractPageEnterpriseUser(Document doc) {
        // name
        Elements nameElements = doc.select("div#project-header div.content-group h1.project-name a");
        name = nameElements.text();

        // maintainers

        // stars
        Elements starsElements = doc
                .select("div#project-header div.content-group div.project-rating span:not(.rating-count)");
        for (int i = 0; i < starsElements.size(); i++) {
            String attr = starsElements.get(i).attr("class");
            if (attr.equals("rating star")) {
                stars += 1;
            } else if (attr.equals("rating star_half")) {
                stars += 0.5;
            } else {
                stars += 0;
            }
        }

        // download count
        Elements downloadElements = doc
                .select("div#project-header div.content-group div.group a.download-stats span.data");
        if (downloadElements.size() > 0) {
            String strDownloadCount = downloadElements.get(0).text();
            strDownloadCount = strDownloadCount.replaceAll("[^\\d]", "");
            this.downloadCount = strDownloadCount;
        }

        // last update
        Elements lastUpdateElements = doc
                .select("div#project-header div.content-group div.project-rating time.dateUpdated");
        if (lastUpdateElements.size() > 0) {
            lastUpdate = lastUpdateElements.get(0).attr("datetime");
        }

        // platform
        Elements downloadOSElements = doc.select("div#project-header div.content-group div.download-os");
        platform = downloadOSElements.text();

        // desc
        Elements descElements = doc
                .select("section#overview div.content-group section.primary-content p#project-description");
        desc = descElements.text();

        // categories
        // license

        // feature
        Elements featureElements = doc.select(
                "section#overview div.content-group section.primary-content section#project-features div[class=content editable]");
        feature = featureElements.text();
        // language
        // intended audience
        // user interface
        // program language
        // registered time
        // additional detail
        Elements enterpriseAddtionalElements = doc.select(
                "section#overview div.content-group section.primary-content aside#additional-details section.content section.project-info");
        for (int i = 0; i < enterpriseAddtionalElements.size(); i++) {
            Element element = enterpriseAddtionalElements.get(i);
            // System.out.println(element.html());
            // System.out.println("*************************************");
            Elements tags = element.select("header");
            if (tags.size() > 0) {
                String tag = tags.text();
                if (tag.equals("Languages")) {
                    language = element.select("section.content").text();
                } else if (tag.equals("Intended Audience")) {
                    intendedAudience = element.select("section.content").text();
                } else if (tag.equals("User Interface")) {
                    userInterface = element.select("section.content").text();
                } else if (tag.equals("Programming Language")) {
                    programmingLanguage = element.select("section.content").text();
                } else if (tag.equals("Registered")) {
                    registeredTime = element.select("section.content").text();
                } else if (tag.equals("Last Updated")) {
                    lastUpdate = element.select("section.content").text();
                } else if (tag.equals("Maintainers")) {
                    maintainers = element.select("a").text();
                } else if (tag.equals("License")) {
                    license = element.select("section.content").text();
                } else if (tag.equals("Categories")) {
                    categories = element.select("a").text();
                } else {

                }
            }

        }
    }

    private void extractPageBluesteelUser(Document doc) {
        // if the project type is
        // bluesteel
        // name
        Elements nameElements = doc.select("div#project-header section#project-title h1[itemprop=name]");
        if (nameElements.size() > 0) {
            name = nameElements.get(0).text();
        }

        // maintainers
        Elements maintainersElements = doc.select("div#project-header section#project-title p#maintainers a");
        maintainers = maintainersElements.text();

        // stars
        Elements starElements = doc.select(
                "article#project section#main-content section#call-to-action section#counts-sharing section.project-info section.content a[title=Browse reviews]");
        if (starElements.size() > 0) {
            String strStar = starElements.get(0).text();
            strStar = strStar.replaceAll("[^\\d\\.]", "");
            stars = Float.parseFloat(strStar);
        }

        // downloadCount
        Elements downloadElements = doc.select(
                "article#project section#main-content section#call-to-action section#counts-sharing section#download-stats section.content a[title=Downloads This Week]");
        if (downloadElements.size() > 0) {
            String strDownloadCount = downloadElements.get(0).text();
            strDownloadCount = strDownloadCount.replaceAll("[^\\d]", "");
            downloadCount = strDownloadCount;
        }

        // last update
        Elements lastUpdateElements = doc.select(
                "article#project section#main-content section#call-to-action section#counts-sharing section#last-updated section.content time.dateUpdated");
        if (lastUpdateElements.size() > 0) {
            lastUpdate = lastUpdateElements.get(0).attr("datetime");
        }

        // platform
        Elements platformElements = doc.select(
                "article#project section#main-content section#call-to-action section#download_button section.project-info");
        if (platformElements.size() > 0) {
            platform = platformElements.text();
        }

        // desc
        Elements descElements = doc.select("article#project section#main-content section#project-description p");
        desc = descElements.text();

        // categories
        Elements categoriesElements = doc.select(
                "article#project section#main-content section#project-categories-and-license div.project-container section:has(header:contains(Categories)) a");
        categories = categoriesElements.text();

        // license
        Elements licenseElements = doc.select(
                "article#project section#main-content section#project-categories-and-license div.project-container section:has(section.project-info header:contains(License)");
        if (licenseElements.size() > 0) {
            license = licenseElements.get(0).select("section.project-info section.content").text();
        }
        // license = licenseElements.html();

        // feature
        Elements featureElements = doc.select(
                "article#project section#main-content section#project-features div[class=content editable]");
        feature = featureElements.text();

        // language+intended audience+user interface+programming
        // language+registered time
        Elements addtionalElements = doc.select(
                "article#project section#main-content section#project-additional-trove div.project-container section.project-info");
        // System.out.println(addtionalElements.html());
        for (int i = 0; i < addtionalElements.size(); i++) {
            Element element = addtionalElements.get(i);
            // System.out.println(element.html());
            // System.out.println("*************************************");
            Elements tags = element.select("header");
            if (tags.size() > 0) {
                String tag = tags.text();
                if (tag.equals("Languages")) {
                    language = element.select("section.content").text();
                } else if (tag.equals("Intended Audience")) {
                    intendedAudience = element.select("section.content").text();
                } else if (tag.equals("User Interface")) {
                    userInterface = element.select("section.content").text();
                } else if (tag.equals("Programming Language")) {
                    programmingLanguage = element.select("section.content").text();
                } else if (tag.equals("Registered")) {
                    registeredTime = element.select("section.content").text();
                } else {

                }
            }

        }
    }

    public void validate(Page page) {
        if (StringHandler.isAtLeastOneBlank(this.name, this.platform, this.categories, this.url)) {
            page.setResultSkip(this, true);
            return;
        }

        if (!StringHandler.canFormatterInteger(this.downloadCount)) {
            page.setResultSkip(this, true);
            return;
        }
        ;
        if (!DateHandler.canFormatToDate(this.lastUpdate, this.registeredTime, this.collectTime)) {
            page.setResultSkip(this, true);
        }

    }

    private String getTime(String strTime) {
        String time = null;
        int vanishTime = Integer.parseInt(strTime.replaceAll("[^\\d]", ""));
        String[] ways = strTime.split("\\d+");
        String unit = ways[1];
        if (unit.contains("seconds")) {
            time = (new Timestamp(System.currentTimeMillis() - vanishTime * VanishTime.SECOND)).toString();
        } else if (unit.contains("minutes")) {
            time = (new Timestamp(System.currentTimeMillis() - vanishTime * VanishTime.MINUTE)).toString();
        } else if (unit.contains("hours")) {
            time = (new Timestamp(System.currentTimeMillis() - vanishTime * VanishTime.HOUR)).toString();
        } else if (unit.contains("days")) {
            time = (new Timestamp(System.currentTimeMillis() - vanishTime * VanishTime.DAY)).toString();
        } else if (unit.contains("months")) {
            time = (new Timestamp(System.currentTimeMillis() - vanishTime * VanishTime.MONTH)).toString();
        } else if (unit.contains("years")) {
            time = (new Timestamp(System.currentTimeMillis() - vanishTime * VanishTime.YEAR)).toString();
        } else {
            SimpleDateFormat bartDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
            return bartDateFormat.format(new Date());
        }
        return time;
    }

    public String getName() {
        return name;
    }

    public String getMaintainers() {
        return maintainers;
    }

    public float getStars() {
        return stars;
    }

    public String getDownloadCount() {
        return downloadCount;
    }

    public String getLastUpdate() {
        return lastUpdate;
    }

    public String getPlatform() {
        return platform;
    }

    public String getDesc() {
        return desc;
    }

    public String getCategories() {
        return categories;
    }

    public String getLicense() {
        return license;
    }

    public String getFeature() {
        return feature;
    }

    public String getLanguage() {
        return language;
    }

    public String getIntendedAudience() {
        return intendedAudience;
    }

    public String getUserInterface() {
        return userInterface;
    }

    public String getProgrammingLanguage() {
        return programmingLanguage;
    }

    public String getRegisteredTime() {
        return registeredTime;
    }

    public void setName(String name) {
        this.name = name;
    }

    public void setMaintainers(String maintainers) {
        this.maintainers = maintainers;
    }

    public void setStars(float stars) {
        this.stars = stars;
    }

    public void setDownloadCount(String downloadCount) {
        this.downloadCount = downloadCount;
    }
    /*
       public void setLastUpdate(String lastUpdate) {
          this.lastUpdate = DateHandler.formatAllTypeDate(lastUpdate,page.getTime());
       }*/

    public void setPlatform(String platform) {
        this.platform = platform;
    }

    public void setDesc(String desc) {
        this.desc = desc;
    }

    public void setCategories(String categories) {
        this.categories = categories;
    }

    public void setLicense(String license) {
        this.license = license;
    }

    public void setFeature(String feature) {
        this.feature = feature;
    }

    public void setLanguage(String language) {
        this.language = language;
    }

    public void setIntendedAudience(String intendedAudience) {
        this.intendedAudience = intendedAudience;
    }

    public void setUserInterface(String userInterface) {
        this.userInterface = userInterface;
    }

    public void setProgrammingLanguage(String programmingLanguage) {
        this.programmingLanguage = programmingLanguage;
    }

    public String getCollectTime() {
        return collectTime;
    }

    public void setCollectTime(String collectTime) {
        this.collectTime = collectTime;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
        this.urlMd5 = DigestUtils.md5Hex(url);
    }

    public String getUrlMd5() {
        return urlMd5;
    }

    public void setUrlMd5(String urlMD5) {
        this.urlMd5 = urlMD5;
    }

    public String getPageMd5() {
        return pageMd5;
    }

    public void setPageMd5(String pageMD5) {
        this.pageMd5 = pageMD5;
    }

    public int getHistory() {
        return history;
    }

    public void setHistory(int history) {
        this.history = history;
    }

    // public String getHtml() {
    // return html;
    // }
    //
    // public void setHtml(String html) {
    // this.html = html;
    // }

    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Object#toString()
     */
    @Override
    public String toString() {
        return "SFProject [name=" + name + ", maintainers=" + maintainers + ", stars=" + stars + ", downloadCount="
                + downloadCount + ", lastUpdate=" + lastUpdate + ", platform=" + platform + ", desc=" + desc
                + ", categories=" + categories + ", license=" + license + ", feature=" + feature + ", language="
                + language + ", intendedAudience=" + intendedAudience + ", userInterface=" + userInterface
                + ", programmingLanguage=" + programmingLanguage + ", registeredTime=" + registeredTime
                + ", collectTime=" + collectTime + ", url=" + url + ", urlMD5=" + urlMd5 + ", pageMD5=" + pageMd5
                + ", history=" + history + "]";
    }

}