smops.PageAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for smops.PageAnalyzer.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package smops;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.hibernate.Criteria;
import org.hibernate.HibernateException;
import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.hibernate.criterion.Restrictions;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import smops.dao.BusinessManager;
import smops.dao.Seal;
import smops.hibernate.Business;
import smops.hibernate.Field;
import smops.hibernate.Form;
import smops.hibernate.JsBusiness;
import smops.hibernate.JsLib;
import smops.hibernate.SealBusiness;
import smops.yelp.Yelp;

/**
 *
 * @author Aale
 */
public class PageAnalyzer {

    public static Set<String> saveFormsAndFields(String url, String filepath, Business business,
            Set<String> current_form_bodies) {
        SessionFactory sessionFactory = HibernateUtil.getSessionFactory();
        Session session = sessionFactory.openSession();
        try {
            File input = new File(filepath);
            Document doc = Jsoup.parse(input, "UTF-8");
            //            System.out.println(doc);
            analyseForms(doc, current_form_bodies, business, input, url, session);
            return current_form_bodies;
        } catch (IOException ex) {
            Logger.getLogger(Yelp.class.getName()).log(Level.SEVERE, null, ex);
        }
        return null;
    }

    public static Set<String> saveJSLibs(String url, String filepath, Business business,
            Set<String> current_js_libs) {
        SessionFactory sessionFactory = HibernateUtil.getSessionFactory();
        Session session = sessionFactory.openSession();
        try {
            File input = new File(filepath);
            Document doc = Jsoup.parse(input, "UTF-8");
            analyseJS(doc, current_js_libs, business, input, url, session);
            return current_js_libs;
        } catch (IOException ex) {
            Logger.getLogger(Yelp.class.getName()).log(Level.SEVERE, null, ex);
        }
        return current_js_libs;
    }

    private static void analyseForms(Document doc, Set<String> current_form_bodies, Business business, File input,
            String url, Session session) throws HibernateException {
        Elements form_elements = doc.select("form");
        System.out.println("#forms in page: " + form_elements.size());
        //            System.out.println(biz_website_a.toString());
        if (form_elements != null) {
            for (Element f : form_elements) {
                int numberOfValidInputs = getNumberOfValidInputs(f);
                if (numberOfValidInputs == 0) {
                    System.out.println("number of valid inputs is zero");
                    continue;
                }
                //                    session.beginTransaction();
                Form form_obj = new Form();
                String action = f.attr("action");
                String form_html = f.html();
                String form_text = f.text();
                boolean isAccepted = isAcceptedForm(form_obj, action, form_html, form_text, current_form_bodies);
                if (!isAccepted) {
                    System.out.println("form is not accepted!");
                    continue;
                }
                String title = getBestTitle(f);
                String purpose = getBestPurpose(f);
                form_obj.setAction(action);
                form_obj.setBusiness(business);
                form_obj.setFileName(input.getName());
                form_obj.setPageUrl(url);
                form_obj.setTitle(Utils.truncate(title, 2048));
                form_obj.setHtml(Utils.truncate(form_html, 65535));
                form_obj.setPurpose(purpose);
                session.beginTransaction();
                session.save(form_obj);
                System.out.println("form saved for " + url);
                session.getTransaction().commit();
                session.flush();
                System.out.println("form_obj=" + form_obj.getId());
                session.refresh(form_obj);
                System.out.println("form=" + form_obj);
                //                    session.getTransaction().commit();

                final Elements inputs = f.select("input");
                int input_count = 0;
                //                    session.beginTransaction();
                for (Element i : inputs) {
                    String type = i.attr("type");
                    //                    if (!(type.equals("text") || type.equals("password") || type.equals("radio") || type.equals("checkbox"))) {
                    //                        continue;
                    //                    }
                    if (!isAcceptedInputType(type)) {
                        continue;
                    }
                    input_count++;
                    String name = i.attr("name");
                    String value = i.attr("value");
                    String html = i.html();
                    String label = getBestLabel(i);
                    String infoType = getBestInfoType(i);
                    Field field = new Field();
                    field.setType(type);
                    field.setName(name);
                    field.setValue(Utils.truncate(value, 512));
                    field.setLabel(Utils.truncate(label, 1024));
                    field.setHtml(html);
                    field.setInfoType(infoType);
                    field.setForm(form_obj);
                    form_obj.getFields().add(field);
                    //                        System.out.println("field=" + field);
                    session.beginTransaction();
                    session.saveOrUpdate(form_obj);
                    session.save(field);
                    System.out.println("field " + name + " saved for " + form_obj.getId());
                    session.getTransaction().commit();

                }
                session.save(form_obj);
                //                    session.getTransaction().commit();
                System.out.println("Input#:" + input_count);
            }
        }
    }

    public static void main(String[] args) {

        SessionFactory sessionFactory = HibernateUtil.getSessionFactory();
        Session session = sessionFactory.openSession();
        String crawl_list_file = "1000-websites/crawled-list-1000.txt";
        final List<String> cw_list = Utils.readFileLineByLine(crawl_list_file);
        Map<Integer, List<String>> crawlInfoMap = new HashMap<>();
        for (String cw : cw_list) {
            String[] splits = cw.split("\t", 2);
            Business biz = BusinessManager.getByWebsite(splits[0], session);
            int biz_id = biz.getId();
            System.out.println("biz_id=" + biz_id);
            String pageAndPath = splits[1];
            System.out.println("pageAndPath=" + pageAndPath);
            List<String> list = crawlInfoMap.get(biz_id);
            if (list == null) {
                crawlInfoMap.put(biz_id, new ArrayList<String>());
            }
            crawlInfoMap.get(biz_id).add(pageAndPath);
        }
        int biz_counter = 0;
        for (Integer biz_id : crawlInfoMap.keySet()) {
            System.out.println("start analyzing " + biz_counter + " biz (id=" + biz_id + ")");
            biz_counter++;
            final Business biz = (Business) session.get(Business.class, biz_id);
            Set<String> current_form_bodies = new HashSet<>();
            Set<String> current_js_libs = new HashSet<>();
            Set<String> current_seals = new HashSet<>();
            int html_forms_count = 0;
            int js_libs_count = 0;
            session.beginTransaction();
            biz.setNumPages(crawlInfoMap.get(biz_id).size());
            session.save(biz);
            session.getTransaction().commit();
            for (String others : crawlInfoMap.get(biz_id)) {
                String[] splits = others.split("\t");
                String url = splits[0];
                String file_path = splits[1];
                file_path = file_path.replace("/home/commercenet/smops/SMOPs/crawl-1000-only-valid/",
                        "C:\\Data\\github\\SMOPs\\1000-websites\\crawl-1000\\");
                current_form_bodies = saveFormsAndFields(url, file_path, biz, current_form_bodies);
                //                saveImpPagesTypes(url, file_path, biz, session);
                //                current_seals = saveSeals(url, file_path, biz, current_seals, session);
                //                html_forms_count += getHtmlFormsCount(url, file_path, biz, session);
                //                js_libs_count += getJsLibsCount(url, file_path, biz, session);

                //                current_js_libs = saveJSLibs(url, file_path, biz, current_js_libs);
            }
            //            updateWebsiteType(biz, html_forms_count, js_libs_count, session);
        }
    }

    private static boolean checkForBestLabel(String text) {
        if (text != null && text.length() > 0 && Constants.getInfoType(text) != null) {
            return true;
        }
        return false;
    }

    private static String getBestLabel(Element elem) {
        String text = elem.text();
        if (checkForBestLabel(text)) {
            return text;
        }
        Element previousSibling = elem.previousElementSibling();
        if (previousSibling != null) {
            text = previousSibling.text();
            if (checkForBestLabel(text)) {
                return text;
            }
        }
        Element parent = elem.parent();
        if (parent != null) {
            text = parent.text();
            if (checkForBestLabel(text)) {
                return text;
            }
        }
        return "";
    }

    private static String getBestTitle(Element f) {
        return f.text();
    }

    private static int getNumberOfValidInputs(Element form) {
        final Elements inputs = form.select("input");
        int count = 0;
        for (Element inp : inputs) {
            if (isAcceptedInputElement(inp)) {
                count++;
            }
            //            String type = inp.attr("type");
            //            if (type.equals("text")) {
            //                count++;
            //            }
        }
        return count;
    }

    private static boolean isAcceptedInputElement(Element input) {
        String type = input.attr("type");
        if (type.equals("text")) {
            String name = input.attr("name");
            if (Constants.getInfoType(name) != null) {
                return true;
            }
            String value = input.attr("value");
            if (Constants.getInfoType(value) != null) {
                return true;
            }
            String label = getBestLabel(input);
            if (Constants.getInfoType(label) != null) {
                return true;
            }
        }
        return false;

    }

    private static String getBestPurpose(Element form) {
        final Element firstChild = form.child(0);
        String text, purpose;
        if (firstChild != null) {
            text = firstChild.text();
            purpose = Constants.getPurpose(text);
            if (purpose != null) {
                return purpose;
            }
        }
        final Element previousElementSibling = form.previousElementSibling();
        if (previousElementSibling != null) {
            text = previousElementSibling.text();
            purpose = Constants.getPurpose(text);
            if (purpose != null) {
                return purpose;
            }
        }
        String formText = form.text();
        if (formText != null && formText.length() < 300) {
            purpose = Constants.getPurpose(formText);
            if (purpose != null) {
                return purpose;
            }
        }
        return "UNKNOWN";
    }

    private static String getBestInfoType(Element input) {
        String type = input.attr("type");
        if (isAcceptedInputType(type)) {
            String name = input.attr("name");
            String infoType = Constants.getInfoType(name);
            if (infoType != null) {
                return infoType;
            }
            String value = input.attr("value");
            infoType = Constants.getInfoType(value);
            if (infoType != null) {
                return infoType;
            }
            String label = getBestLabel(input);
            infoType = Constants.getInfoType(label);
            if (infoType != null) {
                return infoType;
            }
        }
        return "UNKNOWN";

    }

    private static boolean isAcceptedForm(Form form_obj, String action, String form_html, String form_text,
            Set<String> current_form_bodies) {
        if (form_text.length() > 1000) {
            return false;
        }
        if (current_form_bodies.contains(form_text)) {
            return false;
        } else {
            current_form_bodies.add(form_text);
            return true;
        }
    }

    private static void analyseJS(Document doc, Set<String> current_js_libs, Business business, File input,
            String url, Session session) {

        Elements js_tags = doc.select("script");
        //        System.out.println("url=" + url);
        int count = 0;
        for (Element js_tag : js_tags) {
            String src = js_tag.attr("src");
            count++;
            if (src != null && src.length() > 0) {
                final String jslib = Utils.matchJsLib(src);
                if (current_js_libs.contains(jslib)) {

                } else {
                    JsLib jslibObj = getJsLibObj(jslib, session);
                    if (jslibObj != null) {
                        current_js_libs.add(jslib);
                        JsBusiness jsbiz = new JsBusiness();
                        jsbiz.setBusiness(business);
                        jsbiz.setJsLib(jslibObj);
                        jsbiz.setUrl(src);
                        session.beginTransaction();
                        session.save(jsbiz);
                        session.getTransaction().commit();
                    }
                }

                System.out.println(count + " " + jslib + " - " + src);
            }
        }
        System.out.println();
    }

    private static JsLib getJsLibObj(String jslib, Session session) {
        Criteria criteria = session.createCriteria(JsLib.class);
        JsLib jslibObj = (JsLib) criteria.add(Restrictions.eq("name", jslib)).uniqueResult();
        if (jslibObj == null) {
            JsLib newJslibObj = new JsLib();
            newJslibObj.setName(jslib);
            session.beginTransaction();
            session.save(newJslibObj);
            session.getTransaction().commit();
            return newJslibObj;
        } else {
            return jslibObj;
        }
    }

    private static void saveImpPagesTypes(String url, String filepath, Business business, Session session) {
        try {
            File input = new File(filepath);
            Document doc = Jsoup.parse(input, "UTF-8");
            //            System.out.println(doc);
            analyseLinksForImpPages(doc, business, input, url, session);
        } catch (IOException ex) {
            Logger.getLogger(Yelp.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    private static void analyseLinksForImpPages(Document doc, Business business, File input, String url,
            Session session) {
        Elements link_elements = doc.select("a");
        System.out.println("#links in page: " + link_elements.size());
        for (Element link : link_elements) {
            String href = link.attr("href");
            String text = link.text();
            String alt = link.attr("alt");
            String pageType = null;
            if (pageType == null && text != null) {
                pageType = Constants.getImpPage(text);
            }
            if (pageType == null && href != null) {
                pageType = Constants.getImpPage(href);
            }
            if (pageType == null && alt != null) {
                pageType = Constants.getImpPage(alt);
            }
            if (pageType != null) {
                if (pageType.equals("PRIVACY_POLICY")) {
                    if (text != null) {
                        business.setPrivacyPolicyAnchorText(text);
                    }
                    if (url != null) {
                        business.setPrivacyPolicyUrl(href);
                    }
                    business.setHasPrivacyPolicy(Boolean.TRUE);
                    System.out.println("privacy policy for biz " + business.getId() + " updated");
                } else if (pageType.equals("CONTACT_US")) {
                    if (text != null) {
                        business.setContactUsAnchorText(text);
                    }
                    if (url != null) {
                        business.setContactUsUrl(href);
                    }
                    business.setHasContactUs(Boolean.TRUE);
                    System.out.println("contact us for biz " + business.getId() + " updated");
                } else if (pageType.equals("LOGIN")) {
                    if (text != null) {
                        business.setLoginAnchorText(text);
                    }
                    if (url != null) {
                        business.setLoginUrl(href);
                    }
                    business.setSupportLogin(Boolean.TRUE);
                    System.out.println("login for biz " + business.getId() + " updated");
                }
                session.beginTransaction();
                session.save(business);
                session.getTransaction().commit();
            }
            Elements input_elements = doc.select("input");
            for (Element inp : input_elements) {
                String type = inp.attr("type");
                if (type != null) {
                    if (type.toLowerCase().equals("submit")) {
                        String name = inp.attr("name");
                        String value = inp.attr("value");
                        String title = inp.attr("title");
                        String buttonType = null;

                        if (buttonType == null && value != null) {
                            buttonType = Constants.getImpPage(value);
                        }
                        if (buttonType == null && title != null) {
                            buttonType = Constants.getImpPage(title);
                        }
                        if (buttonType == null && name != null) {
                            buttonType = Constants.getImpPage(name);
                        }
                        if (buttonType != null && buttonType.equals("LOGIN")) {
                            String anchor = "SUBMIT INPUT: ";
                            if (value != null) {
                                anchor += "value= " + value + " , ";
                            }
                            if (title != null) {
                                anchor += "title= " + title + " , ";
                            }
                            if (name != null) {
                                anchor += "name= " + name + " , ";
                            }
                            business.setLoginAnchorText(anchor);
                            business.setSupportLogin(Boolean.TRUE);
                            session.beginTransaction();
                            session.save(business);
                            session.getTransaction().commit();

                        } else if (buttonType != null) {
                            System.out
                                    .println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>The Button Type is not login. it is "
                                            + buttonType);
                        }
                    }
                }
            }

        }
        //      

    }

    private static Set<String> saveSeals(String url, String filepath, Business business, Set<String> current_seals,
            Session session) {
        try {
            File input = new File(filepath);
            Document doc = Jsoup.parse(input, "UTF-8");
            analyseSeals(doc, current_seals, business, input, url, session);
            return current_seals;
        } catch (IOException ex) {
            Logger.getLogger(Yelp.class.getName()).log(Level.SEVERE, null, ex);
        }
        return current_seals;
    }

    private static void analyseSeals(Document doc, Set<String> current_seals, Business business, File input,
            String url, Session session) {
        Elements link_elements = doc.select("a");
        System.out.println("#links in page: " + link_elements.size());
        for (Element link : link_elements) {
            String href = link.attr("href");
            String text = link.text();
            String alt = link.attr("alt");
            if (href != null) {
                String seal = Seal.getSealByURL(href);
                if (seal != null) {
                    addSealForBusiness(seal, href, text, current_seals, business, session);
                }
            }
            if (text != null) {
                String seal = Seal.getSealByAnchorText(text);
                if (seal != null) {
                    addSealForBusiness(seal, href, text, current_seals, business, session);
                }
            }
            if (alt != null) {
                String seal = Seal.getSealByAnchorText(alt);
                if (seal != null) {
                    addSealForBusiness(seal, href, text, current_seals, business, session);
                }
            }
        }
    }

    private static void addSealForBusiness(String seal, String url, String anchorText, Set<String> current_seals,
            Business business, Session session) {
        if (current_seals.contains(seal)) {
            System.out.println("duplicate seal for the business, ignore adding seal");
            return;
        }
        System.out.println("Seal saved. name=" + seal + " business=" + business.getId());
        session.beginTransaction();
        SealBusiness sealbiz = new SealBusiness();
        sealbiz.setBusiness(business);
        sealbiz.setSeal(seal);
        if (anchorText != null) {
            sealbiz.setSealAnchorText(anchorText);
        }
        if (url != null) {
            sealbiz.setSealUrl(url);
        }
        session.save(sealbiz);
        current_seals.add(seal);
        session.getTransaction().commit();
    }

    private static int getHtmlFormsCount(String url, String filepath, Business business, Session session) {
        try {
            File input = new File(filepath);
            Document doc = Jsoup.parse(input, "UTF-8");
            Elements form_elements = doc.select("form");
            if (form_elements != null) {
                return form_elements.size();
            }
            return 0;
        } catch (IOException ex) {
            Logger.getLogger(Yelp.class.getName()).log(Level.SEVERE, null, ex);
        }
        return 0;
    }

    private static int getJsLibsCount(String url, String filepath, Business business, Session session) {
        try {
            File input = new File(filepath);
            Document doc = Jsoup.parse(input, "UTF-8");
            Elements js_tags = doc.select("script");
            if (js_tags != null) {
                int count = 0;
                for (Element js_tag : js_tags) {
                    String src = js_tag.attr("src");
                    if (src != null) {
                        count++;
                    }
                }
                return count;
            } else {
                return 0;
            }
        } catch (IOException ex) {
            Logger.getLogger(Yelp.class.getName()).log(Level.SEVERE, null, ex);
        }
        return 0;
    }

    private static void updateWebsiteType(Business biz, int html_forms_count, int js_libs_count, Session session) {
        session.beginTransaction();
        if (html_forms_count == 0 && js_libs_count == 0) {
            biz.setWebsiteType("POSTER");
        } else {
            biz.setWebsiteType("OTHER");
        }
        session.save(biz);
        session.getTransaction().commit();
    }

    static List<String> notAcceptedInputType = Arrays.asList("submit", "hidden", "reset", "button", "image");

    private static boolean isAcceptedInputType(String type) {
        if (type == null || type.length() == 0) {
            return false;
        }
        return !notAcceptedInputType.contains(type.toLowerCase());
    }
}