jp.igapyon.selecrawler.SeleCrawlerWebContentGetter.java Source code

Introduction

Here is the source code for jp.igapyon.selecrawler.SeleCrawlerWebContentGetter.java
Source

/*
 *  selecrawler
 *  Copyright (C) 2017  Toshiki Iga
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
/*
 *  Copyright 2017 Toshiki Iga
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package jp.igapyon.selecrawler;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.net.URLCodec;
import org.apache.commons.io.FileUtils;

import jp.igapyon.selecrawler.util.SimpleChromeWrapper;

public class SeleCrawlerWebContentGetter {
    protected SeleCrawlerSettings settings = null;

    public void process(final SeleCrawlerSettings settings) throws IOException {
        this.settings = settings;
        System.err.println("[jp.igapyon.selecrawler] Fetching web contents using Chrome.");

        // process for each device.
        if (settings.isProcessMac()) {
            processDevice("mac");
        }
        if (settings.isProcessIphone()) {
            processDevice("iphone");
        }
    }

    public void processDevice(final String deviceName) throws IOException {
        System.err.println("[selecrawler] Launch Chrome. UA:" + deviceName);
        final SimpleChromeWrapper chrome = new SimpleChromeWrapper(settings.getPathChromeDriver(), deviceName,
                settings.getPathUserDataDir());
        chrome.open();

        int getcounter = 0;

        System.err.println(
                "[selecrawler] Load url list file: " + new File(settings.getPathUrllisttTxt()).getCanonicalPath());
        System.err.println("[selecrawler] Target dir: " + new File(settings.getPathTargetDir()).getCanonicalPath());

        final List<String> urls = FileUtils.readLines(new File(settings.getPathUrllisttTxt()), "UTF-8");
        for (String urlLookup : urls) {
            if (getcounter >= 10) {
                // refresh chrome instance
                getcounter = 0;
                chrome.close();
                chrome.open();
            }

            final File outputFile = getFileHtml(deviceName, urlLookup);
            if (outputFile.getParentFile().exists() == false) {
                outputFile.getParentFile().mkdirs();
            }

            final File outputMetaFile = new File(outputFile.getParentFile(),
                    outputFile.getName() + SeleCrawlerConstants.EXT_SC_URL);

            final File outputLogFile = new File(outputFile.getParentFile(),
                    outputFile.getName() + SeleCrawlerConstants.EXT_SC_LOG);

            if (outputMetaFile.exists()) {
                if (settings.isDebug()) {
                    System.err.println("[selecrawler] skip(cache): " + urlLookup);
                }
                continue;
            }

            System.err.println("[selecrawler] fetch web: " + urlLookup);

            chrome.getDriver().get(urlLookup);

            {
                // check wait settings.
                final String urlActual = chrome.getDriver().getCurrentUrl();
                for (String regex : settings.getUrllistWaitRegex()) {
                    final Pattern pat = Pattern.compile(regex);
                    final Matcher mat = pat.matcher(urlActual);
                    if (mat.find()) {
                        try {
                            System.out.println("waiting browser operation");
                            Thread.sleep(10000);
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        }
                    }
                }
            }

            final String contents = chrome.getDriver().getPageSource();
            FileUtils.writeStringToFile(outputFile, contents, "UTF-8");

            FileUtils.writeLines(outputLogFile, "UTF-8", chrome.getLogEntries());

            // write meta finally.
            {
                final List<String> metaUrlList = new ArrayList<String>();
                metaUrlList.add(urlLookup);
                metaUrlList.add(chrome.getDriver().getCurrentUrl());
                FileUtils.writeLines(outputMetaFile, "UTF-8", metaUrlList);
            }
            getcounter++;

            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        chrome.close();
    }

    public File getFileHtml(final String deviceName, final String urlLookup) throws IOException {
        final URL url = new URL(urlLookup);
        final String serverhostname = url.getHost();
        String path = url.getPath();
        if (path.length() == 0 || path.equals("/") || path.endsWith("/")) {
            path = path + "/index.html";
        }

        if (url.getQuery() != null) {
            try {
                path += new URLCodec().encode("?" + url.getQuery());
            } catch (EncoderException e) {
                e.printStackTrace();
            }

        }

        return new File(settings.getPathTargetDir() + deviceName + "/" + serverhostname + path);
    }
}