com.soulgalore.crawler.run.CrawlAndVerifyAssetsToCsv.java Source code

Java tutorial

Introduction

Here is the source code for com.soulgalore.crawler.run.CrawlAndVerifyAssetsToCsv.java

Source

/******************************************************
 * Web crawler
 * 
 * 
 * Copyright (C) 2012 by Peter Hedenskog (http://peterhedenskog.com)
 * 
 ****************************************************** 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 * 
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 * 
 ******************************************************* 
 */
package com.soulgalore.crawler.run;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;

import org.apache.commons.cli.ParseException;

import com.google.inject.Guice;
import com.google.inject.Injector;
import com.soulgalore.crawler.core.Crawler;
import com.soulgalore.crawler.core.CrawlerResult;
import com.soulgalore.crawler.core.assets.AssetResponse;
import com.soulgalore.crawler.core.assets.AssetsVerificationResult;
import com.soulgalore.crawler.core.assets.AssetsVerifier;
import com.soulgalore.crawler.guice.CrawlModule;
import com.soulgalore.crawler.util.StatusCode;

public class CrawlAndVerifyAssetsToCsv extends AbstractCrawl {

    /**
     * The default file name of the result.
     */
    public static final String DEFAULT_FILENAME = "errorassets.csv";

    private final String fileName;

    CrawlAndVerifyAssetsToCsv(String[] args) throws ParseException {
        super(args);
        fileName = getLine().getOptionValue("filename", DEFAULT_FILENAME);
    }

    /**
     * Run.
     * 
     * @param args the args
     */
    public static void main(String[] args) {

        try {
            final CrawlAndVerifyAssetsToCsv crawl = new CrawlAndVerifyAssetsToCsv(args);
            crawl.crawl();

        } catch (ParseException e) {
            System.err.print(e.getMessage());
        } catch (IllegalArgumentException e) {
            System.err.println(e.getMessage());
        }

    }

    private void crawl() {
        final Injector injector = Guice.createInjector(new CrawlModule());
        final Crawler crawler = injector.getInstance(Crawler.class);

        final StringBuilder builder = new StringBuilder();
        builder.append("URL,parent,error\n");

        final CrawlerResult result = crawler.getUrls(getConfiguration());
        System.out.println("Crawled  " + result.getVerifiedURLResponses().size() + " pages");

        System.out.println("Start verify assets ...");
        AssetsVerifier verifier = injector.getInstance(AssetsVerifier.class);
        AssetsVerificationResult assetsResult = verifier.verify(result.getVerifiedURLResponses(),
                getConfiguration());

        System.out.println(assetsResult.getWorkingAssets().size() + " assets is ok, "
                + assetsResult.getNonWorkingAssets().size() + " is not");

        for (AssetResponse resp : assetsResult.getNonWorkingAssets()) {
            builder.append(resp.getUrl()).append(",").append(resp.getReferer()).append(",")
                    .append(StatusCode.toFriendlyName(resp.getResponseCode())).append("\n");
        }

        Writer out = null;
        try {
            out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"));
            out.write(builder.toString());
        } catch (Exception e) {
            // TODO Auto-generated catch block
            System.err.println(e);
        } finally {
            if (out != null)
                try {
                    out.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    System.err.println(e);
                }
        }

        crawler.shutdown();
        verifier.shutdown();
    }
}