bixo.examples.crawl.SimpleStatusTool.java Source code

Introduction

Here is the source code for bixo.examples.crawl.SimpleStatusTool.java
Source

/*
 * Copyright 2009-2012 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.examples.crawl;

import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;

import bixo.datum.UrlStatus;
import bixo.utils.CrawlDirUtils;
import cascading.scheme.SequenceFile;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.Tap;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryIterator;

@SuppressWarnings("deprecation")
public class SimpleStatusTool {
    private static final Logger LOGGER = Logger.getLogger(SimpleStatusTool.class);

    private static void printUsageAndExit(CmdLineParser parser) {
        parser.printUsage(System.err);
        System.exit(-1);
    }

    private static void processStatus(JobConf conf, Path curDirPath) throws IOException {
        Path statusPath = new Path(curDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusTap = new Hfs(new TextLine(), statusPath.toUri().toString());

        TupleEntryIterator iter = statusTap.openForRead(conf);

        UrlStatus[] statusValues = UrlStatus.values();
        int[] statusCounts = new int[statusValues.length];
        int totalEntries = 0;
        while (iter.hasNext()) {
            TupleEntry entry = iter.next();
            totalEntries += 1;

            // URL_FN, STATUS_FN, HEADERS_FN, EXCEPTION_FN, STATUS_TIME_FN, HOST_ADDRESS_FN).append(getSuperFields(StatusDatum.class)
            String statusLine = entry.getString("line");
            String[] pieces = statusLine.split("\t");
            UrlStatus status = UrlStatus.valueOf(pieces[1]);
            statusCounts[status.ordinal()] += 1;
        }

        for (int i = 0; i < statusCounts.length; i++) {
            if (statusCounts[i] != 0) {
                LOGGER.info(String.format("Status %s: %d", statusValues[i].toString(), statusCounts[i]));
            }
        }
        LOGGER.info("Total status: " + totalEntries);
        LOGGER.info("");
    }

    private static void processCrawlDb(JobConf conf, Path curDirPath, boolean exportDb) throws IOException {
        TupleEntryIterator iter;
        int totalEntries;
        Path crawlDbPath = new Path(curDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toUri().toString());
        iter = crawldbTap.openForRead(conf);
        totalEntries = 0;
        int fetchedUrls = 0;
        int unfetchedUrls = 0;

        while (iter.hasNext()) {
            TupleEntry entry = iter.next();
            totalEntries += 1;

            CrawlDbDatum datum = new CrawlDbDatum(entry);
            if (exportDb) {
                LOGGER.info(datum.toString());
            }
            if (datum.getLastFetched() == 0) {
                unfetchedUrls += 1;
            } else {
                fetchedUrls += 1;
            }
        }
        if (!exportDb) {
            LOGGER.info(String.format("%d fetched URLs", fetchedUrls));
            LOGGER.info(String.format("%d unfetched URLs", unfetchedUrls));
            LOGGER.info("Total URLs: " + totalEntries);
            LOGGER.info("");
        }
    }

    public static void main(String[] args) {
        SimpleStatusToolOptions options = new SimpleStatusToolOptions();
        CmdLineParser parser = new CmdLineParser(options);

        try {
            parser.parseArgument(args);
        } catch (CmdLineException e) {
            System.err.println(e.getMessage());
            printUsageAndExit(parser);
        }

        String crawlDirName = options.getCrawlDir();

        try {
            JobConf conf = new JobConf();
            Path crawlDirPath = new Path(crawlDirName);
            FileSystem fs = crawlDirPath.getFileSystem(conf);

            if (!fs.exists(crawlDirPath)) {
                System.err.println("Prior crawl output directory does not exist: " + crawlDirName);
                System.exit(-1);
            }

            // Skip Hadoop/Cascading DEBUG messages.
            Logger.getRootLogger().setLevel(Level.INFO);

            boolean exportDb = options.isExportDb();
            if (exportDb) {
                Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath);
                processCrawlDb(conf, latestCrawlDirPath, exportDb);
            } else {
                int prevLoop = -1;
                Path curDirPath = null;
                while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) {
                    String curDirName = curDirPath.toUri().toString();
                    LOGGER.info("");
                    LOGGER.info("================================================================");
                    LOGGER.info("Processing " + curDirName);
                    LOGGER.info("================================================================");

                    int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath);
                    if (curLoop != prevLoop + 1) {
                        LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop));
                    }

                    prevLoop = curLoop;

                    // Process the status and crawldb in curPath
                    processStatus(conf, curDirPath);
                    processCrawlDb(conf, curDirPath, exportDb);

                }
            }
        } catch (Throwable t) {
            LOGGER.error("Exception running tool", t);
            System.exit(-1);
        }
    }

}