bixo.examples.webmining.WebMiningWorkflow.java Source code

Introduction

Here is the source code for bixo.examples.webmining.WebMiningWorkflow.java
Source

/*
 * Copyright 2009-2012 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.examples.webmining;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;

import bixo.config.FetcherPolicy;
import bixo.config.ParserPolicy;
import bixo.config.UserAgent;
import bixo.datum.FetchedDatum;
import bixo.datum.StatusDatum;
import bixo.datum.UrlDatum;
import bixo.datum.UrlStatus;
import bixo.fetcher.SimpleHttpFetcher;
import bixo.operations.BaseScoreGenerator;
import bixo.parser.SimpleParser;
import bixo.pipes.FetchPipe;
import bixo.pipes.ParsePipe;
import bixo.urls.SimpleUrlNormalizer;
import bixo.utils.IoUtils;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.OperationCall;
import cascading.operation.filter.Limit;
import cascading.operation.filter.Limit.Context;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.cogroup.OuterJoin;
import cascading.scheme.SequenceFile;
import cascading.scheme.TextDelimited;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;

import com.bixolabs.cascading.BaseSplitter;
import com.bixolabs.cascading.HadoopUtils;
import com.bixolabs.cascading.SplitterAssembly;
import com.bixolabs.cascading.TupleLogger;

@SuppressWarnings("deprecation")
public class WebMiningWorkflow {

    // Max URLs to fetch in local vs. distributed mode.
    private static final long MAX_LOCAL_FETCH = 5;
    private static final long MAX_DISTRIBUTED_FETCH = 100;

    @SuppressWarnings("serial")
    private static class SplitFetchedUnfetchedSSCrawlDatums extends BaseSplitter {

        @Override
        public String getLHSName() {
            return "unfetched crawl db datums";
        }

        @Override
        // LHS represents unfetched tuples
        public boolean isLHS(TupleEntry tupleEntry) {
            CrawlDbDatum datum = new CrawlDbDatum(tupleEntry);
            UrlStatus status = datum.getLastStatus();
            if (status == UrlStatus.UNFETCHED || status == UrlStatus.SKIPPED_DEFERRED
                    || status == UrlStatus.SKIPPED_BY_SCORER || status == UrlStatus.SKIPPED_BY_SCORE
                    || status == UrlStatus.SKIPPED_TIME_LIMIT || status == UrlStatus.SKIPPED_INTERRUPTED
                    || status == UrlStatus.SKIPPED_INEFFICIENT || status == UrlStatus.ABORTED_SLOW_RESPONSE
                    || status == UrlStatus.ERROR_IOEXCEPTION) {
                return true;
            }
            return false;
        }

    }

    @SuppressWarnings("serial")
    private static class CreateUrlDatumFromCrawlDbDatum extends BaseOperation<Limit.Context>
            implements Function<Limit.Context> {

        private long _limit = 0;

        public CreateUrlDatumFromCrawlDbDatum(long limit) {
            super(UrlDatum.FIELDS);

            _limit = limit;
        }

        @Override
        public void prepare(FlowProcess flowProcess, OperationCall<Limit.Context> operationCall) {
            super.prepare(flowProcess, operationCall);

            Context context = new Context();

            operationCall.setContext(context);

            HadoopFlowProcess process = (HadoopFlowProcess) flowProcess;

            int numTasks = 0;

            if (process.isMapper())
                numTasks = process.getCurrentNumMappers();
            else
                numTasks = process.getCurrentNumReducers();

            int taskNum = process.getCurrentTaskNum();

            context.limit = (long) Math.floor((double) _limit / (double) numTasks);

            long remainingLimit = _limit % numTasks;

            // evenly divide limits across tasks
            context.limit += taskNum < remainingLimit ? 1 : 0;
        }

        @Override
        public void operate(FlowProcess flowProcess, FunctionCall<Limit.Context> funcCall) {
            CrawlDbDatum datum = new CrawlDbDatum(funcCall.getArguments());
            UrlDatum urlDatum = new UrlDatum(datum.getUrl());
            urlDatum.setPayloadValue(CustomFields.PAGE_SCORE_FN, datum.getPageScore());
            urlDatum.setPayloadValue(CustomFields.LINKS_SCORE_FN, datum.getLinksScore());
            urlDatum.setPayloadValue(CustomFields.STATUS_FN, datum.getLastStatus().toString());
            urlDatum.setPayloadValue(CustomFields.SKIP_BY_LIMIT_FN, funcCall.getContext().increment());

            funcCall.getOutputCollector().add(urlDatum.getTuple());
        }
    }

    public static void importSeedUrls(Path crawlDbPath, String fileName) throws IOException, InterruptedException {

        SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();
        JobConf defaultJobConf = HadoopUtils.getDefaultJobConf();

        InputStream is = null;
        TupleEntryCollector writer = null;
        try {
            Tap urlSink = new Hfs(new TextLine(), crawlDbPath.toString(), true);
            writer = urlSink.openForWrite(defaultJobConf);

            is = WebMiningWorkflow.class.getResourceAsStream(fileName);
            if (is == null) {
                throw new FileNotFoundException("The seed urls file doesn't exist");
            }

            List<String> lines = IOUtils.readLines(is);
            for (String line : lines) {
                line = line.trim();
                if (line.startsWith("#")) {
                    continue;
                }

                CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f,
                        0.0f);
                writer.add(datum.getTuple());
            }

            writer.close();
        } catch (IOException e) {
            HadoopUtils.safeRemove(crawlDbPath.getFileSystem(defaultJobConf), crawlDbPath);
            throw e;
        } finally {
            IoUtils.safeClose(is);
            if (writer != null) {
                writer.close();
            }
        }

    }

    public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy,
            UserAgent userAgent, WebMiningOptions options, boolean resetSolr)
            throws IOException, InterruptedException {

        // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir.
        // HTML only.

        // We want to extract the cleaned up HTML, and pass that to the parser, which will
        // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and
        // any results.

        JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
        boolean isLocal = HadoopUtils.isJobLocal(conf);
        int numReducers = 1; // we always want to use a single reducer, to avoid contention
        conf.setNumReduceTasks(numReducers);
        conf.setInt("mapred.min.split.size", 64 * 1024 * 1024);
        Properties props = HadoopUtils.getDefaultProperties(WebMiningWorkflow.class, false, conf);
        FileSystem fs = crawlDbPath.getFileSystem(conf);

        // Input : the crawldb
        if (!fs.exists(crawlDbPath)) {
            throw new RuntimeException("CrawlDb not found");
        }

        Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES),
                crawlDbPath.toString());
        Pipe importPipe = new Pipe("import pipe");

        // Split into tuples that are to be fetched and that have already been fetched
        SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());

        Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
        Pipe urlsToFetchPipe = splitter.getLHSPipe();

        // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, 
        // or MAX_LOCAL_FETCH if running locally. So first we sort the entries 
        // from high to low by links score.
        // TODO add unit test
        urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
        long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
        urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

        BaseScoreGenerator scorer = new LinkScoreGenerator();

        // Create the sub-assembly that runs the fetch job
        int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
        fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
        fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
        fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

        FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);

        // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);

        Pipe analyzerPipe = new Pipe("analyzer pipe");
        analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());

        Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
        outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());

        Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
        resultsPipe = new Each(resultsPipe, new CreateResultsFunction());

        // Group the finished datums, the skipped datums, status, outlinks
        Pipe updatePipe = new CoGroup("update pipe",
                Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
                Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                        new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)),
                null, new OuterJoin());
        updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

        // output : loop dir specific crawldb
        Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap crawlDbSink = new Hfs(new TextLine(), outCrawlDbPath.toString());
        // Status, 
        Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());
        // Content
        Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
        Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

        // PageResults
        Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
        Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString());

        // Create the output map that connects each tail pipe to the appropriate sink.
        Map<String, Tap> sinkMap = new HashMap<String, Tap>();
        sinkMap.put(updatePipe.getName(), crawlDbSink);
        sinkMap.put(statusPipe.getName(), statusSink);
        sinkMap.put(contentPipe.getName(), contentSink);
        sinkMap.put(resultsPipe.getName(), resultsSink);

        FlowConnector flowConnector = new FlowConnector(props);
        Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe);

        return flow;
    }

}