Java tutorial
/* * Copyright 2009-2013 Scale Unlimited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package bixo.examples.webmining; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.IOUtils; import bixo.config.BixoPlatform; import bixo.config.FetcherPolicy; import bixo.config.ParserPolicy; import bixo.config.UserAgent; import bixo.datum.FetchedDatum; import bixo.datum.StatusDatum; import bixo.datum.UrlDatum; import bixo.datum.UrlStatus; import bixo.fetcher.SimpleHttpFetcher; import bixo.operations.BaseScoreGenerator; import bixo.parser.SimpleParser; import bixo.pipes.FetchPipe; import bixo.pipes.ParsePipe; import bixo.urls.SimpleUrlNormalizer; import bixo.utils.IoUtils; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.flow.FlowProcess; import cascading.operation.BaseOperation; import cascading.operation.Function; import cascading.operation.FunctionCall; import cascading.operation.OperationCall; import cascading.operation.filter.Limit; import cascading.operation.filter.Limit.Context; import cascading.operation.regex.RegexParser; import cascading.pipe.CoGroup; import cascading.pipe.Each; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.pipe.joiner.OuterJoin; import cascading.tap.SinkMode; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryCollector; import com.scaleunlimited.cascading.BasePath; import com.scaleunlimited.cascading.BasePlatform; import com.scaleunlimited.cascading.BaseSplitter; import com.scaleunlimited.cascading.SplitterAssembly; import com.scaleunlimited.cascading.TupleLogger; public class DemoWebMiningWorkflow { // Max URLs to fetch in local vs. distributed mode. private static final long MAX_LOCAL_FETCH = 5; private static final long MAX_DISTRIBUTED_FETCH = 100; @SuppressWarnings("serial") private static class SplitFetchedUnfetchedSSCrawlDatums extends BaseSplitter { @Override public String getLHSName() { return "unfetched crawl db datums"; } @Override // LHS represents unfetched tuples public boolean isLHS(TupleEntry tupleEntry) { CrawlDbDatum datum = new CrawlDbDatum(tupleEntry); UrlStatus status = datum.getLastStatus(); if (status == UrlStatus.UNFETCHED || status == UrlStatus.SKIPPED_DEFERRED || status == UrlStatus.SKIPPED_BY_SCORER || status == UrlStatus.SKIPPED_BY_SCORE || status == UrlStatus.SKIPPED_TIME_LIMIT || status == UrlStatus.SKIPPED_INTERRUPTED || status == UrlStatus.SKIPPED_INEFFICIENT || status == UrlStatus.ABORTED_SLOW_RESPONSE || status == UrlStatus.ERROR_IOEXCEPTION) { return true; } return false; } } @SuppressWarnings({ "serial", "rawtypes" }) private static class CreateUrlDatumFromCrawlDbDatum extends BaseOperation<Limit.Context> implements Function<Limit.Context> { private long _limit = 0; public CreateUrlDatumFromCrawlDbDatum(long limit) { super(UrlDatum.FIELDS); _limit = limit; } @Override public void prepare(FlowProcess flowProcess, OperationCall<Limit.Context> operationCall) { super.prepare(flowProcess, operationCall); Context context = new Context(); operationCall.setContext(context); int numTasks = flowProcess.getNumProcessSlices(); int taskNum = flowProcess.getCurrentSliceNum(); context.limit = (long) Math.floor((double) _limit / (double) numTasks); long remainingLimit = _limit % numTasks; // evenly divide limits across tasks context.limit += taskNum < remainingLimit ? 1 : 0; } @Override public void operate(FlowProcess flowProcess, FunctionCall<Limit.Context> funcCall) { CrawlDbDatum datum = new CrawlDbDatum(funcCall.getArguments()); UrlDatum urlDatum = new UrlDatum(datum.getUrl()); urlDatum.setPayloadValue(CustomFields.PAGE_SCORE_FN, datum.getPageScore()); urlDatum.setPayloadValue(CustomFields.LINKS_SCORE_FN, datum.getLinksScore()); urlDatum.setPayloadValue(CustomFields.STATUS_FN, datum.getLastStatus().toString()); urlDatum.setPayloadValue(CustomFields.SKIP_BY_LIMIT_FN, funcCall.getContext().increment()); funcCall.getOutputCollector().add(urlDatum.getTuple()); } } @SuppressWarnings({ "unchecked", "rawtypes" }) public static void importSeedUrls(BasePlatform platform, BasePath crawlDbPath, String fileName) throws Exception { SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer(); InputStream is = null; TupleEntryCollector writer = null; try { Tap urlSink = platform.makeTap(platform.makeTextScheme(), crawlDbPath, SinkMode.REPLACE); writer = urlSink.openForWrite(platform.makeFlowProcess()); is = DemoWebMiningWorkflow.class.getResourceAsStream(fileName); if (is == null) { throw new FileNotFoundException("The seed urls file doesn't exist"); } List<String> lines = IOUtils.readLines(is); for (String line : lines) { line = line.trim(); if (line.startsWith("#")) { continue; } CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f, 0.0f); writer.add(datum.getTuple()); } } catch (IOException e) { crawlDbPath.delete(true); throw e; } finally { IoUtils.safeClose(is); if (writer != null) { writer.close(); } } } @SuppressWarnings("rawtypes") public static Flow createWebMiningWorkflow(BixoPlatform platform, BasePath crawlDbPath, BasePath curLoopDirPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, DemoWebMiningOptions options) throws Exception { // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir. // HTML only. // We want to extract the cleaned up HTML, and pass that to the parser, which will // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and // any results. boolean isLocal = platform.isLocal(); platform.resetNumReduceTasks(); platform.setProperty("mapred.min.split.size", 64 * 1024 * 1024); // Input : the crawldb platform.assertPathExists(crawlDbPath, "CrawlDb"); // TODO VMa - figure out types Tap inputSource = platform.makeTap(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath); Tap inputSource = platform.makeTap(platform.makeTextScheme(), crawlDbPath); Pipe importPipe = new Pipe("import pipe"); // Apply a regex to extract the relevant fields RegexParser crawlDbParser = new RegexParser(CrawlDbDatum.FIELDS, "^(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*)"); importPipe = new Each(importPipe, new Fields("line"), crawlDbParser); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums()); Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe()); Pipe urlsToFetchPipe = splitter.getLHSPipe(); // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, // or MAX_LOCAL_FETCH if running locally. So first we sort the entries // from high to low by links score. // TODO add unit test urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true); long maxToFetch = isLocal ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH; urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch)); BaseScoreGenerator scorer = new LinkScoreGenerator(); // Create the sub-assembly that runs the fetch job int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER; SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, platform.getNumReduceTasks()); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content. SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser); Pipe analyzerPipe = new Pipe("analyzer pipe"); analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml()); Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe); outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction()); Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateResultsFunction()); // Group the finished datums, the skipped datums, status, outlinks Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe), Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN), new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin()); updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS); // output : loop dir specific crawldb BasePath outCrawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawlDbSink = platform.makeTap(platform.makeTextScheme(), outCrawlDbPath, SinkMode.REPLACE); // Status, BasePath statusDirPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = platform.makeTap(platform.makeTextScheme(), statusDirPath); // Content BasePath contentDirPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentDirPath); // PageResults BasePath resultsDirPath = platform.makePath(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME); Tap resultsSink = platform.makeTap(platform.makeTextScheme(), resultsDirPath); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(updatePipe.getName(), crawlDbSink); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(resultsPipe.getName(), resultsSink); FlowConnector flowConnector = platform.makeFlowConnector(); Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe); return flow; } }