org.apache.nutch.admin.scheduling.AdminCrawl.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.admin.scheduling.AdminCrawl.java

Source

/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.admin.scheduling;

import java.util.Arrays;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.FileStatus;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.nutch.admin.GuiConfigUtil;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.indexer.Indexer;
import org.quartz.JobDataMap;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.quartz.StatefulJob;

public class AdminCrawl implements StatefulJob {

    private static Log LOG = LogFactory.getLog(AdminCrawl.class);

    private static class RunningPathFilter implements PathFilter {
        public boolean accept(Path path) {
            String name = path.getName().toLowerCase();
            return name.endsWith("running");
        }
    }

    public void execute(JobExecutionContext context) throws JobExecutionException {
        JobDataMap jobDataMap = context.getJobDetail().getJobDataMap();
        Path crawldbFile = (PathSerializable) jobDataMap.get("crawldb");
        Path linkdbFile = (PathSerializable) jobDataMap.get("linkdb");
        Path segments = (PathSerializable) jobDataMap.get("segments");
        Path segment = null;

        Path configurationFile = (PathSerializable) jobDataMap.get("configuration");

        Configuration configuration = null;

        try {

            configuration = GuiConfigUtil.loadNewConfiguration(configurationFile);

            segment = generateSegment(crawldbFile, segments, configuration);
            fetchSegment(segment, configuration);
            updateCrawldb(crawldbFile, segment, configuration);
            updateLinkDb(linkdbFile, new Path[] { segment }, configuration);
            index(new Path(segment, "index"), crawldbFile, linkdbFile, new Path[] { segment }, configuration);

        } catch (Exception e) {
            throw new JobExecutionException(e.getMessage());
        } finally {
            try {
                FileSystem fileSystem = FileSystem.get(configuration);
                RunningPathFilter filter = new RunningPathFilter();
                // deleteFiles(fileSystem.listPaths(crawldbFile, filter), fileSystem);
                // deleteFiles(fileSystem.listPaths(linkdbFile, filter), fileSystem);
                // deleteFiles(fileSystem.listPaths(segments, filter), fileSystem);
                // deleteFiles(fileSystem.listPaths(segment, filter), fileSystem);
                // deleteFiles(fileSystem.listPaths(new Path(segment, "index"), filter), fileSystem);
                deleteFiles(fileSystem.listStatus(crawldbFile, filter), fileSystem);
                deleteFiles(fileSystem.listStatus(linkdbFile, filter), fileSystem);
                deleteFiles(fileSystem.listStatus(segments, filter), fileSystem);
                deleteFiles(fileSystem.listStatus(segment, filter), fileSystem);
                deleteFiles(fileSystem.listStatus(new Path(segment, "index"), filter), fileSystem);
            } catch (IOException e) {
                throw new JobExecutionException(e.getMessage());
            }
        }

    }

    private void deleteFiles(Path[] files, FileSystem fileSystem) throws IOException {
        for (int i = 0; i < files.length; i++) {
            Path file = files[i];
            fileSystem.delete(file);
        }

    }

    private void deleteFiles(FileStatus[] statuses, FileSystem fileSystem) throws IOException {
        for (int i = 0; i < statuses.length; i++) {
            Path file = statuses[i].getPath();
            fileSystem.delete(file);
        }

    }

    private void index(Path index, Path crawldbFile, Path linkdbFile, Path[] segments, Configuration configuration)
            throws IOException {
        FileSystem fileSystem = FileSystem.get(configuration);
        // create running files in segments
        for (int i = 0; i < segments.length; i++) {
            Path file = segments[i];
            fileSystem.createNewFile(new Path(file, "index.running"));
        }
        // create running files in linkdb
        fileSystem.createNewFile(new Path(linkdbFile, "index.running"));
        // create running files in crawldb
        fileSystem.createNewFile(new Path(crawldbFile, "index.running"));
        Indexer indexer = new Indexer(configuration);
        indexer.index(index, crawldbFile, linkdbFile, Arrays.asList(segments));
    }

    private void updateLinkDb(Path linkdbFile, Path[] segments, Configuration configuration) throws IOException {

        FileSystem fileSystem = FileSystem.get(configuration);

        Path running = new Path(linkdbFile, "linkdb.running");
        // create lock files in segments
        for (int i = 0; i < segments.length; i++) {
            Path file = segments[i];
            fileSystem.createNewFile(new Path(file, "linkdb.running"));
        }
        // create lockfile in linkdb
        fileSystem.createNewFile(running);

        LinkDb linkDb = new LinkDb(configuration);
        linkDb.invert(linkdbFile, segments, true, true, false);

        for (int i = 0; i < segments.length; i++) {
            Path file = segments[i];
            fileSystem.createNewFile(new Path(file, "invert.done"));
        }
    }

    private void updateCrawldb(Path crawldbFile, Path segment, Configuration configuration) throws IOException {
        FileSystem fileSystem = FileSystem.get(configuration);
        Path runningSegment = new Path(segment, "crawldb.running");
        Path runningDB = new Path(crawldbFile, "crawldb.running");
        fileSystem.createNewFile(runningSegment);
        fileSystem.createNewFile(runningDB);
        CrawlDb crawlDb = new CrawlDb(configuration);
        // crawlDb.update(crawldbFile, segment, true, true);
        Path[] segments = new Path[1];
        segments[0] = segment;
        crawlDb.update(crawldbFile, segments, true, true);
    }

    private void fetchSegment(Path segment, Configuration configuration) throws IOException {
        FileSystem fileSystem = FileSystem.get(configuration);
        Path running = new Path(segment, "fetch.running");
        fileSystem.createNewFile(running);
        Fetcher fetcher = new Fetcher(configuration);
        fetcher.fetch(segment, configuration.getInt("fetcher.threads.fetch", 10), true);
        fileSystem.createNewFile(new Path(segment, "fetch.done"));
        if (configuration.getBoolean("fetcher.parse", true)) {
            fileSystem.createNewFile(new Path(segment, "parse.done"));
        }

    }

    private Path generateSegment(Path crawldbFile, Path segments, Configuration configuration) throws IOException {
        FileSystem system = FileSystem.get(configuration);
        Path runningGenerateSegment = new Path(segments, "generate.running");
        Path runningGenerateDB = new Path(crawldbFile, "generate.running");
        system.createNewFile(runningGenerateSegment);
        system.createNewFile(runningGenerateDB);
        Generator generator = new Generator(configuration);
        // Path segment = generator.generate(crawldbFile, segments);
        long topN = Long.MAX_VALUE;
        Path segment = generator.generate(crawldbFile, segments, -1, topN, System.currentTimeMillis());
        return segment;
    }

}