org.apache.nutch.tools.proxy.SegmentHandler.java Source code

Introduction

Here is the source code for org.apache.nutch.tools.proxy.SegmentHandler.java
Source

package org.apache.nutch.tools.proxy;
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolStatus;
import org.mortbay.jetty.Request;

/**
 * XXX should turn this into a plugin?
 */
public class SegmentHandler extends AbstractTestbedHandler {
    private static final Log LOG = LogFactory.getLog(SegmentHandler.class);
    private Segment seg;

    private static HashMap<Integer, Integer> protoCodes = new HashMap<Integer, Integer>();

    static {
        protoCodes.put(ProtocolStatus.ACCESS_DENIED, HttpServletResponse.SC_UNAUTHORIZED);
        protoCodes.put(ProtocolStatus.BLOCKED, HttpServletResponse.SC_SERVICE_UNAVAILABLE);
        protoCodes.put(ProtocolStatus.EXCEPTION, HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
        protoCodes.put(ProtocolStatus.FAILED, HttpServletResponse.SC_BAD_REQUEST);
        protoCodes.put(ProtocolStatus.GONE, HttpServletResponse.SC_GONE);
        protoCodes.put(ProtocolStatus.MOVED, HttpServletResponse.SC_MOVED_PERMANENTLY);
        protoCodes.put(ProtocolStatus.NOTFETCHING, HttpServletResponse.SC_BAD_REQUEST);
        protoCodes.put(ProtocolStatus.NOTFOUND, HttpServletResponse.SC_NOT_FOUND);
        protoCodes.put(ProtocolStatus.NOTMODIFIED, HttpServletResponse.SC_NOT_MODIFIED);
        protoCodes.put(ProtocolStatus.PROTO_NOT_FOUND, HttpServletResponse.SC_BAD_REQUEST);
        protoCodes.put(ProtocolStatus.REDIR_EXCEEDED, HttpServletResponse.SC_BAD_REQUEST);
        protoCodes.put(ProtocolStatus.RETRY, HttpServletResponse.SC_BAD_REQUEST);
        protoCodes.put(ProtocolStatus.ROBOTS_DENIED, HttpServletResponse.SC_FORBIDDEN);
        protoCodes.put(ProtocolStatus.SUCCESS, HttpServletResponse.SC_OK);
        protoCodes.put(ProtocolStatus.TEMP_MOVED, HttpServletResponse.SC_MOVED_TEMPORARILY);
        protoCodes.put(ProtocolStatus.WOULDBLOCK, HttpServletResponse.SC_BAD_REQUEST);
    }

    private static class SegmentPathFilter implements PathFilter {
        public static final SegmentPathFilter INSTANCE = new SegmentPathFilter();

        @Override
        public boolean accept(Path p) {
            return p.getName().startsWith("part-");
        }

    }

    private static class Segment implements Closeable {

        private static final Partitioner PARTITIONER = new HashPartitioner();

        private FileSystem fs;
        private Path segmentDir;

        private Object cLock = new Object();
        private Object crawlLock = new Object();
        private MapFile.Reader[] content;
        private MapFile.Reader[] parseText;
        private MapFile.Reader[] parseData;
        private MapFile.Reader[] crawl;
        private Configuration conf;

        public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException {
            this.fs = fs;
            this.segmentDir = segmentDir;
            this.conf = conf;
        }

        public CrawlDatum getCrawlDatum(Text url) throws IOException {
            synchronized (crawlLock) {
                if (crawl == null)
                    crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
            }
            return (CrawlDatum) getEntry(crawl, url, new CrawlDatum());
        }

        public Content getContent(Text url) throws IOException {
            synchronized (cLock) {
                if (content == null)
                    content = getReaders(Content.DIR_NAME);
            }
            return (Content) getEntry(content, url, new Content());
        }

        /** Open the output generated by this format. */
        private MapFile.Reader[] getReaders(String subDir) throws IOException {
            Path dir = new Path(segmentDir, subDir);
            FileSystem fs = dir.getFileSystem(conf);
            Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, SegmentPathFilter.INSTANCE));

            // sort names, so that hash partitioning works
            Arrays.sort(names);

            MapFile.Reader[] parts = new MapFile.Reader[names.length];
            for (int i = 0; i < names.length; i++) {
                parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
            }
            return parts;
        }

        private Writable getEntry(MapFile.Reader[] readers, Text url, Writable entry) throws IOException {
            return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
        }

        public void close() throws IOException {
            if (content != null) {
                closeReaders(content);
            }
            if (parseText != null) {
                closeReaders(parseText);
            }
            if (parseData != null) {
                closeReaders(parseData);
            }
            if (crawl != null) {
                closeReaders(crawl);
            }
        }

        private void closeReaders(MapFile.Reader[] readers) throws IOException {
            for (int i = 0; i < readers.length; i++) {
                readers[i].close();
            }
        }

    }

    public SegmentHandler(Configuration conf, Path name) throws Exception {
        seg = new Segment(FileSystem.get(conf), name, conf);
    }

    @Override
    public void handle(Request req, HttpServletResponse res, String target, int dispatch)
            throws IOException, ServletException {
        try {
            String uri = req.getUri().toString();
            LOG.info("URI: " + uri);
            addMyHeader(res, "URI", uri);
            Text url = new Text(uri.toString());
            CrawlDatum cd = seg.getCrawlDatum(url);
            if (cd != null) {
                addMyHeader(res, "Res", "found");
                LOG.info("-got " + cd.toString());
                ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
                if (ps != null) {
                    Integer TrCode = protoCodes.get(ps.getCode());
                    if (TrCode != null) {
                        res.setStatus(TrCode.intValue());
                    } else {
                        res.setStatus(HttpServletResponse.SC_OK);
                    }
                    addMyHeader(res, "ProtocolStatus", ps.toString());
                } else {
                    res.setStatus(HttpServletResponse.SC_OK);
                }
                Content c = seg.getContent(url);
                if (c == null) { // missing content
                    req.setHandled(true);
                    res.addHeader("X-Handled-By", getClass().getSimpleName());
                    return;
                }
                byte[] data = c.getContent();
                LOG.debug("-data len=" + data.length);
                Metadata meta = c.getMetadata();
                String[] names = meta.names();
                LOG.debug("- " + names.length + " meta");
                for (int i = 0; i < names.length; i++) {
                    boolean my = true;
                    char ch = names[i].charAt(0);
                    if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
                        // pretty good chance it's a standard header
                        my = false;
                    }
                    String[] values = meta.getValues(names[i]);
                    for (int k = 0; k < values.length; k++) {
                        if (my) {
                            addMyHeader(res, names[i], values[k]);
                        } else {
                            res.addHeader(names[i], values[k]);
                        }
                    }
                }
                req.setHandled(true);
                res.addHeader("X-Handled-By", getClass().getSimpleName());
                res.setContentType(meta.get(Metadata.CONTENT_TYPE));
                res.setContentLength(data.length);
                OutputStream os = res.getOutputStream();
                os.write(data, 0, data.length);
                res.flushBuffer();
            } else {
                addMyHeader(res, "Res", "not found");
                LOG.info(" -not found " + url);
            }
        } catch (Exception e) {
            e.printStackTrace();
            LOG.warn(StringUtils.stringifyException(e));
            addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
        }
    }

}