org.apache.nutch.indexer.elastic.segment.SegmentIndexerJob.java Source code

Introduction

Here is the source code for org.apache.nutch.indexer.elastic.segment.SegmentIndexerJob.java
Source

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.apache.nutch.indexer.elastic.segment;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Map;

import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraMapper;
import org.apache.gora.mapreduce.GoraReducer;
import org.apache.gora.query.Query;
import org.apache.gora.query.Result;
import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.NutchConstant;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchIndexWriterFactory;
import org.apache.nutch.indexer.elastic.ElasticConstants;
import org.apache.nutch.indexer.elastic.ElasticWriter;
import org.apache.nutch.indexer.solr.SolrConstants;
import org.apache.nutch.indexer.solr.SolrWriter;
import org.apache.nutch.indexer.solr.segment.SegmentSolrIndexUtil;
import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.element.DomParser;
import org.apache.nutch.parse.element.SegMentParsers;
import org.apache.nutch.parse.element.SegParserJob;
import org.apache.nutch.parse.element.SegParserReducer;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPageSegment;
import org.apache.nutch.storage.WebPageSegmentIndex;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.ToolUtil;
import org.apache.solr.common.util.DateUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SegmentIndexerJob extends NutchTool implements Tool {

    public static final Logger LOG = LoggerFactory.getLogger(SegmentIndexerJob.class);

    private static final String FORCE_KEY = "parse.job.force";
    public static final String SEGMENT_INDEX_KEY = "segment.parse.job.index";

    public static final String SKIP_TRUNCATED = "parser.skip.truncated";
    private static final Collection<WebPageSegment.Field> FIELDS = new HashSet<WebPageSegment.Field>();

    private Configuration conf;
    static {
        FIELDS.add(WebPageSegment.Field.BASE_URL);
        FIELDS.add(WebPageSegment.Field.CONFIGURL);
        FIELDS.add(WebPageSegment.Field.SCORE);
        FIELDS.add(WebPageSegment.Field.FETCH_TIME);
        FIELDS.add(WebPageSegment.Field.PARSETIME);
        FIELDS.add(WebPageSegment.Field.DATATIME);
        FIELDS.add(WebPageSegment.Field.TITLE);
        FIELDS.add(WebPageSegment.Field.ROOTSITEID);
        FIELDS.add(WebPageSegment.Field.MEDIATYPEID);
        FIELDS.add(WebPageSegment.Field.MEDIALEVELID);
        FIELDS.add(WebPageSegment.Field.TOPICTYPEID);
        FIELDS.add(WebPageSegment.Field.POLICTICTYPEID);
        FIELDS.add(WebPageSegment.Field.AREAID);
        FIELDS.add(WebPageSegment.Field.extendInfoAttrs);
        FIELDS.add(WebPageSegment.Field.SEGMENTATTR);
        FIELDS.add(WebPageSegment.Field.SEGMENTCNT);
        FIELDS.add(WebPageSegment.Field.MARKER);
    }

    public static Collection<WebPageSegment.Field> getFields(Job job) {
        return FIELDS;
    }

    public static class SegIndexReducer extends GoraReducer<Text, WebPageSegmentIndex, String, WebPageSegment> {

        public static final Logger LOG = LoggerFactory.getLogger(SegIndexReducer.class);
        int rowCount = 0;
        int successCount = 0;
        int failCount = 0;
        private String batchID;
        private String batchTime;
        ElasticWriter elasticWriter = null;
        boolean indexed = false;
        private int commitSize;
        long l = 0;

        public void setup(Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            batchID = NutchConstant.getBatchId(conf);
            batchTime = NutchConstant.getBatchTime(conf);
            elasticWriter = new ElasticWriter();
            indexed = conf.getBoolean(SegParserJob.SEGMENT_INDEX_KEY, false);
            elasticWriter.open(context);
            LOG.info("parserReduce-ID" + batchID + "  :" + (new Date().toLocaleString())
                    + " " + batchTime);
            commitSize = conf.getInt(ElasticConstants.COMMIT_SIZE, 300);
            NutchConstant.setupSerialStepProcess(context.getConfiguration(),
                    NutchConstant.BatchNode.segmentIndexNode, context, false);
            l = System.currentTimeMillis();
        }

        protected void reduce(Text key, Iterable<WebPageSegmentIndex> values, Context context)
                throws IOException, InterruptedException {
            String urlKey = key.toString();
            for (WebPageSegment page : values) {
                rowCount++;
                context.setStatus(getParseStatus(context));
                NutchDocument doc = SegmentSolrIndexUtil.index(urlKey, page);
                if (doc == null) {
                    failCount++;
                    page.setMarker(null);
                    context.write(urlKey, page);// 
                    LOG.error(key + "   doc is null");
                    return;
                } else {
                    successCount++;
                    long ls = System.currentTimeMillis();
                    if ((successCount % commitSize) == 0) {
                        System.out.println("?" + commitSize + "?" + (ls - l) + "ms");
                        l = ls;
                    }
                    elasticWriter.write(doc);
                    page.setMarker(new Utf8(batchID + ":" + batchTime));
                    context.write(urlKey, page);// 
                    if ((successCount % commitSize) == 0) {
                        ls = System.currentTimeMillis();
                        System.out.println("??" + commitSize + "?" + (ls - l) + "ms");
                        l = ls;
                    }
                }
            }
        }

        protected void cleanup(Context context) throws IOException, InterruptedException {
            elasticWriter.close();
            LOG.info("parserReduce-ID" + batchID + "  :" + (new Date().toLocaleString())
                    + " " + batchTime);
            context.setStatus(getParseStatus(context));
            LOG.info(getParseStatus(context));
            NutchConstant.cleanupSerialStepProcess(context.getConfiguration(),
                    NutchConstant.BatchNode.segmentIndexNode, context.getTaskAttemptID().getTaskID().toString(),
                    false, true, successCount);
        }

        String getParseStatus(Context context) {
            return "read:" + rowCount + " Success:" + (rowCount - failCount) + "  " + " failed:" + failCount
                    + " <br/>" + (new Date().toLocaleString());
        }

    }

    public static class SegIndexMapper extends GoraMapper<String, WebPageSegmentIndex, Text, WebPageSegmentIndex> {
        private String batchID;
        private String batchTime;
        int rowCount = 0;
        int failCount = 0;
        long l = 0, all = 0;

        @Override
        public void setup(Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            batchID = NutchConstant.getBatchId(conf);
            batchTime = NutchConstant.getBatchTime(conf);
            LOG.info("parserMap-ID" + batchID + "  :" + (new Date().toLocaleString())
                    + " " + batchTime);
            NutchConstant.setupSerialStepProcess(context.getConfiguration(),
                    NutchConstant.BatchNode.segmentIndexNode, context, true);
        }

        protected void cleanup(Context context) throws IOException, InterruptedException {
            LOG.info("parserMap-ID" + batchID + "  :" + (new Date().toLocaleString())
                    + " " + batchTime);
            context.setStatus(getParseStatus(context));
            LOG.info(getParseStatus(context));
            NutchConstant.cleanupSerialStepProcess(context.getConfiguration(),
                    NutchConstant.BatchNode.segmentIndexNode, context.getTaskAttemptID().getTaskID().toString(),
                    true, true, rowCount - failCount);
        }

        String getParseStatus(Context context) {
            return "write:" + rowCount;
        }

        @Override
        public void map(String key, WebPageSegmentIndex page, Context context)
                throws IOException, InterruptedException {
            if (page.getTitle() == null)
                return;
            key = NutchConstant.getWebPageUrl(batchID, key);
            if (key == null)
                return;
            rowCount++;
            if (rowCount % 10 == 0) {
                context.setStatus(getParseStatus(context));
            }
            context.write(new Text(key), page);
        }
    }

    public SegmentIndexerJob() {

    }

    public SegmentIndexerJob(Configuration conf) {
        setConf(conf);
    }

    /**
     * Checks if the page's content is truncated.
     * 
     * @param url
     * @param page
     * @return If the page is truncated <code>true</code>. When it is not, or when it could be determined, <code>false</code>.
     */
    public static boolean isTruncated(String url, WebPage page) {
        ByteBuffer content = page.getContent();
        if (content == null) {
            return false;
        }
        Utf8 lengthUtf8 = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_LENGTH));
        if (lengthUtf8 == null) {
            return false;
        }
        String lengthStr = lengthUtf8.toString().trim();
        if (StringUtil.isEmpty(lengthStr)) {
            return false;
        }
        int inHeaderSize;
        try {
            inHeaderSize = Integer.parseInt(lengthStr);
        } catch (NumberFormatException e) {
            LOG.warn("Wrong contentlength format for " + url, e);
            return false;
        }
        int actualSize = content.limit();
        if (inHeaderSize > actualSize) {
            LOG.warn(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize);
            return true;
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
        }
        return false;
    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args) throws Exception {
        Boolean force = (Boolean) args.get(Nutch.ARG_FORCE);
        Integer numTasks = (Integer) args.get(Nutch.ARG_NUMTASKS);

        if (force != null) {
            getConf().setBoolean(FORCE_KEY, force);
        }
        String batchZKId = this.getConf().get(NutchConstant.BATCH_ID_KEY);
        if (NutchConstant.preparStartJob(this.getConf(), NutchConstant.BatchNode.segmentIndexNode,
                NutchConstant.BatchNode.segmentParsNode, LOG, false) == 0)
            return null;
        LOG.info("SegmentParserJob: batchId: " + batchZKId);
        LOG.info("SegmentParserJob: forced reparse:\t" + getConf().getBoolean(FORCE_KEY, false));

        NutchIndexWriterFactory.addClassToConf(getConf(), ElasticWriter.class);
        String gids = NutchConstant.getGids(getConf(), "all");
        currentJob = new NutchJob(getConf(),
                "[" + (this.getConf().get(NutchConstant.BATCH_ID_KEY)) + "]segmentIndex[" + gids + "]");
        currentJob.getConfiguration().set(NutchConstant.STEPZKBATCHTIME, new Date().toLocaleString());
        Collection<WebPageSegment.Field> fields = SegmentIndexerJob.getFields(currentJob);
        NutchConstant.setSegmentParseRules(currentJob.getConfiguration());
        WebPageSegment.initMapperJob(currentJob, fields, WebPageSegmentIndex.class, Text.class,
                WebPageSegmentIndex.class, SegIndexMapper.class, null, true);
        StorageUtils.initReducerJob(currentJob, WebPageSegment.class, SegIndexReducer.class);

        if (numTasks == null || numTasks < 1) {
            currentJob.setNumReduceTasks(
                    currentJob.getConfiguration().getInt("mapred.reduce.tasks", currentJob.getNumReduceTasks()));
        } else {
            currentJob.setNumReduceTasks(numTasks);
        }

        currentJob.waitForCompletion(true);
        NutchConstant.preparEndJob(this.getConf(), NutchConstant.BatchNode.segmentIndexNode, LOG);
        ToolUtil.recordJobStatus(null, currentJob, results);
        return results;
    }

    public int indexElastic(String elasticUrl, boolean force, int numTasks) throws Exception {
        try {
            LOG.info("ParserJob: starting");
            run(ToolUtil.toArgMap(Nutch.ARG_SOLR, elasticUrl, Nutch.ARG_FORCE, force, Nutch.ARG_NUMTASKS,
                    numTasks));
            LOG.info("ParserJob: success");
        } catch (Exception e) {
            if (NutchConstant.exitValue != 0) {
                return NutchConstant.exitValue;
            } else {
                throw e;
            }
        }
        return 0;
    }

    public int run(String[] args) throws Exception {
        boolean force = false;
        if (args.length < 1) {
            System.err
                    .println("Usage: SegmentIndexJob <-batch batchId>  [-index solr_url] [-crawlId <id>] [-force]");
            return -1;
        }
        int numTasks = 0;
        String elasticUrl = getConf().get(SolrConstants.SEGMENT_URL, null);
        if (elasticUrl != null) {
            getConf().set(ElasticConstants.SERVER_URL, elasticUrl);
        }
        for (int i = 0; i < args.length; i++) {
            if ("-numTasks".equals(args[i])) {
                numTasks = Integer.parseInt(args[++i]);
            } else if ("-force".equals(args[i])) {
                force = true;
                getConf().setBoolean(NutchConstant.STEP_FORCE_RUN_KEY, true);
            } else if ("-crawlId".equals(args[i])) {
                getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
            } else if ("-batch".equals(args[i])) {
                String batchId1 = org.apache.commons.lang.StringUtils.lowerCase(args[i + 1]);
                if (batchId1 != null && !batchId1.equals("")) {
                    getConf().set(NutchConstant.BATCH_ID_KEY, batchId1);
                }
                i++;
            } else if ("-index".equals(args[i])) {
                i++;
                elasticUrl = args[i];
                getConf().set(ElasticConstants.SERVER_URL, args[i]);
            }
        }

        if (getConf().get(NutchConstant.BATCH_ID_KEY, null) == null) {
            throw new Exception("args error no -batch label or value");
        }
        return indexElastic(elasticUrl, force, numTasks);
    }

    public static void main(String[] args) throws Exception {
        if (args.length == 0) {
            String strDt = "2013-07-29 10:14:00".toString();
            DateFormat df = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
            String tstamp = DateUtil.getThreadLocalDateFormat().format(new Date(df.parse(strDt).getTime()));
            String ststamp = DateUtil.getThreadLocalDateFormat().format(new Date(df.parse(strDt).getTime()));
            System.err.println(strDt);
            System.err.println(tstamp);
            System.err.println(ststamp);

            Configuration conf = NutchConfiguration.create();
            if (conf == null)
                return;
            conf.set("elastic.index", "ea");
            conf.set("storage.crawl.id", "ea");
            conf.set("hbase.client.scanner.caching", "1");
            //         conf.set(SolrConstants.SERVER_URL, conf.get(SolrConstants.SEGMENT_URL));
            //         conf.set(SolrConstants.SERVER_URL, "http://localhost:8080/solr/thematic");
            DataStore<String, WebPage> store = StorageUtils.createWebStore(conf, String.class, WebPage.class);
            if (store == null)
                throw new RuntimeException("Could not create datastore");
            Query<String, WebPage> query = store.newQuery();
            if ((query instanceof Configurable)) {
                ((Configurable) query).setConf(conf);
            }
            Job job = new NutchJob(conf, "segmentParse:");
            //         query.setFields(StorageUtils.toStringArray(getFields(job)));
            query.setStartKey("com.soufun.jiahecheng0771:http/house/2910117344/housedetail.htm");
            // query.setEndKey("com.qq.news:http/a/20090313/001870.htm");
            NutchConstant.setSegmentParseRules(conf);
            NutchConstant.getSegmentParseRules(conf);
            SegMentParsers parses = new SegMentParsers(conf);
            Result<String, WebPage> rs = query.execute();
            int cout = 0;
            DomParser parse = new DomParser();
            parse.setConf(conf);
            long l = System.currentTimeMillis();
            ElasticWriter elasticWriter = new ElasticWriter();
            elasticWriter.open(conf);

            SolrWriter writer = new SolrWriter();
            writer.open(conf);
            while (rs.next()) {
                long sl = System.currentTimeMillis();
                WebPage page = rs.get();
                String key = rs.getKey();
                if (page.getContent() == null)
                    continue;
                String unreverseKey = TableUtil.unreverseUrl(key);

                WebPageSegment wps = SegParserReducer.parseSegMent(parses, unreverseKey, page);// ???
                //            System.err.println(wps);

                System.err.println(
                        "" + cout + " " + (System.currentTimeMillis() - sl) + "  rowkey: " + key);
                if (wps != null) {
                    NutchDocument doc = SegmentSolrIndexUtil.index(key, wps);
                    if (doc != null) {
                        SegmentIndexerJob.LOG.info(key + " url:" + unreverseKey + " write to elastic");
                        elasticWriter.write(doc);
                        writer.write(doc);
                    } else {
                        SegmentIndexerJob.LOG.warn(key + " url:" + unreverseKey + " doc is null");
                    }
                }
                cout++;
            }
            elasticWriter.close();
            System.out.println("" + cout + " " + (System.currentTimeMillis() - l));
            System.exit(0);
        } else {
            final int res = ToolRunner.run(NutchConfiguration.create(), new SegmentIndexerJob(), args);
            System.exit(res);
        }

    }

}