com.iflytek.spider.parse.ParseSegment.java Source code

Introduction

Here is the source code for com.iflytek.spider.parse.ParseSegment.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.iflytek.spider.parse;

import java.io.IOException;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.iflytek.avro.io.UnionData;
import com.iflytek.avro.mapreduce.AvroJob;
import com.iflytek.avro.mapreduce.input.AvroPairInputFormat;
import com.iflytek.spider.metadata.Spider;
import com.iflytek.spider.protocol.Content;
import com.iflytek.spider.util.SpiderConfiguration;

/* Parse content in a segment. */
public class ParseSegment extends Configured implements Tool {
    public static final Log LOG = LogFactory.getLog(ParseSegment.class);

    public ParseSegment() {
        this(null);
    }

    public ParseSegment(Configuration conf) {
        super(conf);
    }

    public void configure(JobConf job) {
        setConf(job);
    }

    public void close() {
    }

    public static class ParseMapper extends Mapper<String, Content, String, UnionData> {
        @Override
        protected void map(String key, Content value, Context context) throws IOException, InterruptedException {
            Parse parse = new ParserFactory().getParsers(key, value);
            List pd = parse.parse(key.toString(), value);
            for (Object o : pd) {
                if (o instanceof Outlink) {
                    ((Outlink) o).setExtend(value.getExtendData());
                    context.write(key, new UnionData(((Outlink) o)));
                } else
                    context.write(key, new UnionData(o));
            }
        }

    }

    public void parse(Path segment) throws IOException, InterruptedException, ClassNotFoundException {

        if (LOG.isInfoEnabled()) {
            LOG.info("Parse: starting");
            LOG.info("Parse: segment: " + segment);
        }

        Job job = AvroJob.getAvroJob(getConf());
        job.setJobName("parse " + segment);

        FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
        job.getConfiguration().set(Spider.SEGMENT_NAME_KEY, segment.getName());

        job.setInputFormatClass(AvroPairInputFormat.class);
        job.setMapperClass(ParseMapper.class);

        FileOutputFormat.setOutputPath(job, segment);
        job.setOutputFormatClass(ParseOutputFormat.class);
        job.setOutputKeyClass(String.class);
        job.setOutputValueClass(UnionData.class);

        job.waitForCompletion(true);
        if (LOG.isInfoEnabled()) {
            LOG.info("Parse: done");
        }
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(SpiderConfiguration.create(), new ParseSegment(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {

        String usage = "Usage: ParseSegment segments";

        if (args.length == 0) {
            System.err.println(usage);
            System.exit(-1);
        }
        FileSystem fs = FileSystem.get(getConf());
        for (FileStatus p : fs.listStatus(new Path(args[0]))) {
            if (fs.exists(new Path(p.getPath(), "crawl_parse")))
                fs.delete(new Path(p.getPath(), "crawl_parse"), true);
            if (fs.exists(new Path(p.getPath(), "parse_data")))
                fs.delete(new Path(p.getPath(), "parse_data"), true);
            parse(p.getPath());
        }
        return 0;
    }
}