edu.umd.cloud9.collection.ExtractHTMLFieldCollection.java Source code

Introduction

Here is the source code for edu.umd.cloud9.collection.ExtractHTMLFieldCollection.java
Source

/*
 * Cloud9: A MapReduce Library for Hadoop
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.umd.cloud9.collection;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import edu.umd.cloud9.collection.line.TextDocument;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.TrecExtractLinks.Map.LinkCounter;

/**
 * <p>
 * Tool for generating 'per-field' collections from HTML documents. The output of this
 * tool is a new collection, in TREC format (in the form of a SequenceFile<LongWritable, TextDocument>)
 * that only consists of the text contained within the target tag. This is useful for various
 * document structure and/or field-based retrieval tasks.
 * </p>
 *
 * @author fangyue
 * @author metzler
 */
public class ExtractHTMLFieldCollection extends PowerTool {
    private static final Logger LOG = Logger.getLogger(ExtractHTMLFieldCollection.class);

    public static class MyMapper extends Mapper<LongWritable, Indexable, LongWritable, TextDocument> {
        // TODO: allow this to support user-defined regular expressions, not just the "heading" one pre-defined here
        public static class HeadingTagFilter implements NodeFilter {
            private static final long serialVersionUID = 3848416345122090905L;
            private final Pattern pattern = Pattern.compile("h[123456]", Pattern.CASE_INSENSITIVE);

            public boolean accept(Node node) {
                return (pattern.matcher(node.getText()).matches());
            }
        }

        private static String tag;

        private static final Parser parser = new Parser();
        private static NodeFilter filter;

        private static final LongWritable myKey = new LongWritable();
        private static final TextDocument myValue = new TextDocument();

        private static final StringBuffer strBuf = new StringBuffer();

        @Override
        public void setup(Mapper<LongWritable, Indexable, LongWritable, TextDocument>.Context context)
                throws IOException {
            Configuration conf = context.getConfiguration();
            tag = conf.get("Cloud9.TargetTag");

            if (tag.equalsIgnoreCase("heading")) {
                filter = new HeadingTagFilter();
            } else {
                filter = new TagNameFilter(tag);
            }
        }

        @Override
        public void map(LongWritable key, Indexable doc,
                Mapper<LongWritable, Indexable, LongWritable, TextDocument>.Context context)
                throws IOException, InterruptedException {
            context.getCounter(LinkCounter.INPUT_DOCS).increment(1);

            if (doc.getDocid() == null || doc.getContent() == null) {
                return;
            }

            myKey.set(key.get());

            NodeList nl;
            try {
                // initialize HTML parser
                parser.setInputHTML(doc.getContent());

                // parse the document
                nl = parser.parse(filter);
            } catch (ParserException e) {
                context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
                myValue.setDocid(doc.getDocid());
                myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
                context.write(myKey, myValue);
                return;
            } catch (StackOverflowError e) {
                context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
                myValue.setDocid(doc.getDocid());
                myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
                context.write(myKey, myValue);
                return;
            }

            strBuf.setLength(0);
            strBuf.append("<DOC>\n<DOCNO>");
            strBuf.append(doc.getDocid());
            strBuf.append("</DOCNO>\n");

            for (int i = 0; i < nl.size(); i++) {
                strBuf.append(nl.elementAt(i).toHtml()).append("\n");
            }
            strBuf.append("</DOC>\n");

            // create output document
            myValue.setDocid(doc.getDocid());
            myValue.setContent(strBuf.toString());

            // emit
            context.write(myKey, myValue);

            // bookkeeping
            context.getCounter(LinkCounter.OUTPUT_DOCS).increment(1);
        }
    }

    public static final String[] RequiredParameters = { "Cloud9.InputPath", "Cloud9.InputFormat",
            "Cloud9.OutputPath", "Cloud9.TargetTag" };

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public ExtractHTMLFieldCollection(Configuration conf) {
        super(conf);
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    @Override
    public int runTool() throws Exception {
        Configuration conf = getConf();
        Job job = new Job(conf);

        String inputPath = conf.get("Cloud9.InputPath");
        String inputFormat = conf.get("Cloud9.InputFormat");
        String outputPath = conf.get("Cloud9.OutputPath");
        String tag = conf.get("Cloud9.TargetTag");

        job.setJobName("ExtractFieldCollection");

        job.setJarByClass(ExtractHTMLFieldCollection.class);
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(Reducer.class);
        job.setNumReduceTasks(200);

        job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
        recursivelyAddInputPaths(job, inputPath);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(TextDocument.class);

        LOG.info("ExtractFieldCollection - " + tag);
        LOG.info(" - Input path: " + inputPath);
        LOG.info(" - Input format: " + inputFormat);
        LOG.info(" - Output path: " + outputPath);
        LOG.info(" - Target tag: " + tag);

        job.waitForCompletion(true);
        return 0;
    }

    public static void recursivelyAddInputPaths(Job job, String path) throws IOException {
        FileSystem fs;
        try {
            fs = FileSystem.get(new URI(path), job.getConfiguration());
        } catch (URISyntaxException e) {
            throw new RuntimeException("Error recursively adding path -- " + path);
        }

        FileStatus[] ls = fs.listStatus(new Path(path));
        for (FileStatus status : ls) {
            // skip anything that starts with an underscore, as it often indicates
            // a log directory or another special type of Hadoop file
            if (status.getPath().getName().startsWith("_")) {
                continue;
            }

            if (status.isDir()) {
                recursivelyAddInputPaths(job, status.getPath().toString());
            } else {
                FileInputFormat.addInputPath(job, status.getPath());
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        if (args.length != 4) {
            System.err.println(
                    "Usage: ExtractFieldCollection [input-path] [input-format] [output-path] [target-tag]");
            System.exit(-1);
        }

        conf.set("Cloud9.InputPath", args[0]);
        conf.set("Cloud9.InputFormat", args[1]);
        conf.set("Cloud9.OutputPath", args[2]);
        conf.set("Cloud9.TargetTag", args[3]);

        int res = ToolRunner.run(conf, new ExtractHTMLFieldCollection(conf), args);
        System.exit(res);
    }
}