dk.statsbiblioteket.hadoop.archeaderextractor.ARCHeaderExtractorMR.java Source code

Introduction

Here is the source code for dk.statsbiblioteket.hadoop.archeaderextractor.ARCHeaderExtractorMR.java
Source

package dk.statsbiblioteket.hadoop.archeaderextractor;

/*
 * #%L
 * ARC Header Extractor MR
 * %%
 * Copyright (C) 2013 State and University Library, Denmark
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringWriter;
import java.util.Map;

public class ARCHeaderExtractorMR extends Configured implements Tool {

    static public class ARCHeaderExtractorMapper extends Mapper<LongWritable, Text, Text, Text> {
        private Text tokenValue = new Text();
        private Text tokenKey = new Text();

        @Override
        protected void map(LongWritable offsetKey, Text fileNameValue, Context context)
                throws IOException, InterruptedException {
            ARCHeaderExtractor extractHeaders = new ARCHeaderExtractor();
            StringWriter headersString = new StringWriter();

            try {
                Map<Long, String> headers = extractHeaders.extract(fileNameValue.toString());
                if (headers != null) {

                    for (Map.Entry<Long, String> entry : headers.entrySet()) {
                        Long key = entry.getKey();
                        String value = entry.getValue();

                        if (value.length() != 0) {
                            headersString.write("Filename: " + fileNameValue.toString() + "\n");
                            headersString.write(value);
                        }

                    }

                } else {
                    System.err.println("No records were found in the ARC file!");
                }

                tokenValue.set(headersString.toString());
                tokenKey.set(fileNameValue.toString());

                context.write(tokenKey, tokenValue);
            } catch (FileNotFoundException e) {
                System.err.println("Not able to open file '" + fileNameValue.toString() + "'.");
            }
        }
    }

    static public class ARCHeaderExtractorReducer extends Reducer<Text, Text, Text, Text> {

        @Override
        protected void reduce(Text fileName, Iterable<Text> headers, Context context)
                throws IOException, InterruptedException {

            Text allHeaders = new Text();

            for (Text header : headers)
                allHeaders.set(header);

            context.write(fileName, allHeaders);
        }
    }

    public int run(String[] args) throws Exception {
        Configuration configuration = getConf();

        Job job = new Job(configuration, "ARC Header Extractor");
        job.setJarByClass(ARCHeaderExtractorMR.class);

        job.setMapperClass(ARCHeaderExtractorMapper.class);
        job.setCombinerClass(ARCHeaderExtractorReducer.class);
        job.setReducerClass(ARCHeaderExtractorReducer.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        int n = args.length;
        if (n == 0 || n > 2) {
            System.err.println(
                    "Not enough arguments. input dir and output dir mandatory. Only " + n + " were supplied.");
            System.exit(0);
        }

        SequenceFileInputFormat.addInputPath(job, new Path(args[0]));
        SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));

        return job.waitForCompletion(true) ? 0 : -1;
    }

    public static void main(String[] args) throws Exception {
        System.exit(ToolRunner.run(new ARCHeaderExtractorMR(), args));
    }
}