org.culturegraph.mf.cluster.job.ingest.PropIngest.java Source code

Java tutorial

Introduction

Here is the source code for org.culturegraph.mf.cluster.job.ingest.PropIngest.java

Source

/*
 *  Copyright 2013 Deutsche Nationalbibliothek
 *
 *  Licensed under the Apache License, Version 2.0 the "License";
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.culturegraph.mf.cluster.job.ingest;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.culturegraph.mf.cluster.sink.ComplexPutWriter;
import org.culturegraph.mf.cluster.util.AbstractJobLauncher;
import org.culturegraph.mf.cluster.util.ConfigConst;

/**
 * Reads property records from hdfs and
 * persists them in an {@link HTable}
 * 
 * @author Markus Michael Geipel
 */
public final class PropIngest extends AbstractJobLauncher {

    public static void main(final String[] args) {
        launch(new PropIngest(), args);
    }

    @Override
    protected Configuration prepareConf(final Configuration config) {

        addRequiredArguments(ConfigConst.OUTPUT_TABLE, ConfigConst.INPUT_PATH);

        final Configuration conf = HBaseConfiguration.create(config);
        config.setIfUnset("mapred.map.tasks.speculative.execution", "false");
        setJobName("Ingest " + config.get(ConfigConst.INPUT_PATH) + " -> " + config.get(ConfigConst.OUTPUT_TABLE));
        return conf;
    }

    @Override
    protected void configureJob(final Job job, final Configuration conf) throws IOException {
        job.setJarByClass(PropIngest.class);
        final Path path = new Path(conf.get(ConfigConst.INPUT_PATH));
        FileInputFormat.addInputPath(job, path);

        job.setMapperClass(IngestMapper.class);
        job.setNumReduceTasks(0);
        job.setOutputFormatClass(NullOutputFormat.class);
    }

    /**
     * writes raw records and properties to htable
     */
    static final class IngestMapper<K, V> extends Mapper<LongWritable, Text, K, V> {

        //   private static final Log LOG = LogFactory.getLog(PropIngest.class);

        private static final String INGEST = "Ingest";

        private static final long WRITE_BUFFER = 1024 * 1024 * 16;

        private static final String SEPARATOR = "\t";

        private final ComplexPutWriter collector = new ComplexPutWriter();
        private HTable htable;

        @Override
        protected void setup(final Context context) throws IOException, InterruptedException {
            super.setup(context);
            final Configuration conf = context.getConfiguration();

            htable = new HTable(conf, conf.get(ConfigConst.OUTPUT_TABLE));
            htable.setAutoFlush(false);
            htable.setWriteBufferSize(WRITE_BUFFER);
        }

        @Override
        protected void cleanup(final Context context) throws IOException, InterruptedException {
            super.cleanup(context);
            htable.flushCommits();
            htable.close();
        }

        @Override
        public void map(final LongWritable row, final Text value, final Context context) throws IOException {

            final String record = value.toString();
            if (record.isEmpty()) {
                context.getCounter(INGEST, "empty input lines").increment(1);
                return;
            }
            collector.reset();

            final String[] parts = record.split(SEPARATOR);

            collector.startRecord(parts[0]);
            for (int i = 1; i < parts.length; ++i) {
                final String prop = parts[i];
                final int cut = prop.indexOf('=');
                if (cut > 0 && cut + 1 < prop.length()) {
                    collector.literal(prop.substring(0, cut), prop.substring(cut + 1));
                }
            }
            collector.endRecord();

            final Put put = collector.getCurrentPut();
            if (put.isEmpty()) {
                context.getCounter(INGEST, "records without content").increment(1);
            } else {
                htable.put(put);
                context.getCounter(INGEST, "records ingested").increment(1);
            }
        }
    }
}