com.moz.fiji.mapreduce.lib.bulkimport.DescribedInputTextBulkImporter.java Source code

Introduction

Here is the source code for com.moz.fiji.mapreduce.lib.bulkimport.DescribedInputTextBulkImporter.java
Source

/**
 * (c) Copyright 2012 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.moz.fiji.mapreduce.lib.bulkimport;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Map;
import java.util.Set;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.moz.fiji.annotations.ApiAudience;
import com.moz.fiji.annotations.Inheritance;
import com.moz.fiji.hadoop.configurator.HadoopConf;
import com.moz.fiji.hadoop.configurator.HadoopConfigurator;
import com.moz.fiji.mapreduce.FijiTableContext;
import com.moz.fiji.mapreduce.bulkimport.FijiBulkImporter;
import com.moz.fiji.mapreduce.framework.JobHistoryCounters;
import com.moz.fiji.mapreduce.framework.FijiConfKeys;
import com.moz.fiji.schema.Fiji;
import com.moz.fiji.schema.FijiColumnName;
import com.moz.fiji.schema.FijiSchemaTable;
import com.moz.fiji.schema.FijiTable;
import com.moz.fiji.schema.FijiURI;
import com.moz.fiji.schema.avro.AvroSchema;
import com.moz.fiji.schema.avro.CellSchema;
import com.moz.fiji.schema.layout.FijiTableLayout;
import com.moz.fiji.schema.util.ResourceUtils;

/**
 * DescribedInputTextBulkImporter is an abstract class that provides methods to bulk importers for
 * mapping from source fields in the import lines to destination Fiji columns.  These can be used
 * inside of FijiBulkImportJobBuilder via the withBulkImporter method.
 *
 * <p>Importing from a text file requires specifying a FijiColumnName, and the source field
 * for each element to be inserted into Fiji, in addition to the raw import data.  This information
 * is provided by {@link FijiTableImportDescriptor} which is set via
 * the <code>fiji.import.text.input.descriptor.path</code> parameter in {@link #CONF_FILE}.
 *
 * <p>Use this Mapper over text files to import data into a Fiji
 * table.  Each line in the file will be treated as data for one row.
 * This line should generate a single EntityId to write to, and any number
 * of writes to add to that entity.  You should override the produce(String, Context)
 * method to generate the entities from the input lines.</p>
 *
 * <p>Extensions of this class should implement the following methods:
 * <ul>
 *   <li>{@link #produce} - actual producer code for the bulk importer should go here</li>
 *   <li>{@link #setupImporter} - (optional) any specific setup for this bulk importer.</li>
 *   <li>{@link #cleanupImporter} - (optional) any specific cleanup for this bulk importer.</li>
 * </ul>
 *
 * <p>Extensions of this class can use the following methods to implement their producers:
 * <ul>
 *   <li>{@link #convert} - parses the text into the type associated with the column.</li>
 *   <li>{@link #incomplete} - to log and mark a row that was incomplete.</li>
 *   <li>{@link #reject} - to log and mark a row that could not be processed.</li>
 *   <li>{@link #getDestinationColumns} - to retrieve a collection of destination columns.</li>
 *   <li>{@link #getSource} - to retrieve the source for one of the columns listed above.</li>
 *   <li>{@link #getEntityIdSource()} - to retrieve the source for the entity id for the row.</li>
 * </ul>
 */

@ApiAudience.Public
@Inheritance.Extensible
public abstract class DescribedInputTextBulkImporter extends FijiBulkImporter<LongWritable, Text> {
    private static final Logger LOG = LoggerFactory.getLogger(DescribedInputTextBulkImporter.class);

    /**
     * Location of writer layout file.  File names columns and schemas, and implies
     * ordering of columns in delimited read-in file.
     */
    public static final String CONF_FILE = "fiji.import.text.input.descriptor.path";

    public static final String CONF_LOG_RATE = "fiji.import.text.log.rate";

    private static final ImmutableMap<String, Class<?>> FIJI_CELL_TYPE_TO_CLASS_MAP = new ImmutableMap.Builder<String, Class<?>>()
            .put("\"boolean\"", Boolean.class).put("\"int\"", Integer.class).put("\"long\"", Long.class)
            .put("\"float\"", Float.class).put("\"double\"", Double.class).put("\"string\"", String.class).build();

    private static final ImmutableMap<Schema.Type, Class<?>> FIJI_AVRO_TYPE_TO_CLASS_MAP = new ImmutableMap.Builder<Schema.Type, Class<?>>()
            .put(Schema.Type.BOOLEAN, Boolean.class).put(Schema.Type.INT, Integer.class)
            .put(Schema.Type.LONG, Long.class).put(Schema.Type.FLOAT, Float.class)
            .put(Schema.Type.DOUBLE, Double.class).put(Schema.Type.STRING, String.class).build();

    /** Number of lines to skip between reject/incomplete lines. */
    private Long mLogRate = 1000L;

    /** Current counter of the number of incomplete lines. */
    private Long mIncompleteLineCounter = 0L;

    /** Current counter of the number of rejected lines. */
    private Long mRejectedLineCounter = 0L;

    /** Table layout of the output table. */
    private FijiTableLayout mOutputTableLayout;

    /** Table import descriptor for this bulk load. */
    private FijiTableImportDescriptor mTableImportDescriptor;

    /** FijiColumnName to cell type map. */
    private Map<FijiColumnName, Class> mColumnNameClassMap;

    /**
     * {@inheritDoc}
     *
     * <p>If you override this method, you must call <tt>super.setConf(conf)</tt>.</p>
     */
    @Override
    public void setConf(Configuration conf) {
        super.setConf(conf);
    }

    /**
     * Performs validation that this table import descriptor can be applied to the output table.
     *
     * This method is final to prevent it from being overridden without being called.
     * Subclasses should override the {@link #setupImporter} method instead of overriding this method.
     *
     * {@inheritDoc}
     */
    @Override
    public final void setup(FijiTableContext context) throws IOException {
        HadoopConfigurator.configure(this);
        final Configuration conf = getConf();
        Preconditions.checkNotNull(mTableImportDescriptor);

        final FijiURI uri = FijiURI.newBuilder(conf.get(FijiConfKeys.FIJI_OUTPUT_TABLE_URI)).build();

        final Fiji fiji = Fiji.Factory.open(uri, conf);
        final FijiSchemaTable schemaTable = fiji.getSchemaTable();
        try {
            final FijiTable table = fiji.openTable(uri.getTable());
            try {
                mOutputTableLayout = table.getLayout();
            } finally {
                table.release();
            }

            Preconditions.checkNotNull(mOutputTableLayout);
            mTableImportDescriptor.validateDestination(mOutputTableLayout);

            // Retrieve the classes for all of the imported columns.
            Map<FijiColumnName, Class> columnNameClassMap = Maps.newHashMap();
            for (FijiColumnName fijiColumnName : mTableImportDescriptor.getColumnNameSourceMap().keySet()) {
                CellSchema cellSchema = mOutputTableLayout.getCellSchema(fijiColumnName);
                switch (cellSchema.getType()) {
                case AVRO:
                    // Since this is for prepackaged generic bulk importers, we can assume that we want to
                    // to use the default reader schema for determining the type to write as.
                    Schema.Type schemaType;
                    AvroSchema as = cellSchema.getDefaultReader();
                    if (as.getUid() != null) {
                        Schema schema = schemaTable.getSchema(as.getUid());
                        schemaType = schema.getType();
                    } else if (as.getJson() != null) {
                        Schema schema = new Schema.Parser().parse(as.getJson());
                        schemaType = schema.getType();
                    } else {
                        throw new IOException("Schema is not a UID or JSON type.");
                    }
                    if (FIJI_AVRO_TYPE_TO_CLASS_MAP.containsKey(schemaType)) {
                        columnNameClassMap.put(fijiColumnName, FIJI_AVRO_TYPE_TO_CLASS_MAP.get(schemaType));
                    } else {
                        throw new IOException("Unsupported described output type: " + cellSchema.getValue());
                    }
                    break;
                case INLINE:
                    if (FIJI_CELL_TYPE_TO_CLASS_MAP.containsKey(cellSchema.getValue())) {
                        columnNameClassMap.put(fijiColumnName,
                                FIJI_CELL_TYPE_TO_CLASS_MAP.get(cellSchema.getValue()));
                    } else {
                        throw new IOException("Unsupported described output type: " + cellSchema.getValue());
                    }
                    break;
                case CLASS:
                    throw new IOException("Unsupported described output type: " + cellSchema.getType());
                default:
                    throw new IOException("Unsupported described output type: " + cellSchema.getType());
                }
            }
            mColumnNameClassMap = ImmutableMap.copyOf(columnNameClassMap);
        } finally {
            ResourceUtils.releaseOrLog(fiji);
        }

        setupImporter(context);
    }

    /**
     * Extensible version of {@link FijiBulkImporter#setup} for subclasses of
     * DescribedInputTextBulkImporter.
     * Does nothing by default.
     *
     * @param context A context you can use to generate EntityIds and commit writes.
     * @throws IOException on I/O error.
     */
    public void setupImporter(FijiTableContext context) throws IOException {
    }

    /**
     * Converts a line of text to a set of writes to <code>context</code>, and
     * an EntityId for the row.
     *
     * @param line The line to parse.
     * @param context The context to write to.
     * @throws IOException if there is an error.
     */
    public abstract void produce(Text line, FijiTableContext context) throws IOException;

    /**
     * Post-processes incomplete lines(Logging, keeping count, etc).
     *
     * @param line the line that was marked incomplete incomplete by the producer.
     * @param context the context in which the incompletion occured.
     * @param reason the reason why this line was incomplete.
     */
    public void incomplete(Text line, FijiTableContext context, String reason) {
        if (mIncompleteLineCounter % mLogRate == 0L) {
            LOG.error("Incomplete line: {} with reason: {}", line.toString(), reason);
        }
        mIncompleteLineCounter++;

        //TODO(FIJIMRLIB-9) Abort this bulk importer job early if incomplete records exceed a threshold
        context.incrementCounter(JobHistoryCounters.BULKIMPORTER_RECORDS_INCOMPLETE);

        //TODO(FIJIMRLIB-4) Add a strict mode where we reject incomplete lines
    }

    /**
     * Post-processes rejected lines(Logging, keeping count, etc).
     *
     * @param line the line that was rejected by the producer.
     * @param context the context in which the rejection occured.
     * @param reason the reason why this line was rejected.
     */
    public void reject(Text line, FijiTableContext context, String reason) {
        if (mRejectedLineCounter % mLogRate == 0L) {
            LOG.error("Rejecting line: {} with reason: {}", line.toString(), reason);
        }
        mRejectedLineCounter++;

        //TODO(FIJIMRLIB-9) Abort this bulk importer job early if rejected records exceed a threshold
        context.incrementCounter(JobHistoryCounters.BULKIMPORTER_RECORDS_REJECTED);

        //TODO(FIJIMRLIB-4) Allow this to emit to a rejected output so that import can be reattempted.
    }

    /**
     * Converts the value into an object of the type associated with the specified column.
     * @param fijiColumnName the destination column to infer the type from.
     * @param value string representation of the value.
     * @return object containing the parsed representation of the value.
     */
    public Object convert(FijiColumnName fijiColumnName, String value) {
        Class<?> clazz = mColumnNameClassMap.get(fijiColumnName);
        if (clazz == Boolean.class) {
            return Boolean.valueOf(value);
        } else if (clazz == Integer.class) {
            return Integer.valueOf(value);
        } else if (clazz == Long.class) {
            return Long.valueOf(value);
        } else if (clazz == Float.class) {
            return Float.valueOf(value);
        } else if (clazz == Double.class) {
            return Double.valueOf(value);
        } else if (clazz == String.class) {
            return value;
        }
        return value;
    }

    /**
     * Subclasses should implement the produce(Text line, FijiTableContext context) method instead.
     * {@inheritDoc}
     */
    @Override
    public final void produce(LongWritable fileOffset, Text line, FijiTableContext context) throws IOException {
        produce(line, context);
    }

    /** @return an unmodifiable collection of the columns for this bulk importer. */
    protected final Collection<FijiColumnName> getDestinationColumns() {
        Set<FijiColumnName> columns = mTableImportDescriptor.getColumnNameSourceMap().keySet();
        return Collections.unmodifiableSet(columns);
    }

    /**
     * Returns the source for the specified column, or null if the specified column is not a
     * destination column for this importer.
     *
     * @param fijiColumnName the requested Fiji column
     * @return the source for the requested column
     */
    protected final String getSource(FijiColumnName fijiColumnName) {
        return mTableImportDescriptor.getColumnNameSourceMap().get(fijiColumnName);
    }

    /** @return the source for the EntityId. */
    protected final String getEntityIdSource() {
        return mTableImportDescriptor.getEntityIdSource();
    }

    /** @return whether to override the timestamp from system time. */
    protected final boolean isOverrideTimestamp() {
        return null != mTableImportDescriptor.getOverrideTimestampSource();
    }

    /** @return the source for timestamp. */
    protected final String getTimestampSource() {
        return mTableImportDescriptor.getOverrideTimestampSource();
    }

    /**
     * Subclasses should implement the {@link #cleanupImporter(FijiTableContext)} method instead.
     * {@inheritDoc}
     */
    @Override
    public final void cleanup(FijiTableContext context) throws IOException {
        cleanupImporter(context);
    }

    /**
     * Extensible version of {@link FijiBulkImporter#cleanup} for subclasses of
     * DescribedInputTextBulkImporter.
     * Does nothing by default.
     *
     * @param context A context you can use to generate EntityIds and commit writes.
     * @throws IOException on I/O error.
     */
    public void cleanupImporter(FijiTableContext context) throws IOException {
    }

    /**
     * Sets the log rate - the number of lines between log statements for incomplete/rejected lines.
     *
     * @param logRateString The logging rate as a string.
     */
    @HadoopConf(key = CONF_LOG_RATE, usage = "The number of lines to skip between log statements")
    protected final void setLogRate(String logRateString) {
        if (logRateString != null) {
            try {
                Long logRate = Long.parseLong(logRateString);
                mLogRate = logRate;
            } catch (NumberFormatException ne) {
                LOG.warn("Unable to parse log rate: " + logRateString);
            }
        }
    }

    /**
     * Sets the path to the text input descriptor file and parses it.
     *
     * @param inputDescriptorFile The input descriptor path.
     * @throws RuntimeException if there's an error reading or parsing the input descriptor.
     */
    @HadoopConf(key = CONF_FILE, usage = "The input descriptor file.")
    protected final void setInputDescriptorPath(String inputDescriptorFile) {

        if (null == inputDescriptorFile || inputDescriptorFile.isEmpty()) {
            // Remind the user to specify this path.
            LOG.error("No input-descriptor path specified.");
            throw new RuntimeException("No input descriptor file specified on the Configuration."
                    + "  Did you specify the " + CONF_FILE + " variable?");
        }

        Path descriptorPath = new Path(inputDescriptorFile);
        try {
            LOG.info("Parsing input-descriptor file: " + descriptorPath.toString());
            FileSystem fs = descriptorPath.getFileSystem(getConf());
            FSDataInputStream inputStream = fs.open(descriptorPath);
            mTableImportDescriptor = FijiTableImportDescriptor.createFromEffectiveJson(inputStream);

        } catch (IOException ioe) {
            LOG.error("Could not read input-descriptor file: " + descriptorPath.toString());
            throw new RuntimeException("Could not read file: " + descriptorPath.toString(), ioe);
        }
    }
}