org.kiji.mapreduce.bulkimport.DelimitedJsonBulkImporter.java Source code

Introduction

Here is the source code for org.kiji.mapreduce.bulkimport.DelimitedJsonBulkImporter.java
Source

/**
 * (c) Copyright 2012 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kiji.mapreduce.bulkimport;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.kiji.hadoop.configurator.HadoopConf;
import org.kiji.hadoop.configurator.HadoopConfigurator;
import org.kiji.mapreduce.KijiTableContext;
import org.kiji.schema.EntityId;

/**
 * <p>Bulk imports data from json-formatted delimited files to fully qualified kiji cells.
 * By default, records are separated by a tab.  This can be set to a custom character
 * by setting the "kiji.import.text.column.separator" Configuration variable.</p>
 *
 * <p>In general, all records in this file must be formatted as json.
 * For convenience, however, strings may be represented without quotes.
 * For example, both <i>"my-var"</i> and <i>my-var</i> are acceptable representations
 * for the String "my-var".
 *
 * <p>This implementation uses the first entry on each line as the entity-id for
 * each datum following it.  Thus, it would be natural to omit this first entry
 * from being imported, since it is already used as the EntityId.
 * Override the <code>getEntityId()</code> method to specify a custom way of
 * generating EntityIds from a line of data.</p>
 *
 * @see org.kiji.mapreduce.bulkimport.TextInputDescriptorParser
 *
 * {@inheritDoc}
 */
public class DelimitedJsonBulkImporter extends DescribedInputTextBulkImporter {
    private static final Logger LOG = LoggerFactory.getLogger(DelimitedJsonBulkImporter.class);

    /** Configuration variable that specifies the column value separator in the text input files. */
    public static final String CONF_COLUMN_DELIMITER = "kiji.import.text.column.separator";

    /** Configuration variable that specifies the cell value separator in the text input files. */
    public static final String CONF_CELL_DELIMITER = "kiji.import.text.cell.separator";

    /** The string that separates the columns of data in the input file. */
    @HadoopConf(key = CONF_COLUMN_DELIMITER)
    private String mColumnDelimiter = "\t";

    /** The string that separates the qualifiers in the data within a single cell. */
    @HadoopConf(key = CONF_CELL_DELIMITER)
    private String mCellDelimiter = "|";

    /**
     * Exception thrown if a line can not be parsed to match the Schemas
     * described by DescribedInputTextBulkImporter.getColumnInfo().
     */
    protected static class MalformedLineException extends Exception {
        /**
         * Constructor.
         *
         * @param message A message explaining this exception.
         */
        public MalformedLineException(String message) {
            super(message);
        }

        /**
         * Constructor.
         *
         * @param cause The specific cause.
         */
        public MalformedLineException(Throwable cause) {
            super(cause);
        }
    }

    /** {@inheritDoc} */
    @Override
    public void setConf(Configuration conf) {
        super.setConf(conf);
        HadoopConfigurator.configure(this);
    }

    /**
     * <p>Splits each line on the delimiter returned by <code>getColumnDelimiter()</code>,
     * and writes each element to the matching column specified by
     * <code>getColumnInfo().getKijiColumnName()</code>
     * using the matching writer schema specified by <code>getColumnInfo().getSchema()</code>.
     * This method returns the EnitityId generated by <code>getEntityId()</code>.</p>
     *
     * <p>If a line is malformed, or can not be parsed into the schema specified
     * by <code>getColumnInfo().getSchema()</code>, then this error is logged, and
     * the line skipped.  No writes will be made for this line, but the map task will continue.</p>
     *
     * {@inheritDoc}
     */
    @Override
    public void produce(Text line, KijiTableContext context) throws IOException {
        try {
            String[] cells = readColumnData(line.toString());
            EntityId entity = getEntityId(cells, context);
            for (int i = 0; i < getColumnInfo().size(); i++) {
                if (getColumnInfo(i).isEmpty()) {
                    continue;
                }

                final String family = getColumnInfo(i).getKijiColumnName().getFamily();
                final String qualifier = getColumnInfo(i).getKijiColumnName().getQualifier();
                context.put(entity, family, qualifier, cells[i]);
            }
        } catch (MalformedLineException mle) {
            // Log error without failing the map task or writing this line.
            // TODO(WIBI-1655): Log this in a counter on the context.
            LOG.warn(mle.getMessage());
        }

        // TODO(WIBI-1654): Reinstate map-type bulk imports
        // (possibly in a different class?).
    }

    /**
     * Generates the entity id for this imported line.
     * Called within the produce() method.
     * This implementation returns the first entry in <code>entries</code> as the EntityId.
     * Override this method to specify a different EntityId during the produce() method.
     *
     * @param entries One line of input text split on the column delimiter.
     * @param context The context used by the produce() method.
     * @return The EntityId for the data that gets imported by this line.
     */
    protected EntityId getEntityId(String[] entries, KijiTableContext context) {
        return context.getEntityId(entries[0]);
    }

    /**
     * Parse the line into separate data for each column family.
     *
     * @param line The line of input text from the file.
     * @return A list of column data.
     * @throws MalformedLineException if this line does not split to the same number
     *     of elements as <code>getColumnInfo().size()</code>.
     */
    private String[] readColumnData(String line) throws MalformedLineException {
        String[] columnData = StringUtils.splitPreserveAllTokens(line, getColumnDelimiter());
        if (columnData.length != getColumnInfo().size()) {
            // Line is malformed.  Log error, and throw a handlable exception so
            // that the produce() method can skip this line.
            String message = "Row with incorrect number of columns: " + line + "  Expected "
                    + getColumnInfo().size() + " but was " + columnData.length + ".";
            LOG.warn(message);
            throw new MalformedLineException(message);
        }
        return columnData;
    }

    /**
     * Returns the token used to separate column entries.
     *
     * @return The token used to separate column entries.
     */
    protected String getColumnDelimiter() {
        return mColumnDelimiter;
    }

    /**
     * Returns the token used to separate cell entries within a column (for map types).
     *
     * @return the token used to separate cell entries within a column (for map types).
     * @deprecated This class does not support map-type imports.
     */
    @Deprecated
    protected String getCellDelimiter() {
        return mCellDelimiter;
    }
}