org.kiji.mapreduce.lib.bulkimport.CSVBulkImporter.java Source code

Introduction

Here is the source code for org.kiji.mapreduce.lib.bulkimport.CSVBulkImporter.java
Source

/**
 * (c) Copyright 2013 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kiji.mapreduce.lib.bulkimport;

import java.io.IOException;
import java.text.ParseException;
import java.util.List;
import java.util.Map;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.kiji.annotations.ApiAudience;
import org.kiji.hadoop.configurator.HadoopConf;
import org.kiji.mapreduce.KijiTableContext;
import org.kiji.mapreduce.lib.util.CSVParser;
import org.kiji.schema.EntityId;
import org.kiji.schema.KijiColumnName;

/**
 * Bulk importer that handles comma separated files.  TSVs are also supported by setting the
 * <code>kiji.import.text.field.separator</code> configuration item specified by
 * {@link #CONF_FIELD_DELIMITER}.
 *
 * For import files that do not contain a header row, a default can be specified by setting the
 * <code>kiji.import.text.column.header_row</code> configuration item specified by
 * {@link #CONF_INPUT_HEADER_ROW}.
 */
@ApiAudience.Public
public final class CSVBulkImporter extends DescribedInputTextBulkImporter {
    private static final Logger LOG = LoggerFactory.getLogger(CSVBulkImporter.class);

    /** Configuration variable for a header row containing delimited string of names of fields. */
    public static final String CONF_INPUT_HEADER_ROW = "kiji.import.text.column.header_row";

    /** Configuration variable that specifies the cell value separator in the text input files. */
    public static final String CONF_FIELD_DELIMITER = "kiji.import.text.field.separator";

    private static final String CSV_DELIMITER = ",";
    private static final String TSV_DELIMITER = "\t";

    /** The string that separates the columns of data in the input file. */
    @HadoopConf(key = CONF_FIELD_DELIMITER)
    private String mColumnDelimiter = CSV_DELIMITER;

    /** Internal map of field names to field positions in the parsed line. */
    private Map<String, Integer> mFieldMap = null;

    /** {@inheritDoc} */
    @Override
    public void setupImporter(KijiTableContext context) throws IOException {
        // Validate that the passed in delimiter is one of the supported options.
        List<String> validDelimiters = Lists.newArrayList(CSV_DELIMITER, TSV_DELIMITER);
        if (!validDelimiters.contains(mColumnDelimiter)) {
            throw new IOException(String.format("Invalid delimiter '%s' specified.  Valid options are: '%s'",
                    mColumnDelimiter, StringUtils.join(validDelimiters, "','")));
        }

        // If the header row is specified in the configuration, use that.
        if (getConf().get(CONF_INPUT_HEADER_ROW) != null) {
            List<String> fields = null;
            String confInputHeaderRow = getConf().get(CONF_INPUT_HEADER_ROW);
            try {
                fields = split(confInputHeaderRow);
            } catch (ParseException pe) {
                LOG.error("Unable to parse header row: {} with exception {}", confInputHeaderRow, pe.getMessage());
                throw new IOException("Unable to parse header row: " + confInputHeaderRow);
            }
            initializeHeader(fields);
        }
    }

    /**
     * Initializes the field to column position mapping for this file.
     * @param headerFields the header fields for this delimited file.
     */
    private void initializeHeader(List<String> headerFields) {
        LOG.info("Initializing field map with fields: " + StringUtils.join(headerFields, ","));
        Map<String, Integer> fieldMap = Maps.newHashMap();
        for (int index = 0; index < headerFields.size(); index++) {
            fieldMap.put(headerFields.get(index), index);
        }
        mFieldMap = ImmutableMap.copyOf(fieldMap);
    }

    /**
     * Wrapper around CSV or TSV parsers based on the configuration of this job builder.
     * @return a list of fields split by the mColumnDelimiter.
     * @param line the line to split
     * @throws ParseException if the parser encounters an error while parsing
     */
    private List<String> split(String line) throws ParseException {
        if (CSV_DELIMITER.equals(mColumnDelimiter)) {
            return CSVParser.parseCSV(line);
        } else if (TSV_DELIMITER.equals(mColumnDelimiter)) {
            return CSVParser.parseTSV(line);
        }
        throw new ParseException("Unrecognized delimiter: " + mColumnDelimiter, 0);
    }

    /**
     * Generates the entity id for this imported line.
     * Called within the produce() method.
     * This implementation returns the first entry in <code>entries</code> as the EntityId.
     * Override this method to specify a different EntityId during the produce() method.
     *
     * @param fields One line of input text split on the column delimiter.
     * @param context The context used by the produce() method.
     * @return The EntityId for the data that gets imported by this line.
     */
    protected EntityId getEntityId(List<String> fields, KijiTableContext context) {
        //TODO(KIJIMRLIB-3) Extend this to support composite row key ids
        String rowkey = fields.get(mFieldMap.get(getEntityIdSource()));
        return context.getEntityId(rowkey);
    }

    /** {@inheritDoc} */
    @Override
    public void produce(Text value, KijiTableContext context) throws IOException {
        // This is the header line since fieldList isn't populated
        if (mFieldMap == null) {
            List<String> fields = null;
            try {
                fields = split(value.toString());
            } catch (ParseException pe) {
                LOG.error("Unable to parse header row: {} with exception {}", value.toString(), pe.getMessage());
                throw new IOException("Unable to parse header row: " + value.toString());
            }
            initializeHeader(fields);
            // Don't actually import this line
            return;
        }

        List<String> fields = null;
        try {
            fields = split(value.toString());
        } catch (ParseException pe) {
            reject(value, context, pe.toString());
            return;
        }
        for (KijiColumnName kijiColumnName : getDestinationColumns()) {
            final EntityId eid = getEntityId(fields, context);
            String source = getSource(kijiColumnName);

            if (mFieldMap.get(source) < fields.size()) {
                String fieldValue = fields.get(mFieldMap.get(source));
                context.put(eid, kijiColumnName.getFamily(), kijiColumnName.getQualifier(), fieldValue);
            } else {
                incomplete(value, context, "Detected trailing empty field: " + source);
            }
        }

    }
}