Java tutorial
/** * (c) Copyright 2013 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.moz.fiji.mapreduce.lib.bulkimport; import java.io.IOException; import java.text.ParseException; import java.util.List; import java.util.Map; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.moz.fiji.annotations.ApiAudience; import com.moz.fiji.hadoop.configurator.HadoopConf; import com.moz.fiji.mapreduce.FijiTableContext; import com.moz.fiji.mapreduce.lib.util.CSVParser; import com.moz.fiji.schema.EntityId; import com.moz.fiji.schema.FijiColumnName; /** * Bulk importer that handles comma separated files. TSVs are also supported by setting the * <code>fiji.import.text.field.separator</code> configuration item specified by * {@link #CONF_FIELD_DELIMITER}. This bulk importer uses * {@link com.moz.fiji.mapreduce.lib.util.CSVParser} for parsing lines into fields. * * A default header row can be specified by setting the * <code>fiji.import.text.column.header_row</code> configuration item specified by * {@link #CONF_INPUT_HEADER_ROW}. If this is not specified, this bulk importer will infer * headers from the first line of text encountered. Not that within a MapReduce job this is not * necessarily the first line of the file and thus this parameter should be set. * * <h2>Creating a bulk import job for CSV:</h2> * <p> * The CSV bulk importer can be passed into a * {@link com.moz.fiji.mapreduce.bulkimport.FijiBulkImportJobBuilder}. A * {@link FijiTableImportDescriptor}, which defines the mapping from the import fields to the * destination Fiji columns, must be passed in as part of the job configuration. For writing * to an HFile which can later be loaded with the <code>fiji bulk-load</code> tool the job * creation looks like: * </p> * <pre><code> * // Set the import descriptor file to be used for this bulk importer. * conf.set(DescribedInputTextBulkImporter.CONF_FILE, "foo-test-import-descriptor.json"); * * // Set the header line. * conf.set(CSVBulkImporter.CONF_INPUT_HEADER_ROW, "first,last,email,phone"); * * // Configure and create the MapReduce job. * final MapReduceJob job = FijiBulkImportJobBuilder.create() * .withConf(conf) * .withBulkImporter(CSVBulkImporter.class) * .withInput(MapReduceJobInputs.newTextMapReduceJobInput(new Path(inputFile.toString()))) * .withOutput(MapReduceJobOutputs.newHFileMapReduceJobOutput(mOutputTable, hfileDirPath)) * .build(); * </code></pre> * <p> * Alternately the bulk importer can be configured to write directly to a Fiji Table. This is * <em>not recommended</em> because it generates individual puts for each cell that is being * written. For small jobs or tests, a direct Fiji table output job can be created by modifying * out the .withOutput parameter to: * <code>.withOutput(MapReduceJobOutputs * .newDirectFijiTableMapReduceJobOutput(mOutputTable))</code> * </p> * * @see FijiTableImportDescriptor */ @ApiAudience.Public public final class CSVBulkImporter extends DescribedInputTextBulkImporter { private static final Logger LOG = LoggerFactory.getLogger(CSVBulkImporter.class); /** Configuration variable for a header row containing delimited string of names of fields. */ public static final String CONF_INPUT_HEADER_ROW = "fiji.import.text.column.header_row"; /** Configuration variable that specifies the cell value separator in the text input files. */ public static final String CONF_FIELD_DELIMITER = "fiji.import.text.field.separator"; private static final String CSV_DELIMITER = ","; private static final String TSV_DELIMITER = "\t"; /** The string that separates the columns of data in the input file. */ @HadoopConf(key = CONF_FIELD_DELIMITER) private String mColumnDelimiter = CSV_DELIMITER; /** Internal map of field names to field positions in the parsed line. */ private Map<String, Integer> mFieldMap = null; /** {@inheritDoc} */ @Override public void setupImporter(FijiTableContext context) throws IOException { // Validate that the passed in delimiter is one of the supported options. List<String> validDelimiters = Lists.newArrayList(CSV_DELIMITER, TSV_DELIMITER); if (!validDelimiters.contains(mColumnDelimiter)) { throw new IOException(String.format("Invalid delimiter '%s' specified. Valid options are: '%s'", mColumnDelimiter, StringUtils.join(validDelimiters, "','"))); } // If the header row is specified in the configuration, use that. if (getConf().get(CONF_INPUT_HEADER_ROW) != null) { List<String> fields = null; String confInputHeaderRow = getConf().get(CONF_INPUT_HEADER_ROW); try { fields = split(confInputHeaderRow); } catch (ParseException pe) { LOG.error("Unable to parse header row: {} with exception {}", confInputHeaderRow, pe.getMessage()); throw new IOException("Unable to parse header row: " + confInputHeaderRow); } initializeHeader(fields); } } /** * Initializes the field to column position mapping for this file. * @param headerFields the header fields for this delimited file. */ private void initializeHeader(List<String> headerFields) { LOG.info("Initializing field map with fields: " + StringUtils.join(headerFields, ",")); Map<String, Integer> fieldMap = Maps.newHashMap(); for (int index = 0; index < headerFields.size(); index++) { fieldMap.put(headerFields.get(index), index); } mFieldMap = ImmutableMap.copyOf(fieldMap); } /** * Wrapper around CSV or TSV parsers based on the configuration of this job builder. * @return a list of fields split by the mColumnDelimiter. * @param line the line to split * @throws ParseException if the parser encounters an error while parsing */ private List<String> split(String line) throws ParseException { if (CSV_DELIMITER.equals(mColumnDelimiter)) { return CSVParser.parseCSV(line); } else if (TSV_DELIMITER.equals(mColumnDelimiter)) { return CSVParser.parseTSV(line); } throw new ParseException("Unrecognized delimiter: " + mColumnDelimiter, 0); } /** * Generates the entity id for this imported line using the source from the import descriptor. * Called within the produce() method. * * @param fields One line of input text split on the column delimiter. * @param context The context used by the produce() method. * @return The EntityId for the data that gets imported by this line. */ protected EntityId getEntityId(List<String> fields, FijiTableContext context) { //TODO(FIJIMRLIB-3) Extend this to support composite row key ids String rowkey = fields.get(mFieldMap.get(getEntityIdSource())); return context.getEntityId(rowkey); } /** * Generates the timestamp for this imported line using the source from the import descriptor. * Called within the produce() method. * * @param fields One line of input text split on the column delimiter. * @return The timestamp to be used for this row of data. */ protected Long getTimestamp(List<String> fields) { String timestampString = fields.get(mFieldMap.get(getTimestampSource())); Long timestamp = Long.parseLong(timestampString); return timestamp; } /** {@inheritDoc} */ @Override public void produce(Text value, FijiTableContext context) throws IOException { // This is the header line since fieldList isn't populated if (mFieldMap == null) { List<String> fields = null; try { fields = split(value.toString()); } catch (ParseException pe) { LOG.error("Unable to parse header row: {} with exception {}", value.toString(), pe.getMessage()); throw new IOException("Unable to parse header row: " + value.toString()); } initializeHeader(fields); // Don't actually import this line return; } List<String> fields = null; try { fields = split(value.toString()); } catch (ParseException pe) { reject(value, context, pe.toString()); return; } List<String> emptyFields = Lists.newArrayList(); for (FijiColumnName fijiColumnName : getDestinationColumns()) { final EntityId eid = getEntityId(fields, context); String source = getSource(fijiColumnName); if (mFieldMap.get(source) < fields.size()) { String fieldValue = fields.get(mFieldMap.get(source)); if (!fieldValue.isEmpty()) { String family = fijiColumnName.getFamily(); String qualifier = fijiColumnName.getQualifier(); if (isOverrideTimestamp()) { // Override the timestamp from the imported source Long timestamp = getTimestamp(fields); context.put(eid, family, qualifier, timestamp, convert(fijiColumnName, fieldValue)); } else { // Use the system time as the timestamp context.put(eid, family, qualifier, convert(fijiColumnName, fieldValue)); } } else { emptyFields.add(source); } } } if (!emptyFields.isEmpty()) { incomplete(value, context, "Record is missing fields: " + StringUtils.join(emptyFields, ",")); } } }