com.moz.fiji.mapreduce.lib.bulkimport.JSONBulkImporter.java Source code

Introduction

Here is the source code for com.moz.fiji.mapreduce.lib.bulkimport.JSONBulkImporter.java
Source

/**
 * (c) Copyright 2013 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.moz.fiji.mapreduce.lib.bulkimport;

import java.io.IOException;

import com.google.common.base.Preconditions;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.moz.fiji.annotations.ApiAudience;
import com.moz.fiji.mapreduce.FijiTableContext;
import com.moz.fiji.schema.EntityId;
import com.moz.fiji.schema.FijiColumnName;

/**
 * Bulk importer that handles JSON files.  The expected JSON file should be a set of records
 * separated by new lines
 * (see {@link org.apache.hadoop.mapreduce.lib.input.TextInputFormat TextInputFormat}).
 * Each line represents a separate JSON object to be imported into a row.  Target
 * columns whose sources are not present in the JSON object are skipped.  This bulk importer uses
 * <a href="http://code.google.com/p/google-gson/">google-gson</a> to parse lines into fields.
 *
 * Complex paths in JSON are specified by strings delimited with periods(.).
 *
 * <h2>Creating a bulk import job for JSON files:</h2>
 * <p>
 *   The bulk importer can be passed into a
 *   {@link com.moz.fiji.mapreduce.bulkimport.FijiBulkImportJobBuilder}.  A
 *   {@link FijiTableImportDescriptor}, which defines the mapping from the import fields to the
 *   destination Fiji columns, must be passed in as part of the job configuration.  For writing
 *   to an HFile which can later be loaded with the <code>fiji bulk-load</code> tool the job
 *   creation looks like:
 * </p>
 * <pre><code>
 *   // Set the import descriptor file to be used for this bulk importer.
 *   conf.set(DescribedInputTextBulkImporter.CONF_FILE, "foo-test-import-descriptor.json");
 *   // Configure and create the MapReduce job.
 *   final MapReduceJob job = FijiBulkImportJobBuilder.create()
 *       .withConf(conf)
 *       .withBulkImporter(JSONBulkImporter.class)
 *       .withInput(MapReduceJobInputs.newTextMapReduceJobInput(new Path(inputFile.toString())))
 *       .withOutput(MapReduceJobOutputs.newHFileMapReduceJobOutput(mOutputTable, hfileDirPath))
 *       .build();
 * </code></pre>
 * <p>
 *   Alternately the bulk importer can be configured to write directly to a Fiji Table.  This is
 *   <em>not recommended</em> because it generates individual puts for each cell that is being
 *   written. For small jobs or tests, a direct Fiji table output job can be created by modifying
 *   out the .withOutput parameter to:
 *   <code>.withOutput(MapReduceJobOutputs
 *       .newDirectFijiTableMapReduceJobOutput(mOutputTable))</code>
 * </p>
 *
 * @see FijiTableImportDescriptor
 */
@ApiAudience.Public
public final class JSONBulkImporter extends DescribedInputTextBulkImporter {
    private static final Logger LOG = LoggerFactory.getLogger(JSONBulkImporter.class);

    /**
     * Returns a string containing an element referenced by the specified path, or null if the
     * element isn't found.  This uses a period '.' delimited syntax similar to JSONPath
     * ({@linktourl http://goessner.net/articles/JsonPath/}).
     *
     * TODO(FIJIMRLIB-5) Use an enhanced JSONPath library for this functionality.
     *
     * @param head JsonObject that is the head of the current JSON tree.
     * @param path delimited by periods
     * @return string denoting the element at the specified path.
     */
    private String getFromPath(JsonObject head, String path) {
        Preconditions.checkNotNull(head);
        Preconditions.checkNotNull(path);

        // Split the path into components using the delimiter for tree traversal.
        String[] pathComponents = path.split("\\.");

        // After getting the path components traverse the json tree.
        JsonElement jsonElement = head;
        for (String pathComponent : pathComponents) {
            if (jsonElement.isJsonObject()) {
                JsonObject jsonObject = jsonElement.getAsJsonObject();
                if (jsonObject.has(pathComponent)) {
                    jsonElement = jsonObject.get(pathComponent);
                } else {
                    LOG.warn("Missing path component {} at current path {}.  Returning null.", pathComponent,
                            jsonObject);
                    return null;
                }
            }
        }
        if (jsonElement.isJsonPrimitive()) {
            return jsonElement.getAsString();
        }
        LOG.warn("Specified path {} is not complete for {}.  Returning null", path, head);
        return null;
    }

    /** {@inheritDoc} */
    @Override
    public void produce(Text value, FijiTableContext context) throws IOException {
        JsonObject gson = new JsonParser().parse(value.toString()).getAsJsonObject();

        for (FijiColumnName fijiColumnName : getDestinationColumns()) {
            String entityIdSource = getFromPath(gson, getEntityIdSource());
            if (entityIdSource == null) {
                LOG.error("Unable to retrieve entityId from source field: " + getEntityIdSource());
                return;
            }
            final EntityId eid = context.getEntityId(entityIdSource);
            String source = getSource(fijiColumnName);
            String fieldValue = getFromPath(gson, source);
            if (fieldValue != null) {
                String family = fijiColumnName.getFamily();
                String qualifier = fijiColumnName.getQualifier();
                if (isOverrideTimestamp()) {
                    // Override the timestamp from the imported source
                    String timestampSource = getFromPath(gson, getTimestampSource());
                    Long timestamp = Long.parseLong(timestampSource);
                    context.put(eid, family, qualifier, timestamp, convert(fijiColumnName, fieldValue));
                } else {
                    // Use the system time as the timestamp
                    context.put(eid, family, qualifier, convert(fijiColumnName, fieldValue));
                }
            } else {
                incomplete(value, context, "Detected missing field: " + source);
            }
        }
    }
}