com.cloudera.cdk.morphline.json.ExtractJsonPathsBuilder.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.cdk.morphline.json.ExtractJsonPathsBuilder.java

Source

/*
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.cdk.morphline.json;

import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import com.cloudera.cdk.morphline.api.Command;
import com.cloudera.cdk.morphline.api.CommandBuilder;
import com.cloudera.cdk.morphline.api.MorphlineCompilationException;
import com.cloudera.cdk.morphline.api.MorphlineContext;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.AbstractCommand;
import com.cloudera.cdk.morphline.base.Fields;
import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.base.Preconditions;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import com.typesafe.config.Config;

/**
 * Command that uses zero or more JSON path expressions to extract values from a JSON object.
 * 
 * The JSON input object is expected to be contained in the {@link Fields#ATTACHMENT_BODY}
 * 
 * Each expression consists of a record output field name (on the left side of the colon ':') as
 * well as zero or more path steps (on the right hand side), each path step separated by a '/'
 * slash. JSON arrays are traversed with the '[]' notation.
 * 
 * The result of a path expression is a list of objects, each of which is added to the given record
 * output field.
 * 
 * The path language supports all JSON concepts, including nested structures, records, arrays, etc,
 * as well as a flatten option that collects the primitives in a subtree into a flat list.
 */
public final class ExtractJsonPathsBuilder implements CommandBuilder {

    @Override
    public Collection<String> getNames() {
        return Collections.singletonList("extractJsonPaths");
    }

    @Override
    public Command build(Config config, Command parent, Command child, MorphlineContext context) {
        return new ExtractJsonPaths(config, parent, child, context);
    }

    ///////////////////////////////////////////////////////////////////////////////
    // Nested classes:
    ///////////////////////////////////////////////////////////////////////////////
    private static final class ExtractJsonPaths extends AbstractCommand {

        private final boolean flatten;
        private final Map<String, Collection<String>> stepMap;

        private static final String ARRAY_TOKEN = "[]";

        public ExtractJsonPaths(Config config, Command parent, Command child, MorphlineContext context) {
            super(config, parent, child, context);
            ListMultimap<String, String> stepMultiMap = ArrayListMultimap.create();
            this.flatten = getConfigs().getBoolean(config, "flatten", true);
            Config paths = getConfigs().getConfig(config, "paths");
            for (Map.Entry<String, Object> entry : paths.root().unwrapped().entrySet()) {
                String fieldName = entry.getKey();
                String path = entry.getValue().toString().trim();
                if (path.contains("//")) {
                    throw new MorphlineCompilationException("No support for descendant axis available yet", config);
                }
                if (path.startsWith("/")) {
                    path = path.substring(1);
                }
                if (path.endsWith("/")) {
                    path = path.substring(0, path.length() - 1);
                }
                path = path.trim();
                for (String step : path.split("/")) {
                    step = step.trim();
                    if (step.length() > ARRAY_TOKEN.length() && step.endsWith(ARRAY_TOKEN)) {
                        step = step.substring(0, step.length() - ARRAY_TOKEN.length());
                        stepMultiMap.put(fieldName, normalize(step));
                        stepMultiMap.put(fieldName, ARRAY_TOKEN);
                    } else {
                        stepMultiMap.put(fieldName, normalize(step));
                    }
                }
            }
            this.stepMap = stepMultiMap.asMap();
            LOG.debug("stepMap: {}", stepMap);
            validateArguments();
        }

        private String normalize(String step) { // for faster subsequent query performance
            return ARRAY_TOKEN.equals(step) ? ARRAY_TOKEN : step;
        }

        @Override
        protected boolean doProcess(Record inputRecord) {
            JsonNode datum = (JsonNode) inputRecord.getFirstValue(Fields.ATTACHMENT_BODY);
            Preconditions.checkNotNull(datum);
            Record outputRecord = inputRecord.copy();

            for (Map.Entry<String, Collection<String>> entry : stepMap.entrySet()) {
                String fieldName = entry.getKey();
                List<String> steps = (List<String>) entry.getValue();
                extractPath(datum, fieldName, steps, outputRecord, 0);
            }

            // pass record to next command in chain:
            return getChild().process(outputRecord);
        }

        private void extractPath(JsonNode datum, String fieldName, List<String> steps, Record record, int level) {
            if (level >= steps.size()) {
                return;
            }
            boolean isLeaf = (level + 1 == steps.size());
            String step = steps.get(level);
            if (ARRAY_TOKEN == step) {
                if (datum.isArray()) {
                    if (isLeaf) {
                        resolve(datum, record, fieldName);
                    } else {
                        Iterator<JsonNode> iter = datum.elements();
                        while (iter.hasNext()) {
                            extractPath(iter.next(), fieldName, steps, record, level + 1);
                        }
                    }
                }
            } else if (datum.isObject()) {
                JsonNode value = datum.get(step);
                if (value != null) {
                    if (isLeaf) {
                        resolve(value, record, fieldName);
                    } else {
                        extractPath(value, fieldName, steps, record, level + 1);
                    }
                }
            }
        }

        private void resolve(JsonNode datum, Record record, String fieldName) {
            if (datum == null) {
                return;
            }

            if (flatten) {
                flatten(datum, record.get(fieldName));
                return;
            }

            if (datum.isObject()) {
                record.put(fieldName, datum);
            } else if (datum.isArray()) {
                record.put(fieldName, datum);
            } else if (datum.isTextual()) {
                record.put(fieldName, datum.asText());
            } else if (datum.isBoolean()) {
                record.put(fieldName, datum.asBoolean());
            } else if (datum.isInt()) {
                record.put(fieldName, datum.asInt());
            } else if (datum.isLong()) {
                record.put(fieldName, datum.asLong());
            } else if (datum.isShort()) {
                record.put(fieldName, datum.shortValue());
            } else if (datum.isDouble()) {
                record.put(fieldName, datum.asDouble());
            } else if (datum.isFloat()) {
                record.put(fieldName, datum.floatValue());
            } else if (datum.isBigInteger()) {
                record.put(fieldName, datum.bigIntegerValue());
            } else if (datum.isBigDecimal()) {
                record.put(fieldName, datum.decimalValue());
            } else if (datum.isNull()) {
                ; // ignore
            } else {
                record.put(fieldName, datum.toString());
            }
        }

        private void flatten(JsonNode datum, List list) {
            if (datum == null) {
                return;
            }

            if (datum.isObject()) {
                for (JsonNode child : datum) {
                    flatten(child, list);
                }
            } else if (datum.isArray()) {
                Iterator<JsonNode> iter = datum.elements();
                while (iter.hasNext()) {
                    flatten(iter.next(), list);
                }
            } else if (datum.isTextual()) {
                list.add(datum.asText());
            } else if (datum.isBoolean()) {
                list.add(datum.asBoolean());
            } else if (datum.isInt()) {
                list.add(datum.asInt());
            } else if (datum.isLong()) {
                list.add(datum.asLong());
            } else if (datum.isShort()) {
                list.add(datum.shortValue());
            } else if (datum.isDouble()) {
                list.add(datum.asDouble());
            } else if (datum.isFloat()) {
                list.add(datum.floatValue());
            } else if (datum.isBigInteger()) {
                list.add(datum.bigIntegerValue());
            } else if (datum.isBigDecimal()) {
                list.add(datum.decimalValue());
            } else if (datum.isNull()) {
                ; // ignore
            } else {
                list.add(datum.toString());
            }
        }

    }

}