org.icgc.dcc.release.job.export.util.SchemaGenerator.java Source code

Java tutorial

Introduction

Here is the source code for org.icgc.dcc.release.job.export.util.SchemaGenerator.java

Source

/*
 * Copyright (c) 2016 The Ontario Institute for Cancer Research. All rights reserved.                             
 *                                                                                                               
 * This program and the accompanying materials are made available under the terms of the GNU Public License v3.0.
 * You should have received a copy of the GNU General Public License along with                                  
 * this program. If not, see <http://www.gnu.org/licenses/>.                                                     
 *                                                                                                               
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY                           
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES                          
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT                           
 * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                                
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED                          
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;                               
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER                              
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN                         
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.icgc.dcc.release.job.export.util;

import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static java.lang.String.format;
import static java.util.stream.Collectors.toList;
import static org.icgc.dcc.common.core.util.stream.Collectors.toImmutableMap;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import lombok.Cleanup;
import lombok.SneakyThrows;
import lombok.val;

import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.icgc.dcc.common.core.util.Splitters;
import org.icgc.dcc.release.job.export.model.ExportType;

import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

public class SchemaGenerator {

    private static final String SCHEMA_DEF_DIR = "/schemas";

    public StructType createDataType(ExportType exportType) {
        val fileSchemaDef = getFileSchemaDefinitions(exportType);
        val dataTypes = convertToDataType(fileSchemaDef);
        val fieldNames = getSortedFieldNames(exportType, fileSchemaDef);

        val structFields = fieldNames.stream().map(f -> {
            DataType type = dataTypes.get(f);
            if (type == null) {
                ExportType childType = resolveChildExportType(f, exportType);
                type = DataTypes.createArrayType(createDataType(childType));
            }

            return DataTypes.createStructField(f, type, true);
        }).collect(toList());

        return DataTypes.createStructType(structFields);
    }

    private static Collection<String> getSortedFieldNames(ExportType exportType, Map<String, String> fileSchema) {
        return ImmutableSortedSet.<String>naturalOrder().addAll(fileSchema.keySet())
                .addAll(getChildFields(exportType)).build();
    }

    private static Map<String, DataType> convertToDataType(Map<String, String> schemaDef) {
        return schemaDef.entrySet().stream()
                .collect(toImmutableMap(e -> e.getKey(), e -> parseDataType(e.getValue())));
    }

    private Map<String, String> getFileSchemaDefinitions(ExportType exportType) {
        val schemaPath = getSchemaPath(exportType);
        val fileSchema = readFileSchemaDefinition(schemaPath);

        return fileSchema;
    }

    private static ExportType resolveChildExportType(String field, ExportType exportType) {
        val childType = exportType.getChildren().get(field);
        checkNotNull(childType, "Failed to resolve ExportType for child %s", field);

        return childType;
    }

    private static List<String> getChildFields(ExportType exportType) {
        return exportType.getChildren().entrySet().stream().map(e -> e.getKey()).collect(toList());
    }

    private static String getSchemaPath(ExportType exportType) {
        return SCHEMA_DEF_DIR + "/" + exportType.getId() + ".schema";
    }

    @SneakyThrows
    private Map<String, String> readFileSchemaDefinition(String schemaPath) {
        @Cleanup
        val reader = getFileSchemaDefinitionReader(schemaPath);
        String def = null;
        val schemaDef = Maps.<String, String>newHashMap();
        while ((def = reader.readLine()) != null) {
            val entry = parseSchemaDefinition(def);
            schemaDef.put(entry.getKey(), entry.getValue());
        }

        return schemaDef;
    }

    private BufferedReader getFileSchemaDefinitionReader(String schemaPath) {
        val in = getClass().getResourceAsStream(schemaPath);
        checkNotNull(in, "Failed to read schema definition %s", schemaPath);

        return new BufferedReader(new InputStreamReader(in));
    }

    private static Entry<String, String> parseSchemaDefinition(String schemaDefinition) {
        val keyValue = Lists.newArrayList(Splitters.COLON.split(schemaDefinition));
        checkState(keyValue.size() == 2,
                "Incorrect format of field definition. Expected <key> : <value>. Actual: %s", schemaDefinition);

        val fieldName = keyValue.get(0).trim();
        val value = keyValue.get(1).trim();

        return Maps.immutableEntry(fieldName, value);
    }

    private static DataType parseDataType(String dataTypeString) {
        switch (dataTypeString) {
        case "StringType":
            return DataTypes.StringType;
        case "BinaryType":
            return DataTypes.BinaryType;
        case "BooleanType":
            return DataTypes.BooleanType;
        case "DateType":
            return DataTypes.DateType;
        case "TimestampType":
            return DataTypes.TimestampType;
        case "CalendarIntervalType":
            return DataTypes.CalendarIntervalType;
        case "DoubleType":
            return DataTypes.DoubleType;
        case "ByteType":
            return DataTypes.ByteType;
        case "IntegerType":
            return DataTypes.IntegerType;
        case "LongType":
            return DataTypes.LongType;
        case "ShortType":
            return DataTypes.ShortType;
        case "NullType":
            return DataTypes.NullType;
        default:
            throw new IllegalArgumentException(format("Unknown datatype %s", dataTypeString));
        }
    }

}