Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.uber.hoodie.utilities; import com.google.common.base.Preconditions; import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.util.DFSPropertiesConfiguration; import com.uber.hoodie.common.util.ReflectionUtils; import com.uber.hoodie.common.util.TypedProperties; import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.utilities.schema.SchemaProvider; import com.uber.hoodie.utilities.sources.Source; import com.uber.hoodie.utilities.transform.Transformer; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.nio.ByteBuffer; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.Accumulator; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; /** * Bunch of helper methods */ public class UtilHelpers { private static Logger logger = LogManager.getLogger(UtilHelpers.class); public static Source createSource(String sourceClass, TypedProperties cfg, JavaSparkContext jssc, SparkSession sparkSession, SchemaProvider schemaProvider) throws IOException { try { return (Source) ReflectionUtils .loadClass( sourceClass, new Class<?>[] { TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class }, cfg, jssc, sparkSession, schemaProvider); } catch (Throwable e) { throw new IOException("Could not load source class " + sourceClass, e); } } public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg, JavaSparkContext jssc) throws IOException { try { return schemaProviderClass == null ? null : (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc); } catch (Throwable e) { throw new IOException("Could not load schema provider class " + schemaProviderClass, e); } } public static Transformer createTransformer(String transformerClass) throws IOException { try { return transformerClass == null ? null : (Transformer) ReflectionUtils.loadClass(transformerClass); } catch (Throwable e) { throw new IOException("Could not load transformer class " + transformerClass, e); } } /** */ public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath, List<String> overriddenProps) { try { DFSPropertiesConfiguration conf = new DFSPropertiesConfiguration(fs, cfgPath); if (!overriddenProps.isEmpty()) { logger.info("Adding overridden properties to file properties."); conf.addProperties(new BufferedReader(new StringReader(String.join("\n", overriddenProps)))); } return conf; } catch (Exception e) { throw new HoodieException("Unable to read props file at :" + cfgPath, e); } } public static TypedProperties buildProperties(List<String> props) { TypedProperties properties = new TypedProperties(); props.stream().forEach(x -> { String[] kv = x.split("="); Preconditions.checkArgument(kv.length == 2); properties.setProperty(kv[0], kv[1]); }); return properties; } /** * Parse Schema from file * * @param fs File System * @param schemaFile Schema File */ public static String parseSchema(FileSystem fs, String schemaFile) throws Exception { // Read schema file. Path p = new Path(schemaFile); if (!fs.exists(p)) { throw new Exception(String.format("Could not find - %s - schema file.", schemaFile)); } long len = fs.getFileStatus(p).getLen(); ByteBuffer buf = ByteBuffer.allocate((int) len); try (FSDataInputStream inputStream = fs.open(p)) { inputStream.readFully(0, buf.array(), 0, buf.array().length); } return new String(buf.array()); } private static SparkConf buildSparkConf(String appName, String defaultMaster) { return buildSparkConf(appName, defaultMaster, new HashMap<>()); } private static SparkConf buildSparkConf(String appName, String defaultMaster, Map<String, String> additionalConfigs) { final SparkConf sparkConf = new SparkConf().setAppName(appName); String master = sparkConf.get("spark.master", defaultMaster); sparkConf.setMaster(master); if (master.startsWith("yarn")) { sparkConf.set("spark.eventLog.overwrite", "true"); sparkConf.set("spark.eventLog.enabled", "true"); } sparkConf.setIfMissing("spark.driver.maxResultSize", "2g"); sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); sparkConf.set("spark.hadoop.mapred.output.compress", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); additionalConfigs.entrySet().forEach(e -> sparkConf.set(e.getKey(), e.getValue())); SparkConf newSparkConf = HoodieWriteClient.registerClasses(sparkConf); return newSparkConf; } public static JavaSparkContext buildSparkContext(String appName, String defaultMaster, Map<String, String> configs) { return new JavaSparkContext(buildSparkConf(appName, defaultMaster, configs)); } public static JavaSparkContext buildSparkContext(String appName, String defaultMaster) { return new JavaSparkContext(buildSparkConf(appName, defaultMaster)); } /** * Build Spark Context for ingestion/compaction * @return */ public static JavaSparkContext buildSparkContext(String appName, String sparkMaster, String sparkMemory) { SparkConf sparkConf = buildSparkConf(appName, sparkMaster); sparkConf.set("spark.executor.memory", sparkMemory); return new JavaSparkContext(sparkConf); } /** * Build Hoodie write client * * @param jsc Java Spark Context * @param basePath Base Path * @param schemaStr Schema * @param parallelism Parallelism */ public static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, String schemaStr, int parallelism, Optional<String> compactionStrategyClass, TypedProperties properties) throws Exception { HoodieCompactionConfig compactionConfig = compactionStrategyClass .map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false) .withCompactionStrategy(ReflectionUtils.loadClass(strategy)).build()) .orElse(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build()); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) .withParallelism(parallelism, parallelism).withSchema(schemaStr).combineInput(true, true) .withCompactionConfig(compactionConfig) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .withProps(properties).build(); return new HoodieWriteClient(jsc, config); } public static int handleErrors(JavaSparkContext jsc, String instantTime, JavaRDD<WriteStatus> writeResponse) { Accumulator<Integer> errors = jsc.accumulator(0); writeResponse.foreach(writeStatus -> { if (writeStatus.hasErrors()) { errors.add(1); logger.error(String.format("Error processing records :writeStatus:%s", writeStatus.getStat().toString())); } }); if (errors.value() == 0) { logger.info(String.format("Dataset imported into hoodie dataset with %s instant time.", instantTime)); return 0; } logger.error(String.format("Import failed with %d errors.", errors.value())); return -1; } public static TypedProperties readConfig(InputStream in) throws IOException { TypedProperties defaults = new TypedProperties(); defaults.load(in); return defaults; } }