Java tutorial
/* * Copyright 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.template.etl.batch.source; import co.cask.cdap.api.annotation.Description; import co.cask.cdap.api.annotation.Name; import co.cask.cdap.api.annotation.Plugin; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.dataset.DatasetProperties; import co.cask.cdap.api.dataset.lib.KeyValue; import co.cask.cdap.api.dataset.lib.KeyValueTable; import co.cask.cdap.api.templates.plugins.PluginConfig; import co.cask.cdap.template.etl.api.Emitter; import co.cask.cdap.template.etl.api.PipelineConfigurer; import co.cask.cdap.template.etl.api.batch.BatchSource; import co.cask.cdap.template.etl.api.batch.BatchSourceContext; import co.cask.cdap.template.etl.common.BatchFileFilter; import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.google.gson.Gson; import com.google.gson.reflect.TypeToken; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.lang.reflect.Type; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; /** * A {@link BatchSource} to use any distributed file system as a Source. */ @Plugin(type = "source") @Name("File") @Description("Batch source for File Systems") public class FileBatchSource extends BatchSource<LongWritable, Object, StructuredRecord> { public static final String INPUT_NAME_CONFIG = "input.path.name"; public static final String INPUT_REGEX_CONFIG = "input.path.regex"; public static final String LAST_TIME_READ = "last.time.read"; public static final String CUTOFF_READ_TIME = "cutoff.read.time"; public static final String USE_TIMEFILTER = "timefilter"; public static final Schema DEFAULT_SCHEMA = Schema.recordOf("event", Schema.Field.of("ts", Schema.of(Schema.Type.LONG)), Schema.Field.of("body", Schema.of(Schema.Type.STRING))); protected static final String MAX_SPLIT_SIZE_DESCRIPTION = "Maximum split-size for each mapper in the MapReduce " + "Job. Defaults to 128MB."; protected static final String PATH_DESCRIPTION = "Path to file(s) to be read. If a directory is specified, " + "terminate the path name with a \'/\'."; protected static final String TABLE_DESCRIPTION = "Name of the Table that keeps track of the last time files " + "were read in."; protected static final String INPUT_FORMAT_CLASS_DESCRIPTION = "Name of the input format class, which must be a " + "subclass of FileInputFormat. Defaults to CombineTextInputFormat."; protected static final String REGEX_DESCRIPTION = "Regex to filter out filenames in the path. " + "To use the TimeFilter, input \"timefilter\". The TimeFilter assumes that it " + "is reading in files with the File log naming convention of 'YYYY-MM-DD-HH-mm-SS-Tag'. The TimeFilter " + "reads in files from the previous hour if the field 'timeTable' is left blank. So if it's currently " + "2015-06-16-15 (June 16th 2015, 3pm), it will read in files that contain 2015-06-16-14 in the filename. " + "If the field 'timeTable' is present, then it will read files in that haven't been read yet."; private static final String FILESYSTEM_PROPERTIES_DESCRIPTION = "JSON of the properties needed for the " + "distributed file system. The formatting needs to be as follows:\n{\n\t\"<property name>\" : " + "\"<property value>\", ...\n}. For example, the property names needed for S3 are \"fs.s3n.awsSecretAccessKey\" " + "and \"fs.s3n.awsAccessKeyId\"."; private static final String FILESYSTEM_DESCRIPTION = "Distributed file system to read in from."; private static final Gson GSON = new Gson(); private static final Logger LOG = LoggerFactory.getLogger(FileBatchSource.class); private static final Type ARRAYLIST_DATE_TYPE = new TypeToken<ArrayList<Date>>() { }.getType(); private static final Type MAP_STRING_STRING_TYPE = new TypeToken<Map<String, String>>() { }.getType(); private static final int DEFAULT_SPLIT_SIZE = 134217728; private final FileBatchConfig config; private KeyValueTable table; private Date prevHour; private String datesToRead; public FileBatchSource(FileBatchConfig config) { this.config = config; } @Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) { if (config.timeTable != null) { pipelineConfigurer.createDataset(config.timeTable, KeyValueTable.class, DatasetProperties.EMPTY); } } @Override public void prepareRun(BatchSourceContext context) throws Exception { //SimpleDateFormat needs to be local because it is not threadsafe SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH"); //calculate date one hour ago, rounded down to the nearest hour prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1)); Calendar cal = Calendar.getInstance(); cal.setTime(prevHour); cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); prevHour = cal.getTime(); Job job = context.getHadoopJob(); Configuration conf = job.getConfiguration(); if (config.fileSystemProperties != null) { Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE); for (Map.Entry<String, String> entry : properties.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } } if (config.fileRegex != null) { conf.set(INPUT_REGEX_CONFIG, config.fileRegex); } conf.set(INPUT_NAME_CONFIG, config.path); if (config.timeTable != null) { table = context.getDataset(config.timeTable); datesToRead = Bytes.toString(table.read(LAST_TIME_READ)); if (datesToRead == null) { List<Date> firstRun = Lists.newArrayList(new Date(0)); datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE); } List<Date> attempted = Lists.newArrayList(prevHour); String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE); if (!updatedDatesToRead.equals(datesToRead)) { table.write(LAST_TIME_READ, updatedDatesToRead); } conf.set(LAST_TIME_READ, datesToRead); } conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour)); if (!Strings.isNullOrEmpty(config.inputFormatClass)) { ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); Class<? extends FileInputFormat> classType = (Class<? extends FileInputFormat>) classLoader .loadClass(config.inputFormatClass); job.setInputFormatClass(classType); } else { job.setInputFormatClass(CombineTextInputFormat.class); } FileInputFormat.setInputPathFilter(job, BatchFileFilter.class); FileInputFormat.addInputPath(job, new Path(config.path)); long maxSplitSize; try { maxSplitSize = Long.parseLong(config.maxSplitSize); } catch (NumberFormatException e) { maxSplitSize = DEFAULT_SPLIT_SIZE; } CombineTextInputFormat.setMaxInputSplitSize(job, maxSplitSize); } @Override public void transform(KeyValue<LongWritable, Object> input, Emitter<StructuredRecord> emitter) throws Exception { StructuredRecord output = StructuredRecord.builder(DEFAULT_SCHEMA).set("ts", System.currentTimeMillis()) .set("body", input.getValue().toString()).build(); emitter.emit(output); } @Override public void onRunFinish(boolean succeeded, BatchSourceContext context) { if (!succeeded && table != null && USE_TIMEFILTER.equals(config.fileRegex)) { List<Date> existing = GSON.fromJson(Bytes.toString(table.read(LAST_TIME_READ)), ARRAYLIST_DATE_TYPE); List<Date> failed = GSON.fromJson(datesToRead, ARRAYLIST_DATE_TYPE); failed.add(prevHour); failed.addAll(existing); table.write(LAST_TIME_READ, GSON.toJson(failed, ARRAYLIST_DATE_TYPE)); } } /** * Config class that contains all the properties needed for the file source. */ public static class FileBatchConfig extends PluginConfig { @Name("fileSystem") @Description(FILESYSTEM_DESCRIPTION) private String fileSystem; @Name("fileSystemProperties") @Nullable @Description(FILESYSTEM_PROPERTIES_DESCRIPTION) private String fileSystemProperties; @Name("path") @Description(PATH_DESCRIPTION) private String path; @Name("fileRegex") @Nullable @Description(REGEX_DESCRIPTION) private String fileRegex; @Name("timeTable") @Nullable @Description(TABLE_DESCRIPTION) private String timeTable; @Name("inputFormatClass") @Nullable @Description(INPUT_FORMAT_CLASS_DESCRIPTION) private String inputFormatClass; @Name("maxSplitSize") @Nullable @Description(MAX_SPLIT_SIZE_DESCRIPTION) private String maxSplitSize; public FileBatchConfig(String fileSystem, String path, @Nullable String regex, @Nullable String timeTable, @Nullable String inputFormatClass, @Nullable String fileSystemProperties, @Nullable String maxSplitSize) { this.fileSystem = fileSystem; this.fileSystemProperties = fileSystemProperties; this.path = path; this.fileRegex = regex; this.timeTable = timeTable; this.inputFormatClass = inputFormatClass; this.maxSplitSize = maxSplitSize; } } }