Java tutorial
package com.splout.db.hadoop; /* * #%L * Splout SQL Hadoop library * %% * Copyright (C) 2012 - 2013 Datasalt Systems S.L. * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.datasalt.pangool.io.ITuple; import com.datasalt.pangool.io.Schema; import com.datasalt.pangool.utils.TaskAttemptContextFactory; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import java.io.IOException; import java.util.List; /** * Small piece of code that can sample a Pangool Schema from an arbitrary InputFormat<ITuple, NullWritable>. It just * reads the first Tuple and reads the Schema from it. */ public class SchemaSampler { private final static Log log = LogFactory.getLog(SchemaSampler.class); public static Schema sample(Configuration conf, Path input, InputFormat<ITuple, NullWritable> inputFormat) throws IOException, InterruptedException { Schema schema = null; // sample schema from input path given the provided InputFormat @SuppressWarnings("deprecation") Job job = new Job(conf); FileInputFormat.setInputPaths(job, input); // get first inputSplit List<InputSplit> inputSplits = inputFormat.getSplits(job); if (inputSplits == null || inputSplits.size() == 0) { throw new IOException( "Given input format doesn't produce any input split. Can't sample first record. PATH: " + input); } InputSplit inputSplit = inputSplits.get(0); TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext; try { attemptContext = TaskAttemptContextFactory.get(conf, attemptId); } catch (Exception e) { throw new IOException(e); } RecordReader<ITuple, NullWritable> rReader = inputFormat.createRecordReader(inputSplit, attemptContext); rReader.initialize(inputSplit, attemptContext); if (!rReader.nextKeyValue()) { throw new IOException( "Can't read first record of first input split of the given path [" + input + "]."); } // finally get the sample schema schema = rReader.getCurrentKey().getSchema(); log.info("Sampled schema from [" + input + "] : " + schema); rReader.close(); return schema; } }