Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.hive; import cascading.flow.FlowProcess; import cascading.scheme.Scheme; import cascading.scheme.SinkCall; import cascading.scheme.SourceCall; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.type.CoercibleType; import java.io.IOException; import java.io.OutputStream; import java.lang.reflect.Type; import java.util.ArrayList; import java.util.Properties; import java.util.regex.Pattern; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.hive.ql.io.RCFileInputFormat; import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; import org.apache.hadoop.hive.serde2.ByteStream; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; import org.apache.hadoop.hive.serde2.columnar.ColumnarStruct; import org.apache.hadoop.hive.serde2.lazy.LazyBinary; import org.apache.hadoop.hive.serde2.lazy.LazyBoolean; import org.apache.hadoop.hive.serde2.lazy.LazyByte; import org.apache.hadoop.hive.serde2.lazy.LazyDouble; import org.apache.hadoop.hive.serde2.lazy.LazyFloat; import org.apache.hadoop.hive.serde2.lazy.LazyHiveDecimal; import org.apache.hadoop.hive.serde2.lazy.LazyInteger; import org.apache.hadoop.hive.serde2.lazy.LazyLong; import org.apache.hadoop.hive.serde2.lazy.LazyShort; import org.apache.hadoop.hive.serde2.lazy.LazyString; import org.apache.hadoop.hive.serde2.lazy.LazyTimestamp; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; /** * This is a {@link Scheme} subclass. RCFile (Record Columnar File) format can partition the data horizontally(rows) and * vertically(columns) and allows to fetch only the specific columns during the processing and avoid the Disk IO penalty * with all the columns. * This class mainly developed for writing Cascading output to RCFile format to be consumed by Hive afterwards. It also * support read RCFile format, also support optimization as Hive(less HDFS_BYTES_READ less CPU). */ public class RCFile extends Scheme<JobConf, RecordReader, OutputCollector, Object[], Object[]> { private transient ColumnarSerDe serde; private String[] types; /*columns' ids(start from zero), concatenated with comma.*/ private String selectedColIds = null; /*regular expression for comma separated string for column ids.*/ private static final Pattern COMMA_SEPARATED_IDS = Pattern.compile("^([0-9]+,)*[0-9]+$"); /** * Construct an instance of RCFile using specified array of field names and types. * @param names field names * @param types field types */ public RCFile(String[] names, String[] types) { this(names, types, null); } /** * Construct an instance of RCFile using specified array of field names and types. * @param names field names * @param types field types * @param selectedColIds a list of column ids (started from 0) to explicitly specify which columns will be used */ public RCFile(String[] names, String[] types, String selectedColIds) { super(new Fields(names), new Fields(names)); this.types = types; this.selectedColIds = selectedColIds; validate(); } /** * Construct an instance of RCFile using hive table scheme. Table schema should be a space and comma separated string * describing the Hive schema, e.g.: * uid BIGINT, name STRING, description STRING * specifies 3 fields * @param hiveScheme hive table scheme */ public RCFile(String hiveScheme) { this(hiveScheme, null); } /** * Construct an instance of RCFile using hive table scheme. Table schema should be a space and comma separated string * describing the Hive schema, e.g.: * uid BIGINT, name STRING, description STRING * specifies 3 fields * @param hiveScheme hive table scheme * @param selectedColIds a list of column ids (started from 0) to explicitly specify which columns will be used */ public RCFile(String hiveScheme, String selectedColIds) { ArrayList<String>[] lists = HiveSchemaUtil.parse(hiveScheme); Fields fields = new Fields(lists[0].toArray(new String[lists[0].size()])); setSinkFields(fields); setSourceFields(fields); this.types = lists[1].toArray(new String[lists[1].size()]); this.selectedColIds = selectedColIds; validate(); } private void validate() { if (types.length != getSourceFields().size()) { throw new IllegalArgumentException("fields size and length of fields types not match."); } if (selectedColIds != null) { if (!COMMA_SEPARATED_IDS.matcher(selectedColIds).find()) { throw new IllegalArgumentException("selected column ids must in comma separated formatted"); } } } @Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setInputFormat(RCFileInputFormat.class); if (selectedColIds != null) { conf.set(HiveProps.HIVE_SELECTD_COLUMN_IDS, selectedColIds); } } @Override public void sourcePrepare(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException { if (serde == null) { try { serde = new ColumnarSerDe(); serde.initialize(flowProcess.getConfigCopy(), getProps()); } catch (SerDeException e) { throw new RuntimeException("Unable to initialize SerDe."); } } sourceCall.setContext(new Object[2]); sourceCall.getContext()[0] = sourceCall.getInput().createKey(); sourceCall.getContext()[1] = sourceCall.getInput().createValue(); } private Properties getProps() { Properties props = new Properties(); Fields fields = getSourceFields(); StringBuilder sb = new StringBuilder(); StringBuilder sbType = new StringBuilder(); for (int i = 0; i < fields.size(); i++) { sb.append(fields.get(i)).append(","); sbType.append(types[i]).append(","); } sb.deleteCharAt(sb.length() - 1); sbType.deleteCharAt(sbType.length() - 1); props.put(HiveProps.HIVE_COLUMNS, sb.toString()); props.put(HiveProps.HIVE_COLUMN_TYPES, sbType.toString()); return props; } @Override public void sourceCleanup(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) { sourceCall.setContext(null); } private boolean sourceReadInput(SourceCall<Object[], RecordReader> sourceCall) throws IOException { Object[] context = sourceCall.getContext(); return sourceCall.getInput().next(context[0], context[1]); } @Override public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException { if (!sourceReadInput(sourceCall)) return false; Tuple tuple = sourceCall.getIncomingEntry().getTuple(); BytesRefArrayWritable value = (BytesRefArrayWritable) sourceCall.getContext()[1]; try { ColumnarStruct struct = (ColumnarStruct) serde.deserialize(value); ArrayList<Object> objects = struct.getFieldsAsList(); tuple.clear(); for (Object o : objects) { //each field has to be explicitly converted, otherwise will get error due to unable to load serializer for hive lazy types. tuple.add(sourceField(o)); } return true; } catch (SerDeException e) { throw new IOException(e); } } @Override public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setOutputKeyClass(WritableComparable.class); conf.setOutputValueClass(BytesRefArrayWritable.class); conf.setOutputFormat(RCFileOutputFormat.class); conf.set(HiveProps.HIVE_COLUMN_NUMBER, String.valueOf(getSinkFields().size())); } @Override public void sinkPrepare(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException { sinkCall.setContext(new Object[3]); sinkCall.getContext()[0] = new ByteStream.Output(); sinkCall.getContext()[1] = new BytesRefArrayWritable(); sinkCall.getContext()[2] = new BytesRefWritable[getSinkFields().size()]; } @Override public void sinkCleanup(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) { sinkCall.setContext(null); } @Override public void sink(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException { Tuple tuple = sinkCall.getOutgoingEntry().getTuple(); ByteStream.Output byteStream = (ByteStream.Output) sinkCall.getContext()[0]; BytesRefArrayWritable rowWritable = (BytesRefArrayWritable) sinkCall.getContext()[1]; BytesRefWritable[] colValRefs = (BytesRefWritable[]) sinkCall.getContext()[2]; if (tuple.size() != colValRefs.length) { throw new RuntimeException("fields size and length of column buffer not match."); } byteStream.reset(); int startPos = 0; for (int i = 0; i < colValRefs.length; i++) { colValRefs[i] = new BytesRefWritable(); rowWritable.set(i, colValRefs[i]); sinkField(byteStream, tuple.getObject(i), tuple.getTypes()[i]); colValRefs[i].set(byteStream.getData(), startPos, byteStream.getCount() - startPos); startPos = byteStream.getCount(); } sinkCall.getOutput().collect(null, rowWritable); } private void sinkField(OutputStream out, Object field, Type fieldType) throws IOException { if (field == null) { return; // just leave it empty } if (fieldType instanceof CoercibleType) { CoercibleType<?> coercible = (CoercibleType<?>) fieldType; out.write(coercible.coerce(field, String.class).toString().getBytes()); } else if (field instanceof byte[]) { // The Hive serde serializes the binary type as a Base64 encoded string. out.write(Base64.encodeBase64((byte[]) field)); } else { out.write(field.toString().getBytes()); } //TODO: need handle more cases } /* * Convert Hive lazy objects to Java objects. */ private Object sourceField(Object value) { if (value instanceof LazyString) { value = ((LazyString) value).getWritableObject().toString(); } else if (value instanceof LazyInteger) { value = ((LazyInteger) value).getWritableObject().get(); } else if (value instanceof LazyLong) { value = ((LazyLong) value).getWritableObject().get(); } else if (value instanceof LazyFloat) { value = ((LazyFloat) value).getWritableObject().get(); } else if (value instanceof LazyDouble) { value = ((LazyDouble) value).getWritableObject().get(); } else if (value instanceof LazyBoolean) { value = ((LazyBoolean) value).getWritableObject().get(); } else if (value instanceof LazyByte) { value = (int) ((LazyByte) value).getWritableObject().get(); } else if (value instanceof LazyShort) { value = ((LazyShort) value).getWritableObject().get(); } else if (value instanceof LazyBinary) { value = ((LazyBinary) value).getWritableObject().getBytes(); } else if (value instanceof LazyHiveDecimal) { value = ((LazyHiveDecimal) value).getWritableObject().getHiveDecimal(); } else if (value instanceof LazyTimestamp) { value = ((LazyTimestamp) value).getWritableObject().getTimestamp(); } //TODO: need handle more types return value; } }