Java tutorial
/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.io.rubix; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.serializer.Deserializer; import org.apache.hadoop.io.serializer.SerializationFactory; import org.apache.hadoop.io.serializer.Serializer; import org.apache.hadoop.mapreduce.InputSplit; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.io.BlockSerializationType; import com.linkedin.cubert.utils.ClassCache; public class RubixInputSplit<K, V> extends InputSplit implements Writable, Configurable { final static int MAX_LOCATIONS = 5; private K key; private Path filename; private long offset; private long length; private long blockId; private long numRecords; private Class<K> keyClass; private Class<V> valueClass; private Serializer<K> keySerializer; private Configuration conf; private BlockSchema schema; private BlockSerializationType blockSerializationType; private String[] hostnames = null; public RubixInputSplit() { } public RubixInputSplit(Configuration conf, Path filename, K key, long offset, long length, long blockId, long numRecords, Class<K> keyClass, Class<V> valueClass, BlockSchema schema, BlockSerializationType blockSerializationType) { this.conf = conf; this.key = key; this.filename = filename; this.offset = offset; this.length = length; this.blockId = blockId; this.numRecords = numRecords; this.keyClass = keyClass; this.valueClass = valueClass; this.schema = schema; this.blockSerializationType = blockSerializationType; SerializationFactory serializationFactory = new SerializationFactory(conf); keySerializer = serializationFactory.getSerializer(keyClass); } @Override public long getLength() throws IOException, InterruptedException { return length; } @Override public String[] getLocations() throws IOException, InterruptedException { if (hostnames == null) { /* Obtain the FileSystem object and get the FileStatus objects for the split */ FileSystem fileSystem = FileSystem.get(conf); FileStatus fileStatus = fileSystem.getFileStatus(filename); /* * Obtain the Block locations for the split. This also provides the offset and * length information for each block */ final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, offset, length); /** * Collect all hosts in a map and populate the number of bytes to be read from * each host */ Long l; Map<String, Long> hostMap = new HashMap<String, Long>(); for (BlockLocation bl : blockLocations) { final long start = bl.getOffset() < offset ? offset : bl.getOffset(); final long end = (offset + length) < (bl.getOffset() + bl.getLength()) ? offset + length : bl.getOffset() + bl.getLength(); final long nRelevantBytes = end - start; for (String host : bl.getHosts()) { hostMap.put(host, ((l = hostMap.get(host)) == null ? 0 : l) + nRelevantBytes); } } /* Sort them in decreasing order of maximum number of relevant bytes */ final Set<Map.Entry<String, Long>> entries = hostMap.entrySet(); final Map.Entry<String, Long>[] hostLengthPairs = entries.toArray(new Map.Entry[entries.size()]); Arrays.sort(hostLengthPairs, new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Map.Entry<String, Long> e1, Map.Entry<String, Long> e2) { return (int) (e2.getValue() - e1.getValue()); } }); /* Populate the hostnames object */ final int nHost = Math.min(hostLengthPairs.length, MAX_LOCATIONS); hostnames = new String[nHost]; for (int i = 0; i < nHost; ++i) { hostnames[i] = hostLengthPairs[i].getKey(); } } return hostnames; } public K getKey() { return key; } public Path getFilename() { return filename; } public long getOffset() { return offset; } public Class<V> getValueClass() { return valueClass; } public BlockSchema getSchema() { return schema; } public long getBlockId() { return blockId; } public long getNumRecords() { return numRecords; } public BlockSerializationType getBlockSerializationType() { return blockSerializationType; } @Override public void write(DataOutput out) throws IOException { ByteArrayOutputStream bos = new ByteArrayOutputStream(); keySerializer.open(bos); keySerializer.serialize(key); byte[] keyBytes = bos.toByteArray(); out.writeInt(keyBytes.length); out.write(keyBytes); out.writeUTF(filename.toString()); out.writeLong(offset); out.writeLong(length); out.writeLong(blockId); out.writeLong(numRecords); out.writeUTF(keyClass.getName()); out.writeUTF(valueClass.getName()); ObjectMapper mapper = new ObjectMapper(); out.writeUTF(mapper.writeValueAsString(schema.toJson())); out.writeInt(blockSerializationType.ordinal()); } @SuppressWarnings("unchecked") @Override public void readFields(DataInput in) throws IOException { int keyBytesLen = in.readInt(); byte[] keyBytes = new byte[keyBytesLen]; in.readFully(keyBytes, 0, keyBytesLen); filename = new Path(in.readUTF()); offset = in.readLong(); length = in.readLong(); blockId = in.readLong(); numRecords = in.readLong(); try { keyClass = (Class<K>) ClassCache.forName(in.readUTF()); valueClass = (Class<V>) ClassCache.forName(in.readUTF()); SerializationFactory serializationFactory = new SerializationFactory(conf); Deserializer<K> keyDeserializer = serializationFactory.getDeserializer(keyClass); ByteArrayInputStream bis = new ByteArrayInputStream(keyBytes); keyDeserializer.open(bis); key = keyDeserializer.deserialize(null); ObjectMapper mapper = new ObjectMapper(); schema = new BlockSchema(mapper.readValue(in.readUTF(), JsonNode.class)); blockSerializationType = BlockSerializationType.values()[in.readInt()]; } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } @Override public void setConf(Configuration conf) { this.conf = conf; } @Override public Configuration getConf() { // TODO Auto-generated method stub return null; } @Override public String toString() { return String.format("RubixInputSplit [key=%s, filename=%s, offset=%s, length=%s]", key, filename, offset, length); } }