Java tutorial
/* Copyright (C) 2012-2013 Unister GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package com.unister.semweb.drums.bucket.hashfunction; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.HashMap; import java.util.List; import org.apache.commons.io.IOUtils; import com.unister.semweb.drums.bucket.hashfunction.util.RangeHashSorter; import com.unister.semweb.drums.storable.AbstractKVStorable; import com.unister.semweb.drums.util.ByteArrayComparator; import com.unister.semweb.drums.util.KeyUtils; /** * This hashFunction maps an element to a specific range. The ranges are not overlapping. It is not needed, that the * ranges are consecutive. * * @author Martin Nettling */ public class RangeHashFunction extends AbstractHashFunction { private static final long serialVersionUID = 4288827206276176844L; /** the file where the hashfunction is stored human-readable */ private String hashFunctionFile; /** the key composition. E.g. 2 4 2 8 or char int char long */ private int keyComposition[]; private byte[][] maxRangeValues; private int[] bucketIds; private String[] filenames; /** * This constructor instantiates a new {@link RangeHashFunction} with the given number of ranges. It tries to size * all ranges equally between the smallest and the largest key. * * @param minKey * the smallest expected key * @param maxKey * the largest expected key * * @param ranges * the number of ranges * @param hashFunctionFilename * the filename of the file, where to store the hash-function */ public RangeHashFunction(byte[] minKey, byte[] maxKey, int ranges, String hashFunctionFilename) { this.hashFunctionFile = hashFunctionFilename; this.buckets = ranges; this.initHashFunction(minKey, maxKey, ranges); } /** * This constructor instantiates a new {@link RangeHashFunction} with the given number of ranges. It tries to size * all ranges equally within the complete available space of numbers. * * @param ranges * the number of ranges * @param keySize * the size in bytes of the key * @param hashFunctionFilename * the filename of the file, where to store the hash-function */ public RangeHashFunction(int ranges, int keySize, String hashFunctionFilename) { this.hashFunctionFile = hashFunctionFilename; this.buckets = ranges; byte[] max = new byte[keySize], min = new byte[keySize]; Arrays.fill(max, (byte) -1); this.initHashFunction(min, max, ranges); } private void initHashFunction(byte[] minKey, byte[] maxKey, int ranges) { this.maxRangeValues = KeyUtils.getMaxValsPerRange(minKey, maxKey, ranges); this.filenames = new String[ranges]; for (int i = 0; i < ranges; i++) { filenames[i] = i + ".db"; } this.keyComposition = new int[minKey.length]; Arrays.fill(keyComposition, 1); sort(); } /** * This method instantiates a new {@link RangeHashFunction} by the given rangeValues. The given array should contain * only the maximal allowed value per bucket. The minimal value will be the direct successor of the previous maximal * value. Remember: the array will be handled circular. * * @param rangeValues * the maximum keys for all buckets * @param filenames * the filenames for all buckets * @param hashFunctionFilename * the file name of the range hash function */ public RangeHashFunction(byte[][] rangeValues, String[] filenames, String hashFunctionFilename) { this.hashFunctionFile = hashFunctionFilename; this.buckets = rangeValues.length; this.maxRangeValues = rangeValues; this.filenames = filenames; this.keyComposition = new int[rangeValues[0].length]; Arrays.fill(keyComposition, 1); sort(); } /** Sorts the max range values corresponding to the file names and the bucket sizes. */ private void sort() { RangeHashSorter sortMachine; sortMachine = new RangeHashSorter(maxRangeValues, filenames); sortMachine.quickSort(); generateBucketIds(); } /** * This method instantiates a new {@link RangeHashFunction} by the given {@link File}. The File contains some long * values, which describe the maximal allowed values for the buckets. The minimal value will be the direct successor * of the previous maximal value. Remember: the array will be handled circular. * * @param file * the file, which contains the maximal keys * @throws IOException */ public RangeHashFunction(File file) throws IOException { load(new FileInputStream(file)); this.hashFunctionFile = file.getAbsolutePath(); } /** * Returns the File, where the HashFunction is stored human-readable * * @return File */ public String getHashFunctionFile() { return this.hashFunctionFile; } /** * generates the correct index structure, namely the bucketIds to the already initialized filenames and * maxRangeValues */ private void generateBucketIds() { // generate indexes for buckets, needed if two different ranges belong to the same file this.buckets = 0; bucketIds = new int[filenames.length]; HashMap<String, Integer> tmpSeenFilenames = new HashMap<String, Integer>(); for (int i = 0; i < filenames.length; i++) { if (!tmpSeenFilenames.containsKey(filenames[i])) { tmpSeenFilenames.put(filenames[i], this.buckets++); } bucketIds[i] = tmpSeenFilenames.get(filenames[i]); } this.buckets = bucketIds.length; } /** * @param bucketId * @return the maximal key in the bucket with the given bucketId. */ public byte[] getUpperBound(int bucketId) { return maxRangeValues[bucketId]; } /** Determines the bucket id to the given <code>key</code>. */ @Override public int getBucketId(byte[] key) { int index = searchBucketIndex(key, 0, maxRangeValues.length - 1); return bucketIds[index]; } /** * Searches for the given <code>key</code> in {@link #maxRangeValues} and returns the index of the corresponding * range. Remember: this may not be the bucketId */ protected int searchBucketIndex(byte[] key, int leftIndex, int rightIndex) { if (KeyUtils.compareKey(key, maxRangeValues[rightIndex]) > 0) { return 0; } int idx = Arrays.binarySearch(maxRangeValues, leftIndex, rightIndex, key, new ByteArrayComparator()); idx = idx < 0 ? -idx - 1 : idx; if (idx > rightIndex) { return -1; } else { return idx; } } /** Gets the bucket id from the given date. */ @Override public int getBucketId(AbstractKVStorable key) { return getBucketId(key.getKey()); } /** Get the file name of the given bucket. */ @Override public String getFilename(int bucketId) { return filenames[bucketId]; } @Override public String toString() { StringBuilder ret = new StringBuilder(); for (int i = 0; i < maxRangeValues[0].length; i++) { ret.append('b').append('\t'); } ret.append("filename").append('\t').append("\n"); for (int i = 0; i < maxRangeValues.length; i++) { String oneCSVLine = makeOneLine(maxRangeValues[i], filenames[i]); ret.append(oneCSVLine); } return ret.toString(); } /** * Concatenates the given range value and the file name to one string. It is used to write the hash function file. */ private String makeOneLine(byte[] value, String filename) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < value.length; i++) { sb.append(value[i]).append('\t'); } sb.append(filename); return sb.toString(); } @Override public int getBucketId(String dbFilename) { for (int i = 0; i < filenames.length; i++) { if (filenames[i].equals(dbFilename)) { return i; } } return -1; } /** * Replaces one bucket line within the {@link RangeHashFunction} with the lines given. All added buckets are set to * the specified bucket size. If the <code>bucketId</code> that is to replaced is invalid a * {@link IllegalArgumentException} is thrown. * * @param keysToInsert * @param bucketId */ public void replace(int bucketId, byte[][] keysToInsert) { if (bucketId < 0 || bucketId >= maxRangeValues.length) { throw new IllegalArgumentException("Invalid bucketId: " + bucketId); } int numberOfPartitions = keysToInsert.length; int newSize = this.getNumberOfBuckets() - 1 + numberOfPartitions; byte[][] newMaxRangeValues = new byte[newSize][]; String[] newFileNames = new String[newSize]; int k = 0; for (int i = 0; i < this.getNumberOfBuckets(); i++) { if (i != bucketId) { newMaxRangeValues[k] = this.getUpperBound(i); newFileNames[k] = this.getFilename(i); k++; } } for (int i = this.getNumberOfBuckets() - 1; i < newSize; i++) { k = i - (this.getNumberOfBuckets() - 1); newMaxRangeValues[i] = keysToInsert[k]; newFileNames[i] = generateFileName(k, this.getFilename(bucketId)); } this.maxRangeValues = newMaxRangeValues; this.filenames = newFileNames; sort(); } /** * @return the ranges of this hash function. */ public byte[][] getRanges() { return this.maxRangeValues; } /** * generates a new filename for a subbucket from the given oldName * * @param subBucket * @param oldName * @return */ protected String generateFileName(int subBucket, String oldName) { int dotPos = oldName.lastIndexOf("."); int slashPos = Math.max(oldName.lastIndexOf("/"), oldName.lastIndexOf("\\")); String prefix; String suffix; if (dotPos > slashPos) { prefix = oldName.substring(0, dotPos); suffix = oldName.substring(dotPos); } else { prefix = oldName; suffix = ""; } return prefix + "_" + subBucket + suffix; } /** * Makes a copy of the current {@link RangeHashFunction}. <b>Note: the file name is also copied. Make sure that you * don't overwrite the file if you change one of the functions.</b> * * @return a copy of this {@link RangeHashFunction} */ public RangeHashFunction copy() { RangeHashFunction clone = new RangeHashFunction(maxRangeValues, filenames, hashFunctionFile); return clone; } /** * The header of could contain characters which are not numbers. Some of them can be translated into bytes. E.g. * char would be two byte. * * @param code * the code to look for * @return the size of the given code */ public static int stringToByteCount(String code) { @SuppressWarnings("serial") HashMap<String, Integer> codingMap = new HashMap<String, Integer>() { { put("b", 1); put("byte", 1); put("bool", 1); put("boolean", 1); put("c", 2); put("char", 2); put("character", 2); put("i", 4); put("int", 4); put("integer", 4); put("f", 4); put("float", 4); put("d", 8); put("double", 8); put("l", 8); put("long", 8); put("1", 1); put("2", 2); put("3", 3); put("4", 4); put("5", 5); put("6", 6); put("7", 7); put("8", 8); } }; if (codingMap.containsKey(code)) { return codingMap.get(code.toLowerCase()); } else { return 0; } } /** * Writes the hash function, represented as tuples (range, filename) into the file that is linked with the * HashFunction. The content of the file is overwritten. * * @throws IOException */ public void writeToFile() throws IOException { FileOutputStream fos = new FileOutputStream(new File(this.hashFunctionFile)); store(fos); fos.close(); } @Override public void store(OutputStream os) throws IOException { StringBuilder sb = new StringBuilder(); for (int i = 0; i < maxRangeValues[0].length; i++) { sb.append("b").append("\t"); } sb.append("filename\t").append("\n"); for (int i = 0; i < maxRangeValues.length; i++) { sb.append(makeOneLine(maxRangeValues[i], filenames[i])).append("\n"); } os.write(sb.toString().getBytes()); os.close(); } @Override public void load(InputStream in) throws IOException { List<String> readData = IOUtils.readLines(in); maxRangeValues = new byte[readData.size() - 1][]; filenames = new String[readData.size() - 1]; // analyze header String[] header = readData.get(0).split("\t"); int keySize = 0; keyComposition = new int[header.length - 1]; for (int i = 0; i < keyComposition.length; i++) { int e = stringToByteCount(header[i]); if (e == 0) { throw new IOException("Header could not be read. Could not decode " + header[i]); } keyComposition[i] = e; keySize += e; } for (int i = 0; i < readData.size() - 1; i++) { String[] Aline = readData.get(i + 1).split("\t"); // TODO: format exception maxRangeValues[i] = new byte[keySize]; // we need an offset for the current part of the key int keyPartOffset = -1; for (int k = 0; k < keyComposition.length; k++) { long tmp = Long.parseLong(Aline[k]); // set the offset on the last byte of the current part of the key keyPartOffset += keyComposition[k]; // start from the lowest bits of the read long value and use them for the last byte (= lowest byte) of // the current part of the key. Than take the next bits and the second lowest byte for (int b = 0; b < keyComposition[k]; b++) { maxRangeValues[i][keyPartOffset - b] = (byte) tmp; tmp = tmp >> 8; } } filenames[i] = Aline[keyComposition.length]; } this.sort(); generateBucketIds(); } }