Java tutorial
package com.yahoo.glimmer.util; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import it.unimi.dsi.big.util.LongBigListSignedStringMap; import it.unimi.dsi.bits.TransformationStrategies; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.SafelyCloseable; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.sux4j.mph.AbstractHashFunction; import it.unimi.dsi.sux4j.mph.HollowTrieMonotoneMinimalPerfectHashFunction; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.SequenceInputStream; import java.nio.charset.Charset; import java.util.AbstractCollection; import java.util.Arrays; import java.util.Iterator; import java.util.NoSuchElementException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.Switch; import com.martiansoftware.jsap.UnflaggedOption; import com.martiansoftware.jsap.stringparsers.ForNameStringParser; public class ComputeHashTool extends Configured implements Tool { private final static Logger LOGGER = Logger.getLogger(ComputeHashTool.class); private static final String SRC_FILES_ARG = "srcFilenames"; private static final String SIGNED_ARG = "signed"; private static final String UNSIGNED_ARG = "unsigned"; private static final String WRITE_INFO_ARG = "info"; private static final String NUMBER_OF_ELEMENTS_ARG = "numElements"; private static final String FILE_ENCODING_ARG = "encoding"; public static final FsPermission ALL_PERMISSIONS = new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL); private static final String DOT_UNSIGNED = ".map"; private static final String DOT_SIGNED = ".smap"; private static final String DOT_MAPINFO = ".mapinfo"; @Override public int run(String[] args) throws Exception { final SimpleJSAP jsap = new SimpleJSAP(ComputeHashTool.class.getName(), "Builds a hash function.", new Parameter[] { new Switch(SIGNED_ARG, SIGNED_ARG.charAt(0), SIGNED_ARG, "Generate signed hashes."), new Switch(UNSIGNED_ARG, UNSIGNED_ARG.charAt(0), UNSIGNED_ARG, "Generate unsiged hashes."), new Switch(WRITE_INFO_ARG, WRITE_INFO_ARG.charAt(0), WRITE_INFO_ARG, "Write a .info tab seperated text file with size/width info in."), new FlaggedOption(NUMBER_OF_ELEMENTS_ARG, JSAP.LONG_PARSER, null, JSAP.NOT_REQUIRED, NUMBER_OF_ELEMENTS_ARG.charAt(0), NUMBER_OF_ELEMENTS_ARG, "Sign the hash with a hash width of w bits."), new FlaggedOption(FILE_ENCODING_ARG, ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, FILE_ENCODING_ARG.charAt(0), FILE_ENCODING_ARG, "Set the input file encoding(default is UTF-8)."), new UnflaggedOption(SRC_FILES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "The filenames (or HDFS dirs if building hashes) to work with.") }); JSAPResult jsapResult = jsap.parse(args); if (jsap.messagePrinted()) { throw new IllegalArgumentException(""); } String[] srcFilenames = jsapResult.getStringArray(SRC_FILES_ARG); Long numElements = null; if (jsapResult.contains(NUMBER_OF_ELEMENTS_ARG)) { numElements = jsapResult.getLong(NUMBER_OF_ELEMENTS_ARG); } boolean generateUnsigned = true; boolean generateSigned = false; Charset srcFileCharset = (Charset) jsapResult.getObject(FILE_ENCODING_ARG); if (jsapResult.getBoolean(SIGNED_ARG)) { generateSigned = true; if (jsapResult.getBoolean(UNSIGNED_ARG)) { LOGGER.info("Building unsigned and signed hashes for " + srcFileCharset.displayName() + " files:" + Arrays.toString(srcFilenames)); } else { LOGGER.info("Building signed hashes for " + srcFileCharset.displayName() + " files:" + Arrays.toString(srcFilenames)); generateUnsigned = false; } } else { LOGGER.info("Building unsigned hashes for " + srcFileCharset.displayName() + " files:" + srcFilenames); } Configuration conf = getConf(); // This need to be set if you want to read from a local HDFS system. //conf.set("fs.default.name","hdfs://127.0.0.1:9000/"); JobConf job = new JobConf(conf, ComputeHashTool.class); FileSystem fs = FileSystem.get(job); for (String srcFilename : srcFilenames) { LOGGER.info("Building hash of " + srcFilename); buildHash(fs, srcFilename, numElements, generateUnsigned, generateSigned, srcFileCharset, jsapResult.getBoolean(WRITE_INFO_ARG, false)); } return 0; } public long buildHash(FileSystem fs, String srcFilename, Long numElements, boolean generateUnsigned, boolean generateSigned, final Charset charset, boolean writeInfoFile) throws IOException, ClassNotFoundException { final MapReducePartInputStreamEnumeration inputStreamEnumeration; try { inputStreamEnumeration = new MapReducePartInputStreamEnumeration(fs, new Path(srcFilename)); } catch (IOException e) { throw new RuntimeException("Failed to open " + srcFilename, e); } LineReaderCollection inCollection = new LineReaderCollection(new LineReaderCollection.ReaderFactory() { @Override public Reader newReader() { inputStreamEnumeration.reset(); return new InputStreamReader(new SequenceInputStream(inputStreamEnumeration), charset); } }); String destFilename = inputStreamEnumeration.removeCompressionSuffixIfAny(srcFilename); Path unsigendPath = new Path(destFilename + DOT_UNSIGNED); HollowTrieMonotoneMinimalPerfectHashFunction<CharSequence> unsignedHash; if (generateUnsigned) { // if (numElements != null) { // LOGGER.info("\tBuilding unsigned hash with given number of elements:" + numElements); // } else { // LOGGER.info("\tBuilding unsigned hash. Getting number of elements from collection..."); // long timeToGetSize = System.currentTimeMillis(); // numElements = inCollection.size64(); // timeToGetSize = System.currentTimeMillis() - timeToGetSize; // LOGGER.info("\tNumber of elements is " + numElements + " found in " + timeToGetSize / 1000 + " seconds"); // } // unsignedHash = new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(inCollection, numElements, TransformationStrategies.prefixFreeUtf16()); unsignedHash = new HollowTrieMonotoneMinimalPerfectHashFunction<CharSequence>(inCollection, TransformationStrategies.prefixFreeUtf32()); LOGGER.info("\tSaving unsigned hash as " + unsigendPath.toString()); writeMapToFile(unsignedHash, fs, unsigendPath); } else { LOGGER.info("\tLoading unsigned hash from " + unsigendPath.toString()); unsignedHash = (HollowTrieMonotoneMinimalPerfectHashFunction<CharSequence>) readMpHashFromFile(fs, unsigendPath); } if (generateSigned) { LOGGER.info("\tBuilding signed hash..."); // ShiftAddXorSignedStringMap signedHash = new ShiftAddXorSignedStringMap(inCollection.iterator(), unsignedHash, signatureWidth); Path signedPath = new Path(destFilename + DOT_SIGNED); DataOutputStream signedDataOutputStream = null; try { signedDataOutputStream = new DataOutputStream( new FastBufferedOutputStream(createOutputStream(fs, signedPath))); LongBigListSignedStringMap.sign(inCollection.iterator(), signedDataOutputStream, null); } finally { if (signedDataOutputStream != null) { signedDataOutputStream.close(); } } LOGGER.info("\tSaving signed hash as " + signedPath.toString()); } if (writeInfoFile) { Path infoPath = new Path(destFilename + DOT_MAPINFO); FSDataOutputStream infoStream = fs.create(infoPath, true);// overwrite fs.setPermission(infoPath, ALL_PERMISSIONS); OutputStreamWriter infoWriter = new OutputStreamWriter(infoStream); infoWriter.write("size\t"); infoWriter.write(Long.toString(unsignedHash.size64())); infoWriter.write("\n"); infoWriter.write("unsignedBits\t"); infoWriter.write(Long.toString((unsignedHash).numBits())); infoWriter.write("\n"); if (generateSigned) { infoWriter.write("signedWidth\t64\n"); } infoWriter.close(); infoStream.close(); } return unsignedHash.size64(); } private static OutputStream createOutputStream(FileSystem fs, Path path) throws IOException { FSDataOutputStream outStream = fs.create(path, true);// overwrite; fs.setPermission(path, ALL_PERMISSIONS); return outStream; } private static void writeMapToFile(AbstractObject2LongFunction<CharSequence> object, FileSystem fs, Path path) throws IOException { OutputStream outStream = createOutputStream(fs, path); try { ObjectOutputStream oOutStream = null; try { oOutStream = new ObjectOutputStream(outStream); oOutStream.writeObject(object); } finally { if (oOutStream != null) { oOutStream.close(); } } } finally { if (outStream != null) { outStream.close(); } } } @SuppressWarnings("unchecked") private static AbstractHashFunction<CharSequence> readMpHashFromFile(FileSystem fs, Path path) throws IOException, ClassNotFoundException { FSDataInputStream inStream = null; try { inStream = fs.open(path); ObjectInputStream oInStream = null; try { oInStream = new ObjectInputStream(inStream); Object object = oInStream.readObject(); return (AbstractHashFunction<CharSequence>) object; } finally { if (oInStream != null) { oInStream.close(); } } } finally { if (inStream != null) { inStream.close(); } } } public static void main(String[] args) throws Exception { int ret = ToolRunner.run(new ComputeHashTool(), args); System.exit(ret); } /** * Presents a Reader as a Collection of MutableStrings with each line read * from the Reader as an element of the Collection. * {@link LineReaderCollection.ReaderFactory.newReader} is called each time * {@link LineReaderCollection.iterator} is called. Only the current line * and next line are held in memory. * * Note that {@link LineReaderCollection.LineReaderIterator.next} always * returns the same instance of MutableString(but with different contents) * for each instance of {@link LineReaderCollection.LineReaderIterator}. * * @author tep * */ private static class LineReaderCollection extends AbstractCollection<MutableString> implements Size64 { private final ReaderFactory readerFactory; private long size = -1; public LineReaderCollection(ReaderFactory readerFactory) { this.readerFactory = readerFactory; } public interface ReaderFactory { public Reader newReader(); } private class LineReaderIterator implements Iterator<MutableString>, SafelyCloseable { private FastBufferedReader fbr; private MutableString current = new MutableString(); private MutableString next = new MutableString(); private boolean advance = true; public LineReaderIterator(Reader reader) { fbr = new FastBufferedReader(reader); } @Override public boolean hasNext() { if (fbr == null) { return false; } if (advance) { try { if (fbr.readLine(next) == null) { close(); return false; } } catch (IOException e) { throw new RuntimeException(e); } advance = false; } return true; } @Override public MutableString next() { if (advance) { if (!hasNext()) { throw new NoSuchElementException("Size is " + size); } } current.replace(next); advance = true; return current; } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public void close() throws IOException { // This gets called multiple times.. if (fbr != null) { fbr.close(); fbr = null; advance = false; } } } @Override public LineReaderIterator iterator() { return new LineReaderIterator(readerFactory.newReader()); } @Override public int size() { long size64 = size64(); if (size64 > Integer.MAX_VALUE) { throw new IndexOutOfBoundsException( "LineReaderCollection.size() called on a instance with more than Integer.MAX_VALUE elements. Use Size64.size64() instead."); } return (int) size64; } @Override public long size64() { if (size == -1l) { LineReaderIterator i = iterator(); size = 0l; while (i.hasNext()) { size++; i.next(); } try { i.close(); } catch (IOException e) { throw new RuntimeException(e); } } return size; } } }