com.yahoo.glimmer.util.ComputeHashTool.java Source code

Java tutorial

Introduction

Here is the source code for com.yahoo.glimmer.util.ComputeHashTool.java

Source

package com.yahoo.glimmer.util;

/*
 * Copyright (c) 2012 Yahoo! Inc. All rights reserved.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software distributed under the License is 
 *  distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and limitations under the License.
 *  See accompanying LICENSE file.
 */

import it.unimi.dsi.big.util.LongBigListSignedStringMap;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.SafelyCloseable;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.sux4j.mph.AbstractHashFunction;
import it.unimi.dsi.sux4j.mph.HollowTrieMonotoneMinimalPerfectHashFunction;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.SequenceInputStream;
import java.nio.charset.Charset;
import java.util.AbstractCollection;
import java.util.Arrays;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;

public class ComputeHashTool extends Configured implements Tool {
    private final static Logger LOGGER = Logger.getLogger(ComputeHashTool.class);
    private static final String SRC_FILES_ARG = "srcFilenames";
    private static final String SIGNED_ARG = "signed";
    private static final String UNSIGNED_ARG = "unsigned";
    private static final String WRITE_INFO_ARG = "info";
    private static final String NUMBER_OF_ELEMENTS_ARG = "numElements";
    private static final String FILE_ENCODING_ARG = "encoding";
    public static final FsPermission ALL_PERMISSIONS = new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL);
    private static final String DOT_UNSIGNED = ".map";
    private static final String DOT_SIGNED = ".smap";
    private static final String DOT_MAPINFO = ".mapinfo";

    @Override
    public int run(String[] args) throws Exception {
        final SimpleJSAP jsap = new SimpleJSAP(ComputeHashTool.class.getName(), "Builds a hash function.",
                new Parameter[] {
                        new Switch(SIGNED_ARG, SIGNED_ARG.charAt(0), SIGNED_ARG, "Generate signed hashes."),
                        new Switch(UNSIGNED_ARG, UNSIGNED_ARG.charAt(0), UNSIGNED_ARG, "Generate unsiged hashes."),
                        new Switch(WRITE_INFO_ARG, WRITE_INFO_ARG.charAt(0), WRITE_INFO_ARG,
                                "Write a .info tab seperated text file with size/width info in."),
                        new FlaggedOption(NUMBER_OF_ELEMENTS_ARG, JSAP.LONG_PARSER, null, JSAP.NOT_REQUIRED,
                                NUMBER_OF_ELEMENTS_ARG.charAt(0), NUMBER_OF_ELEMENTS_ARG,
                                "Sign the hash with a hash width of w bits."),
                        new FlaggedOption(FILE_ENCODING_ARG, ForNameStringParser.getParser(Charset.class), "UTF-8",
                                JSAP.NOT_REQUIRED, FILE_ENCODING_ARG.charAt(0), FILE_ENCODING_ARG,
                                "Set the input file encoding(default is UTF-8)."),
                        new UnflaggedOption(SRC_FILES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
                                JSAP.GREEDY, "The filenames (or HDFS dirs if building hashes) to work with.") });

        JSAPResult jsapResult = jsap.parse(args);
        if (jsap.messagePrinted()) {
            throw new IllegalArgumentException("");
        }

        String[] srcFilenames = jsapResult.getStringArray(SRC_FILES_ARG);

        Long numElements = null;
        if (jsapResult.contains(NUMBER_OF_ELEMENTS_ARG)) {
            numElements = jsapResult.getLong(NUMBER_OF_ELEMENTS_ARG);
        }

        boolean generateUnsigned = true;
        boolean generateSigned = false;
        Charset srcFileCharset = (Charset) jsapResult.getObject(FILE_ENCODING_ARG);
        if (jsapResult.getBoolean(SIGNED_ARG)) {
            generateSigned = true;
            if (jsapResult.getBoolean(UNSIGNED_ARG)) {
                LOGGER.info("Building unsigned and signed hashes for " + srcFileCharset.displayName() + " files:"
                        + Arrays.toString(srcFilenames));
            } else {
                LOGGER.info("Building signed hashes for " + srcFileCharset.displayName() + " files:"
                        + Arrays.toString(srcFilenames));
                generateUnsigned = false;
            }
        } else {
            LOGGER.info("Building unsigned hashes for " + srcFileCharset.displayName() + " files:" + srcFilenames);
        }
        Configuration conf = getConf();
        // This need to be set if you want to read from a local HDFS system.
        //conf.set("fs.default.name","hdfs://127.0.0.1:9000/");
        JobConf job = new JobConf(conf, ComputeHashTool.class);
        FileSystem fs = FileSystem.get(job);
        for (String srcFilename : srcFilenames) {
            LOGGER.info("Building hash of " + srcFilename);
            buildHash(fs, srcFilename, numElements, generateUnsigned, generateSigned, srcFileCharset,
                    jsapResult.getBoolean(WRITE_INFO_ARG, false));
        }
        return 0;
    }

    public long buildHash(FileSystem fs, String srcFilename, Long numElements, boolean generateUnsigned,
            boolean generateSigned, final Charset charset, boolean writeInfoFile)
            throws IOException, ClassNotFoundException {
        final MapReducePartInputStreamEnumeration inputStreamEnumeration;
        try {
            inputStreamEnumeration = new MapReducePartInputStreamEnumeration(fs, new Path(srcFilename));
        } catch (IOException e) {
            throw new RuntimeException("Failed to open " + srcFilename, e);
        }

        LineReaderCollection inCollection = new LineReaderCollection(new LineReaderCollection.ReaderFactory() {
            @Override
            public Reader newReader() {
                inputStreamEnumeration.reset();
                return new InputStreamReader(new SequenceInputStream(inputStreamEnumeration), charset);
            }
        });

        String destFilename = inputStreamEnumeration.removeCompressionSuffixIfAny(srcFilename);
        Path unsigendPath = new Path(destFilename + DOT_UNSIGNED);

        HollowTrieMonotoneMinimalPerfectHashFunction<CharSequence> unsignedHash;
        if (generateUnsigned) {
            //       if (numElements != null) {
            //      LOGGER.info("\tBuilding unsigned hash with given number of elements:" + numElements);
            //       } else {
            //      LOGGER.info("\tBuilding unsigned hash. Getting number of elements from collection...");
            //      long timeToGetSize = System.currentTimeMillis();
            //      numElements = inCollection.size64();
            //      timeToGetSize = System.currentTimeMillis() - timeToGetSize;
            //      LOGGER.info("\tNumber of elements is " + numElements + " found in " + timeToGetSize / 1000 + " seconds");
            //       }
            //       unsignedHash = new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(inCollection, numElements, TransformationStrategies.prefixFreeUtf16());
            unsignedHash = new HollowTrieMonotoneMinimalPerfectHashFunction<CharSequence>(inCollection,
                    TransformationStrategies.prefixFreeUtf32());
            LOGGER.info("\tSaving unsigned hash as " + unsigendPath.toString());
            writeMapToFile(unsignedHash, fs, unsigendPath);
        } else {
            LOGGER.info("\tLoading unsigned hash from " + unsigendPath.toString());
            unsignedHash = (HollowTrieMonotoneMinimalPerfectHashFunction<CharSequence>) readMpHashFromFile(fs,
                    unsigendPath);
        }

        if (generateSigned) {
            LOGGER.info("\tBuilding signed hash...");
            //       ShiftAddXorSignedStringMap signedHash = new ShiftAddXorSignedStringMap(inCollection.iterator(), unsignedHash, signatureWidth);
            Path signedPath = new Path(destFilename + DOT_SIGNED);
            DataOutputStream signedDataOutputStream = null;
            try {
                signedDataOutputStream = new DataOutputStream(
                        new FastBufferedOutputStream(createOutputStream(fs, signedPath)));
                LongBigListSignedStringMap.sign(inCollection.iterator(), signedDataOutputStream, null);
            } finally {
                if (signedDataOutputStream != null) {
                    signedDataOutputStream.close();
                }
            }

            LOGGER.info("\tSaving signed hash as " + signedPath.toString());
        }

        if (writeInfoFile) {
            Path infoPath = new Path(destFilename + DOT_MAPINFO);
            FSDataOutputStream infoStream = fs.create(infoPath, true);// overwrite
            fs.setPermission(infoPath, ALL_PERMISSIONS);
            OutputStreamWriter infoWriter = new OutputStreamWriter(infoStream);
            infoWriter.write("size\t");
            infoWriter.write(Long.toString(unsignedHash.size64()));
            infoWriter.write("\n");
            infoWriter.write("unsignedBits\t");
            infoWriter.write(Long.toString((unsignedHash).numBits()));
            infoWriter.write("\n");
            if (generateSigned) {
                infoWriter.write("signedWidth\t64\n");
            }
            infoWriter.close();
            infoStream.close();
        }

        return unsignedHash.size64();
    }

    private static OutputStream createOutputStream(FileSystem fs, Path path) throws IOException {
        FSDataOutputStream outStream = fs.create(path, true);// overwrite;
        fs.setPermission(path, ALL_PERMISSIONS);
        return outStream;
    }

    private static void writeMapToFile(AbstractObject2LongFunction<CharSequence> object, FileSystem fs, Path path)
            throws IOException {
        OutputStream outStream = createOutputStream(fs, path);
        try {
            ObjectOutputStream oOutStream = null;
            try {
                oOutStream = new ObjectOutputStream(outStream);
                oOutStream.writeObject(object);
            } finally {
                if (oOutStream != null) {
                    oOutStream.close();
                }
            }
        } finally {
            if (outStream != null) {
                outStream.close();
            }
        }
    }

    @SuppressWarnings("unchecked")
    private static AbstractHashFunction<CharSequence> readMpHashFromFile(FileSystem fs, Path path)
            throws IOException, ClassNotFoundException {
        FSDataInputStream inStream = null;
        try {
            inStream = fs.open(path);
            ObjectInputStream oInStream = null;
            try {
                oInStream = new ObjectInputStream(inStream);
                Object object = oInStream.readObject();
                return (AbstractHashFunction<CharSequence>) object;
            } finally {
                if (oInStream != null) {
                    oInStream.close();
                }
            }
        } finally {
            if (inStream != null) {
                inStream.close();
            }
        }
    }

    public static void main(String[] args) throws Exception {
        int ret = ToolRunner.run(new ComputeHashTool(), args);
        System.exit(ret);
    }

    /**
     * Presents a Reader as a Collection of MutableStrings with each line read
     * from the Reader as an element of the Collection.
     * {@link LineReaderCollection.ReaderFactory.newReader} is called each time
     * {@link LineReaderCollection.iterator} is called. Only the current line
     * and next line are held in memory.
     * 
     * Note that {@link LineReaderCollection.LineReaderIterator.next} always
     * returns the same instance of MutableString(but with different contents)
     * for each instance of {@link LineReaderCollection.LineReaderIterator}.
     * 
     * @author tep
     * 
     */
    private static class LineReaderCollection extends AbstractCollection<MutableString> implements Size64 {
        private final ReaderFactory readerFactory;
        private long size = -1;

        public LineReaderCollection(ReaderFactory readerFactory) {
            this.readerFactory = readerFactory;
        }

        public interface ReaderFactory {
            public Reader newReader();
        }

        private class LineReaderIterator implements Iterator<MutableString>, SafelyCloseable {
            private FastBufferedReader fbr;
            private MutableString current = new MutableString();
            private MutableString next = new MutableString();
            private boolean advance = true;

            public LineReaderIterator(Reader reader) {
                fbr = new FastBufferedReader(reader);
            }

            @Override
            public boolean hasNext() {
                if (fbr == null) {
                    return false;
                }

                if (advance) {
                    try {
                        if (fbr.readLine(next) == null) {
                            close();
                            return false;
                        }
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                    advance = false;
                }
                return true;
            }

            @Override
            public MutableString next() {
                if (advance) {
                    if (!hasNext()) {
                        throw new NoSuchElementException("Size is " + size);
                    }
                }
                current.replace(next);
                advance = true;
                return current;
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }

            @Override
            public void close() throws IOException {
                // This gets called multiple times..
                if (fbr != null) {
                    fbr.close();
                    fbr = null;
                    advance = false;
                }
            }
        }

        @Override
        public LineReaderIterator iterator() {
            return new LineReaderIterator(readerFactory.newReader());
        }

        @Override
        public int size() {
            long size64 = size64();
            if (size64 > Integer.MAX_VALUE) {
                throw new IndexOutOfBoundsException(
                        "LineReaderCollection.size() called on a instance with more than Integer.MAX_VALUE elements.  Use Size64.size64() instead.");
            }
            return (int) size64;
        }

        @Override
        public long size64() {
            if (size == -1l) {
                LineReaderIterator i = iterator();
                size = 0l;
                while (i.hasNext()) {
                    size++;
                    i.next();
                }
                try {
                    i.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return size;
        }
    }
}