hack.VectorScan.java Source code

Java tutorial

Introduction

Here is the source code for hack.VectorScan.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package hack;

import lsh.mahout.core.Hash;
import lsh.mahout.core.SimplexSpace;
import lsh.mahout.core.SparseHash;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.util.HelpFormatter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class VectorScan {
    SimplexSpace<String> space = null;
    static int DIMS = 45000;

    private static final Logger log = LoggerFactory.getLogger(VectorScan.class);

    private VectorScan() {
    }

    public static void main(String[] args) throws Exception {
        DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
        ArgumentBuilder abuilder = new ArgumentBuilder();
        GroupBuilder gbuilder = new GroupBuilder();

        Option seqOpt = obuilder.withLongName("seqFile").withRequired(false)
                .withArgument(abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create())
                .withDescription("The Sequence File containing the Clusters").withShortName("s").create();
        Option outputOpt = obuilder.withLongName("output").withRequired(false)
                .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
                .withDescription("The output file.  If not specified, dumps to the console").withShortName("o")
                .create();
        Option substringOpt = obuilder.withLongName("substring").withRequired(false)
                .withArgument(abuilder.withName("substring").withMinimum(1).withMaximum(1).create())
                .withDescription("The number of chars of the asFormatString() to print").withShortName("b")
                .create();
        Option countOpt = obuilder.withLongName("count").withRequired(false)
                .withDescription("Report the count only").withShortName("c").create();
        Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
                .create();

        Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(substringOpt)
                .withOption(countOpt).withOption(helpOpt).create();

        try {
            Parser parser = new Parser();
            parser.setGroup(group);
            CommandLine cmdLine = parser.parse(args);

            if (cmdLine.hasOption(helpOpt)) {

                printHelp(group);
                return;
            }

            boolean doCount = false;
            if (cmdLine.hasOption(countOpt))
                doCount = true;

            if (cmdLine.hasOption(seqOpt)) {
                Path path = new Path(cmdLine.getValue(seqOpt).toString());
                Configuration conf = new Configuration();
                FileSystem fs = FileSystem.get(path.toUri(), conf);
                SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
                int start = 4;
                int end = 5;
                int samples = 1000;
                SimplexSpace<String>[] spaces = makeSpaces(start, end, doCount);
                try {
                    int sub = Integer.MAX_VALUE;
                    if (cmdLine.hasOption(substringOpt)) {
                        sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
                    }
                    Text key = (Text) reader.getKeyClass().asSubclass(Writable.class).newInstance();
                    VectorWritable value = (VectorWritable) reader.getValueClass().asSubclass(Writable.class)
                            .newInstance();
                    int count = 0;
                    while (reader.next(key, value)) {
                        String text = key.toString();
                        Vector v = value.get();
                        //            int size = v.size();
                        //            int density = v.getNumNondefaultElements();
                        //            System.out.println(size + "," + density);
                        addSpaces(spaces, start, text, v);
                        count++;
                        if (count % 1000 == 0)
                            System.out.print(".");
                        if (count == samples)
                            break;
                    }
                    if (doCount)
                        printSpacesCounts(spaces, start, samples);
                    else
                        printSpacesFull(spaces, start, samples);
                } finally {
                }
            }

        } catch (OptionException e) {
            log.error("Exception", e);
            printHelp(group);
        }

    }

    private static void printSpacesCounts(SimplexSpace<String>[] spaces, int start, int samples) {
        //print differences between vectors and counts, if vectors are stored.
        for (int i = start; i < spaces.length; i++) {
            System.out.println("#" + i);
            System.out.println("non-single: " + spaces[i].getNonSingleHashes(true));
            System.out.println("max:        " + spaces[i].getMaxHashes(true));
            System.out.println("count:      " + spaces[i].getCount(true));
            System.out
                    .println("range:      (" + spaces[i].getMinHash(true) + "," + spaces[i].getMaxHash(true) + ")");
        }
    }

    private static void printSpacesFull(SimplexSpace<String>[] spaces, int start, int samples) {
        //print differences between vectors and counts, if vectors are stored.
        for (int i = start; i < spaces.length; i++) {
            System.out.println("#" + i);
            System.out.println("non-single: " + spaces[i].getNonSingleHashes(false));
            System.out.println("max:        " + spaces[i].getMaxHashes(false));
            System.out.println("count:      " + spaces[i].getCount(false));
            System.out.println(
                    "range:      (" + spaces[i].getMinHash(false) + "," + spaces[i].getMaxHash(false) + ")");
            System.out.println("dups:       " + (samples - spaces[i].getCount(false)));
        }
    }

    private static void addSpaces(SimplexSpace<String>[] spaces, int start, String key, Vector v) {
        lsh.mahout.core.SparseHash sh = (SparseHash) spaces[start].getHashLOD(v);
        for (int lod = start; lod < spaces.length; lod++) {
            Hash spot = new SparseHash(sh, lod);
            if (null != spaces[lod]) {
                spaces[lod].addHash(v, spot, null);
                // set key as payload
            }
        }
    }

    private static SimplexSpace<String>[] makeSpaces(int start, int n, boolean doCount) {
        //    Hasher hasher = new OrthonormalHasher(DIMS, 0.001d);
        lsh.mahout.core.Hasher hasher = new lsh.mahout.core.VertexTransitiveHasher(DIMS, 0.001d);
        DistanceMeasure measure = new EuclideanDistanceMeasure();
        SimplexSpace<String>[] spaces = new SimplexSpace[n];
        for (int i = start; i < n; i++) {
            SimplexSpace<String> space = new SimplexSpace<String>(hasher, DIMS, measure, false, doCount);
            spaces[i] = space;
            space.setLOD(i);
        }
        return spaces;
    }

    private static void printHelp(Group group) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.setGroup(group);
        formatter.print();
    }
}