org.apache.mahout.graph.AdjacencyMatrixJob.java Source code

Introduction

Here is the source code for org.apache.mahout.graph.AdjacencyMatrixJob.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.graph;

import com.google.common.base.Preconditions;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.FileLineIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.common.mapreduce.VectorSumReducer;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.map.OpenIntIntHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * <p>Distributed computation of the adjacency matrix of a graph, see http://en.wikipedia.org/wiki/Adjacency_matrix
 *
 * <p>This job outputs {@link org.apache.hadoop.io.SequenceFile}s an {@link IntWritable} as key and a {@link VectorWritable}  as value</p>
 *
 * <p>Command line arguments specific to this class are:</p>
 *
 * <ol>
 *   <li>--output=(path): output path where the resulting matrix and the number of vertices should be written</li>
 *   <li>--vertices=(path): file containing a list of all vertices</li>
 *   <li>--edges=(path): Directory containing edges of the graph</li>
 *   <li>--symmetric = (boolean) produce a symmetric adjacency matrix (corresponds to an undirected graph)</li>
 * </ol>
 *
 * <p>General command line options are documented in {@link AbstractJob}.</p>
 *
 * <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.</p>
 */
public class AdjacencyMatrixJob extends AbstractJob {

    private static final Logger log = LoggerFactory.getLogger(AdjacencyMatrixJob.class);

    public static final String NUM_VERTICES = "numVertices.bin";
    public static final String ADJACENCY_MATRIX = "adjacencyMatrix";
    public static final String VERTEX_INDEX = "vertexIndex";

    static final String NUM_VERTICES_PARAM = AdjacencyMatrixJob.class.getName() + ".numVertices";
    static final String VERTEX_INDEX_PARAM = AdjacencyMatrixJob.class.getName() + ".vertexIndex";
    static final String SYMMETRIC_PARAM = AdjacencyMatrixJob.class.getName() + ".symmetric";

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new AdjacencyMatrixJob(), args);
    }

    @Override
    public int run(String[] args) throws Exception {

        addOption("vertices", null, "a text file containing all vertices of the graph (one per line)", true);
        addOption("edges", null, "text files containing the edges of the graph (vertexA,vertexB per line)", true);
        addOption("symmetric", null, "produce a symmetric adjacency matrix (corresponds to an undirected graph)",
                String.valueOf(false));

        addOutputOption();

        Map<String, List<String>> parsedArgs = parseArguments(args);
        if (parsedArgs == null) {
            return -1;
        }

        Path vertices = new Path(getOption("vertices"));
        Path edges = new Path(getOption("edges"));
        boolean symmetric = Boolean.parseBoolean(getOption("symmetric"));

        log.info("Indexing vertices sequentially, this might take a while...");
        int numVertices = indexVertices(vertices, getOutputPath(VERTEX_INDEX));

        HadoopUtil.writeInt(numVertices, getOutputPath(NUM_VERTICES), getConf());
        Preconditions.checkArgument(numVertices > 0);

        log.info("Found " + numVertices + " vertices, creating adjacency matrix...");
        Job createAdjacencyMatrix = prepareJob(edges, getOutputPath(ADJACENCY_MATRIX), TextInputFormat.class,
                VectorizeEdgesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
                IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
        createAdjacencyMatrix.setCombinerClass(VectorSumReducer.class);
        Configuration createAdjacencyMatrixConf = createAdjacencyMatrix.getConfiguration();
        createAdjacencyMatrixConf.set(NUM_VERTICES_PARAM, String.valueOf(numVertices));
        createAdjacencyMatrixConf.set(VERTEX_INDEX_PARAM, getOutputPath(VERTEX_INDEX).toString());
        createAdjacencyMatrixConf.setBoolean(SYMMETRIC_PARAM, symmetric);

        boolean succeeded = createAdjacencyMatrix.waitForCompletion(true);

        return succeeded ? 0 : -1;
    }

    //TODO do this in parallel?
    private int indexVertices(Path verticesPath, Path indexPath) throws IOException {
        FileSystem fs = FileSystem.get(verticesPath.toUri(), getConf());
        SequenceFile.Writer writer = null;
        int index = 0;

        try {
            writer = SequenceFile.createWriter(fs, getConf(), indexPath, IntWritable.class, IntWritable.class);

            for (FileStatus fileStatus : fs.listStatus(verticesPath)) {
                InputStream in = null;
                try {
                    in = HadoopUtil.openStream(fileStatus.getPath(), getConf());
                    for (String line : new FileLineIterable(in)) {
                        writer.append(new IntWritable(index++), new IntWritable(Integer.parseInt(line)));
                    }
                } finally {
                    Closeables.closeQuietly(in);
                }
            }
        } finally {
            Closeables.closeQuietly(writer);
        }

        return index;
    }

    static class VectorizeEdgesMapper extends Mapper<LongWritable, Text, IntWritable, VectorWritable> {

        private int numVertices;
        private OpenIntIntHashMap vertexIDsToIndex;
        private boolean symmetric;

        private final IntWritable row = new IntWritable();

        private static final Pattern SEPARATOR = Pattern.compile("[\t,]");

        @Override
        protected void setup(Context ctx) throws IOException, InterruptedException {
            Configuration conf = ctx.getConfiguration();
            numVertices = Integer.parseInt(conf.get(NUM_VERTICES_PARAM));
            symmetric = conf.getBoolean(SYMMETRIC_PARAM, false);
            Path vertexIndexPath = new Path(conf.get(VERTEX_INDEX_PARAM));
            vertexIDsToIndex = new OpenIntIntHashMap(numVertices);
            for (Pair<IntWritable, IntWritable> indexAndVertexID : new SequenceFileIterable<IntWritable, IntWritable>(
                    vertexIndexPath, true, conf)) {
                vertexIDsToIndex.put(indexAndVertexID.getSecond().get(), indexAndVertexID.getFirst().get());
            }
        }

        @Override
        protected void map(LongWritable offset, Text line, Mapper.Context ctx)
                throws IOException, InterruptedException {

            String[] tokens = SEPARATOR.split(line.toString());
            int rowIndex = vertexIDsToIndex.get(Integer.parseInt(tokens[0]));
            int columnIndex = vertexIDsToIndex.get(Integer.parseInt(tokens[1]));

            Vector partialTransitionMatrixRow = new SequentialAccessSparseVector(numVertices, 1);
            row.set(rowIndex);
            partialTransitionMatrixRow.setQuick(columnIndex, 1);
            ctx.write(row, new VectorWritable(partialTransitionMatrixRow));

            if (symmetric && rowIndex != columnIndex) {
                partialTransitionMatrixRow = new SequentialAccessSparseVector(numVertices, 1);
                row.set(columnIndex);
                partialTransitionMatrixRow.setQuick(rowIndex, 1);
                ctx.write(row, new VectorWritable(partialTransitionMatrixRow));
            }
        }
    }

}