org.sf.xrime.algorithms.layout.radialtree.Raw2OutLabeledAdjVertexTransformer.java Source code

Introduction

Here is the source code for org.sf.xrime.algorithms.layout.radialtree.Raw2OutLabeledAdjVertexTransformer.java
Source

/*
 * Copyright (C) liuchangyan@BUPT. 2009.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.sf.xrime.algorithms.layout.radialtree;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;

import org.sf.xrime.ProcessorExecutionException;
import org.sf.xrime.Transformer;
import org.sf.xrime.model.edge.Edge;
import org.sf.xrime.model.vertex.LabeledAdjVertex;

/**
 * Calculate outgoing adjacent vertexes lists for authors from newsmth.net based on article
 * Re: relationships and add the a label using in  RadialTree Layout Algorithm. The incoming 
 * data is line-oriented, in text format, the output data serialized according to Writable 
 * interface.
 * 
 * We choose to generate these lists from raw data instead of incoming adjacent vertexes
 * list.
 * @author liu chang yan
 */
public class Raw2OutLabeledAdjVertexTransformer extends Transformer {
    /**
     * Default constructor.
     */
    public Raw2OutLabeledAdjVertexTransformer() {
        super();
    }

    /**
     * Normal constructor.
     * @param src
     * @param dest
     */
    public Raw2OutLabeledAdjVertexTransformer(Path src, Path dest) {
        super(src, dest);
    }

    /**
     * For each input line, emit a list of AdjVertexes, i.e., a Re:er and its outgoing adjacent
     * vertex (the original author). This list will have redundant entries removed.
     */
    public static class MapClass extends MapReduceBase
            implements Mapper<LongWritable, Text, Text, LabeledAdjVertex> {

        private Text friend = new Text();

        public void map(LongWritable key, Text value, OutputCollector<Text, LabeledAdjVertex> output,
                Reporter reporter) throws IOException {
            // Get the input line.
            String line = value.toString().trim();

            // Get re-ers.
            int reer_startindex = line.lastIndexOf(" ");
            if (reer_startindex < 0 || line.charAt(reer_startindex + 1) != '+')
                return; // Input line is error.
            String reer_line = line.substring(reer_startindex + 2);

            // Get the original author name.
            String substr_before_reer = line.substring(0, reer_startindex).trim();
            int orig_startindex = substr_before_reer.lastIndexOf(" ");
            if (orig_startindex < 0)
                return; // Input line is error.
            String author_name = substr_before_reer.substring(orig_startindex + 1).trim();

            // Tokenize the re-ers' part.
            StringTokenizer itr = new StringTokenizer(reer_line, "+");
            HashSet<String> friend_set = new HashSet<String>();
            // The Re: authors, i.e., the friends of original author.
            while (itr.hasMoreTokens()) {
                String friend_name = itr.nextToken();
                friend_set.add(friend_name);
            }

            // Remove the author from the friend list.
            friend_set.remove(author_name);

            // Add those friends as a list of AdjVertexes.
            Iterator<String> iterator;
            for (iterator = friend_set.iterator(); iterator.hasNext();) {
                String friend_name = iterator.next();
                friend.set(friend_name);
                // Create an instance of AdjVertex.
                LabeledAdjVertex adj = new LabeledAdjVertex();
                adj.setId(friend_name);
                Edge edge = new Edge();
                edge.setFrom(friend_name);
                edge.setTo(author_name);
                adj.addEdge(edge);
                RadialTreeLabel label = (RadialTreeLabel) adj.getLabel(RadialTreeLabel.RadialTreeLabelPathsKey);
                label = new RadialTreeLabel();
                adj.setLabel(RadialTreeLabel.RadialTreeLabelPathsKey, label);
                output.collect(friend, adj);
            }

            // Considering those who only publish articles, not re articles (including those who also
            // never got re'ed).
            LabeledAdjVertex adj = new LabeledAdjVertex();
            adj.setId(author_name);
            output.collect(new Text(author_name), adj);
        }
    }

    /**
     * Merge the sublists of outgoing adjacent vertexes of a Re:er into one.
     */
    public static class ReduceClass extends MapReduceBase
            implements Reducer<Text, LabeledAdjVertex, Text, LabeledAdjVertex> {

        public void reduce(Text key, Iterator<LabeledAdjVertex> values,
                OutputCollector<Text, LabeledAdjVertex> output, Reporter reporter) throws IOException {
            // The set used to accommodate the original authors.
            HashSet<String> orig_author_set = new HashSet<String>();

            // Each value represents a list of original authors. Accumulate them into the set.
            while (values.hasNext()) {
                LabeledAdjVertex authors = values.next();
                List<Edge> edges = authors.getEdges();
                if (edges != null) {
                    for (Iterator<Edge> iterator = edges.iterator(); iterator.hasNext();) {
                        // Get the original author from the edge and put it into the set.
                        orig_author_set.add(iterator.next().getTo());
                    }
                }
            }

            // Create the AdjVertex instance.
            LabeledAdjVertex adj = new LabeledAdjVertex();
            String friend_name = key.toString();
            // Set the vertex.
            adj.setId(friend_name);

            // Add original authors.
            Iterator<String> iterator;
            for (iterator = orig_author_set.iterator(); iterator.hasNext();) {
                String author_name = iterator.next();
                Edge edge = new Edge();
                edge.setFrom(friend_name);
                edge.setTo(author_name);
                adj.addEdge(edge);
            }
            // Add RadialTree Label
            RadialTreeLabel label = (RadialTreeLabel) adj.getLabel(RadialTreeLabel.RadialTreeLabelPathsKey);
            label = new RadialTreeLabel();
            adj.setLabel(RadialTreeLabel.RadialTreeLabelPathsKey, label);
            output.collect(key, adj);
        }
    }

    @Override
    public void execute() throws ProcessorExecutionException {
        JobConf jobConf = new JobConf(conf, Raw2OutLabeledAdjVertexTransformer.class);
        jobConf.setJobName("Smth - OutLabeledAdjVertexesList");

        jobConf.setMapperClass(MapClass.class);
        jobConf.setCombinerClass(ReduceClass.class);
        jobConf.setReducerClass(ReduceClass.class);

        // the keys are author names (strings)
        jobConf.setOutputKeyClass(Text.class);
        // the values are adjacent vertexes (Writable)
        jobConf.setOutputValueClass(LabeledAdjVertex.class);

        // makes the file format suitable for machine processing.
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);

        FileInputFormat.setInputPaths(jobConf, srcPath);
        FileOutputFormat.setOutputPath(jobConf, destPath);

        jobConf.setNumMapTasks(mapperNum);
        jobConf.setNumReduceTasks(reducerNum);

        try {
            this.runningJob = JobClient.runJob(jobConf);
        } catch (IOException e) {
            throw new ProcessorExecutionException(e);
        }
    }

    public static void main(String[] args) {
        try {
            int res = ToolRunner.run(new Raw2OutLabeledAdjVertexTransformer(), args);
            System.exit(res);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}