edu.berkeley.chukwa_xtrace.XtrExtract.java Source code

Introduction

Here is the source code for edu.berkeley.chukwa_xtrace.XtrExtract.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.berkeley.chukwa_xtrace;

import org.apache.hadoop.chukwa.ChunkImpl;
import org.apache.hadoop.chukwa.ChukwaArchiveKey;
import org.apache.hadoop.chukwa.extraction.demux.processor.mapper.AbstractProcessor;
import org.apache.hadoop.chukwa.extraction.engine.ChukwaRecord;
import org.apache.hadoop.chukwa.extraction.engine.ChukwaRecordKey;
import org.apache.hadoop.chukwa.extraction.engine.Record;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;

import edu.berkeley.xtrace.reporting.Report;
import edu.berkeley.xtrace.*;

/**
 * MapReduce job to process xtrace reports coming out of chukwa demux.
 * 
 * Map phase unwraps the chukwa records, reduce phase does trace reconstruction.
 * 
 * We use task ID as the reduce sort key.
 *
 */
public class XtrExtract extends Configured implements Tool {

    /**
     * Hadoop docs say to do this if you pass an ArrayWritable to reduce.
     */
    public static class TextArrayWritable extends ArrayWritable {
        public TextArrayWritable() {
            super(Text.class);
        }

    }

    public static final String OUTLINK_FIELD = "__xtr_outlinks";
    static Logger log = Logger.getLogger(XtrExtract.class);

    /**
     * with more than 50,000 reports in a single trace, switch to on-disk sort, 
     * instead of in-memory topological sort.
     */
    static final int MAX_IN_MEMORY_REPORTS = 50 * 1000;

    public static class MapClass extends Mapper<Object, Object, BytesWritable, Text> {

        public MapClass() {
            System.out.println("starting xtrace map");
        }

        @Override
        protected void map(Object k, Object v, Mapper<Object, Object, BytesWritable, Text>.Context context)
                throws IOException, InterruptedException {
            Counter unparseableReport = context.getCounter("app", "unparseable chunks");

            Text t;
            BytesWritable bw;

            if (k instanceof ChukwaArchiveKey && v instanceof ChunkImpl) {
                ChunkImpl value = (ChunkImpl) v;
                Report xtrReport = Report.createFromString(new String(value.getData()));

                try { //we do this to handle the case where not all input is x-trace
                    bw = new BytesWritable(xtrReport.getMetadata().getTaskId().get());
                } catch (Exception e) {
                    unparseableReport.increment(1);
                    return;
                }

                //FIXME: can probably optimize the above lines by doing a search in the raw bytes
                t = new Text(value.getData());
            } else if (k instanceof ChukwaRecordKey && v instanceof ChukwaRecord) {
                ChukwaRecord value = (ChukwaRecord) v;
                Report xtrReport = Report.createFromString(value.getValue(Record.bodyField));
                bw = new BytesWritable(xtrReport.getMetadata().getTaskId().get());
                //FIXME: can probably optimize the above lines by doing a search in the raw bytes
                t = new Text(value.getValue(Record.bodyField));
            } else {
                log.error("unexpected key/value types: " + k.getClass().getCanonicalName() + " and "
                        + v.getClass().getCanonicalName());
                return;
            }
            context.write(bw, t);
        }
    }

    public static class Reduce extends Reducer<BytesWritable, Text, BytesWritable, ArrayWritable> {

        public Reduce() {
        }

        /**
         * 
         * Note that loading everything into hashtables means
         * we implicity suppress duplicate-but-identical reports.  
         * 
         */
        protected void reduce(BytesWritable taskID, Iterable<Text> values,
                Reducer<BytesWritable, Text, BytesWritable, ArrayWritable>.Context context)
                throws IOException, InterruptedException {
            String taskIDString = IoUtil.bytesToString(taskID.getBytes());
            //in both cases, key is OpId string
            HashMap<String, Report> reports = new LinkedHashMap<String, Report>();

            Counter reportCounter = context.getCounter("app", "distinct reports");
            Counter edgeCounter = context.getCounter("app", "edges");
            Counter badEdgeCounter = context.getCounter("app", "reference to missing report");
            Counter dupCounter = context.getCounter("app", "duplicate report");

            int edgeCount = 0, dups = 0, numReports = 0;

            for (Text rep_text : values) {
                Report r = Report.createFromString(rep_text.toString());
                numReports++;

                if (numReports < MAX_IN_MEMORY_REPORTS) {
                    if (reports.containsKey(r.getMetadata().getOpIdString()))
                        dups++;
                    reports.put(r.getMetadata().getOpIdString(), r);
                } else if (numReports == MAX_IN_MEMORY_REPORTS) {
                    //bail out, prepare to do an external sort.
                    return;
                } else
                    ;
                //      do the external sort
            }

            reportCounter.increment(reports.size());
            dupCounter.increment(dups);
            CausalGraph g = new CausalGraph(reports);

            PtrReverse reverser = new PtrReverse();
            List<Report> sortedReports = g.topoSort(reverser);
            int sortedLen = sortedReports.size();
            if (sortedLen != reports.size()) {
                if (sortedLen > 0)
                    log.warn(taskIDString + ": I only sorted " + sortedLen + " items, but expected "
                            + reports.size() + ", is your list cyclic?");
                else
                    log.warn(taskIDString + ": every event in graph has a predecessor; perhaps "
                            + "the start event isn't in the input set?");
            }
            log.debug(taskIDString + ": " + reverser.edgeCount + " total edges");
            edgeCounter.increment(reverser.edgeCount);
            badEdgeCounter.increment(reverser.badCount);

            Text[] finalOutput = new Text[sortedReports.size()];
            int i = 0;
            for (Report r : sortedReports)
                finalOutput[i++] = new Text(r.toString());

            TextArrayWritable out = new TextArrayWritable();
            out.set(finalOutput);
            context.write(taskID, out);
            //Should sort values topologically and output list.  or?

        } //end reduce

    }//end reduce class

    public static class PtrReverse {
        int badCount = 0;
        int edgeCount = 0;

        public int setupForwardPointers(Map<String, Report> reports, Report r, String myOpID) {
            int parentCount = 0;
            for (String inLink : r.get("Edge")) {
                //sanitize data from old, nonconformant C++ implementation
                if (inLink.contains(","))
                    inLink = inLink.substring(0, inLink.indexOf(','));

                Report parent = reports.get(inLink);
                if (parent != null) {
                    parent.put(OUTLINK_FIELD, myOpID);
                    parentCount++;
                } else { //no match
                    if (!inLink.equals("0000000000000000")) {
                        log.info("no sign of parent: " + inLink);
                        badCount++;
                    }
                    //else quietly suppress
                }
            }
            edgeCount += badCount + parentCount;
            return parentCount;
        }
    }

    @Override
    public int run(String[] arg) throws Exception {
        Job extractor = new Job(getConf());

        extractor.setMapperClass(MapClass.class);

        extractor.setReducerClass(Reduce.class);
        extractor.setJobName("x-trace reconstructor");
        extractor.setJarByClass(this.getClass());

        extractor.setMapOutputKeyClass(BytesWritable.class);
        extractor.setMapOutputValueClass(Text.class);

        extractor.setOutputKeyClass(BytesWritable.class);
        extractor.setOutputValueClass(TextArrayWritable.class);

        extractor.setInputFormatClass(SequenceFileInputFormat.class);
        extractor.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileInputFormat.setInputPaths(extractor, new Path(arg[0]));
        FileOutputFormat.setOutputPath(extractor, new Path(arg[1]));
        System.out.println("looks OK.  Submitting.");
        extractor.submit();
        //    extractor.waitForCompletion(false);
        return 0;

    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new XtrExtract(), args);
        return;
    }

}