org.apache.gora.goraci.Verify.java Source code

Introduction

Here is the source code for org.apache.gora.goraci.Verify.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gora.goraci;

import org.apache.gora.goraci.generated.CINode;
import org.apache.gora.goraci.generated.Flushed;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

import org.apache.avro.util.Utf8;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.gora.mapreduce.GoraMapper;
import org.apache.gora.query.Query;
import org.apache.gora.query.Result;
import org.apache.gora.store.DataStore;
import org.apache.gora.store.DataStoreFactory;
import org.apache.gora.util.GoraException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VLongWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * A Map Reduce job that verifies that the linked list generated by {@link org.apache.gora.goraci.Generator} do not have any holes.
 */
public class Verify extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(Verify.class);
    private static final VLongWritable DEF = new VLongWritable(-1);

    private Job job;

    public static class VerifyMapper extends GoraMapper<Long, CINode, LongWritable, VLongWritable> {
        private LongWritable row = new LongWritable();
        private LongWritable ref = new LongWritable();
        private VLongWritable vrow = new VLongWritable();
        private Map<Utf8, Long> flushed = null;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);

            String[] entries = context.getConfiguration().getStrings("org.apache.gora.goraci.verify.flushed");

            if (entries != null && entries.length > 0) {
                flushed = new HashMap<Utf8, Long>();
                for (String entry : entries) {
                    String[] kv = entry.split(":");
                    flushed.put(new Utf8(kv[0]), Long.parseLong(kv[1]));
                }
            }
        }

        @Override
        protected void map(Long key, CINode node, Context context) throws IOException, InterruptedException {
            if (flushed != null) {
                Long count = flushed.get(node.getClient());
                if (count == null || node.getCount() >= count) {
                    context.getCounter(Counts.IGNORED).increment(1);
                    return;
                }
            }

            row.set(key);
            context.write(row, DEF);

            if (node.getPrev() >= 0) {
                ref.set(node.getPrev());
                vrow.set(key);
                context.write(ref, vrow);
            }
        }
    }

    public static enum Counts {
        UNREFERENCED, UNDEFINED, REFERENCED, CORRUPT, IGNORED
    }

    public static class VerifyReducer extends Reducer<LongWritable, VLongWritable, Text, Text> {
        private ArrayList<Long> refs = new ArrayList<Long>();

        public void reduce(LongWritable key, Iterable<VLongWritable> values, Context context)
                throws IOException, InterruptedException {

            int defCount = 0;

            refs.clear();
            for (VLongWritable type : values) {
                if (type.get() == -1) {
                    defCount++;
                } else {
                    refs.add(type.get());
                }
            }

            // TODO check for more than one def, should not happen

            if (defCount == 0 && refs.size() > 0) {
                // this is bad, found a node that is referenced but not defined. It must have been lost, emit some info about this node for debugging purposes.

                StringBuilder sb = new StringBuilder();
                String comma = "";
                for (Long ref : refs) {
                    sb.append(comma);
                    comma = ",";
                    sb.append(String.format("%016x", ref));
                }

                context.write(new Text(String.format("%016x", key.get())), new Text(sb.toString()));
                context.getCounter(Counts.UNDEFINED).increment(1);

            } else if (defCount > 0 && refs.size() == 0) {
                // node is defined but not referenced
                context.getCounter(Counts.UNREFERENCED).increment(1);
            } else {
                // node is defined and referenced
                context.getCounter(Counts.REFERENCED).increment(1);
            }

        }
    }

    @Override
    public int run(String[] args) throws Exception {

        Options options = new Options();
        options.addOption("c", "concurrent", false, "run concurrently with generation");

        GnuParser parser = new GnuParser();
        CommandLine cmd = null;
        try {
            cmd = parser.parse(options, args);
            if (cmd.getArgs().length != 2) {
                throw new ParseException("Did not see expected # of arguments, saw " + cmd.getArgs().length);
            }
        } catch (ParseException e) {
            System.err.println("Failed to parse command line " + e.getMessage());
            System.err.println();
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(getClass().getSimpleName() + " <output dir> <num reducers>", options);
            System.exit(-1);
        }

        String outputDir = cmd.getArgs()[0];
        int numReducers = Integer.parseInt(cmd.getArgs()[1]);

        return run(outputDir, numReducers, cmd.hasOption("c"));
    }

    public int run(String outputDir, int numReducers, boolean concurrent) throws Exception {
        return run(new Path(outputDir), numReducers, concurrent);
    }

    public int run(Path outputDir, int numReducers, boolean concurrent) throws Exception {
        start(outputDir, numReducers, concurrent);

        boolean success = job.waitForCompletion(true);

        return success ? 0 : 1;
    }

    public void start(Path outputDir, int numReducers, boolean concurrent)
            throws GoraException, IOException, Exception {
        LOG.info("Running Verify with outputDir=" + outputDir + ", numReducers=" + numReducers);

        DataStore<Long, CINode> store = DataStoreFactory.getDataStore(Long.class, CINode.class,
                new Configuration());

        job = new Job(getConf());

        if (!job.getConfiguration().get("io.serializations")
                .contains("org.apache.hadoop.io.serializer.JavaSerialization")) {
            job.getConfiguration().set("io.serializations", job.getConfiguration().get("io.serializations")
                    + ",org.apache.hadoop.io.serializer.JavaSerialization");
        }

        job.setJobName("Link Verifier");
        job.setNumReduceTasks(numReducers);
        job.setJarByClass(getClass());

        Query<Long, CINode> query = store.newQuery();
        if (!concurrent) {
            // no concurrency filtering, only need prev field
            query.setFields("prev");
        } else {
            readFlushed(job.getConfiguration());
        }

        GoraMapper.initMapperJob(job, query, store, LongWritable.class, VLongWritable.class, VerifyMapper.class,
                true);

        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);

        job.setReducerClass(VerifyReducer.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, outputDir);

        store.close();

        job.submit();
    }

    public boolean isComplete() throws IOException {
        return job.isComplete();
    }

    public boolean isSuccessful() throws IOException {
        return job.isSuccessful();
    }

    public boolean waitForCompletion() throws IOException, InterruptedException, ClassNotFoundException {
        return job.waitForCompletion(true);
    }

    private void readFlushed(Configuration conf) throws Exception {
        DataStore<Utf8, Flushed> flushedTable = DataStoreFactory.getDataStore(Utf8.class, Flushed.class, conf);

        Query<Utf8, Flushed> query = flushedTable.newQuery();
        Result<Utf8, Flushed> result = flushedTable.execute(query);

        ArrayList<String> flushedEntries = new ArrayList<String>();
        while (result.next()) {
            flushedEntries.add(result.getKey() + ":" + result.get().getCount());
        }

        conf.setStrings("org.apache.gora.goraci.verify.flushed", flushedEntries.toArray(new String[] {}));

        flushedTable.close();
    }

    public boolean verify(long expectedReferenced) throws Exception {
        if (job == null) {
            throw new IllegalStateException("You should call run() first");
        }

        Counters counters = job.getCounters();

        Counter referenced = counters.findCounter(Counts.REFERENCED);
        Counter unreferenced = counters.findCounter(Counts.UNREFERENCED);
        Counter undefined = counters.findCounter(Counts.UNDEFINED);

        boolean success = true;
        //assert
        if (expectedReferenced != referenced.getValue()) {
            LOG.error("Expected referenced count does not match with actual referenced count. "
                    + "expected referenced=" + expectedReferenced + " ,actual=" + referenced.getValue());
            success = false;
        }

        if (unreferenced.getValue() > 0) {
            LOG.error("Unreferenced nodes were not expected. Unreferenced count=" + unreferenced.getValue());
            success = false;
        }

        if (undefined.getValue() > 0) {
            LOG.error("Found an undefined node. Undefined count=" + undefined.getValue());
            success = false;
        }

        return success;
    }

    public static void main(String[] args) throws Exception {
        int ret = ToolRunner.run(new Verify(), args);
        System.exit(ret);
    }
}