cmd.tdbloader4.java Source code

Java tutorial

Introduction

Here is the source code for cmd.tdbloader4.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cmd;

import static org.apache.jena.tdbloader4.Constants.OUTPUT_PATH_POSTFIX_1;
import static org.apache.jena.tdbloader4.Constants.OUTPUT_PATH_POSTFIX_2;
import static org.apache.jena.tdbloader4.Constants.OUTPUT_PATH_POSTFIX_3;
import static org.apache.jena.tdbloader4.Constants.OUTPUT_PATH_POSTFIX_4;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.jena.tdbloader4.Constants;
import org.apache.jena.tdbloader4.FirstDriver;
import org.apache.jena.tdbloader4.FourthDriver;
import org.apache.jena.tdbloader4.SecondDriver;
import org.apache.jena.tdbloader4.ThirdDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.hp.hpl.jena.graph.Graph;
import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.graph.Triple;
import com.hp.hpl.jena.sparql.core.DatasetGraph;
import com.hp.hpl.jena.tdb.TDBFactory;
import com.hp.hpl.jena.tdb.TDBLoader;
import com.hp.hpl.jena.tdb.base.file.Location;
import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
import com.hp.hpl.jena.tdb.sys.SetupTDB;
import com.hp.hpl.jena.tdb.transaction.DatasetGraphTransaction;
import com.hp.hpl.jena.util.iterator.ExtendedIterator;

public class tdbloader4 extends Configured implements Tool {

    private static final Logger log = LoggerFactory.getLogger(tdbloader4.class);

    public tdbloader4() {
        super();
        log.debug("constructed with no configuration.");
    }

    public tdbloader4(Configuration configuration) {
        super(configuration);
        log.debug("constructed with configuration.");
    }

    @Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }

        Configuration configuration = getConf();
        configuration.set(Constants.RUN_ID, String.valueOf(System.currentTimeMillis()));
        boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
                Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
        boolean copyToLocal = configuration.getBoolean(Constants.OPTION_COPY_TO_LOCAL,
                Constants.OPTION_COPY_TO_LOCAL_DEFAULT);
        boolean verify = configuration.getBoolean(Constants.OPTION_VERIFY, Constants.OPTION_VERIFY_DEFAULT);
        boolean runLocal = configuration.getBoolean(Constants.OPTION_RUN_LOCAL, Constants.OPTION_RUN_LOCAL_DEFAULT);

        FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
        if (overrideOutput) {
            fs.delete(new Path(args[1]), true);
            fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_1), true);
            fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_2), true);
            fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_3), true);
            fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_4), true);
        }

        if ((copyToLocal) || (runLocal)) {
            File path = new File(args[1]);
            path.mkdirs();
        }

        Tool first = new FirstDriver(configuration);
        int status = first.run(new String[] { args[0], args[1] + OUTPUT_PATH_POSTFIX_1 });
        if (status != 0) {
            return status;
        }

        createOffsetsFile(fs, args[1] + OUTPUT_PATH_POSTFIX_1, args[1] + OUTPUT_PATH_POSTFIX_1);
        Path offsets = new Path(args[1] + OUTPUT_PATH_POSTFIX_1, Constants.OFFSETS_FILENAME);
        DistributedCache.addCacheFile(offsets.toUri(), configuration);

        Tool second = new SecondDriver(configuration);
        status = second.run(new String[] { args[0], args[1] + OUTPUT_PATH_POSTFIX_2 });
        if (status != 0) {
            return status;
        }

        Tool third = new ThirdDriver(configuration);
        status = third.run(new String[] { args[1] + OUTPUT_PATH_POSTFIX_2, args[1] + OUTPUT_PATH_POSTFIX_3 });
        if (status != 0) {
            return status;
        }

        Tool fourth = new FourthDriver(configuration);
        status = fourth.run(new String[] { args[1] + OUTPUT_PATH_POSTFIX_3, args[1] + OUTPUT_PATH_POSTFIX_4 });
        if (status != 0) {
            return status;
        }

        if (copyToLocal) {
            Tool download = new download(configuration);
            download.run(
                    new String[] { args[1] + OUTPUT_PATH_POSTFIX_2, args[1] + OUTPUT_PATH_POSTFIX_4, args[1] });
        }

        if (verify) {
            DatasetGraphTDB dsgMem = load(args[0]);
            Location location = new Location(args[1]);

            if (!copyToLocal) {
                // TODO: this is a sort of a cheat and it could go away (if it turns out to be too slow)!
                download.fixNodeTable2(location);
            }

            DatasetGraphTDB dsgDisk = SetupTDB.buildDataset(location);
            boolean isomorphic = isomorphic(dsgMem, dsgDisk);
            System.out.println("> " + isomorphic);
        }

        return status;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new tdbloader4(), args);
        System.exit(exitCode);
    }

    private void createOffsetsFile(FileSystem fs, String input, String output) throws IOException {
        log.debug("Creating offsets file...");
        Map<Long, Long> offsets = new TreeMap<Long, Long>();
        FileStatus[] status = fs.listStatus(new Path(input));
        for (FileStatus fileStatus : status) {
            Path file = fileStatus.getPath();
            if (file.getName().startsWith("part-r-")) {
                log.debug("Processing: {}", file.getName());
                BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(file)));
                String line = in.readLine();
                String[] tokens = line.split("\\s");
                long partition = Long.valueOf(tokens[0]);
                long offset = Long.valueOf(tokens[1]);
                log.debug("Partition {} has offset {}", partition, offset);
                offsets.put(partition, offset);
            }
        }

        Path outputPath = new Path(output, Constants.OFFSETS_FILENAME);
        PrintWriter out = new PrintWriter(new OutputStreamWriter(fs.create(outputPath)));
        for (Long partition : offsets.keySet()) {
            out.println(partition + "\t" + offsets.get(partition));
        }
        out.close();
        log.debug("Offset file created.");
    }

    public static boolean isomorphic(DatasetGraph dsgMem, DatasetGraph dsgDisk) {
        if (!dsgMem.getDefaultGraph().isIsomorphicWith(dsgDisk.getDefaultGraph()))
            return false;
        Iterator<Node> graphsMem = dsgMem.listGraphNodes();
        Iterator<Node> graphsDisk = dsgDisk.listGraphNodes();

        Set<Node> seen = new HashSet<Node>();

        while (graphsMem.hasNext()) {
            Node graphNode = graphsMem.next();
            if (dsgDisk.getGraph(graphNode) == null)
                return false;
            if (!dsgMem.getGraph(graphNode).isIsomorphicWith(dsgDisk.getGraph(graphNode)))
                return false;
            seen.add(graphNode);
        }

        while (graphsDisk.hasNext()) {
            Node graphNode = graphsDisk.next();
            if (!seen.contains(graphNode)) {
                if (dsgMem.getGraph(graphNode) == null)
                    return false;
                if (!dsgMem.getGraph(graphNode).isIsomorphicWith(dsgDisk.getGraph(graphNode)))
                    return false;
            }
        }

        return true;
    }

    public static DatasetGraphTDB load(String inputPath) {
        List<String> urls = new ArrayList<String>();
        for (File file : new File(inputPath).listFiles()) {
            if (file.isFile()) {
                urls.add(file.getAbsolutePath());
            }
        }
        DatasetGraphTransaction dsg = (DatasetGraphTransaction) TDBFactory.createDatasetGraph();
        TDBLoader.load(dsg.getBaseDatasetGraph(), urls);

        return dsg.getBaseDatasetGraph();
    }

    public static String dump(DatasetGraph dsgMem, DatasetGraph dsgDisk) {
        StringBuffer sb = new StringBuffer();
        sb.append("\n");

        if (!dsgMem.getDefaultGraph().isIsomorphicWith(dsgDisk.getDefaultGraph())) {
            sb.append("Default graphs are not isomorphic [FAIL]\n");
            sb.append("    First:\n");
            dump(sb, dsgMem.getDefaultGraph());
            sb.append("    Second:\n");
            dump(sb, dsgDisk.getDefaultGraph());
        } else {
            sb.append("Default graphs are isomorphic [OK]\n");
        }

        Iterator<Node> graphsMem = dsgMem.listGraphNodes();
        Iterator<Node> graphsDisk = dsgDisk.listGraphNodes();
        Set<Node> seen = new HashSet<Node>();

        while (graphsMem.hasNext()) {
            Node graphNode = graphsMem.next();
            if (dsgDisk.getGraph(graphNode) == null)
                sb.append(graphNode + " is present only in one dataset. [FAIL]\n");
            if (!dsgMem.getGraph(graphNode).isIsomorphicWith(dsgDisk.getGraph(graphNode))) {
                sb.append("\n" + graphNode + " graphs are not isomorphic [FAIL]\n");
                sb.append("    First:\n");
                dump(sb, dsgMem.getGraph(graphNode));
                sb.append("    Second:\n");
                dump(sb, dsgDisk.getGraph(graphNode));
            }
            seen.add(graphNode);
        }

        while (graphsDisk.hasNext()) {
            Node graphNode = graphsDisk.next();
            if (!seen.contains(graphNode)) {
                if (dsgMem.getGraph(graphNode) == null)
                    sb.append(graphNode + " is present only in one dataset. [FAIL]\n");
                if (!dsgMem.getGraph(graphNode).isIsomorphicWith(dsgDisk.getGraph(graphNode))) {
                    sb.append("\n" + graphNode + " graphs are not isomorphic [FAIL]\n");
                    sb.append("    First:\n");
                    dump(sb, dsgMem.getGraph(graphNode));
                    sb.append("    Second:\n");
                    dump(sb, dsgDisk.getGraph(graphNode));
                }
            }
        }

        return sb.toString();
    }

    private static void dump(StringBuffer sb, Graph graph) {
        ExtendedIterator<Triple> iter = graph.find(Node.ANY, Node.ANY, Node.ANY);
        while (iter.hasNext()) {
            Triple triple = iter.next();
            sb.append(triple).append("\n");
        }
    }

}