knowledgeMiner.preprocessing.KnowledgeMinerPreprocessor.java Source code

Java tutorial

Introduction

Here is the source code for knowledgeMiner.preprocessing.KnowledgeMinerPreprocessor.java

Source

/*******************************************************************************
 * Copyright (C) 2013 University of Waikato, Hamilton, New Zealand.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *    Sam Sarjant - initial API and implementation
 ******************************************************************************/
package knowledgeMiner.preprocessing;

import graph.core.CommonConcepts;
import graph.core.DirectedAcyclicGraph;
import io.ResourceAccess;
import io.ontology.OntologySocket;
import io.resources.WMISocket;

import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import knowledgeMiner.ConceptMiningTask;
import knowledgeMiner.ConceptModule;
import knowledgeMiner.KnowledgeMiner;
import knowledgeMiner.WeightedHeuristic;
import knowledgeMiner.mapping.CycMapper;
import knowledgeMiner.mapping.MappingHeuristic;
import knowledgeMiner.mining.CycMiner;
import knowledgeMiner.mining.MinedInformation;
import knowledgeMiner.mining.MiningHeuristic;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.slf4j.LoggerFactory;

import util.UtilityMethods;
import util.collection.WeightedSet;
import util.serialisation.DefaultSerialisationMechanism;
import util.serialisation.FSTSerialisationMechanism;
import util.serialisation.SerialisationMechanism;
import cyc.OntologyConcept;

/**
 * Precomputes mappings and minings for concepts/articles to be used later. Also
 * has the capability to read the precomputed data.
 * 
 * @author Sam Sarjant
 */
public class KnowledgeMinerPreprocessor {
    private static final File DIR_PATH = new File("precomputedHeuristics");

    private static KnowledgeMinerPreprocessor instance_;

    public static boolean ENABLE_PREPROCESSING = true;

    public static final int NUM_MAPPINGS = (int) Math.pow(2, 13);

    /**
     * The data being precomputed. Heuristic name => sortedmap of
     * concept/article and the precomputed data.
     */
    private Map<String, HeuristicResult> heuristicResults_;

    /** The current heuristics being processed. */
    private Collection<? extends WeightedHeuristic> heuristics_;

    /** The mapper link. */
    private CycMapper mapper_;

    /** The miner link. */
    private CycMiner miner_;

    /** The number of preprocessed concepts/articles. */
    private int numWritten_;

    private boolean overwriteMode_ = false;

    /** The start time of the processing. */
    private long startTime_;

    @SuppressWarnings("rawtypes")
    private KnowledgeMinerPreprocessor() {
        miner_ = KnowledgeMiner.getInstance().getMiner();
        mapper_ = KnowledgeMiner.getInstance().getMapper();

        // Initialise the heuristic map
        heuristicResults_ = new HashMap<String, HeuristicResult>();
        for (MappingHeuristic cycWikiMapper : mapper_.getCycToWikiMappingSuite().getHeuristics())
            heuristicResults_.put(cycWikiMapper.getHeuristicName(),
                    new HeuristicResult(cycWikiMapper.getHeuristicName()));
        for (MappingHeuristic wikiCycMapper : mapper_.getWikiToCycMappingSuite().getHeuristics())
            heuristicResults_.put(wikiCycMapper.getHeuristicName(),
                    new HeuristicResult(wikiCycMapper.getHeuristicName()));
        for (MiningHeuristic miningHeuristic : miner_.getMiningHeuristics())
            heuristicResults_.put(miningHeuristic.getHeuristicName(),
                    new HeuristicResult(miningHeuristic.getHeuristicName()));
    }

    /**
     * Loops through all Wikipedia/Ontology indices.
     * 
     * @param taskType
     *            The type of task to process.
     * @param heuristics
     *            The heuristics to process with.
     * @param reverseOrder
     */
    private void precomputeAll(PrecomputationTaskType taskType, Collection<? extends WeightedHeuristic> heuristics,
            boolean reverseOrder) {
        boolean loopOntology = taskType == PrecomputationTaskType.CYC_TO_WIKI;

        // Set up the iterator
        OntologySocket ontology = (loopOntology) ? ResourceAccess.requestOntologySocket() : null;
        WMISocket wmi = (loopOntology) ? null : ResourceAccess.requestWMISocket();

        // Set up an executor and add all concepts to the execution queue.
        ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors
                .newFixedThreadPool(Math.max(1, KnowledgeMiner.getNumThreads()));
        int id = (reverseOrder) ? 35000000 : 0;

        while (true) {
            try {
                // Get next thing
                int nextID = 0;
                if (loopOntology) {
                    if (reverseOrder)
                        nextID = ontology.getPrevNode(id);
                    else
                        nextID = ontology.getNextNode(id);
                } else {
                    if (reverseOrder)
                        nextID = wmi.getPrevArticle(id);
                    else
                        nextID = wmi.getNextArticle(id);
                }
                if (nextID < 0)
                    break;

                id = nextID;
                if (loopOntology) {
                    // If the concept is not a predicate or ephemeral, process
                    // it.
                    String constant = ontology.findConceptByID(id);
                    if (constant != null
                            && ontology.getProperty(id, true, DirectedAcyclicGraph.EPHEMERAL_MARK) == null
                            && !ontology.evaluate(null, CommonConcepts.ISA.getID(), constant,
                                    CommonConcepts.PREDICATE.getID())) {
                        PrecomputationTask preTask = new PrecomputationTask(
                                new ConceptModule(new OntologyConcept(constant, id)), heuristics, taskType, this);
                        executor.execute(preTask);
                    }
                } else {
                    String type = wmi.getPageType(nextID);
                    // If it's an article or disambiguation, process it.
                    if (type.equals(WMISocket.TYPE_ARTICLE) || type.equals(WMISocket.TYPE_DISAMBIGUATION)) {
                        PrecomputationTask preTask = new PrecomputationTask(new ConceptModule(id), heuristics,
                                taskType, this);
                        executor.execute(preTask);
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        // Wait for completion
        executor.shutdown();
        try {
            if (executor.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS))
                return;
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        System.err.println("Error precomputing tasks.");
    }

    /**
     * Writes the ontology mapped data to file for the given heuristic.
     * 
     * @param ontologyConcept
     * 
     * @param mappings
     *            The mapping data to write.
     * @param heuristic
     *            The heuristic to write for.
     */
    protected void writeCycMappedData(ConceptModule concept, WeightedSet<Integer> mappings,
            MappingHeuristic<OntologyConcept, Integer> heuristic) {
        recordData(heuristic.getHeuristicName(), concept.getConcept().getID(), mappings);
    }

    /**
     * Writes the mined data to file for the given heuristic.
     * 
     * @param articleID
     * 
     * @param info
     *            The mined concept module.
     * @param heuristic
     *            The heuristic to write the data for.
     */
    protected void writeMinedData(ConceptModule article, MinedInformation info, MiningHeuristic heuristic) {
        recordData(heuristic.getHeuristicName(), article.getArticle(), info);
    }

    /**
     * Writes the Wikipedia mapped data to file for the given heuristic.
     * 
     * @param article
     *            The article to write about.
     * @param mappings
     *            The mapping data to write.
     * @param heuristic
     *            The heuristic to write for.
     */
    protected void writeWikiMappedData(ConceptModule article, WeightedSet<OntologyConcept> mappings,
            MappingHeuristic<Integer, OntologyConcept> heuristic) {
        recordData(heuristic.getHeuristicName(), article.getArticle(), mappings);
    }

    /**
     * Gets/Loads the map for the given heuristic. If the heuristic is
     * serialised, it is deserialised for mapping data to be added to.
     * 
     * @param heuristic
     *            The heuristic map to get.
     * @param index
     *            The index of the data to be recorded.
     */
    public Object getLoadHeuristicResult(String heuristicName, int index) {
        if (!ENABLE_PREPROCESSING)
            return null;

        return heuristicResults_.get(heuristicName).getLoadHeuristicResults(index);
    }

    public void incrementProcessed() {
        if (!ENABLE_PREPROCESSING)
            return;
        numWritten_++;
        if (numWritten_ % ConceptMiningTask.UPDATE_INTERVAL == 0) {
            long elapsedTime = System.currentTimeMillis() - startTime_;
            String status = numWritten_ + " processed. " + UtilityMethods.toTimeFormat(elapsedTime) + " runtime.";
            System.out.println("\n" + status);
            LoggerFactory.getLogger("STATUS").info(status);
        }

        if (numWritten_ % NUM_MAPPINGS == 0) {
            System.out.println("Writing heuristics to file.");
            writeHeuristics();
        }
    }

    /**
     * Checks if a thing is already processed by a heuristic.
     * 
     * @param heuristic
     *            The heuristic to check for.
     * @param id
     *            The id of the thing being checked.
     * @return True if the thing has already been processed by the heuristic.
     */
    public boolean isProcessed(String heuristicName, int id) {
        if (overwriteMode_ || !ENABLE_PREPROCESSING)
            return false;
        return makeFilename(heuristicName, id).exists();
    }

    /**
     * Precompute a mapping/mining for a given input with a given set of
     * heuristics.
     * 
     * @param cm
     *            The input to map/mine.
     * @param heuristics
     *            The heuristics to mine with (if null, use all).
     * @param reverseOrder
     *            If the data should be precomputed in reverse order.
     * @param The
     *            task type to perform.
     */
    public void precomputeData(ConceptModule cm, Collection<? extends WeightedHeuristic> heuristics,
            PrecomputationTaskType taskType, boolean reverseOrder) {
        ENABLE_PREPROCESSING = true;
        // If articleID is -1, run through every article via threads
        if (cm == null) {
            precomputeAll(taskType, heuristics, reverseOrder);
        } else {
            PrecomputationTask preTask = new PrecomputationTask(cm, heuristics, taskType, this);
            preTask.run();
        }
    }

    /**
     * A general function for recording data. Type information is deliberately
     * general to allow flexibility.
     * 
     * @param heuristic
     *            The heuristic to record the data for.
     * @param index
     *            The index (article/concept) that wsa processed.
     * @param data
     *            The data linked to the index.
     */
    public void recordData(String heuristicName, int index, Object data) {
        if (heuristicName == null || index < 0 || data == null || !ENABLE_PREPROCESSING)
            return;
        heuristicResults_.get(heuristicName).recordResult(index, data);
    }

    public void run(CommandLine parse) {
        // Sort out article and heuristics
        ENABLE_PREPROCESSING = true;
        startTime_ = System.currentTimeMillis();
        overwriteMode_ = parse.hasOption("f");
        numWritten_ = 0;
        ConceptModule cm = null;
        boolean reverseOrder = false;
        if (parse.getOptionValue("i") != null) {
            int i = Integer.parseInt(parse.getOptionValue("i"));
            if (parse.hasOption("o"))
                cm = new ConceptModule(new OntologyConcept(i));
            else
                cm = new ConceptModule(i);
        }
        if (parse.hasOption("R"))
            reverseOrder = true;

        PrecomputationTaskType taskType = null;
        if (parse.hasOption("m")) {
            // Mining article
            heuristics_ = new ArrayList<>(miner_.getMiningHeuristics());
            taskType = PrecomputationTaskType.MINE;
        } else if (parse.hasOption("w")) {
            // Mapping article to concept
            heuristics_ = new ArrayList<>(mapper_.getWikiToCycMappingSuite().getHeuristics());
            taskType = PrecomputationTaskType.WIKI_TO_CYC;
        } else if (parse.hasOption("o")) {
            // Mapping concept to article
            heuristics_ = new ArrayList<>(mapper_.getCycToWikiMappingSuite().getHeuristics());
            taskType = PrecomputationTaskType.CYC_TO_WIKI;
        }

        // Parse the heuristics
        if (parse.hasOption("h")) {
            String[] heuristicArgs = parse.getOptionValues("h");
            Collection<WeightedHeuristic> filteredHeuristics = new ArrayList<>();
            // Find the heuristic by name
            for (String arg : heuristicArgs) {
                boolean found = false;
                for (WeightedHeuristic obj : heuristics_) {
                    if (arg.equals(obj.getHeuristicName())) {
                        filteredHeuristics.add(obj);
                        found = true;
                        break;
                    }
                }

                if (!found) {
                    System.err.println("Could not find heuristic: " + arg);
                    System.err.println("Available heuristics: " + heuristics_.toString());
                    System.exit(1);
                }
            }

            heuristics_ = filteredHeuristics;
        }
        System.out.println("Beginning precomputation with heuristics: " + heuristics_);
        precomputeData(cm, heuristics_, taskType, reverseOrder);

        // Write files
        writeHeuristics();

        long elapsedTime = System.currentTimeMillis() - startTime_;
        System.out.println("\n\nProcessing complete. " + UtilityMethods.toTimeFormat(elapsedTime) + " runtime.");
    }

    /**
     * Writes the heuristic-processed data to file.
     */
    public void writeHeuristics() {
        if (!ENABLE_PREPROCESSING)
            return;

        // Write every heuristic
        for (HeuristicResult heuristic : heuristicResults_.values())
            heuristic.writeHeuristic();

        // Clear the serialisation cache
        ((FSTSerialisationMechanism) SerialisationMechanism.FST.getSerialiser()).reset();
    }

    /**
     * Gets the {@link KnowledgeMiner} instance.
     * 
     * @return The {@link KnowledgeMiner} instance or null if it hasn't been
     *         created yet.
     */
    public static KnowledgeMinerPreprocessor getInstance() {
        if (instance_ == null) {
            instance_ = new KnowledgeMinerPreprocessor();
            SerialisationMechanism.FST.getSerialiser();
        }
        return instance_;
    }

    /**
     * Main method for running the CycMiner in precomputation or verbose mode.
     * 
     * @param args
     *            The args to determine which mode to run in.
     */
    public static void main(String[] args) {
        ENABLE_PREPROCESSING = true;
        Options options = new Options();
        options.addOption("m", false, "If precomputing mined data.");
        options.addOption("w", false, "If mapping from article to ontology.");
        options.addOption("o", false, "If mapping from ontology to article.");
        options.addOption("f", false, "Force all heuristics to be run, even if they have stored results.");
        options.addOption("i", true, "The article/concept to mine. Defaults to all.");
        options.addOption("R", false, "If the process should run in reverse order.");
        Option heurOption = new Option("h", true, "The heuristic(s) to use. Defaults to all.");
        heurOption.setArgs(20);
        options.addOption(heurOption);

        CommandLineParser parser = new BasicParser();
        try {
            CommandLine parse = parser.parse(options, args);
            KnowledgeMinerPreprocessor kmp = getInstance();
            kmp.run(parse);
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }

    /**
     * Basic helper method for consistent file naming.
     */
    public static File makeFilename(String heuristic, int index) {
        // Organise into folders
        int chunkIndex = (index / NUM_MAPPINGS) * NUM_MAPPINGS;
        return new File(DIR_PATH, heuristic + File.separator + chunkIndex + File.separator + index + ".dat");
    }

    private class HeuristicResult {
        /** The indices that have changed and need to be serialised. */
        private Collection<Integer> changed_;

        /** The name of the heuristic this result set is for. */
        private String heuristicName_;

        /** HashMap results by ID. */
        private Map<Integer, Object> resultMap_;

        private Lock writeLock_;

        /**
         * Constructor for a new HeuristicResult
         */
        public HeuristicResult(String heuristicName) {
            heuristicName_ = heuristicName;
            resultMap_ = new HashMap<>();
            writeLock_ = new ReentrantLock();
            changed_ = new HashSet<>();
        }

        private Object loadResult(int index) {
            File location = makeFilename(heuristicName_, index);
            Object deser = null;
            try {
                deser = SerialisationMechanism.FST.getSerialiser().deserialize(location);
            } catch (Exception e) {
                e.printStackTrace();
                // Remove the file and treat as unprocessed
                location.delete();
                return null;
            }
            if (deser != null) {
                try {
                    writeLock_.lock();
                    resultMap_.put(index, deser);
                } finally {
                    writeLock_.unlock();
                }
            }
            return deser;
        }

        /**
         * Clears the results from memory.
         */
        public void clear() {
            resultMap_.clear();
            changed_.clear();
        }

        /**
         * Gets or loads the mapping containing the preprocessed results.
         * 
         * @param index
         *            The index to get the map for.
         * @return The mapping that would contain the index.
         */
        public Object getLoadHeuristicResults(int index) {
            // First attempt to load it
            if (resultMap_.containsKey(index))
                return resultMap_.get(index);
            return loadResult(index);
        }

        /**
         * Records a result for the index and returns the old value (if any).
         * 
         * @param index
         *            The index to record the value for.
         * @param data
         *            The value to record.
         * @return The old value (if any).
         */
        public void recordResult(int index, Object data) {
            try {
                writeLock_.lock();
                Object oldVal = resultMap_.put(index, data);
                if (oldVal == null || !oldVal.equals(data))
                    changed_.add(index);
            } finally {
                writeLock_.unlock();
            }
        }

        /**
         * Writes the current results to file.
         */
        public void writeHeuristic() {
            // If there's no data to serialise, continue.
            if (resultMap_.isEmpty())
                return;

            try {
                writeLock_.lock();
                for (Integer index : changed_) {
                    // Set up file details
                    File location = makeFilename(heuristicName_, index);
                    // Serialise it.
                    try {
                        SerialisationMechanism.FST.getSerialiser().serialize(resultMap_.get(index), location,
                                DefaultSerialisationMechanism.NORMAL);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
                clear();
            } finally {
                writeLock_.unlock();
            }
        }
    }
}