knowledgeMiner.mining.wikipedia.CategoryMembershipMiner.java Source code

Java tutorial

Introduction

Here is the source code for knowledgeMiner.mining.wikipedia.CategoryMembershipMiner.java

Source

/*******************************************************************************
 * Copyright (C) 2013 University of Waikato, Hamilton, New Zealand.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *    Sam Sarjant - initial API and implementation
 ******************************************************************************/
package knowledgeMiner.mining.wikipedia;

import io.ontology.OntologySocket;
import io.resources.WMISocket;

import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import knowledgeMiner.mapping.CycMapper;
import knowledgeMiner.mapping.textToCyc.TextToCyc_TextSearch;
import knowledgeMiner.mining.CycMiner;
import knowledgeMiner.mining.HeuristicProvenance;
import knowledgeMiner.mining.InformationType;
import knowledgeMiner.mining.MinedInformation;
import knowledgeMiner.mining.PartialAssertion;
import knowledgeMiner.mining.SentenceParserHeuristic;

import org.apache.commons.lang3.StringUtils;

import util.collection.WeightedSet;
import cyc.CycConstants;
import cyc.OntologyConcept;

/**
 * Mines the names of the categories this article is a member by treating them
 * as parent collections.
 * 
 * @author Sam Sarjant
 */
public class CategoryMembershipMiner extends WikipediaArticleMiningHeuristic {
    private static final Pattern BIRTH_PATTERN = Pattern.compile("(\\d{1,4})s? births?");
    private static final Pattern DEATH_PATTERN = Pattern.compile("(\\d{1,4})s? deaths?");
    public static boolean wikifyText_ = true;

    public CategoryMembershipMiner(CycMapper mapper, CycMiner miner) {
        super(true, mapper, miner);
    }

    @Override
    protected void mineArticleInternal(MinedInformation info, int informationRequested, WMISocket wmi,
            OntologySocket ontology) throws Exception {
        int artID = info.getArticle();
        String artTitle = wmi.getPageTitle(artID, true);
        Collection<Integer> categories = wmi.getArticleCategories(artID);
        for (Integer category : categories) {
            String categoryTitle = wmi.getPageTitle(category, true);
            // Remove the word 'stub(s)'
            categoryTitle = categoryTitle.replaceAll(" stubs?", "");
            if (categoryTitle.equals(artTitle))
                continue;

            // Special category parsing
            if (parseSpecial(categoryTitle, info, ontology, wmi))
                continue;

            // Check article title similarity
            int result = StringUtils.getLevenshteinDistance(artTitle, categoryTitle, 3);
            if (result != -1)
                continue;

            // Treat the category as a chunk of text to be parsed
            String sentence = SentenceParserHeuristic.SENTENCE_PREFIX + categoryTitle + ".";
            miner_.mineSentence(sentence, false, info, this, ontology, wmi);
        }
    }

    /**
     * Parses the category title using specialised techniques rather than the
     * NLP parser.
     * 
     * @param categoryTitle
     *            The title of the category.
     * @param info
     *            The info to add assertion(s) to.
     * @param ontology
     *            The ontology access.
     * @param wmi
     *            The WMI access.
     * @return True if the category is a special category.
     * @throws IllegalAccessException
     *             Should something go awry.
     */
    private boolean parseSpecial(String categoryTitle, MinedInformation info, OntologySocket ontology,
            WMISocket wmi) throws IllegalAccessException {
        // Births
        PartialAssertion assertion = createDatedAssertion(categoryTitle, info, BIRTH_PATTERN,
                CycConstants.BIRTH_DATE.getConcept(), ontology, wmi);
        if (assertion != null) {
            info.addAssertion(assertion);
            return true;
        }
        // Deaths
        assertion = createDatedAssertion(categoryTitle, info, DEATH_PATTERN, CycConstants.DEATH_DATE.getConcept(),
                ontology, wmi);
        if (assertion != null) {
            info.addAssertion(assertion);
            return true;
        }
        return false;
    }

    @SuppressWarnings("unchecked")
    private PartialAssertion createDatedAssertion(String categoryTitle, MinedInformation info, Pattern titlePattern,
            OntologyConcept predicate, OntologySocket ontology, WMISocket wmi) throws IllegalAccessException {
        Matcher m = titlePattern.matcher(categoryTitle);
        if (m.matches()) {
            HeuristicProvenance provenance = new HeuristicProvenance(this, categoryTitle);
            WeightedSet<OntologyConcept> results = mapper_.mapViaHeuristic(m.group(1), TextToCyc_TextSearch.class,
                    wmi, ontology);
            // Should only return 0-1 result
            if (!results.isEmpty())
                return new PartialAssertion(predicate, provenance, info.getMappableSelfRef(),
                        results.iterator().next());
        }
        return null;
    }

    @Override
    protected void setInformationTypes(boolean[] infoTypes) {
        infoTypes[InformationType.TAXONOMIC.ordinal()] = true;
    }

}