org.seasr.meandre.components.analytics.socialnetworking.AbstractLinkCreationComponent.java Source code

Introduction

Here is the source code for org.seasr.meandre.components.analytics.socialnetworking.AbstractLinkCreationComponent.java
Source

/**
*
* University of Illinois/NCSA
* Open Source License
*
* Copyright (c) 2008, NCSA.  All rights reserved.
*
* Developed by:
* The Automated Learning Group
* University of Illinois at Urbana-Champaign
* http://www.seasr.org
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal with the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimers.
*
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
*
* Neither the names of The Automated Learning Group, University of
* Illinois at Urbana-Champaign, nor the names of its contributors may
* be used to endorse or promote products derived from this Software
* without specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE
* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*
*/

package org.seasr.meandre.components.analytics.socialnetworking;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.WordUtils;
import org.meandre.annotations.ComponentInput;
import org.meandre.annotations.ComponentProperty;
import org.meandre.core.ComponentContext;
import org.meandre.core.ComponentContextException;
import org.meandre.core.ComponentContextProperties;
import org.seasr.datatypes.core.BasicDataTypes.Strings;
import org.seasr.datatypes.core.BasicDataTypes.StringsArray;
import org.seasr.datatypes.core.BasicDataTypesTools;
import org.seasr.datatypes.core.KeyValuePair;
import org.seasr.datatypes.core.Names;
import org.seasr.meandre.components.abstracts.AbstractStreamingExecutableComponent;
import org.seasr.meandre.components.nlp.opennlp.OpenNLPNamedEntity;
import org.seasr.meandre.support.components.tuples.SimpleTuple;
import org.seasr.meandre.support.components.tuples.SimpleTuplePeer;

/**
 * Abstract skeleton component that provides link creation services.
 * Extend this class to create a complete component that outputs the result in a specific format.
 *
 * @author Boris Capitanu
 *
 */

public abstract class AbstractLinkCreationComponent extends AbstractStreamingExecutableComponent {

    //------------------------------ INPUTS ------------------------------------------------------

    @ComponentInput(name = Names.PORT_TUPLES, description = "Set of tuples."
            + "<br>TYPE: org.seasr.datatypes.BasicDataTypes.StringsArray")
    protected static final String IN_TUPLES = Names.PORT_TUPLES;

    @ComponentInput(name = Names.PORT_META_TUPLE, description = "Meta data for tuples."
            + "<br>TYPE: org.seasr.datatypes.BasicDataTypes.Strings")
    protected static final String IN_META_TUPLE = Names.PORT_META_TUPLE;

    //------------------------------ PROPERTIES --------------------------------------------------

    @ComponentProperty(description = "Entity types (comma delimited list).", name = Names.PROP_ENTITIES, defaultValue = "person")
    protected static final String PROP_ENTITIES = Names.PROP_ENTITIES;

    @ComponentProperty(description = "Maximum sentence distance whereby entities are marked as related.", name = Names.PROP_OFFSET, defaultValue = "1")
    protected static final String PROP_OFFSET = Names.PROP_OFFSET;

    @ComponentProperty(description = "Set to 'true' to apply normalization functions to entities.", name = "normalize_entities", defaultValue = "true")
    protected static final String PROP_NORMALIZE_ENTITIES = "normalize_entities";

    @ComponentProperty(description = "Set to 'true' to remove uncorrelated entities.", name = "remove_uncorrelated_entities", defaultValue = "true")
    protected static final String PROP_REMOVE_EMPTY = "remove_uncorrelated_entities";

    //--------------------------------------------------------------------------------------------

    protected static final Pattern REGEXP_NONWHITESPACE = Pattern.compile("([^\\s]+)");
    protected static final Pattern REGEXP_PERSON = Pattern.compile("(?:(\\p{Alpha}+)\\s*)");

    protected Set<String> _entityTypes;
    protected int _offset;
    protected boolean _isStreaming;
    protected boolean _removeUncorrelatedEntities;
    protected boolean _normalizeEntities;

    protected final Map<Entity, Entity> _entities = new HashMap<Entity, Entity>();

    //--------------------------------------------------------------------------------------------

    @Override
    public void initializeCallBack(ComponentContextProperties ccp) throws Exception {
        super.initializeCallBack(ccp);

        _offset = Integer.parseInt(getPropertyOrDieTrying(PROP_OFFSET, ccp));
        if (_offset < 0)
            throw new ComponentContextException(
                    String.format("Property '%s' must be greater than or equal to zero", PROP_OFFSET));

        String entityTypes = getPropertyOrDieTrying(PROP_ENTITIES, ccp);

        _normalizeEntities = Boolean.parseBoolean(getPropertyOrDieTrying(PROP_NORMALIZE_ENTITIES, ccp));
        _removeUncorrelatedEntities = Boolean.parseBoolean(getPropertyOrDieTrying(PROP_REMOVE_EMPTY, ccp));

        _entityTypes = new HashSet<String>();
        for (String entity : entityTypes.split(","))
            _entityTypes.add(entity.trim());

        _isStreaming = false;
    }

    //
    // TODO .. allow the component via properties to decide what values to pull from
    // the tuples:  e.g. sentenceId, type, text, etc
    //

    @Override
    public void executeCallBack(ComponentContext cc) throws Exception {
        Strings inMetaTuple = (Strings) cc.getDataComponentFromInput(IN_META_TUPLE);
        SimpleTuplePeer tuplePeer = new SimpleTuplePeer(inMetaTuple);
        console.fine("Input meta tuple: " + tuplePeer.toString());

        StringsArray inTuples = (StringsArray) cc.getDataComponentFromInput(IN_TUPLES);
        Strings[] tuples = BasicDataTypesTools.stringsArrayToJavaArray(inTuples);

        int SENTENCE_ID_IDX = tuplePeer.getIndexForFieldName(OpenNLPNamedEntity.SENTENCE_ID_FIELD);
        int TYPE_IDX = tuplePeer.getIndexForFieldName(OpenNLPNamedEntity.TYPE_FIELD);
        int TEXT_IDX = tuplePeer.getIndexForFieldName(OpenNLPNamedEntity.TEXT_FIELD);

        // Linked list of sentences keyed by sentence id - the HashSet is the set of entities in that sentence
        LinkedList<KeyValuePair<Integer, HashSet<Entity>>> _sentencesWindow = new LinkedList<KeyValuePair<Integer, HashSet<Entity>>>();

        // Note: The algorithm used to mark entities as adjacent if they fall within the specified sentence distance
        //       relies on a sliding-window of sentences that are within the 'adjacency' range. As new sentences are
        //       considered, the window moves to the right and old sentences that are now too far fall out of scope.

        SimpleTuple tuple = tuplePeer.createTuple();
        for (Strings t : tuples) {
            tuple.setValues(t);

            Integer sentenceId = Integer.parseInt(tuple.getValue(SENTENCE_ID_IDX));
            String tupleType = tuple.getValue(TYPE_IDX);
            String tupleValue = tuple.getValue(TEXT_IDX);

            // If the entity is of the type we're interested in
            if (_entityTypes.contains(tupleType)) {

                if (_normalizeEntities) {
                    // Normalize whitespaces
                    StringBuilder sb = new StringBuilder();
                    Matcher nonWhitespaceMatcher = REGEXP_NONWHITESPACE.matcher(tupleValue);
                    while (nonWhitespaceMatcher.find())
                        sb.append(" ").append(nonWhitespaceMatcher.group(1));

                    if (sb.length() > 0)
                        tupleValue = sb.substring(1);
                    else
                        continue;

                    // Normalize people's names
                    if (tupleType.toLowerCase().equals("person")) {
                        sb = new StringBuilder();
                        Matcher personMatcher = REGEXP_PERSON.matcher(tupleValue);
                        while (personMatcher.find())
                            sb.append(" ").append(personMatcher.group(1));

                        if (sb.length() > 0)
                            tupleValue = sb.substring(1);
                        else
                            continue;

                        // ignore names with 1 character
                        if (tupleValue.length() == 1)
                            continue;
                    }

                    tupleValue = WordUtils.capitalizeFully(tupleValue);
                }

                // ... create an object for it
                Entity entity = new Entity(tupleType, tupleValue);

                // Check if we already recorded this entity before
                Entity oldEntity = _entities.get(entity);
                if (oldEntity == null)
                    // If not, record it
                    _entities.put(entity, entity);
                else
                    // Otherwise retrieve the entity we used before
                    entity = oldEntity;

                HashSet<Entity> sentenceEntities;

                // Remove all sentences (together with any entities they contained) from the set
                // of sentences that are too far from the current sentence of this entity
                while (_sentencesWindow.size() > 0 && sentenceId - _sentencesWindow.peek().getKey() > _offset)
                    _sentencesWindow.remove();

                if (_sentencesWindow.size() > 0) {
                    // If this sentence is different from the last sentence in the window
                    if (_sentencesWindow.getLast().getKey() != sentenceId) {
                        // Create an entry for it and add it at the end of the window
                        sentenceEntities = new HashSet<Entity>();
                        _sentencesWindow
                                .addLast(new KeyValuePair<Integer, HashSet<Entity>>(sentenceId, sentenceEntities));
                    } else
                        sentenceEntities = _sentencesWindow.getLast().getValue();
                } else {
                    // If there are no sentences in the window, create an entry for this sentence and add it
                    sentenceEntities = new HashSet<Entity>();
                    _sentencesWindow
                            .addLast(new KeyValuePair<Integer, HashSet<Entity>>(sentenceId, sentenceEntities));
                }

                // Iterate through all the sentences in the window
                for (KeyValuePair<Integer, HashSet<Entity>> kvp : _sentencesWindow)
                    // ... and all the entities in each sentence
                    for (Entity e : kvp.getValue()) {
                        // ignore self-references
                        if (e.equals(entity))
                            continue;

                        // ... and mark the new entity as being adjacent to all the entities in the window
                        e.addOutwardLink(entity);
                        entity.addInwardLink(e);
                    }

                // Add the new entity to the window
                sentenceEntities.add(entity);
            }
        }

        if (!_isStreaming)
            generateAndPushOutputInternal();
    }

    @Override
    public void disposeCallBack(ComponentContextProperties ccp) throws Exception {
    }

    //--------------------------------------------------------------------------------------------

    @Override
    public boolean isAccumulator() {
        return true;
    }

    @Override
    public void startStream() throws Exception {
        _isStreaming = true;

        reset();
    }

    @Override
    public void endStream() throws Exception {
        generateAndPushOutputInternal();

        _isStreaming = false;
    }

    private void generateAndPushOutputInternal() throws Exception {
        console.entering(getClass().getSimpleName(), "generateAndPushOutput");

        console.info(String.format("Number of nodes: %d", _entities.size()));

        if (_removeUncorrelatedEntities) {
            List<Entity> toRemove = new ArrayList<Entity>();
            for (Entity entity : _entities.keySet())
                if (!(entity.hasInwardLinks() || entity.hasOutwardLinks()))
                    toRemove.add(entity);

            for (Entity e : toRemove)
                _entities.remove(e);

            console.info(String.format("Number of nodes after removing uncorrelated nodes: %d", _entities.size()));
        }

        generateAndPushOutput();

        console.exiting(getClass().getSimpleName(), "generateAndPushOutput");

        reset();
    }

    private void reset() {
        _entities.clear();
    }

    //--------------------------------------------------------------------------------------------

    protected abstract void generateAndPushOutput() throws Exception;

    //--------------------------------------------------------------------------------------------

    class Entity {
        private final String _type;
        private final String _value;
        private final Map<Entity, Integer> _in;
        private final Map<Entity, Integer> _out;
        private Integer _id;

        public Entity(String type, String value) {
            _type = type;
            _value = value;
            _in = new HashMap<Entity, Integer>();
            _out = new HashMap<Entity, Integer>();
            _id = null;
        }

        public String getType() {
            return _type;
        }

        public String getValue() {
            return _value;
        }

        public void setId(int id) {
            _id = id;
        }

        public Integer getId() {
            return _id;
        }

        public void addInwardLink(Entity fromEntity) {
            Integer count = _in.get(fromEntity);
            if (count == null)
                count = 0;
            _in.put(fromEntity, count + 1);
        }

        public void addOutwardLink(Entity toEntity) {
            Integer count = _out.get(toEntity);
            if (count == null)
                count = 0;
            _out.put(toEntity, count + 1);
        }

        public boolean hasInwardLinks() {
            return _in.size() > 0;
        }

        public boolean hasOutwardLinks() {
            return _out.size() > 0;
        }

        public Map<Entity, Integer> getInwardLinks() {
            return _in;
        }

        public Map<Entity, Integer> getOutwardLinks() {
            return _out;
        }

        @Override
        public int hashCode() {
            return (_type + _value).hashCode();
        }

        @Override
        public boolean equals(Object obj) {
            if (!(obj instanceof Entity) || obj == null)
                return false;
            Entity other = (Entity) obj;
            return (_type + _value).equals(other.getType() + other.getValue());
        }

        @Override
        public String toString() {
            return String.format("%s (%s)", _value, _type);
        }
    }
}