de.dfki.km.perspecting.obie.model.Document.java Source code

Introduction

Here is the source code for de.dfki.km.perspecting.obie.model.Document.java
Source

/*
Copyright (c) 2011, 
Benjamin Adrian <benjamin.horak@gmail.com>
German Research Center for Artificial Intelligence (DFKI) <info@dfki.de>
    
All rights reserved.
    
This file is part of SCOOBIE.
    
SCOOBIE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
    
SCOOBIE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with SCOOBIE.  If not, see <http://www.gnu.org/licenses/>.
*/

package de.dfki.km.perspecting.obie.model;

import java.io.File;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Map.Entry;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;

import de.dfki.km.perspecting.obie.transducer.model.SuffixArray;
import de.dfki.km.perspecting.obie.vocabulary.Language;
import de.dfki.km.perspecting.obie.vocabulary.MediaType;
import edu.uci.ics.jung.graph.DirectedGraph;
import gnu.trove.TIntHashSet;
import gnu.trove.TIntObjectHashMap;

/**
 * The {@link Document} is the base document representation along the extraction
 * pipeline.
 * 
 * @author adrian
 * 
 */
public class Document implements Iterable<Token> {

    private URI uri;
    private String plainTextContent;
    private String content;

    private MediaType mimeType = MediaType.TEXT;
    private Language language = Language.UNKNOWN;

    private DataSheet data = new DataSheet();

    private final Logger log = Logger.getLogger(Document.class.getName());

    public Document(String content, URI uri, MediaType mimeType, Language language) throws Exception {
        this.content = content;
        this.uri = uri;
        this.mimeType = mimeType;
        this.language = language;
        normalizeContent();
    }

    public Document(File file, URI uri, MediaType mimeType, Language language) throws Exception {
        this(FileUtils.readFileToString(file), uri, mimeType, language);
    }

    private void normalizeContent() throws Exception {

        switch (mimeType) {

        case HTML:
            plainTextContent = extractPlainTextFromHtml(content);
            break;
        case XHTML:
            plainTextContent = extractPlainTextFromHtml(content);
            break;
        case TEXT:
        default:
            plainTextContent = content;
        }

    }

    /***************************************************************************
     * Gets the pure plain text out of a html text. All html tags are replaced
     * by spaces. To do so, the head is replaced, all remaining javascript tags
     * (including the content) and finally all remaining html tags. Thus,
     * absolute positioning is possible.
     * 
     * @param text
     *            content of the html document as text
     * @return text where all html was replaced by spaces
     */
    private String extractPlainTextFromHtml(String text) {
        Collection<Pattern> patterns = new ArrayList<Pattern>(3);
        // Delete the head, then all remaining javascript items that might exist
        // in the body, then all remaining html tags.
        patterns.add(
                Pattern.compile("<head.*/head>", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL));
        // .*? makes it non greedy -> take the shortes match
        // DOTALL does also include new lines
        patterns.add(Pattern.compile("<script.*?/script>",
                Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL));
        patterns.add(Pattern.compile("<.+?>", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE));
        StringBuffer s = new StringBuffer(text);

        // Go for all patterns.
        for (Pattern p : patterns) {
            Matcher matcher = p.matcher(s);

            // As long as the matcher finds another occurance of the pattern we
            // replace it by the same number of spaces but keep new lines.
            while (matcher.find())
                s.replace(matcher.start(), matcher.end(), matcher.group().replaceAll(".", " "));
        }
        return s.toString();
    }

    /**
     * @return the uri
     */
    public URI getUri() {
        return uri;
    }

    /**
     * @param uri
     *            the uri to set
     */
    public void setUri(URI uri) {
        this.uri = uri;
    }

    /**
     * @return the plainTextContent
     */
    public final String getPlainTextContent() {
        return plainTextContent;
    }

    public String getContent() {
        return content;
    }

    /**
     * @return the mimeType
     */
    public MediaType getMimeType() {
        return mimeType;
    }

    /**
     * @return the language
     */
    public Language getLanguage() {
        return language;
    }

    /**
     * @param language
     *            the language to set
     */
    public void setLanguage(Language language) {
        this.language = language;
    }

    /**
     * The Literals-Subject graph contains links from literals recognized in a
     * text and RDF subjects of triples of these subject having the literal
     * value as object.
     */
    private DirectedGraph<Integer, RDFEdge> graph;

    private DirectedGraph<Integer, RDFEdge> predictionGraph;
    private String template;

    private FilterContext filterContext;

    private Set<Set<Integer>> literalSubjectPairs;

    private SuffixArray suffixArray;

    private DoubleMatrix ambiguityScores;
    private DoubleMatrix relevanceScores;

    public DoubleMatrix getAmbiguityScores() {
        return ambiguityScores;
    }

    public void setAmbiguityScores(DoubleMatrix ambiguityScores) {
        this.ambiguityScores = ambiguityScores;
    }

    public FilterContext getFilterContext() {
        return filterContext;
    }

    public Set<Set<Integer>> getLiteralSubjectPairs() {
        return literalSubjectPairs;
    }

    public DataSheet getData() {
        return data;
    }

    public DirectedGraph<Integer, RDFEdge> getGraph() {
        return graph;
    }

    public void setGraph(DirectedGraph<Integer, RDFEdge> graph) {
        this.graph = graph;
    }

    public DirectedGraph<Integer, RDFEdge> getPredictionGraph() {
        return predictionGraph;
    }

    public void setPredictionGraph(DirectedGraph<Integer, RDFEdge> graph) {
        this.predictionGraph = graph;
    }

    public void removeUnresolvedSubjects(int[] subjects) {

        TIntHashSet _subjects = new TIntHashSet(subjects);

        for (int tokenIndex : data.getIntegerKeys(TokenSequence.SUBJECT)) {
            List<SemanticEntity> values = data.get(TokenSequence.SUBJECT, tokenIndex);
            Set<Integer> indexes = new HashSet<Integer>();
            int i = 0;
            for (SemanticEntity value : values) {

                int subject = value.getSubjectIndex();

                if (_subjects.contains(subject)) {
                    indexes.add(i);
                }
                i++;
            }

            for (int j : indexes) {
                log.fine("removed entity: " + values.get(j));
                values.set(j, null);
            }

            while (values.remove(null))
                ;

        }

    }

    /**
     * Returns all RDF subjects with matching literal property values in text.
     */
    public List<TokenSequence<SemanticEntity>> getResolvedSubjects() {

        // collection that will be returned as result
        List<TokenSequence<SemanticEntity>> entities = new ArrayList<TokenSequence<SemanticEntity>>();

        HashMap<Integer, TokenSequence<SemanticEntity>> map = new HashMap<Integer, TokenSequence<SemanticEntity>>();

        for (int tokenIndex : data.getIntegerKeys(TokenSequence.SUBJECT)) {
            List<SemanticEntity> values = data.get(TokenSequence.SUBJECT, tokenIndex);
            assert values != null; // when does this occur?
            for (SemanticEntity value : values) {

                int subject = value.getSubjectIndex();
                if (value.getPosition().equals("B")) {
                    TokenSequence<SemanticEntity> entity = map.get(subject);
                    if (entity != null) {
                        entities.add(map.remove(subject));
                    }
                    entity = new TokenSequence<SemanticEntity>(value);
                    entity.addToken(new Token(tokenIndex, this));
                    map.put(subject, entity);
                } else {
                    map.get(subject).addToken(new Token(tokenIndex, this));
                }
            }

        }
        entities.addAll(map.values());

        return entities;
        //      
        //      
        //      
        //      
        //      
        // List<TokenSequence<SemanticEntity>> entities = new
        // ArrayList<TokenSequence<SemanticEntity>>();
        // TokenSequence<SemanticEntity> entity = null;
        //
        // for (int tokenIndex :
        // this.data.getIntegerKeys(TokenSequence.SUBJECT)) {
        // List<SemanticEntity> value = this.data.get(TokenSequence.SUBJECT,
        // tokenIndex);
        // for (SemanticEntity e : value) {
        // if (e.getPosition().equals("B")) { // equal for all entries.
        // if (entity != null) {
        // entities.add(entity);
        // }
        // entity = new TokenSequence<SemanticEntity>(e);
        // entity.addToken(new Token(tokenIndex, this));
        // } else {
        // assert entity != null;
        // entity.addToken(new Token(tokenIndex, this));
        // }
        // }
        // }
        // if (entity != null) {
        // entities.add(entity);
        // entity = null;Set
        // }
        //
        // return entities;
    }

    /**
     * Returns for a given set of types the best rorrelating cluster label or
     * null.
     * 
     * @param clusters
     *            mapping between RDF type and assigned cluster label.
     * @param types
     *            the types of the RDF subject(s)
     */
    private Integer getClusterLabel(Map<Integer, Integer> clusters, Set<Integer> types) {

        Set<Integer> labels = new HashSet<Integer>();

        for (Integer t : types) {
            labels.add(clusters.get(t));
        }

        if (labels.size() == 1) {
            return labels.iterator().next();
        }
        return null;

    }

    /**
     * 
     * @param clusters
     * @return
     */
    public List<TokenSequence<SemanticEntity>> getUnambiguoslyTypedEntities(Map<Integer, Integer> clusters,
            Set<Integer> rootLabels) {

        TIntObjectHashMap<List<TokenSequence<SemanticEntity>>> m = new TIntObjectHashMap<List<TokenSequence<SemanticEntity>>>();

        List<TokenSequence<SemanticEntity>> out = new ArrayList<TokenSequence<SemanticEntity>>();

        // aggregate equally positioned TokenSequence on different
        // SemanticEntities
        for (TokenSequence<SemanticEntity> e : getResolvedSubjects()) {
            List<TokenSequence<SemanticEntity>> se = m.get(e.getStart());
            if (se == null) {
                se = new ArrayList<TokenSequence<SemanticEntity>>();
                m.put(e.getStart(), se);
            }
            se.add(e);
        }

        // filter unambiguous entities
        for (int key : m.keys()) {
            HashSet<Integer> types = new HashSet<Integer>();
            TokenSequence<SemanticEntity> phrase = null;

            for (TokenSequence<SemanticEntity> se : m.get(key)) {
                phrase = se;
                for (TIntDoubleTuple t : se.getValue().getTypeIndex()) {
                    types.add(t.key);
                }
            }

            types.removeAll(rootLabels);

            Integer label = getClusterLabel(clusters, types);

            if (label != null) {

                SemanticEntity semEnt = new SemanticEntity();
                semEnt.addTypeIndex(label, 1.0);

                TokenSequence<SemanticEntity> ts = new TokenSequence<SemanticEntity>(semEnt);
                // copy tokens of phrase
                for (Token t : phrase.getTokens()) {
                    ts.addToken(t);
                }
                out.add(ts);
            }
        }
        return out;
    }

    public List<TokenSequence<SemanticEntity>> getRetrievedPropertyValues() {
        List<TokenSequence<SemanticEntity>> entities = new ArrayList<TokenSequence<SemanticEntity>>();

        HashMap<String, TokenSequence<SemanticEntity>> map = new HashMap<String, TokenSequence<SemanticEntity>>();

        for (int tokenIndex : this.data.getIntegerKeys(TokenSequence.PROPERTY)) {
            List<SemanticEntity> values = this.data.get(TokenSequence.PROPERTY, tokenIndex);
            if (values != null) {
                for (SemanticEntity value : values) {

                    String key = Integer.toString(value.getPropertyIndex())
                            + Integer.toString(value.getLiteralValueIndex());

                    if (value.getPosition().equals("B")) {
                        TokenSequence<SemanticEntity> entity = map.get(key);
                        if (entity != null) {
                            entities.add(map.remove(key));
                        }
                        entity = new TokenSequence<SemanticEntity>(value);
                        entity.addToken(new Token(tokenIndex, this));
                        map.put(key, entity);
                    } else {
                        map.get(key).addToken(new Token(tokenIndex, this));
                    }
                }
            } else {
                entities.addAll(map.values());
                map.clear();
            }
        }
        entities.addAll(map.values());

        return entities;
    }

    public List<TokenSequence<SemanticEntity>> getEntityTypes() {
        List<TokenSequence<SemanticEntity>> entities = new ArrayList<TokenSequence<SemanticEntity>>();

        HashMap<Integer, TokenSequence<SemanticEntity>> map = new HashMap<Integer, TokenSequence<SemanticEntity>>();

        for (int tokenIndex : this.data.getIntegerKeys(TokenSequence.TYPE)) {
            List<SemanticEntity> values = this.data.get(TokenSequence.TYPE, tokenIndex);
            if (values != null) {
                for (SemanticEntity value : values) {
                    int property = value.getPropertyIndex();
                    if (value.getPosition().equals("B")) {
                        TokenSequence<SemanticEntity> entity = map.get(property);
                        if (entity != null) {
                            entities.add(map.remove(property));
                        }
                        entity = new TokenSequence<SemanticEntity>(value);
                        entity.addToken(new Token(tokenIndex, this));
                        map.put(property, entity);
                    } else {
                        map.get(property).addToken(new Token(tokenIndex, this));
                    }
                }
            } else {
                entities.addAll(map.values());
                map.clear();
            }
        }
        entities.addAll(map.values());

        return entities;
    }

    public List<TokenSequence<String>> getNounPhrases() {

        List<TokenSequence<String>> phrases = new ArrayList<TokenSequence<String>>();

        TokenSequence<String> phrase = null;

        for (int tokenIndex : this.data.getIntegerKeys(TokenSequence.NOUN_PHRASE)) {
            String tag = this.data.get(TokenSequence.NOUN_PHRASE, tokenIndex);

            // TODO: Fix CRF: Transitions O -> I-NP are never allowed!
            if (tag.equals("I-NP")) {

                // If there has been no B-NP before, we change this I-NP to a
                // B-NP and proceed with it as it was a B-NP before.
                if (phrase == null)
                    tag = "B-NP";
                else
                    phrase.addToken(new Token(tokenIndex, this));
            }

            // Caution: Don't use an else here, since we need to check for B-NP
            // even though we already found an I-NP (see comment above).
            if (tag.equals("B-NP")) {
                if (phrase != null) {
                    phrases.add(phrase);
                }
                phrase = new TokenSequence<String>("B-NP");
                phrase.addToken(new Token(tokenIndex, this));
            }
        }
        if (phrase != null) {
            phrases.add(phrase);
            phrase = null;
        }

        return phrases;
    }

    public List<TokenSequence<Integer>> getSentences() {
        TreeMap<Integer, TokenSequence<Integer>> sentences = new TreeMap<Integer, TokenSequence<Integer>>();

        for (Entry<String, Integer> token : this.data.integerEntries(TokenSequence.SENTENCE)) {

            int start = Integer.parseInt(token.getKey());

            TokenSequence<Integer> sentence = sentences.get(token.getValue());
            if (sentence == null) {
                sentence = new TokenSequence<Integer>(token.getValue());
                sentences.put(token.getValue(), sentence);
            }
            sentence.addToken(new Token(start, this));

        }

        return new ArrayList<TokenSequence<Integer>>(sentences.values());
    }

    public TokenSequence<Integer> getSentence(int index) {
        TokenSequence<Integer> sentence = new TokenSequence<Integer>(index);
        for (Entry<String, Integer> s : this.data.integerEntries(TokenSequence.SENTENCE)) {
            int start = Integer.parseInt(s.getKey());
            if (s.getValue() == index) {
                sentence.addToken(new Token(start, this));
            }
        }
        return sentence;
    }

    /**
     * @return the tokens
     */
    public List<Token> getTokens() {
        TreeSet<Token> tokens = new TreeSet<Token>();

        for (String key : this.data.getKeys(TokenSequence.TOKEN)) {
            tokens.add(new Token(Integer.parseInt(key), this));
        }
        return new ArrayList<Token>(tokens);
    }

    /**
     * @return the tokens
     */
    public List<Token> getTokens(int start, int end) {
        ArrayList<Token> list = new ArrayList<Token>();
        ArrayList<Integer> keys = new ArrayList<Integer>(this.data.getIntegerKeys(TokenSequence.TOKEN));

        for (int i = 0; i < keys.size() && keys.get(i) <= start; i++) {

            if (keys.get(i) == start) {

                Token t;

                int i1 = i;
                do {
                    t = new Token(keys.get(i1), this);
                    if (t.getEnd() < end) {
                        list.add(t);
                    }

                    if (t.getEnd() == end) {
                        list.add(t);
                    }
                    i1++;
                } while (t.getEnd() <= end);
            }

        }

        if (list.get(list.size() - 1).getEnd() != end) {
            list.clear();
        }

        return list;
    }

    public String getTemplate() {
        return template;
    }

    /**
     * @param template
     *            the template to set
     */
    public void setTemplate(String template) {
        this.template = template;
    }

    public void setFilterContext(FilterContext filterContext) {
        this.filterContext = filterContext;
    }

    public void setLiteralsSubjectPairs(Set<Set<Integer>> literalSubjectPairs) {
        this.literalSubjectPairs = literalSubjectPairs;

    }

    public void setSuffixArray(SuffixArray suffixArray) {
        this.suffixArray = suffixArray;
    }

    public SuffixArray getSuffixArray() {
        return suffixArray;
    }

    @Override
    public Iterator<Token> iterator() {
        return getTokens().iterator();
    }

    public void setRelevanceScores(DoubleMatrix matrix) {
        this.relevanceScores = matrix;
    }

    public DoubleMatrix getRelevanceScores() {
        return relevanceScores;
    }
}