de.dfki.km.perspecting.obie.corpus.TextCorpus.java Source code

Introduction

Here is the source code for de.dfki.km.perspecting.obie.corpus.TextCorpus.java
Source

/*
Copyright (c) 2011, 
Benjamin Adrian <benjamin.horak@gmail.com>
German Research Center for Artificial Intelligence (DFKI) <info@dfki.de>
    
All rights reserved.
    
This file is part of SCOOBIE.
    
SCOOBIE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
    
SCOOBIE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with SCOOBIE.  If not, see <http://www.gnu.org/licenses/>.
 */

package de.dfki.km.perspecting.obie.corpus;

import gnu.trove.TDoubleFunction;
import gnu.trove.TIntDoubleHashMap;
import gnu.trove.TIntHashSet;
import gnu.trove.TIntIntHashMap;
import gnu.trove.TIntIntProcedure;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.logging.Logger;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.IndexSearcher;

import de.dfki.km.perspecting.obie.model.Document;
import de.dfki.km.perspecting.obie.model.DocumentProcedure;
import de.dfki.km.perspecting.obie.model.SemanticEntity;
import de.dfki.km.perspecting.obie.model.Token;
import de.dfki.km.perspecting.obie.model.TokenSequence;
import de.dfki.km.perspecting.obie.transducer.RDFLiteralSpotting;
import de.dfki.km.perspecting.obie.vocabulary.Language;
import de.dfki.km.perspecting.obie.vocabulary.MediaType;
import de.dfki.km.perspecting.obie.workflow.Pipeline;

/**
 * A {@link TextCorpus} wraps a collection of text.
 * 
 * @author adrian
 * 
 */
public class TextCorpus {

    /**
     * The location in file system of this corpus
     */
    protected File corpus;

    protected ArrayList<InputStream> files = new ArrayList<InputStream>();

    protected MediaType corpusMediaType;

    protected Language language;

    private static final String SPACE = " ";

    protected final Logger log = Logger.getLogger(TextCorpus.class.getName());

    private MediaType corpusFileMediaType;

    /**
     * Creates a new TextCorpus.
     * 
     * @param corpusFile
     *            directory to contain text documents
     * @param indexDir
     *            directory to contain the lucene index
     * @throws Exception 
     */
    public TextCorpus(File corpusFile, MediaType corpusFileMediaType, MediaType corpusMediaType, Language language)
            throws Exception {
        this.corpus = corpusFile;
        this.corpusFileMediaType = corpusFileMediaType;
        this.corpusMediaType = corpusMediaType;
        this.language = language;
        log.info("creating corpus on " + corpusFile.getAbsolutePath());
    }

    @SuppressWarnings("unchecked")
    public List<?> forEach(DocumentProcedure<?> p) throws Exception {
        @SuppressWarnings("rawtypes")
        List l = new ArrayList();
        for (Entry<URI, InputStream> in : getEntries().entrySet()) {
            InputStreamReader reader = new InputStreamReader(in.getValue());
            log.info("processing entry: " + in.getKey().toString());
            l.add(p.process(reader, in.getKey()));
            reader.close();
        }
        return l;
    }

    /**
     * Returns a Lucene index on this {@link TextCorpus}.
     * 
     * @param dir
     *            The directory the index is stored.
     * @param reindex
     *            If <code>true</code>, an existing index will be re-created.
     * @return Access to the Lucene index.
     * 
     * @throws Exception
     */
    public IndexSearcher getLuceneIndex(File dir, boolean reindex) throws Exception {

        if (dir.exists()) {
            if (reindex) {
                FileUtils.deleteDirectory(dir);
                log.info("deleted directory: " + dir);
            } else {
                return new IndexSearcher(dir.getAbsolutePath());
            }
        }

        dir.mkdirs();
        log.info("created directory: " + dir);

        final WhitespaceAnalyzer analyser = new WhitespaceAnalyzer();

        final IndexWriter indexWriter = new IndexWriter(dir, analyser, true, MaxFieldLength.LIMITED);
        forEach(new DocumentProcedure<String>() {
            @Override
            public String process(Reader doc, URI uri) throws Exception {
                org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
                document.add(new Field("text", doc, TermVector.YES));
                indexWriter.addDocument(document, analyser);
                log.fine("indexes: " + document);
                return uri.toString();
            }
        });
        log.info("indexed: " + indexWriter.numDocs() + " documents");

        indexWriter.commit();
        indexWriter.close();

        return new IndexSearcher(dir.getAbsolutePath());
    }

    @SuppressWarnings("unchecked")
    public TIntDoubleHashMap getDocumentFrequency(final Pipeline pipe) throws Exception {

        // calculate property frequency per document
        final List<TIntIntHashMap> results = (List<TIntIntHashMap>) forEach(
                new DocumentProcedure<TIntIntHashMap>() {

                    @Override
                    public TIntIntHashMap process(final Reader file, final URI uri) throws Exception {

                        final Document document = pipe.createDocument(file, uri, getMediatype(),
                                "SELECT * WHERE {?s ?p ?o}", getLanguage());

                        final TIntIntHashMap stats = new TIntIntHashMap();
                        for (int step = 0; pipe.hasNext(step) && step < 8; step = pipe.execute(step, document)) {
                            if (step > 0
                                    && pipe.getTranducer(step - 1).getClass().equals(RDFLiteralSpotting.class)) {
                                for (final TokenSequence<SemanticEntity> se : document
                                        .getRetrievedPropertyValues()) {
                                    stats.adjustOrPutValue(se.getValue().getPropertyIndex(), 1, 1);
                                }
                                break;
                            }
                        }
                        return stats;
                    }
                });

        final TIntDoubleHashMap propertyIDF = new TIntDoubleHashMap();

        for (TIntIntHashMap indexedDoc : results) {
            indexedDoc.forEachEntry(new TIntIntProcedure() {
                @Override
                public boolean execute(int property, int value) {
                    propertyIDF.adjustOrPutValue(property, 1.0, 1.0);
                    return true;
                }
            });
        }

        propertyIDF.transformValues(new TDoubleFunction() {
            @Override
            public double execute(double value) {

                double idf = ((double) results.size()) / (value + 1);
                return idf;
            }
        });

        return propertyIDF;
    }

    public LabeledTextCorpus labelRDFTypes(final File corpus, final Pipeline pipeline, final String template)
            throws Exception {

        final BufferedWriter writer = new BufferedWriter(new FileWriter(corpus));

        this.forEach(new DocumentProcedure<String>() {
            @Override
            public String process(Reader doc, URI uri) throws Exception {

                Document document = pipeline.createDocument(doc, uri, corpusMediaType, template, language);

                for (int step = 0; pipeline.hasNext(step); step = pipeline.execute(step, document))
                    ;

                TIntHashSet sentenceBoundaries = new TIntHashSet();
                for (TokenSequence<Integer> sentence : document.getSentences()) {
                    sentenceBoundaries.add(sentence.getEnd());
                }

                for (Token token : document) {
                    String word = token.toString();
                    String pos = token.getPartOfSpeechTag();
                    String phrase = token.getNounPhraseTag();
                    int label = -1;

                    int[] types = token.getTypes(0.0).toArray();
                    if (types.length > 0) {
                        label = pipeline.getKnowledgeBase().getCluster(types);
                        // System.out.println(word + " " + kb.getURI(label));
                    }

                    // int[] subjects = token.getSubjects().toArray();
                    // if (subjects.length > 0) {
                    // System.out.println(word + " " +
                    // Arrays.toString(subjects));
                    // }
                    writer.append(word);
                    writer.append(SPACE);
                    writer.append(pos);
                    writer.append(SPACE);
                    writer.append(phrase);
                    writer.append(SPACE);

                    if (label > 0) {
                        writer.append(Integer.toString(label));
                    } else {
                        writer.append(LabeledTextCorpus.OUTSIDE_ANY_LABEL);
                    }

                    writer.newLine();

                    if (sentenceBoundaries.contains(token.getEnd())) {
                        writer.newLine();
                    }
                }

                writer.flush();
                return uri.toString();
            }

        });
        writer.close();
        return new LabeledTextCorpus(corpus, MediaType.TEXT, this);

    }

    /**
     * @return list of files in corpus
     * @throws IOException 
     * @throws ZipException 
     */
    @SuppressWarnings("unchecked")
    protected HashMap<URI, InputStream> getEntries() throws Exception {

        HashMap<URI, InputStream> entries = new HashMap<URI, InputStream>();

        if (corpusFileMediaType == MediaType.ZIP) {
            ZipFile zippedCorpusDir = new ZipFile(corpus);
            Enumeration<? extends ZipEntry> zipEntries = zippedCorpusDir.entries();
            while (zipEntries.hasMoreElements()) {
                ZipEntry zipEntry = zipEntries.nextElement();
                if (!zipEntry.isDirectory()) {

                    String uriValue = corpus.toURI().toString() + "/";
                    String entryName = zipEntry.getName();
                    uriValue += URLEncoder.encode(entryName, "utf-8");

                    entries.put(new URI(uriValue), zippedCorpusDir.getInputStream(zipEntry));
                }
            }
        } else if (corpusFileMediaType == MediaType.DIRECTORY) {
            for (File f : corpus.listFiles()) {
                entries.put(f.toURI(), new FileInputStream(f));
            }
        }

        return entries;
    }

    public File getCorpus() {
        return corpus;
    }

    public Language getLanguage() {
        return language;
    }

    public MediaType getMediatype() {
        return corpusMediaType;
    }

    public MediaType getCorpusFileMediaType() {
        return corpusFileMediaType;
    }

}