de.dfki.km.perspecting.obie.corpus.BBCNatureCorpus.java Source code

Introduction

Here is the source code for de.dfki.km.perspecting.obie.corpus.BBCNatureCorpus.java
Source

/*
Copyright (c) 2011, 
Benjamin Adrian <benjamin.horak@gmail.com>
German Research Center for Artificial Intelligence (DFKI) <info@dfki.de>
    
All rights reserved.
    
This file is part of SCOOBIE.
    
SCOOBIE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
    
SCOOBIE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with SCOOBIE.  If not, see <http://www.gnu.org/licenses/>.
 */

package de.dfki.km.perspecting.obie.corpus;

import java.io.File;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URI;
import java.net.URLDecoder;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.apache.commons.io.FileUtils;
import org.openrdf.model.Statement;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.model.vocabulary.RDFS;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.repository.sail.SailRepositoryConnection;
import org.openrdf.rio.RDFFormat;
import org.openrdf.sail.memory.MemoryStore;

import de.dfki.km.perspecting.obie.vocabulary.Language;
import de.dfki.km.perspecting.obie.vocabulary.MediaType;

public class BBCNatureCorpus extends LabeledTextCorpus {

    public BBCNatureCorpus(File labelFolder, TextCorpus corpus) throws Exception {
        super(labelFolder, MediaType.ZIP, corpus);
    }

    public BBCNatureCorpus() throws Exception {
        this(new File("../corpora/bbc_nature/bbc_nature_labels.zip"), new TextCorpus(
                new File("../corpora/bbc_nature/bbc_nature_text.zip"), MediaType.ZIP, MediaType.HTML, Language.EN));
    }

    public Reader getGroundTruth(final URI uri) throws Exception {
        if (labelFileMediaType == MediaType.DIRECTORY) {
            return new StringReader(FileUtils.readFileToString(new File(uri)));
        } else if (labelFileMediaType == MediaType.ZIP) {
            ZipFile zipFile = new ZipFile(labelFolder);
            String[] entryName = uri.toURL().getFile().split("/");
            ZipEntry entry = zipFile
                    .getEntry(URLDecoder.decode(entryName[entryName.length - 1], "utf-8").replace("text", "rdf"));

            if (entry != null) {
                log.info("found labels for: " + uri.toString());
            } else {
                throw new Exception("did not found labels for: " + uri.toString());
            }
            return new InputStreamReader(zipFile.getInputStream(entry));
        } else {
            throw new Exception("Unsupported media format for labels: " + labelFileMediaType + ". "
                    + "Please use zip or plain directories instead.");
        }
    }

    /**
     * This method is a hook for inserting a label extraction from different
     * label files.
     */
    protected Reader extractLabels(Reader in) throws Exception {

        SailRepository sr = new SailRepository(new MemoryStore());
        SailRepositoryConnection conn;
        sr.initialize();
        conn = sr.getConnection();
        conn.add(in, "http://www.bbc.co.uk/", RDFFormat.RDFXML);

        final StringBuffer b = new StringBuffer();

        for (Statement stmt : conn.getStatements(null, RDFS.LABEL, null, true).asList()) {
            b.append(stmt.getSubject().stringValue());
            b.append("\n");
        }
        for (Statement stmt : conn.getStatements(null, new URIImpl("http://purl.org/dc/terms/title"), null, true)
                .asList()) {
            b.append(stmt.getSubject().stringValue());
            b.append("\n");
        }
        return new StringReader(b.toString());
    }

}