de.tudarmstadt.ukp.dkpro.keyphrases.bookindexing.evaluation.phrasematch.LineReader.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.keyphrases.bookindexing.evaluation.phrasematch.LineReader.java
Source

/*******************************************************************************
 * Copyright 2013
    
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl-3.0.txt
 ******************************************************************************/
package de.tudarmstadt.ukp.dkpro.keyphrases.bookindexing.evaluation.phrasematch;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.LineIterator;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;

/**
 * Retrieves strings associated with a document name using a line based string
 * reader.
 *
 * @author Mateusz Parzonka
 *
 */
public class LineReader implements StringReader {

    private final String pathName;
    private final String suffix;
    private final boolean lowercase;
    private final String encoding;

    public LineReader(String pathName, String suffix, boolean lowercase) {
        this.pathName = pathName;
        this.suffix = suffix;
        this.lowercase = lowercase;
        this.encoding = "UTF-8";
    }

    public LineReader(String pathName, String suffix, boolean lowercase, String encoding) {
        this.pathName = pathName;
        this.suffix = suffix;
        this.lowercase = lowercase;
        this.encoding = encoding;
    }

    /*
     * (non-Javadoc)
     *
     * @see
     * de.tudarmstadt.ukp.dkpro.semantics.bookindexing.evaluation.phrasematch.
     * StringReader#getSetOfStrings(java.lang.String)
     */
    @Override
    public Set<String> getSetOfStrings(JCas jcas) throws AnalysisEngineProcessException {
        return new TreeSet<String>(getListOfStrings(jcas));
    }

    /*
     * (non-Javadoc)
     *
     * @see
     * de.tudarmstadt.ukp.dkpro.semantics.bookindexing.evaluation.phrasematch.
     * StringReader#getListOfStrings(java.lang.String)
     */
    @Override
    public List<String> getListOfStrings(JCas jcas) throws AnalysisEngineProcessException {

        List<String> goldList = new ArrayList<String>();
        LineIterator lineIterator;
        try {
            lineIterator = FileUtils.lineIterator(new File(getPath(getDocumentBaseName(jcas))), encoding);
        } catch (IOException e) {
            throw new AnalysisEngineProcessException(new Throwable(e));
        }
        try {
            while (lineIterator.hasNext()) {
                String line = lineIterator.nextLine().trim();
                if (!line.isEmpty()) {
                    if (lowercase)
                        line = line.toLowerCase();
                    goldList.add(line);
                }
            }
        } finally {
            LineIterator.closeQuietly(lineIterator);
        }
        return goldList;
    }

    /**
     * @param jcas
     * @return the document basename from the parsed document-URI-path.
     * @throws AnalysisEngineProcessException
     */
    private String getDocumentBaseName(JCas jcas) throws AnalysisEngineProcessException {
        try {
            URI uri = new URI(DocumentMetaData.get(jcas).getDocumentUri());
            return FilenameUtils.getBaseName(uri.getPath());
        } catch (URISyntaxException e) {
            throw new AnalysisEngineProcessException(e);
        }
    }

    private String getPath(String baseName) {
        return FilenameUtils.separatorsToSystem(
                FilenameUtils.concat(pathName, baseName) + ((suffix.startsWith(".")) ? suffix : "." + suffix));
    }

}