edu.ehu.galan.lite.model.Corpus.java Source code

Java tutorial

Introduction

Here is the source code for edu.ehu.galan.lite.model.Corpus.java

Source

package edu.ehu.galan.lite.model;

/*
 * Copyright (C) 2014 Angel Conde Manjon neuw84 at gmail.com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

import java.io.File;
import java.util.ArrayDeque;
import java.util.Collection;
import java.util.Iterator;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A Corpus contains a List of Documents in a language
 *
 * @author Angel Conde Manjon
 */
public class Corpus {

    private ArrayDeque<Document> docList;
    private String language;
    private String rootFolder;
    private final Logger logger = LoggerFactory.getLogger(this.getClass());

    /**
     *
     * @param pLanguage
     */
    public Corpus(String pLanguage) {
        language = pLanguage;
        docList = new ArrayDeque<>();

    }

    /**
     *
     * @param pDocument
     */
    public void addDocument(Document pDocument) {
        getDocQueue().add(pDocument);
    }

    /**
     *
     * @return
     */
    public Iterator<Document> getIterator() {
        return getDocQueue().iterator();
    }

    /**
     * @return the docList
     */
    public ArrayDeque<Document> getDocQueue() {
        return docList;
    }

    /**
     * @param docList the docList to set
     */
    public void setDocQueue(ArrayDeque<Document> docList) {
        this.docList = docList;
    }

    /**
     * @return the language
     */
    public String getLanguage() {
        return language;
    }

    /**
     * @param language the language to set
     */
    public void setLanguage(String language) {
        this.language = language;
    }

    /**
     * Loads a corpus from a given folder and scans recursively all the documents inside it,
     * then it reorder using the file name (uses FilesUtils from CommonsIO), you need to choose
     * the source of knowledge to be used (Wikipedia or Wordnet)
     * @param pRootFolder
     * @param pType 
     */

    public void loadCorpus(String pRootFolder, Document.SourceType pType) {
        File file = new File(pRootFolder);
        rootFolder = pRootFolder;
        Collection<File> documents = FileUtils.listFiles(file, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE);
        documents.stream().sorted((d1, d2) -> d1.getName().compareTo(d2.getName()))
                .forEachOrdered((f) -> docList.add(new Document(f.getAbsolutePath(), f.getName())));
    }

    /**
     * @return the rootFolder
     */
    public String getRootFolder() {
        return rootFolder;
    }

    /**
     * @param rootFolder the rootFolder to set
     */
    public void setRootFolder(String rootFolder) {
        this.rootFolder = rootFolder;
    }

}