edu.ehu.galan.cvalue.model.Document.java Source code

Java tutorial

Introduction

Here is the source code for edu.ehu.galan.cvalue.model.Document.java

Source

package edu.ehu.galan.cvalue.model;

/*
 *    Document.java
 *    Copyright (C) 2014 Angel Conde, neuw84 at gmail dot com
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A document represents the piece of a corpus containing text.
 *
 * @author Angel Conde Manjon
 */
public class Document {

    private transient String path;
    private transient List<String> sentenceList;
    private transient List<LinkedList<Token>> tokenList;
    private String name;
    private transient List<Term> termList;
    private transient static final Logger logger = LoggerFactory.getLogger(Document.class);

    /**
     *
     * @param pPath
     * @param pName
     */
    public Document(String pPath, String pName) {
        path = pPath;
        name = pName;
        termList = new ArrayList<>();
    }

    /**
     * @return the path
     */
    public String getPath() {
        return path;
    }

    /**
     * @param path the path to set
     */
    public void setPath(String path) {
        this.path = path;
    }

    /**
     * @return the sentenceList
     */
    public List<String> getSentenceList() {
        return sentenceList;
    }

    /**
     * @param sentenceList the sentenceList to set
     */
    public void setSentenceList(List<String> sentenceList) {
        this.sentenceList = sentenceList;
    }

    /**
     * @return the tokenList
     */
    public List<LinkedList<Token>> getTokenList() {
        return tokenList;
    }

    /**
     * @param tokenList the tokenList to set
     */
    public void List(List<LinkedList<Token>> tokenList) {
        this.tokenList = tokenList;
    }

    /**
     * @return the name
     */
    public String getName() {
        return name;
    }

    /**
     * @param name the name to set
     */
    public void setName(String name) {
        this.name = name;
    }

    /**
     *
     * @return
     */
    public List<Term> getTermList() {
        return termList;
    }

    /**
     * Tries to convert the content of this document to UTF-8 using java
     * CharsetDecoders
     */
    public void convertToUTF8() {
        FileInputStream istream = null;
        Writer out = null;
        try {
            istream = new FileInputStream(path);
            BufferedInputStream in = new BufferedInputStream(istream);
            CharsetDecoder charsetDecoder = Charset.forName("UTF-8").newDecoder();
            charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
            charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
            Reader inputReader = new InputStreamReader(in, charsetDecoder);
            StringWriter writer = new StringWriter();
            IOUtils.copy(inputReader, writer);
            String theString = writer.toString();
            FileUtils.deleteQuietly(new File(path));
            out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"));
            out.write(theString);
            out.close();
            //            System.out.println("");
        } catch (FileNotFoundException ex) {
            logger.error("Error converting the file to utf8", ex);
        } catch (IOException ex) {
            logger.error("Error converting the file to utf8", ex);
        } finally {
            try {
                if (out != null) {
                    out.close();
                }
                if (istream != null) {
                    istream.close();
                }
            } catch (IOException ex) {
                logger.error("Error converting the file to utf8", ex);
            }
        }

    }

    /**
     * @param termList the termList to set
     */
    public void setTermList(List<Term> termList) {
        this.termList = termList;
    }

}