com.joliciel.talismane.machineLearning.TextFileWordList.java Source code

Java tutorial

Introduction

Here is the source code for com.joliciel.talismane.machineLearning.TextFileWordList.java

Source

///////////////////////////////////////////////////////////////////////////////
//Copyright (C) 2014 Joliciel Informatique
//
//This file is part of Talismane.
//
//Talismane is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//Talismane is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Affero General Public License for more details.
//
//You should have received a copy of the GNU Affero General Public License
//along with Talismane.  If not, see <http://www.gnu.org/licenses/>.
//////////////////////////////////////////////////////////////////////////////
package com.joliciel.talismane.machineLearning;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.joliciel.talismane.utils.JolicielException;
import com.joliciel.talismane.utils.LogUtils;

/**
 * An external word list read from a text file.<br/>
 * The first line must be "Type: WordList", otherwise an exception gets thrown.<br/>
 * The default name will be the filename.<br/>
 * If a line starts with the string "Name: ", the default name will be replaced by this name.<br/>
 * All lines starting with # are skipped.<br/>
 * All other lines contain words.
 * @author Assaf Urieli
 *
 */
public class TextFileWordList implements ExternalWordList {
    private static final long serialVersionUID = 1L;
    private static final Log LOG = LogFactory.getLog(TextFileWordList.class);
    List<String> wordList = new ArrayList<String>();

    private String name;

    public TextFileWordList(File file) {
        try {
            this.name = file.getName();

            Scanner scanner = new Scanner(
                    new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")));
            int i = 1;
            String firstLine = scanner.nextLine();
            if (!firstLine.equals("Type: WordList")) {
                throw new JolicielException("A word list file must start with \"Type: WordList\"");
            }
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine();
                if (line.length() > 0 && !line.startsWith("#")) {
                    if (line.startsWith("Name: ")) {
                        this.name = line.substring("Name: ".length());
                        i++;
                        continue;
                    }
                    wordList.add(line);
                }
                i++;
            }
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            throw new RuntimeException(e);
        }
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    @Override
    public List<String> getWordList() {
        return wordList;
    }

}