nlp.corpora.NUSSMSCorpus.java Source code

Java tutorial

Introduction

Here is the source code for nlp.corpora.NUSSMSCorpus.java

Source

package nlp.corpora;

import org.apache.commons.io.FilenameUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

/*
Copyright 2017 Cloudminds Technology, Inc
    
Author: vishwas.mruthyunjaya@cloudminds.com
    
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program ; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA  02111-1307 USA
 */

/****************************************************************************************
 * Created by Vishwas Mruthyunjaya on 1/19/17.
 */
public class NUSSMSCorpus {
    private String directoryName;
    private String writeToDirectory;

    /************************************************************************************
     * This method takes the converted data and
     * stores in a text file format which is required for the ChatBot.
     */
    private void writeFile(List<String> parsedLines, String fileName) {

        try (BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(fileName))) {
            for (String parsedLine : parsedLines) {
                bufferedWriter.write(parsedLine);
                bufferedWriter.newLine();
            }
        } catch (IOException e) {
            System.out.println("In Method writeFile(): Error with" + fileName + ": " + e);
            e.printStackTrace();
        }
    }

    /****************************************************************
     * This method is to read the file(s) from NUS SMS Corpus and extract
     * the necessary text from the specific tag.
     */
    private List<String> readFile(String fileName) {

        List<String> rawData = new ArrayList<>();
        DocumentBuilder builder;
        DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
        try {
            builder = builderFactory.newDocumentBuilder();
            Document document = builder.parse(fileName);
            document.getDocumentElement().normalize();
            NodeList posts = document.getElementsByTagName("message");
            for (int i = 0; i < posts.getLength(); i++) {
                Node nNode = posts.item(i);
                if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                    Element eElement = (Element) nNode;
                    rawData.add(eElement.getElementsByTagName("text").item(0).getTextContent().trim());
                }
            }
        } catch (ParserConfigurationException e) {
            System.out.println(e.toString());
            System.out.println("Warning: In method readFile(), error occurred while parsing");
        } catch (IOException e) {
            System.out.println(e.toString());
            System.out.println("Warning: In method readFile(), could not find the file");
        } catch (SAXException e) {
            System.out.println(e.toString());
            System.out.println("Warning: In method readFile(), exception with builder");
        }
        return rawData;
    }

    /******************************************************************************************************
     * Lists all the files and reads & writes the data in each file listed to a directory
     */
    private void readAllFiles() {

        File directory = new File(directoryName);
        File[] files = directory.listFiles();

        if (files != null) {
            for (File file : files) {
                if (FilenameUtils.getExtension(file.getName()).equals("xml")) {
                    List<String> parsedLines = readFile(file.getAbsolutePath());
                    String parsedFilePath = writeToDirectory + "/"
                            + FilenameUtils.getBaseName(file.getAbsolutePath()) + "_parsed.txt";
                    writeFile(parsedLines, parsedFilePath);
                }
            }
        }
    }

    /******************************************************************************************************
     * main() instantiates class and runs functionality
     */
    public static void main(String[] args) {

        NUSSMSCorpus nusSMSCorpus = new NUSSMSCorpus();
        Properties prop = new Properties();
        try {
            InputStream input = new FileInputStream("corpora.properties");
            prop.load(input);
        } catch (IOException e) {
            System.out.println("Problem loading resource file " + e);
            e.printStackTrace();
        }
        nusSMSCorpus.directoryName = prop.getProperty("nusSMSCorpusDirectoryName");
        nusSMSCorpus.writeToDirectory = prop.getProperty("nusSMSCorpusParsedDirectoryName");
        nusSMSCorpus.readAllFiles();
    }
}