nlp.main.NER_Main.java Source code

Java tutorial

Introduction

Here is the source code for nlp.main.NER_Main.java

Source

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                             * 
 * Named Entity Recognition system is used to identify the names of person,    *
 * names of location, names of organization, time expressions and numeric      *
 * expression in Tamil text.                                                   *
 *                                                                             *
 *  Copyright (C) 2015-2016  University of Moratuwa                            *
 *                                                                             *
 * This program is free software: you can redistribute it and/or modify        *
 * it under the terms of the GNU General Public License as published by        *
 * the Free Software Foundation, either version 3 of the License, or           *
 * (at your option) any later version.                                         *
 *                                                                             *
 * This program is distributed in the hope that it will be useful,             *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of              *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               *
 * GNU General Public License for more details.                                *
 *                                                                             *
 * You should have received a copy of the GNU General Public License           *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.       *
 *                                                                             * 
 *                                                                             * 
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

package nlp.main;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import java.beans.PropertyChangeListener;
import java.beans.PropertyChangeSupport;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import nlp.gazetteer.RuleBaseTagger;
import nlp.morph.noun.MainNounDetetectionLayer;
import nlp.orthographic.OrthographicFeatureCreation;
import nlp.prefix.PrefixFeatureCreation;

/**
 * @author University of Moratuwa, Computer Science and Engineering, Group-12 CSE'11
 *
 * This is the main execution class of NER. It integrates all the related
 * features to build the model
 */
public class NER_Main {

    public static volatile Double progress = 0.0;
    private static final PropertyChangeSupport changes = new PropertyChangeSupport(NER_Main.class);

    public static void addPropertyChangeListener(PropertyChangeListener l) {
        changes.addPropertyChangeListener(l);
    }

    public static void setProgress(double pro) {
        progress += pro;
        System.out.println(progress);
        changes.fireIndexedPropertyChange("progess", 0, progress - 1, progress + 1);

    }

    public static ArrayList createTestData(String data) {
        try {
            File file_word_2POS = new File("resources/POS.txt");
            File onlyWords = new File("resources/words.txt");
            PrintWriter pw = new PrintWriter(file_word_2POS);
            PrintWriter pw1 = new PrintWriter(onlyWords);
            ArrayList<String> testData = new ArrayList<>();
            MaxentTagger tagger;
            tagger = new MaxentTagger("Models/tamil.tagger");
            List<List<String[]>> morphed = MainNounDetetectionLayer.getMorph(data);
            Double p = (1.0 / morphed.size()) * 25;

            List<List<HasWord>> sentences = new ArrayList<>();

            morphed.stream().map((List<String[]> sent) -> {
                List<HasWord> newList = new ArrayList<>();
                sent.stream().forEach((s) -> {
                    newList.add(new Word(s[0]));
                });
                return newList;
            }).map((newList) -> {
                sentences.add(newList);
                return newList;
            }).forEach((_item) -> {

            });

            boolean last = false;
            int j = 0;

            for (List<HasWord> sentence : sentences) {
                boolean skip = false;
                int size = sentence.size();
                int index = 0;
                List<TaggedWord> tSentence = tagger.tagSentence(sentence);
                for (TaggedWord word : tSentence) {
                    String s = String.valueOf(word);
                    String[] y = s.split("/");
                    s = s.replace("/", "\t");
                    pw.write(s + "\t" + morphed.get(j).get(index)[1] + "\n");
                    pw1.write(y[0] + "\n");
                    index++;
                }
                pw.write("\n");
                pw1.write("\n");
                setProgress(p);
                j++;

            }
            pw1.close();
            pw.close();
            RuleBaseTagger.getTestData("resources/words.txt");

            Process proc = Runtime.getRuntime().exec("scripts/./test.sh");

            // Read the output
            BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));

            String line = "";
            PrintWriter testDataSet = new PrintWriter(new File("resources/test_data.txt"));
            while ((line = reader.readLine()) != null) {
                testData.add(line);
                testDataSet.write(line + "\n");
            }
            testDataSet.close();
            int waitFor = proc.waitFor();

            return testData;

        } catch (IOException | InterruptedException ex) {
            Logger.getLogger(NER_Main.class.getName()).log(Level.SEVERE, null, ex);
        }
        return null;
    }

    public static ArrayList createTestDataWithPrefix(String data, int i) {
        try {
            File file_word_2POS = new File("resources/POS.txt");
            File onlyWords = new File("resources/words.txt");
            PrintWriter pw = new PrintWriter(file_word_2POS);
            PrintWriter pw1 = new PrintWriter(onlyWords);
            ArrayList<String> testData = new ArrayList<>();
            MaxentTagger tagger;
            tagger = new MaxentTagger("Models/tamil.tagger");
            List<List<String[]>> morphed = MainNounDetetectionLayer.getMorph(data);
            Double p = (1.0 / morphed.size()) * 25;

            List<List<HasWord>> sentences = new ArrayList<>();

            morphed.stream().map((List<String[]> sent) -> {
                List<HasWord> newList = new ArrayList<>();
                sent.stream().forEach((s) -> {
                    newList.add(new Word(s[0]));
                });
                return newList;
            }).map((newList) -> {
                sentences.add(newList);
                return newList;
            }).forEach((_item) -> {

            });

            boolean last = false;
            int j = 0;

            for (List<HasWord> sentence : sentences) {
                boolean skip = false;
                int size = sentence.size();
                int index = 0;
                List<TaggedWord> tSentence = tagger.tagSentence(sentence);
                for (TaggedWord word : tSentence) {
                    String s = String.valueOf(word);
                    String[] y = s.split("/");
                    s = s.replace("/", "\t");
                    pw.write(s + "\t" + morphed.get(j).get(index)[1] + "\n");
                    pw1.write(y[0] + "\n");
                    index++;
                }
                pw.write("\n");
                pw1.write("\n");
                setProgress(p);
                j++;

            }
            pw1.close();
            pw.close();
            RuleBaseTagger.getTestData("resources/words.txt");
            if (i == 0)
                PrefixFeatureCreation.prefixGeneration("resources/words.txt", 4);
            else
                PrefixFeatureCreation.prefixGeneration("resources/words.txt", 4);

            Process proc = Runtime.getRuntime().exec("scripts/./test_with_prefix.sh");

            // Read the output
            BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));

            String line = "";
            PrintWriter testDataSet = new PrintWriter(new File("resources/test_data.txt"));
            while ((line = reader.readLine()) != null) {
                testData.add(line);
                testDataSet.write(line + "\n");
            }
            testDataSet.close();
            int waitFor = proc.waitFor();

            return testData;

        } catch (IOException | InterruptedException ex) {
            Logger.getLogger(NER_Main.class.getName()).log(Level.SEVERE, null, ex);
        }
        return null;
    }

    public static ArrayList createTestDataWithPrefixandOrthograph(String data, int index) {
        try {
            File file_word_2POS = new File("resources/POS.txt");
            File onlyWords = new File("resources/words.txt");
            PrintWriter pw = new PrintWriter(file_word_2POS);
            PrintWriter pw1 = new PrintWriter(onlyWords);
            ArrayList<String> testData = new ArrayList<>();

            List<List<String[]>> morphed = MainNounDetetectionLayer.getMorph(data);
            Double p = (1.0 / morphed.size()) * 25;

            boolean last = false;
            int j = 0;

            for (List<String[]> m : morphed) {
                for (String[] s : m) {
                    pw.write(s[0] + "\t" + s[1] + "\n");
                    System.out.println(s[0] + "\t" + s[1] + "\n");
                    pw1.write(s[0] + "\n");
                }
                setProgress(p);
                pw.write("\n");
                ;
                pw1.write("\n");
            }

            pw1.close();
            pw.close();
            RuleBaseTagger.getTestData("resources/words.txt");
            PrefixFeatureCreation.prefixGeneration("resources/words.txt", 4);
            OrthographicFeatureCreation.createFeature("resources/words.txt");
            Process proc = null;
            if (index == 0 || index == 2)
                proc = Runtime.getRuntime().exec("scripts/./test_with_prefix_OurPOS_ortho.sh");
            else
                proc = Runtime.getRuntime().exec("scripts/./test_with_prefix_OurPOS_ortho2.sh");

            // Read the output
            BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));

            String line = "";
            PrintWriter testDataSet = new PrintWriter(new File("resources/test_data.txt"));
            while ((line = reader.readLine()) != null) {
                testData.add(line);
                testDataSet.write(line + "\n");
            }
            testDataSet.close();
            int waitFor = proc.waitFor();

            return testData;

        } catch (IOException | InterruptedException ex) {
            Logger.getLogger(NER_Main.class.getName()).log(Level.SEVERE, null, ex);
        }
        return null;
    }

    /**
     * To get the output by giving the preprocessed test data
     *
     * @param data Data to be tagged
     * @param index Selection of classifier
     * @return the tagged data
     */
    public static ArrayList<String[]> getOutput(String data, int index) {
        try {
            setProgress(0);
            if (index < 2)
                createTestData(data);
            else if (index < 4)
                createTestDataWithPrefix(data, index - 2);
            else
                createTestDataWithPrefixandOrthograph(data, index - 4);

            Process proc = null;

            if (index == 0) {
                proc = Runtime.getRuntime().exec("scripts/./mira.sh");
            } else if (index == 1) {
                proc = Runtime.getRuntime().exec("scripts/./crf.sh");
            } else if (index == 2) {
                proc = Runtime.getRuntime().exec("scripts/./mira_with_prefix.sh");
            } else if (index == 3) {
                proc = Runtime.getRuntime().exec("scripts/./crf_with_prefix.sh");
            } else if (index == 4) {
                proc = Runtime.getRuntime().exec("scripts/./mira_with_prefix_OurPOS.sh");
            } else if (index == 5) {
                proc = Runtime.getRuntime().exec("scripts/./mira_with_prefix_OurPOS2.sh");
            } else if (index == 6)
                proc = Runtime.getRuntime().exec("scripts/./crf_with_prefix_OurPOS.sh");
            else
                proc = Runtime.getRuntime().exec("scripts/./mira_with_prefix_OurPOS2.sh");

            // Read the output
            BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));

            String line = "";
            ArrayList<String[]> output = new ArrayList<>();
            while ((line = reader.readLine()) != null) {

                String[] split = line.split("\\s+");
                output.add(split);
            }

            return output;
        } catch (IOException ex) {
            Logger.getLogger(NER_Main.class.getName()).log(Level.SEVERE, null, ex);
        }
        return null;

    }

    public static void main(String[] args) throws IOException, InterruptedException {
        createTestData(
                "? ??? ?? ? ? ??? ? ? 2002? ?? ??? ?? ??? ? ??? ?? ?? ?? ?[?? ] ?? ??? ?? ?? ????. ???? ??? ??? 59 ?? ? ? ??? 790 ??? 254 ????? ?? ??? ?? ????? ?? ??. ???? ?? ?? ? ?? ?? ? ?? ?? ??????. ?? ?? ?? ?? ???? ??? ? ? ???? 140 ?? ? ???? ??  ?? ??, \"?\" ?? ??  ? ??? ??.");

        Process proc = Runtime.getRuntime().exec("scripts/./mira.sh");

        // Read the output
        BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));

        String line = "";
        while ((line = reader.readLine()) != null) {
            System.out.print(line + "\n");
        }

        proc.waitFor();
    }

}