edu.usc.polar.NLTKRest.java Source code

Java tutorial

Introduction

Here is the source code for edu.usc.polar.NLTKRest.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package edu.usc.polar;

import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Map;
import java.util.Set;
import org.json.simple.JSONObject;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.parser.ner.nltk.NLTKNERecogniser;

import java.io.File;
import java.io.FileWriter;
import org.json.simple.JSONArray;

/**
 *
 * @author Sneh
 */
public class NLTKRest {

    public static int counter = 0;
    public static long jsonCount = 0;
    public static FileWriter jsonFile;
    public static File file;
    public static JSONArray jsonArray = new JSONArray();
    public static NLTKNERecogniser ner = new NLTKNERecogniser();

    public static void main(String args[]) {
        try {
            String doc = "C:\\Users\\Snehal\\Documents\\TREC-Data\\Data\\1\\";
            System.out.println(ner.getEntityTypes() + " \t " + ner.isAvailable());
            dir(doc, args);
            if (jsonFile != null) {
                jsonFile.write("{\"NER_NLTKRest\":");
                jsonFile.write(jsonArray.toJSONString());
                jsonFile.write("}");
                // System.out.println(jsonArray.toJSONString());
                jsonFile.close();
            }

        } catch (Exception e) {
            System.out.println("Error" + e.toString());
            e.printStackTrace();
        }
    }

    public static void ApacheNLTKRest(String doc, String args[]) {
        try {
            String text;
            AutoDetectParser parser = new AutoDetectParser();
            BodyContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();

            InputStream stream = new FileInputStream(doc);
            // System.out.println(stream.toString());
            parser.parse(stream, handler, metadata);
            // return handler.toString();
            text = handler.toString();
            //System.out.println(text);
            String metaValue = metadata.toString();
            System.out.println("Desc:: " + metadata.toString());

            String[] example = new String[1];
            example[0] = text;
            String name = doc.replace("C:\\Users\\Snehal\\Documents\\TREC-Data\\Data", "polar.usc.edu");
            //System.out.println(text);
            Map<String, Set<String>> list = tikaNLTKRest(text);
            System.out.println(list);
            JSONObject jsonObj = new JSONObject();
            jsonObj.put("DOI", name);
            jsonObj.put("metadata", metaValue.replaceAll("\\s\\s+|\n|\t", " "));
            JSONArray tempArray = new JSONArray();
            JSONObject tempObj = new JSONObject();
            for (Map.Entry<String, Set<String>> entry : list.entrySet()) {
                System.out.println("\"" + entry.getKey() + "/" + ":\"" + entry.getValue() + "\"");
                tempObj.put(entry.getKey(), entry.getValue());
                //          String jsonOut="{ DOI:"+name+"  ,"
                //                + ""+item.first() + "\": \"" + text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t"," ")+"\""
                //                + "\"metadata\":\""+metaValue+"\""
                //                + "}";
                // System.out.println(jsonOut);
                //     tempObj.put(item.first(),text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t"," "));
            }
            tempArray.add(tempObj);
            jsonObj.put("NER", tempArray);
            jsonArray.add(jsonObj);

            // System.out.println("---");

        } catch (Exception e) {
            System.out.println("ERROR : NLTKRest" + "|File Name"
                    + doc.replaceAll("C:\\Users\\Snehal\\Documents\\TREC-Data", "") + " direct" + e.toString());
        }
    }

    public static void dir(String path, String[] args) {
        try {

            File root = new File(path);
            if (root.isFile()) {

                if (counter >= 1000 || file == null) {
                    counter = 0;
                    jsonCount++;
                    file = new File("C:\\Users\\Snehal\\Documents\\tikaSimilarityTestSet\\NLTKRest\\NER_"
                            + jsonCount + ".json");
                    if (jsonFile != null) {
                        jsonFile.write("{\"NER_NLTKRest\":");
                        jsonFile.write(jsonArray.toJSONString());
                        jsonFile.write("}");
                        //   System.out.println(jsonArray.toJSONString());
                        jsonFile.close();
                    }
                    jsonFile = new FileWriter(file);
                    jsonArray = new JSONArray();
                }

                if (!root.getName().equals((".DS_Store"))) {
                    ApacheNLTKRest(root.getAbsolutePath(), args);
                    counter++;
                }
            } else {
                File[] list = root.listFiles();
                if (list == null) {
                    return;
                }
                for (File f : list) {
                    if (f.isDirectory()) {
                        dir(f.getAbsolutePath(), args);
                        System.out.println("Dir:" + f.getAbsoluteFile());
                    } else {
                        if (counter >= 1000 || file == null) {
                            counter = 0;
                            jsonCount++;
                            file = new File("C:\\Users\\Snehal\\Documents\\tikaSimilarityTestSet\\NLTKRest\\NER_"
                                    + jsonCount + ".json");
                            //  System.out.print("check"+jsonArray.toJSONString());
                            if (jsonFile != null) {
                                jsonFile.write("{\"NER_NLTKRest\":");
                                jsonFile.write(jsonArray.toJSONString());
                                jsonFile.write("}");
                                System.out.println(jsonArray.toJSONString());
                                jsonFile.close();
                            }
                            jsonFile = new FileWriter(file);
                            jsonArray = new JSONArray();
                        }

                        if (!f.getName().equals((".DS_Store"))) {
                            ApacheNLTKRest(f.getAbsolutePath(), args);
                            counter++;
                            // add json   
                        }
                    }
                }
            }
        } catch (Exception e) {
            System.err.print(e.toString());

        }
    }

    public static String parseExample(String doc) {
        try {
            AutoDetectParser parser = new AutoDetectParser();
            BodyContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();

            InputStream stream = new FileInputStream(doc);
            //ParsingExample.class.getResourceAsStream(doc) ;
            //   System.out.println(stream.toString());
            parser.parse(stream, handler, metadata);
            return handler.toString();

        } catch (Exception e) {
            return "ERROR - " + e.toString();
        }

    }

    public static Map<String, Set<String>> tikaNLTKRest(String doc) {
        Map<String, Set<String>> names = ner.recognise(doc);
        // System.out.println("\nDOC: "+names);
        return names;

    }

}