org.apache.tika.parser.ner.NamedEntityParserTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.parser.ner.NamedEntityParserTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.ner;

import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
import org.apache.tika.parser.ner.regex.RegexNERecogniser;
import org.json.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.junit.Test;

import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;

import static org.junit.Assume.assumeTrue;

/**
 *Test case for {@link NamedEntityParser}
 */
public class NamedEntityParserTest {

    public static final String CONFIG_FILE = "tika-config.xml";

    @Test
    public void testParse() throws Exception {

        //test config is added to resources directory
        TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
        Tika tika = new Tika(config);

        JSONParser parser = new JSONParser();
        String text = "";

        HashMap<Integer, String> hmap = new HashMap<Integer, String>();
        HashMap<String, HashMap<Integer, String>> outerhmap = new HashMap<String, HashMap<Integer, String>>();

        int index = 0;
        //Input Directory Path
        String inputDirPath = "/Users/AravindMac/Desktop/polardata_json_grobid/application_pdf";
        int count = 0;
        try {

            File root = new File(inputDirPath);
            File[] listDir = root.listFiles();
            for (File filename : listDir) {

                if (!filename.getName().equals(".DS_Store") && count < 3573) {
                    count += 1;
                    System.out.println(count);

                    String absoluteFilename = filename.getAbsolutePath().toString();

                    //   System.out.println(absoluteFilename);
                    //Read the json file, parse and retrieve the text present in the content field.

                    Object obj = parser.parse(new FileReader(absoluteFilename));

                    BufferedWriter bw = new BufferedWriter(new FileWriter(new File(absoluteFilename)));

                    JSONObject jsonObject = (JSONObject) obj;
                    text = (String) jsonObject.get("content");

                    Metadata md = new Metadata();
                    tika.parse(new ByteArrayInputStream(text.getBytes()), md);

                    //Parse the content and retrieve the values tagged as the NER entities
                    HashSet<String> set = new HashSet<String>();
                    set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));

                    // Store values tagged as NER_PERSON
                    set.clear();
                    set.addAll(Arrays.asList(md.getValues("NER_PERSON")));

                    hmap = new HashMap<Integer, String>();
                    index = 0;

                    for (Iterator<String> i = set.iterator(); i.hasNext();) {
                        String f = i.next();
                        hmap.put(index, f);
                        index++;
                    }

                    if (!hmap.isEmpty()) {
                        outerhmap.put("PERSON", hmap);
                    }

                    // Store values tagged as NER_LOCATION
                    set.clear();
                    set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
                    hmap = new HashMap<Integer, String>();
                    index = 0;

                    for (Iterator<String> i = set.iterator(); i.hasNext();) {
                        String f = i.next();
                        hmap.put(index, f);
                        index++;
                    }

                    if (!hmap.isEmpty()) {
                        outerhmap.put("LOCATION", hmap);
                    }

                    //Store values tagged as NER_ORGANIZATION
                    set.clear();
                    set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));

                    hmap = new HashMap<Integer, String>();
                    index = 0;

                    for (Iterator<String> i = set.iterator(); i.hasNext();) {
                        String f = i.next();
                        hmap.put(index, f);
                        index++;
                    }

                    if (!hmap.isEmpty()) {
                        outerhmap.put("ORGANIZATION", hmap);
                    }

                    // Store values tagged as NER_DATE
                    set.clear();
                    set.addAll(Arrays.asList(md.getValues("NER_DATE")));

                    hmap = new HashMap<Integer, String>();
                    index = 0;

                    for (Iterator<String> i = set.iterator(); i.hasNext();) {
                        String f = i.next();
                        hmap.put(index, f);
                        index++;
                    }

                    if (!hmap.isEmpty()) {
                        outerhmap.put("DATE", hmap);
                    }

                    JSONArray array = new JSONArray();
                    array.put(outerhmap);
                    if (!outerhmap.isEmpty()) {
                        jsonObject.put("OpenNLP", array); //Add the NER entities to the json under NER key as a JSON array.
                    }

                    System.out.println(jsonObject);

                    bw.write(jsonObject.toJSONString()); //Stringify thr JSON and write it back to the file 
                    bw.close();

                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    /*@Test
    public void testNerChain() throws Exception {
    String classNames = OpenNLPNERecogniser.class.getName()
            + "," + RegexNERecogniser.class.getName();
    System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
    TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
    Tika tika = new Tika(config);
    String text = "University of Southern California (USC), is located in Los Angeles ." +
            " Campus is busy from monday to saturday";
    Metadata md = new Metadata();
    tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
    HashSet<String> keys = new HashSet<String>(Arrays.asList(md.getValues("NER_WEEK_DAY")));
    System.out.println(keys);
    assumeTrue(keys.contains("monday"));
        
    keys.clear();
    keys.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
    System.out.println(keys);
    assumeTrue(keys.contains("Los Angeles"));
        
     }*/
}