tr.edu.gsu.nerwip.tools.corpus.ArticleCompletion.java Source code

Introduction

Here is the source code for tr.edu.gsu.nerwip.tools.corpus.ArticleCompletion.java
Source

package tr.edu.gsu.nerwip.tools.corpus;

/*
 * Nerwip - Named Entity Extraction in Wikipedia Pages
 * Copyright 2011 Yasa Akbulut, Burcu Kpeliolu & Vincent Labatut
 * Copyright 2012 Burcu Kpeliolu, Samet Atda & Vincent Labatut
 * Copyright 2013 Samet Atda & Vincent Labatut
 * Copyright 2014-15 Vincent Labatut
 * 
 * This file is part of Nerwip - Named Entity Extraction in Wikipedia Pages.
 * 
 * Nerwip - Named Entity Extraction in Wikipedia Pages is free software: you can 
 * redistribute it and/or modify it under the terms of the GNU General Public License 
 * as published by the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 * 
 * Nerwip - Named Entity Extraction in Wikipedia Pages is distributed in the hope 
 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 
 * License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Nerwip - Named Entity Extraction in Wikipedia Pages.  
 * If not, see <http://www.gnu.org/licenses/>.
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeSet;

import org.xml.sax.SAXException;

import tr.edu.gsu.nerwip.data.article.Article;
import tr.edu.gsu.nerwip.data.article.ArticleCategory;
import tr.edu.gsu.nerwip.data.entity.AbstractEntity;
import tr.edu.gsu.nerwip.data.entity.Entities;
import tr.edu.gsu.nerwip.data.entity.EntityType;
import tr.edu.gsu.nerwip.evaluation.ArticleList;
import tr.edu.gsu.nerwip.recognition.ConverterException;
import tr.edu.gsu.nerwip.recognition.external.AbstractExternalConverter;
import tr.edu.gsu.nerwip.retrieval.ArticleRetriever;
import tr.edu.gsu.nerwip.retrieval.reader.ReaderException;
import tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader;
import tr.edu.gsu.nerwip.tools.file.FileNames;
import tr.edu.gsu.nerwip.tools.file.FileTools;
import tr.edu.gsu.nerwip.tools.log.HierarchicalLogger;
import tr.edu.gsu.nerwip.tools.log.HierarchicalLoggerManager;
import tr.edu.gsu.nerwip.tools.string.StringTools;

/**
 * This class contains various methods used to complete pre-existing
 * articles and make them compatible with subsequent versions of Nerwip.
 *  
 * @author Vincent Labatut
 */
@SuppressWarnings("unused")
public class ArticleCompletion {
    /**
     * Method used to do some punctual processing.
     * 
     * @param args
     *       None needed.
     * 
     * @throws Exception 
     *       Whatever.
     */
    public static void main(String[] args) throws Exception {
        logger.setName("Article-Completion");

        //      removeArticleCategories();
        //      insertArticleCategories();
        //      completeArticleCategories();
        //      insertArticleTitles();
    }

    /////////////////////////////////////////////////////////////////
    // LOGGER      /////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /** Common object used for logging */
    private static HierarchicalLogger logger = HierarchicalLoggerManager.getHierarchicalLogger();

    /////////////////////////////////////////////////////////////////
    // CATEGORIES   /////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /**
     * Removes the categories of all existing
     * articles in the corpus.
     * <br/>
     * Note: category loading must be disabled first,
     * by adding comments in class Article.
     * 
     * @throws IOException 
     *       Problem while accessing the files.
     * @throws SAXException 
     *       Problem while accessing the files.
     * @throws ParseException 
     *       Problem while accessing the files.
     */
    private static void removeArticleCategories() throws ParseException, SAXException, IOException {
        logger.log("Resetting categories in articles");
        logger.increaseOffset();

        List<File> files = ArticleLists.getArticleList();
        for (File file : files) {
            String name = file.getName();
            logger.log("Processing article '" + name + "'");
            logger.increaseOffset();

            Article article = Article.read(name);
            article.write();

            logger.decreaseOffset();
        }

        logger.decreaseOffset();
        logger.log("Categories reset");
    }

    /**
     * This methods allows setting the category of 
     * articles already retrieved and manually annotated
     * for evaluation. This way, the annotation can be
     * performed overall, or in function of the category.
     * The categories must be listed in a file in which
     * each line contains the name of the article folder
     * and the corresponding category.
     * 
     * @throws ParseException
     *       Problem while accessing the files.
     * @throws SAXException
     *       Problem while accessing the files.
     * @throws IOException
     *       Problem while accessing the files.
     */
    private static void insertArticleCategories() throws ParseException, SAXException, IOException {
        logger.log("Setting categories in articles");
        logger.increaseOffset();

        File file = new File(FileNames.FO_OUTPUT + File.separator + "cats" + FileNames.EX_TXT);
        FileInputStream fis = new FileInputStream(file);
        InputStreamReader isr = new InputStreamReader(fis);
        Scanner scanner = new Scanner(isr);
        logger.log("Reading file " + file);

        logger.increaseOffset();
        while (scanner.hasNextLine()) {
            String line = scanner.nextLine().trim();
            String temp[] = line.split("\\t");
            String name = temp[0];
            String folderPath = FileNames.FO_OUTPUT + File.separator + name;
            File folderFile = new File(folderPath);
            if (folderFile.exists()) {
                //            String gender = temp[1];
                String categoryStr = temp[2].toUpperCase(Locale.ENGLISH);
                ArticleCategory category = ArticleCategory.valueOf(categoryStr);
                //category = StringTools.initialize(category);
                logger.log("Processing '" + name + "': cat=" + category);
                List<ArticleCategory> categories = new ArrayList<ArticleCategory>();
                categories.add(category);

                Article article = Article.read(name);
                article.setCategories(categories);
                article.write();
            } else
                logger.log("Processing '" + temp[0] + "': folder not found");
        }
        logger.decreaseOffset();

        scanner.close();
        logger.decreaseOffset();
        logger.log("Categories set");
    }

    /**
     * Reads existing article, and apply the new
     * method to set their categories.
     * 
     * @throws ParseException
     *       Problem while accessing the files.
     * @throws SAXException
     *       Problem while accessing the files.
     * @throws IOException
     *       Problem while accessing the files.
     * @throws org.apache.http.ParseException
     *       Problem while accessing the files.
     * @throws org.json.simple.parser.ParseException
     *       Problem while accessing the files.
     */
    private static void completeArticleCategories() throws ParseException, SAXException, IOException,
            org.apache.http.ParseException, org.json.simple.parser.ParseException {
        logger.log("Completing categories in articles");
        logger.increaseOffset();

        WikipediaReader reader = new WikipediaReader();

        List<File> files = ArticleLists.getArticleList();
        for (File file : files) {
            String name = file.getName();
            logger.log("Processing article '" + name + "'");
            logger.increaseOffset();
            Article article = Article.read(name);

            List<ArticleCategory> categories = reader.getArticleCategories(article);
            article.setCategories(categories);
            logger.log("Detected categories: " + categories.toString());

            article.write();
            logger.log("Completed article written");
            logger.decreaseOffset();
        }

        logger.decreaseOffset();
        logger.log("Categories completed");
    }

    /////////////////////////////////////////////////////////////////
    // TITLES      /////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /**
     * This methods allows setting the title of 
     * articles already retrieved and manually annotated
     * for evaluation. We use the first sentence of the
     * article to initialize the title.
     * 
     * @throws ParseException
     *       Problem while accessing the files.
     * @throws SAXException
     *       Problem while accessing the files.
     * @throws IOException
     *       Problem while accessing the files.
     */
    private static void insertArticleTitles() throws ParseException, SAXException, IOException {
        logger.log("Setting title in articles");
        logger.increaseOffset();

        logger.increaseOffset();
        List<File> files = ArticleLists.getArticleList();
        for (File file : files) {
            String name = file.getName();
            Article article = Article.read(name);
            String rawText = article.getRawText();
            String sentence = rawText.substring(0, 25);

            logger.log("Processing article " + name + ": '" + sentence + "'");
            article.setTitle(sentence);
            article.write();
        }
        logger.decreaseOffset();

        logger.decreaseOffset();
        logger.log("Titles set");
    }
}