tr.edu.gsu.nerwip.recognition.internal.modelless.subee.Subee.java Source code

Introduction

Here is the source code for tr.edu.gsu.nerwip.recognition.internal.modelless.subee.Subee.java
Source

package tr.edu.gsu.nerwip.recognition.internal.modelless.subee;

/*
 * Nerwip - Named Entity Extraction in Wikipedia Pages
 * Copyright 2011 Yasa Akbulut, Burcu Kpeliolu & Vincent Labatut
 * Copyright 2012 Burcu Kpeliolu, Samet Atda & Vincent Labatut
 * Copyright 2013 Samet Atda & Vincent Labatut
 * Copyright 2014-15 Vincent Labatut
 * 
 * This file is part of Nerwip - Named Entity Extraction in Wikipedia Pages.
 * 
 * Nerwip - Named Entity Extraction in Wikipedia Pages is free software: you can 
 * redistribute it and/or modify it under the terms of the GNU General Public License 
 * as published by the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 * 
 * Nerwip - Named Entity Extraction in Wikipedia Pages is distributed in the hope 
 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 
 * License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Nerwip - Named Entity Extraction in Wikipedia Pages.  
 * If not, see <http://www.gnu.org/licenses/>.
 */

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import tr.edu.gsu.nerwip.data.article.Article;
import tr.edu.gsu.nerwip.data.entity.AbstractEntity;
import tr.edu.gsu.nerwip.data.entity.EntityType;
import tr.edu.gsu.nerwip.recognition.RecognizerException;
import tr.edu.gsu.nerwip.recognition.RecognizerName;
import tr.edu.gsu.nerwip.recognition.internal.modelless.AbstractModellessInternalRecognizer;
import tr.edu.gsu.nerwip.tools.file.FileNames;
import tr.edu.gsu.nerwip.tools.file.FileTools;
import tr.edu.gsu.nerwip.tools.freebase.FbCommonTools;
import tr.edu.gsu.nerwip.tools.freebase.FbTypeTools;
import tr.edu.gsu.nerwip.tools.string.StringTools;

/**
 * This class implements our own NER tool, called Subee. It takes advantage of
 * hyperlinks present in  Wikipedia pages to identify entities in the text, and 
 * of Freebase to select their type.
 * <br/>
 * Recommended parameter values:
 * <ul>
 *       <li>{@code additionalOccurrences}: {@code true}</li>
 *       <li>{@code useTitle}: {@code true}</li>
 *       <li>{@code notableType}: {@code true}</li>
 *       <li>{@code useAcronyms}: {@code true}</li>
 *       <li>{@code discardDemonyms}: {@code true}</li>
 * </ul>
 * <br/>
 * <b>Note:</b> if you use this tool, make sure you set up your Freebase key
 * in class {@link FbCommonTools}.
 * 
 * @author Yasa Akbulut
 * @author Vincent Labatut
 */
public class Subee extends AbstractModellessInternalRecognizer<List<AbstractEntity<?>>, SubeeConverter> {
    /**
     * Builds and sets up an object representing
     * Subee, our NER tool taking advantage of text
     * containing hyperlinks.
     * 
     * @param additionalOccurrences
     *       Whether or not the tool should annotate the additional occurrences
     *       of some entity.
     * @param useTitle
     *       Whether or not the tool should use the article title to infer
     *       the person name.
     * @param notableType
     *       Whether the tool should use the single notable type provided by Freebase,
     *       or all available Freebase types.
     * @param useAcronyms
     *       On their first occurrence, certain entities are followed by the associated
     *       acronym: this option allows searching them in the rest of the text.
     * @param discardDemonyms
     *       Ignore entities whose string value corresponds to a demonym, i.e. the adjective
     *       associated to a place, or the name of its inhabitants. Subee generally takes them
     *       for the place itself, leading to an increased number of false positives.
     */
    public Subee(boolean additionalOccurrences, boolean useTitle, boolean notableType, boolean useAcronyms,
            boolean discardDemonyms) {
        super(false, false, false);

        this.additionalOccurrences = additionalOccurrences;
        this.useTitle = useTitle;
        this.notableType = notableType;
        this.useAcronyms = useAcronyms;
        this.discardDemonyms = discardDemonyms;

        // init converter
        converter = new SubeeConverter(getFolder());
    }

    /////////////////////////////////////////////////////////////////
    // NAME            /////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    @Override
    public RecognizerName getName() {
        return RecognizerName.SUBEE;
    }

    /////////////////////////////////////////////////////////////////
    // FOLDER         /////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    @Override
    public String getFolder() {
        String result = getName().toString();

        result = result + "_" + "addOcc=" + additionalOccurrences;
        result = result + "_" + "useTtl=" + useTitle;
        result = result + "_" + "ntblType=" + notableType;
        result = result + "_" + "useAcro=" + useAcronyms;
        result = result + "_" + "discDemo=" + discardDemonyms;

        return result;
    }

    /////////////////////////////////////////////////////////////////
    // ENTITIES         /////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /** List of entities recognized by OpenCalais */
    private static final List<EntityType> HANDLED_TYPES = Arrays.asList(EntityType.LOCATION,
            EntityType.ORGANIZATION, EntityType.PERSON);

    @Override
    public List<EntityType> getHandledEntityTypes() {
        return HANDLED_TYPES;
    }

    /////////////////////////////////////////////////////////////////
    // PROCESSING          /////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    @Override
    protected List<AbstractEntity<?>> detectEntities(Article article) throws RecognizerException {
        logger.increaseOffset();
        List<AbstractEntity<?>> result = new ArrayList<AbstractEntity<?>>();

        try { // detect and process hyperlinks
            logger.log("Detect and process hyperlinks");
            List<AbstractEntity<?>> sureEntities = processHyperlinks(article);

            // look for additional occurrences of these entities
            List<AbstractEntity<?>> possibleEntities = new ArrayList<AbstractEntity<?>>();
            if (additionalOccurrences) {
                logger.log("Look for additional occurrences");
                possibleEntities = processOccurrences(article, sureEntities);
            } else
                logger.log("Ignore additional occurrences");

            // process the name of the person described in the processed article
            if (useTitle) {
                logger.log("Process the name of this article main person");
                List<AbstractEntity<?>> temp = processMainName(article);
                possibleEntities.addAll(temp);
            } else
                logger.log("Ignore article title");

            // build result list by merging both lists (sure and possible entities)
            result = mergeEntityLists(sureEntities, possibleEntities);
        } catch (ParserException e) {
            e.printStackTrace();
            throw new RecognizerException(e.getMessage());
        } catch (ClientProtocolException e) {
            e.printStackTrace();
            throw new RecognizerException(e.getMessage());
        } catch (ParseException e) {
            e.printStackTrace();
            throw new RecognizerException(e.getMessage());
        } catch (IOException e) {
            e.printStackTrace();
            throw new RecognizerException(e.getMessage());
        } catch (org.json.simple.parser.ParseException e) {
            e.printStackTrace();
            throw new RecognizerException(e.getMessage());
        }

        logger.decreaseOffset();
        return result;
    }

    /////////////////////////////////////////////////////////////////
    // TITLE             /////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /** Whether or not the title should be used to infer the person's name */
    private boolean useTitle;

    /**
     * Handles the name of the person described in the processed article. For this matter,
     * we consider the article title and name, as well as the first sentence, which generally
     * starts with the full name of the person.
     * 
     * @param article 
     *       Article to process.
     * @return
     *       List of possible entities based on the analysis of the article title and name.
     * 
     * @throws ClientProtocolException
     *       Problem while accessing Freebase.
     * @throws ParseException
     *       Problem while accessing Freebase.
     * @throws IOException
     *       Problem while accessing Freebase.
     * @throws org.json.simple.parser.ParseException
     *       Problem while accessing Freebase.
     */
    private List<AbstractEntity<?>> processMainName(Article article)
            throws ClientProtocolException, ParseException, IOException, org.json.simple.parser.ParseException {
        logger.increaseOffset();
        List<AbstractEntity<?>> result = new ArrayList<AbstractEntity<?>>();
        String rawText = article.getRawText();

        // init candidate strings with article name and title 
        Set<String> candidateStrings = new TreeSet<String>();
        String articleTitle = article.getTitle();
        //debug
        //if(articleTitle.equals("Alfred Lothar Wegener"))
        //   System.out.print("");
        logger.log("Article title: " + articleTitle);
        candidateStrings.add(articleTitle);
        String articleName = article.getName();
        logger.log("Article name: " + articleName);
        articleName = articleName.replace('_', ' ').trim();
        candidateStrings.add(articleName);

        // process the beginning of the first sentence
        // we look for the string before the first parenthesis (usually containing birth info)
        // if there's none, we just ignore this potential information source
        Pattern p = Pattern.compile("^[^\\.]+?\\(");
        Matcher m = p.matcher(rawText);
        if (m.find()) {
            int startPos = m.start();
            if (startPos == 0) {
                int endPos = m.end();
                String persName = rawText.substring(0, endPos - 1);
                persName = persName.trim();
                int wordCount = persName.length() - persName.replaceAll(" ", "").length();
                if (wordCount > 6)
                    logger.log(
                            "Not able to extract person name from first sentence (too many words before the parenthesis): \""
                                    + rawText.substring(0, 75) + "\"");
                else {
                    logger.log("Person name: " + persName);
                    candidateStrings.add(persName);
                }
            }
        } else
            logger.log("Not able to extract person name from first sentence (can't find the parenthesis): \""
                    + rawText.substring(0, 75) + "\"");

        // possibly remove double quotes (especially for the nicknames)
        List<String> nickFull = new ArrayList<String>();
        Set<String> copy = new TreeSet<String>(candidateStrings);
        candidateStrings.clear();
        for (String candidateString : copy) {
            if (candidateString.contains("\"")) {
                nickFull.add(candidateString);
                candidateString = candidateString.replaceAll("\"", "");
            }
            candidateStrings.add(candidateString);
        }

        // possibly remove an indication in parenthesis at the end (especially for the titles)
        copy = new TreeSet<String>(candidateStrings);
        candidateStrings.clear();
        for (String candidateString : copy) {
            if (candidateString.endsWith(")")) {
                String temp[] = candidateString.split("\\(");
                candidateString = temp[0].trim();
            }
            candidateStrings.add(candidateString);
        }

        // add the lastname alone; only with the preceeding word; only with the 2 preeceding words, etc.
        copy = new TreeSet<String>(candidateStrings);
        for (String candidateString : copy) {
            String split[] = candidateString.split(" ");
            for (int i = split.length - 1; i >= 0; i--) {
                String temp = "";
                for (int j = i; j < split.length; j++)
                    temp = temp + split[j] + " ";
                temp = temp.trim();
                candidateStrings.add(temp);
            }
        }

        // add very first and very last names (for more than 2 words)
        copy = new TreeSet<String>(candidateStrings);
        for (String candidateString : copy) {
            String split[] = candidateString.split(" ");
            if (split.length > 2) {
                String temp = split[0] + " " + split[split.length - 1];
                candidateStrings.add(temp);
            }
        }

        // add variants with initials instead of firstnames
        copy = new TreeSet<String>(candidateStrings);
        for (String candidateString : copy) {
            String split[] = candidateString.split(" ");
            if (split.length > 1) {
                String initials1 = "";
                String initials2 = "";
                for (int i = 0; i < split.length - 1; i++) {
                    initials1 = initials1 + split[i].substring(0, 1).toUpperCase(Locale.ENGLISH) + ". ";
                    initials2 = initials2 + split[i].substring(0, 1).toUpperCase(Locale.ENGLISH) + ".";
                }
                initials1 = initials1 + split[split.length - 1];
                initials2 = initials2 + " " + split[split.length - 1];
                candidateStrings.add(initials1);
                candidateStrings.add(initials2);
            }
        }

        // add the original version of the nicknames
        candidateStrings.addAll(nickFull);

        // look for similar strings in the text
        for (String expr : candidateStrings) {
            String escapedStr = Pattern.quote(expr);
            p = Pattern.compile("\\b" + escapedStr + "\\b");
            m = p.matcher(rawText);
            while (m.find()) {
                int startPos = m.start();
                int endPos = m.end();
                String valueStr = m.group();
                AbstractEntity<?> ent = AbstractEntity.build(EntityType.PERSON, startPos, endPos,
                        RecognizerName.SUBEE, valueStr);
                result.add(ent);
            }
        }

        if (result.isEmpty())
            logger.log("WARNING: title not found at all in the text, which is unusual");

        logger.decreaseOffset();
        return result;
    }

    /////////////////////////////////////////////////////////////////
    // HYPERLINKS          /////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /** HTML name of hyperlink elements */
    private static final String TAG_LINK = "a";
    /** HTML start tag, used for parsing the linked text */
    private static final String TAG_PAR_START = "<p>";
    /** HTML end tag, used for parsing the linked text */
    private static final String TAG_PAR_END = "</p>";
    /** Wheter acronyms should be searched for, or not */
    private boolean useAcronyms;

    /**
     * Takes advantage of hyperlinks in the text, in order
     * to detect entities. Most of the time, in a Wikipedia
     * article, the hyperlink is defined only for the very 
     * first occurrence of the entity. For this reason,
     * an additional processing is required to find the possible
     * other occurrences (cf. {@link #processOccurrences(Article, List)}). 
     * 
     * @param article
     *       Processed article.
     * @return
     *       The list of entities detected by this method.
     * 
     * @throws ParserException
     *       Problem while parsing the hyperlinks.
     * @throws ClientProtocolException
     *       Problem while accessing Freebase.
     * @throws ParseException
     *       Problem while accessing Freebase.
     * @throws IOException
     *       Problem while accessing Freebase.
     * @throws org.json.simple.parser.ParseException
     *       Problem while accessing Freebase.
     */
    private List<AbstractEntity<?>> processHyperlinks(Article article) throws ParserException,
            ClientProtocolException, ParseException, IOException, org.json.simple.parser.ParseException {
        logger.increaseOffset();
        List<AbstractEntity<?>> result = new ArrayList<AbstractEntity<?>>();

        // parse linked text to automatically get hyperlink list
        logger.log("Get hyperlink list");
        String linkedText = article.getLinkedText();
        Parser parser = new Parser(TAG_PAR_START + linkedText + TAG_PAR_END);
        NodeList linkList = parser.parse(new TagNameFilter(TAG_LINK));
        int offset = TAG_PAR_START.length();

        // process each hyperlink
        logger.log("Process each hyperlink");
        logger.increaseOffset();
        for (int i = 0; i < linkList.size(); i++) {
            LinkTag linkTag = (LinkTag) linkList.elementAt(i);
            String valueStr = linkTag.getLinkText();
            int length = valueStr.length();
            String test = linkTag.toHtml();
            logger.log("Hyperlink '" + test + "'");

            // get type from Freebase
            EntityType type = null;
            // only process strings with uppercase initial
            if (StringTools.hasInitial(valueStr)) {
                String hyperlink = linkTag.getLink();
                String[] linkParts = hyperlink.split("/");
                String lastPart = linkParts[linkParts.length - 1];
                String wikipediaTitle = URLDecoder.decode(lastPart, "UTF-8"); //TODO we may take advantage of this to automatically detect the type
                String wikipediaTitleEscaped = FbCommonTools.escapeMqlKey(wikipediaTitle); //TODO or this
                logger.log("Wikipedia title: " + wikipediaTitle);
                logger.log("Escaped Wikipedia title: " + wikipediaTitleEscaped);
                // use only the notable type
                if (notableType) {
                    String possibleType = FbTypeTools.getNotableType(wikipediaTitleEscaped);
                    if (possibleType == null)
                        logger.log("No notable Freebase type found for \"" + valueStr + "\"");
                    else {
                        List<String> possibleTypes = new ArrayList<String>();
                        possibleTypes.add(possibleType);
                        type = retrieveEntityType(possibleTypes);
                    }
                }
                // use all available types
                if (type == null) {
                    List<String> possibleTypes = FbTypeTools.getAllTypes(wikipediaTitleEscaped);
                    logger.log("Possible types: " + possibleTypes.toString());
                    if (possibleTypes.isEmpty())
                        logger.log("WARNING: no Freebase type found at all for \"" + valueStr + "\"");
                    else
                        type = retrieveEntityType(possibleTypes);
                }
            }

            // set up the entity position
            int startPos = linkTag.getStartPosition() - offset;
            int endPos = startPos + length;
            offset = offset + test.length() - length;
            //debug                        
            //String text = article.getRawText();
            //String valueStr2 = text.substring(startPos,endPos);
            //boolean test2 = valueStr.equals(valueStr2);
            //if(!test2)
            //   System.out.println("ERROR: entity and article do not match (position problem)");

            // no type: we can't create the entity
            if (type == null) {
                logger.log("WARNING: no entity was created, because no type could be identified for \"" + valueStr
                        + "\"");
            }
            // otherwise, we try
            else { // ignore if purely numerical
                if (StringTools.hasNoLetter(valueStr))
                    logger.log("The string is only numerical (no letters) so no entity is created for " + valueStr);

                // ignore if recognized as a location/organization but actually a demonym
                else if (discardDemonyms && (type == EntityType.LOCATION || type == EntityType.ORGANIZATION)
                        && DEMONYMS.contains(valueStr))
                    logger.log("The string is in the demonym list, so no entity is created for " + valueStr);

                else {
                    //debug
                    //if(valueStr.equalsIgnoreCase("Irish"))
                    //   System.out.print("");

                    // possibly look for an acronym
                    if (useAcronyms) { // only organization and locations have relevant acronyms
                                       // (for a person, acronyms usually correspond to titles or awards)
                        if (type == EntityType.ORGANIZATION || type == EntityType.LOCATION) { // check if there's an acronym inside the entity name itself
                            Pattern r = Pattern.compile("\\([^\\(a-z]+?\\)$"); // must be in uppercase
                            Matcher m = r.matcher(valueStr);
                            if (m.find()) { // create an additional entity (acronym) with the same type
                                int last = m.groupCount();
                                String acro = m.group(last);
                                int l = acro.length();
                                acro = acro.substring(1, l - 1);
                                int s = startPos + m.start(last) + 1;
                                int e = startPos + m.end(last) - 1;
                                if (!StringTools.hasNoLetter(acro)) {
                                    //debug                        
                                    //String valueStr3 = text.substring(s,e);
                                    //boolean test3 = acro.equals(valueStr3);
                                    //if(!test3)
                                    //   System.out.println("ERROR: entity acronym and article do not match (position problem)");
                                    AbstractEntity<?> entity = AbstractEntity.build(type, s, e,
                                            RecognizerName.SUBEE, acro);
                                    result.add(entity);
                                    logger.log("Creation of an extra entity (acronym) " + entity);
                                }
                                // remove the acronym from the original string
                                valueStr = valueStr.substring(0, valueStr.length() - l).trim();
                                endPos = startPos + valueStr.length();
                            }
                            // check if there's an acronym right after the entity 
                            else {
                                r = Pattern.compile("\\([^\\(a-z]+?\\)"); // must be in uppercase
                                m = r.matcher(linkedText);
                                if (m.find(linkTag.getEndTag().getEndPosition() - TAG_PAR_START.length())) { // possibly create an additional entity (acronym) with the same type
                                    int last = m.groupCount();
                                    String acro = m.group(last);
                                    acro = acro.substring(1, acro.length() - 1);
                                    int s = m.start(last) - 1 - (offset - TAG_PAR_END.length()) + 1; // actually <a/> and not <p/>, but same length...
                                    // the acronym must be right after the original entity
                                    if (s == endPos + 2 && !StringTools.hasNoLetter(acro)) {
                                        int e = m.end(last) - 1 - (offset - TAG_PAR_END.length()) - 1;
                                        //debug
                                        //String valueStr3 = text.substring(s,e);
                                        //boolean test3 = acro.equals(valueStr3);
                                        //if(!test3)
                                        //   System.out.println("ERROR: entity acronym and article do not match (position problem)");
                                        AbstractEntity<?> entity = AbstractEntity.build(type, s, e,
                                                RecognizerName.SUBEE, acro);
                                        result.add(entity);
                                        logger.log("Creation of an extra entity (acronym) " + entity);
                                    }
                                }
                            }
                        }
                    }

                    // create the entity
                    AbstractEntity<?> entity = AbstractEntity.build(type, startPos, endPos, RecognizerName.SUBEE,
                            valueStr);
                    result.add(entity);
                    logger.log("Creation of the entity " + entity);
                }
            }
        }
        logger.decreaseOffset();

        logger.decreaseOffset();
        return result;
    }

    /////////////////////////////////////////////////////////////////
    // ENTITY TYPES          /////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /** Whether or not to use Freebase notable types (instead of all FB types) */
    private boolean notableType;
    /** Prefix used for the map files */
    protected static String FILE_PREFIX = "fb.";
    /** Name of the file containinig the list of ignored FB types */
    protected static String FILE_IGNORED = "ignored";
    /** Map tp convert Freebase types to EntityType values */
    protected static final Map<String, EntityType> TYPE_MAP = new HashMap<String, EntityType>();

    @Override
    protected void prepareRecognizer() throws RecognizerException {
        try {
            loadTypeMaps();
            loadUnknownTypes();
            if (discardDemonyms)
                loadDemonyms();
        } catch (FileNotFoundException e) {
            throw new RecognizerException(e.getMessage());
        }
    }

    /**
     * Initializes the conversion map with some predefined
     * files. Each file contains a list of FB types associated
     * (mainly) to a specific type. An additional file contains
     * a list of ignored types (for debugging purposes, and to
     * ease the future completion of these files).
     * 
     * @throws FileNotFoundException
     *       Problem while accessing one of the map files.
     */
    private synchronized void loadTypeMaps() throws FileNotFoundException {
        if (TYPE_MAP.isEmpty()) {
            logger.log("Loading type maps");
            logger.increaseOffset();

            // set up the list of types
            String base = FileNames.FO_SUBEE + File.separator;
            List<EntityType> types = new ArrayList<EntityType>(HANDLED_TYPES);
            types.add(null); // for the ignored types

            // process each corresponding file
            for (EntityType type : types) { // open file
                String name = FILE_IGNORED;
                if (type != null)
                    name = type.toString().toLowerCase();
                String filePath = base + FILE_PREFIX + name + FileNames.EX_TXT;
                logger.log("Processing file " + filePath);
                Scanner scanner = FileTools.openTextFileRead(filePath);

                // read the content and add to the conversion map
                while (scanner.hasNextLine()) {
                    String string = scanner.nextLine().trim();
                    TYPE_MAP.put(string, type);
                }

                scanner.close();
            }

            logger.decreaseOffset();
            logger.log("Type maps loading complete");
        }
    }

    /**
     * This method receives a list of Freebase types, and 
     * infers the corresponding {@link EntityType}.
     * 
     * @param fbTypes
     *       List of Freebase types.
     * @return
     *       Corresponding ArticleCategory, or {@code null} if node could be found.
     */
    protected synchronized EntityType retrieveEntityType(List<String> fbTypes) {
        logger.increaseOffset();
        Set<String> knownKeys = TYPE_MAP.keySet();

        // retrieve a list of EntityTypes corresponding to the FreeBase types
        List<EntityType> types = new ArrayList<EntityType>();
        for (String fbType : fbTypes) { // try to use first the existing map
            EntityType type = TYPE_MAP.get(fbType);
            if (type != null)
                types.add(type);

            // otherwise, try to use the type name (rough)
            else { // person
                if (fbType.endsWith("person"))
                    types.add(EntityType.PERSON);

                // location
                else if (fbType.endsWith("location"))
                    types.add(EntityType.LOCATION);

                // organization
                else if (fbType.endsWith("organization"))
                    types.add(EntityType.ORGANIZATION);
                else if (fbType.endsWith("governmental_body"))
                    types.add(EntityType.ORGANIZATION);
                else if (fbType.endsWith("collective"))
                    types.add(EntityType.ORGANIZATION);

                // possibly add to the list of unknown types
                if (!knownKeys.contains(fbType))
                    updateUnknownTypes(fbType);
            }
        }

        // determine the final type by prioritizing them
        EntityType result = null;
        if (types.contains(EntityType.ORGANIZATION))
            result = EntityType.ORGANIZATION;
        else if (types.contains(EntityType.LOCATION))
            result = EntityType.LOCATION;
        else if (types.contains(EntityType.PERSON))
            result = EntityType.PERSON;

        logger.decreaseOffset();
        return result;
    }

    /////////////////////////////////////////////////////////////////
    // UNKNOWN FREEBASE TYPES      /////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /** Freebase types not recognized by Subee (for debugging purposes) */
    protected static Set<String> UNKNOWN_TYPES = new TreeSet<String>();

    /**
     * Loads the existing list of unknown Freebase types.
     * This list is supposed to be processed manually,
     * in order to complete the other FB-related files of
     * Subee. The goal is to associate an EntityType value
     * to all FB types.
     */
    private synchronized void loadUnknownTypes() {
        if (UNKNOWN_TYPES.isEmpty()) {
            logger.log("Loading unknown Freebase types");
            logger.increaseOffset();

            // set up file path
            String path = FileNames.FO_SUBEE + File.separator + FileNames.FI_UNKNOWN_TYPES;
            File file = new File(path);

            // retrieve existing unknown types
            try {
                Scanner scanner = FileTools.openTextFileRead(file);
                while (scanner.hasNextLine()) {
                    String line = scanner.nextLine().trim();
                    UNKNOWN_TYPES.add(line);
                }
                scanner.close();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }

            logger.decreaseOffset();
            logger.log("Loading complete");
        }
    }

    /**
     * Adds the specified type to the list of unknown FB types,
     * updating both the memory and the file versions of this list.
     * 
     * @param fbType
     *       New unknown Freebase type.
     */
    protected synchronized void updateUnknownTypes(String fbType) {
        if (!UNKNOWN_TYPES.contains(fbType) // type not already in the list 
                && !fbType.startsWith("/user/") // not a user type
                && !fbType.startsWith("/m/")) // not a coded type
        { // add to the memory list
            UNKNOWN_TYPES.add(fbType);

            // set up file path
            String path = FileNames.FO_SUBEE + File.separator + FileNames.FI_UNKNOWN_TYPES;
            File file = new File(path);

            // create the print writer
            try { // open the file in append mode
                FileOutputStream fos = new FileOutputStream(file, true);
                OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
                PrintWriter printWriter = new PrintWriter(osw);

                // write the new type
                printWriter.println(fbType);
                printWriter.flush(); // just a precaution

                // close the stream
                printWriter.close();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (UnsupportedEncodingException e) {
                e.printStackTrace();
            }
        }
    }

    /////////////////////////////////////////////////////////////////
    // OCCURRENCES          /////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /** Whether or not the tool should try to detect additional occurrences of linked entities */
    private boolean additionalOccurrences;

    /**
     * Receives the entities detected thanks to the hyperlinks, and tries 
     * to find their other occurrences in the text.
     * 
     * @param article 
     *       Article to process.
     * @param sureEntities 
     *       Entities already detected, corresponding to hyperlinks.
     * @return
     *       A new list of possible entities, to be merged later with the sure entities.
     */
    private List<AbstractEntity<?>> processOccurrences(Article article, List<AbstractEntity<?>> sureEntities) {
        logger.increaseOffset();
        String rawText = article.getRawText();
        List<AbstractEntity<?>> result = new ArrayList<AbstractEntity<?>>();

        //      // sort entities by type (we want to prioritize them)
        //      logger.log("Sort entity by type");
        //      TreeSet<AbstractEntity<?>> temp = new TreeSet<AbstractEntity<?>>(new Comparator<AbstractEntity<?>>()
        //      {   @Override
        //         public int compare(AbstractEntity<?> o1, AbstractEntity<?> o2)
        //         {   int result = 0;
        //            EntityType t1 = o1.getType();
        //            EntityType t2 = o2.getType();
        //            if(t1==EntityType.ORGANIZATION && t2!=EntityType.ORGANIZATION
        //               || t1==EntityType.PERSON && t2==EntityType.LOCATION)
        //               result = -1;
        //            else if(t2==EntityType.ORGANIZATION && t1!=EntityType.ORGANIZATION
        //                  || t2==EntityType.PERSON && t1==EntityType.LOCATION)
        //               result = 1;
        //            else
        //               result = o1.compareTo(o2);
        //            return result;
        //         }   
        //      });
        //      temp.addAll(sureEntities);

        // look for additional occurrences
        logger.log("Look for additional occurrences");
        for (AbstractEntity<?> entity : sureEntities) {
            String valueStr = entity.getStringValue();

            // look for the entity in the text
            String escapedStr = Pattern.quote(valueStr);
            Pattern p = Pattern.compile("\\b" + escapedStr + "\\b");
            Matcher m = p.matcher(rawText);
            while (m.find()) {
                int startPos = m.start();
                //            // don't use the same position for several entities
                //            if(!positionAlreadyUsed(startPos, result))   // this test is now done later 
                {
                    int endPos = m.end();
                    EntityType type = entity.getType();
                    AbstractEntity<?> ent = AbstractEntity.build(type, startPos, endPos, RecognizerName.SUBEE,
                            valueStr);
                    result.add(ent);
                }
            }
        }

        logger.decreaseOffset();
        return result;
    }

    /**
     * Merges two lists of entities: <i>sure</i> entities identified based on hyperlinks alone,
     * and <i>possible</i> entities identified using other means. If some possible entity overlaps
     * with a sure one, then only the sure one is kept. If several possible entities overlap,
     * then the longest one (in terms of string length) is kept.
     * 
     * @param sureEntities
     *       Entities for which we are reasonably sure.
     * @param possibleEntities
     *       Entities for which we are less sure.
     * @return
     *       Result of the merging of both lists.
     */
    private List<AbstractEntity<?>> mergeEntityLists(List<AbstractEntity<?>> sureEntities,
            List<AbstractEntity<?>> possibleEntities) {
        logger.log("Start merging sure and possible entity lists");
        logger.increaseOffset();
        ArrayList<AbstractEntity<?>> result = new ArrayList<AbstractEntity<?>>();

        // add all sure entities
        logger.log("Add all sure entities (" + sureEntities.size() + " entities)");
        result.addAll(sureEntities);

        // remove overlapping possible entities (keeping the longest ones)
        logger.log("Remove overlapping possible entities (" + possibleEntities.size() + " entities)");
        filterRedundancy(possibleEntities);
        logger.log("Removal complete (" + possibleEntities.size() + " entities remaining)");

        // add to the result only the possible entities with no overlap with sure ones
        logger.log("Adding remaining entities to the sure ones, avoiding overlaps)");
        for (AbstractEntity<?> entity : possibleEntities) {
            AbstractEntity<?> e = positionAlreadyUsed(entity, sureEntities);
            if (e == null)
                result.add(entity);
        }

        logger.decreaseOffset();
        logger.log("Merging complete: " + result.size() + " entities in total");
        return result;
    }

    /////////////////////////////////////////////////////////////////
    // DEMONYMS             /////////////////////////////////////////
    /////////////////////////////////////////////////////////////////
    /** Whether demonyms should be discarded ({@code true}) or ignored ({@code false}) */
    private boolean discardDemonyms;
    /** Set of demonyms (loaded from a file) */
    private static final Set<String> DEMONYMS = new TreeSet<String>();

    /**
     * Loads the list of demonyms. It is supposed to contain only
     * unambiguous demonyms, i.e. strings which are not at the same
     * time the adjective and the name of the place. We want to keep
     * locations.
     */
    private synchronized void loadDemonyms() {
        if (DEMONYMS.isEmpty()) {
            logger.log("Loading demonyms");
            logger.increaseOffset();

            // set up file path
            String path = FileNames.FO_CUSTOM_LISTS + File.separator + FileNames.FI_DEMONYMS;
            File file = new File(path);

            // retrieve demonyms
            try {
                Scanner scanner = FileTools.openTextFileRead(file);
                while (scanner.hasNextLine()) {
                    String line = scanner.nextLine().trim();
                    DEMONYMS.add(line);
                }
                scanner.close();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }

            logger.decreaseOffset();
            logger.log("Loading complete");
        }
    }
}