it.uniud.ailab.dcore.DistillerFactory.java Source code

Introduction

Here is the source code for it.uniud.ailab.dcore.DistillerFactory.java
Source

/*
 * Copyright (C) 2015 Artificial Intelligence
 * Laboratory @ University of Udine.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package it.uniud.ailab.dcore;

import it.uniud.ailab.dcore.annotation.annotators.*;
import it.uniud.ailab.dcore.io.GramPrinter;
import it.uniud.ailab.dcore.io.SentencePrinter;
import it.uniud.ailab.dcore.utils.FileSystem;
import it.uniud.ailab.dcore.wrappers.external.*;
import java.io.File;
import java.io.IOException;
import java.util.Locale;
import org.springframework.beans.factory.BeanDefinitionStoreException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.context.support.FileSystemXmlApplicationContext;

/**
 * A simple factory that generates the default Distiller configuration either
 * via XML configuration file or via Java code. We write this class also as a
 * tutorial for the users of this library, who can learn how to instantiate the
 * Distiller object studying this source code.
 *
 * @author Marco Basaldella
 */
public class DistillerFactory {

    /**
     * Instantiates a Distiller object using the default XML configuration; if
     * it's not available, uses the safer (but less precise) code configuration,
     * which excludes TagMe and inference from the distillation process.
     *
     * @return a Distiller ready to work.
     */
    public static Distiller getDefault() {

        // While the code under this comment may be ugly, it works.
        //
        // It tries to load the default XML config. If the load fails, throws
        // the cause of the failure and immediately catches it.
        //
        // If the config file is not accessible (due to permission, 
        // non-existance, or whatever), the exception is caught and the default
        // code configuration runs.
        //
        // Otherwise, the exception is re-thrown, so that the developer can
        // handle the errors in the config file, which are the other most likely
        // cause of failure of configuration loading falure.
        try {
            return getDefaultXML();
        } catch (BeanDefinitionStoreException bsde) {
            try {
                throw bsde.getCause();
            } catch (IOException ioe) {
                // the configuration file does not exist or is not accessible:
                // load the fallback configuration
                System.out.println("Distiller config file not found: using fallback configuration");
                return getDefaultCode();
            } catch (Throwable te) {
                throw bsde;
            }
        }
    }

    /**
     * Instantiates a Distiller object using the default evaluation
     * configuration.
     *
     * @return a Distiller ready to work.
     */
    public static Distiller getDefaultEval() {

        try {
            ApplicationContext context = new ClassPathXmlApplicationContext("eval.xml");
            return (Distiller) context.getBean("distiller");
        } catch (BeanDefinitionStoreException bsde) {
            try {
                throw bsde.getCause();
            } catch (IOException ioe) {
                // the configuration file does not exist or is not accessible:
                throw new RuntimeException("FATAL: Impossible to load the default evaluation pipeline.", ioe);
            } catch (Throwable te) {
                throw bsde;
            }
        }
    }

    /**
     * Instantiates a Distiller object using the specified configuration and
     * returns it.
     *
     * @param path the path where the config file is located
     * @return a Distiller ready to work.
     */
    public static Distiller loadFromXML(File path) {
        // We add the file:// thing before because, for some ??? reason, 
        // the Spring Framework decided that all paths are relative paths.

        // So, we get the file, retrieve is absolute path, and then add the
        // file:// prefix to be sure that the Spring Frameworks treats 
        // all paths as absolute paths. 
        // This is less problematic, because the Java platform will handle the
        // File and produce its absolute path, even if it has been created
        // using a relative one.
        ApplicationContext context = new FileSystemXmlApplicationContext("file://" + path.getAbsolutePath());
        return (Distiller) context.getBean("distiller");
    }

    /**
     * Instantiates a Distiller object using the default configuration and
     * returns it. Please note that you should create a config.xml file and copy
     * the content of default.xml inside it to get the framework to work.
     *
     * @return a Distiller ready to work.
     */
    public static Distiller getDefaultXML() {
        ApplicationContext context = new ClassPathXmlApplicationContext("config.xml");
        return (Distiller) context.getBean("distiller");
    }

    /**
     * Instantiates a Distiller object using a configuration packaged in the
     * Distiller JAR file and returns it.
     *
     * @param configPath the path of the pipeline
     * @return a Distiller ready to work.
     */
    public static Distiller loadFromPackagedXML(String configPath) {

        ApplicationContext context = new ClassPathXmlApplicationContext(configPath);
        return (Distiller) context.getBean("distiller");
    }

    public static Distiller getDefaultCode() {
        Distiller d = new Distiller();

        // set the language detector tool
        d.setLanguageDetector(new CybozuLanguageDetectorAnnotator());

        // build the pipeline
        Pipeline p = new Pipeline();
        // split the text
        p.addStage(new OpenNlpBootstrapperAnnotator());
        // add wikipedia tags to tokens

        //annotate tokens with stemming
        p.addStage(new PorterStemmerAnnotator());
        // Uncomment the lines below to use the TagMe service
        // TagMeTokenAnnotator tagme = new TagMeTokenAnnotator();        
        // tagme.setApiKey("INSERT KEY HERE");        
        // p.addStage(tagme);
        // generate ngrams
        p.addStage(new SimpleNGramGeneratorAnnotator());

        // remove stopwords
        p.addStage(new StopwordSimpleFilterAnnotator());

        // annotate ngrams
        p.addStage(new StatisticalAnnotator());

        // Uncomment to use TagMe
        // p.addStage(new TagMeGramAnnotator());
        // Uncomment to use the emotional intensity annotator.
        // This way you'll see how different annotators lead to different
        // keyphrases detection
        // p.addStage(new SyuzhetAnnotator());
        // evaluate ngram features        
        LinearEvaluatorAnnotator evaluator = new LinearEvaluatorAnnotator();
        evaluator.addWeight(StatisticalAnnotator.DEPTH, 0.15);
        evaluator.addWeight(StatisticalAnnotator.HEIGHT, 0.25);
        evaluator.addWeight(StatisticalAnnotator.LIFESPAN, 0.1);
        evaluator.addWeight(StatisticalAnnotator.FREQUENCY_SENTENCE, 0.1);
        evaluator.addWeight(GenericNGramGeneratorAnnotator.NOUNVALUE, 0.3);
        evaluator.addWeight(GenericWikipediaAnnotator.WIKIFLAG, 0.1);

        p.addStage(evaluator);

        // Uncomment the line below to infer concepts.
        // Watch out: the inference process sends lots of requests to Wikipedia, 
        // so it significantly slows down the process
        // p.addStage(new WikipediaInferenceAnnotator());
        // filter results
        p.addStage(new SkylineGramFilterAnnotator());

        // remove redundant grams
        //p.addStage(new GramMergerAnnotator());
        p.addStage(new GramPrinter());
        p.addStage(new SentencePrinter());

        d.addPipeline(Locale.ENGLISH, p);
        d.addPipeline(Locale.ITALIAN, p);

        return d;
    }

    public static Distiller getStanfordCode() {
        Distiller d = new Distiller();

        // set the language detector tool
        d.setLanguageDetector(new CybozuLanguageDetectorAnnotator());

        // build the pipeline
        Pipeline p = new Pipeline();
        // split the text
        p.addStage(new StanfordBootstrapperAnnotator());
        // add wikipedia tags to tokens

        //annotate tokens with stemming
        p.addStage(new PorterStemmerAnnotator());

        // Uncomment the lines below to use the TagMe service
        // TagMeTokenAnnotator tagme = new TagMeTokenAnnotator();        
        // tagme.setApiKey("INSERT KEY HERE");        
        // p.addStage(tagme);
        // generate ngrams
        p.addStage(new SimpleNGramGeneratorAnnotator());

        //        // remove stopwords
        p.addStage(new StopwordSimpleFilterAnnotator());
        //
        //        // annotate ngrams
        p.addStage(new StatisticalAnnotator());
        p.addStage(new CoreferenceResolverAnnotator());
        p.addStage(new ChunkingNerAnnotator());
        // Uncomment to use TagMe
        // p.addStage(new TagMeGramAnnotator());
        // Uncomment to use the emotional intensity annotator.
        // This way you'll see how different annotators lead to different
        // keyphrases detection
        // p.addStage(new SyuzhetAnnotator());
        // evaluate ngram features        
        LinearEvaluatorAnnotator evaluator = new LinearEvaluatorAnnotator();
        evaluator.addWeight(StatisticalAnnotator.DEPTH, 0.15);
        evaluator.addWeight(StatisticalAnnotator.HEIGHT, 0.25);
        evaluator.addWeight(StatisticalAnnotator.LIFESPAN, 0.1);
        evaluator.addWeight(StatisticalAnnotator.FREQUENCY_SENTENCE, 0.1);
        evaluator.addWeight(GenericNGramGeneratorAnnotator.NOUNVALUE, 0.3);
        evaluator.addWeight(GenericWikipediaAnnotator.WIKIFLAG, 0.1);
        evaluator.addWeight(CoreferenceResolverAnnotator.NUMBER_OF_REFERENCE, 0.2);
        evaluator.addWeight(CoreferenceResolverAnnotator.IN_ANAPHORA, 0.2);
        evaluator.addWeight(ChunkingNerAnnotator.IS_NER, 0.2);

        p.addStage(evaluator);

        p.addStage(new GramPrinter());

        d.addPipeline(Locale.ENGLISH, p);

        return d;
    }

}