gate.termraider.bank.AbstractTermbank.java Source code

Java tutorial

Introduction

Here is the source code for gate.termraider.bank.AbstractTermbank.java

Source

/*
 *  Copyright (c) 2008--2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: AbstractTermbank.java 17718 2014-03-20 20:40:06Z adamfunk $
 */
package gate.termraider.bank;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.Resource;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.gui.ActionsPublisher;
import gate.termraider.gui.ActionSaveCsv;
import gate.termraider.output.CsvGenerator;
import gate.termraider.util.ScoreType;
import gate.termraider.util.Term;
import gate.termraider.util.TermComparatorByDescendingScore;
import gate.util.GateException;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import javax.swing.Action;
import org.apache.commons.lang.StringEscapeUtils;

public abstract class AbstractTermbank extends AbstractBank implements ActionsPublisher {

    private static final long serialVersionUID = -1044054380153036770L;

    // additional CREOLE init parameters
    protected Set<String> inputAnnotationTypes;

    // transient to allow serialization
    protected transient List<Action> actionsList;

    protected Map<ScoreType, Map<Term, Number>> scores;
    protected Map<Term, Set<String>> termDocuments;
    public static final String RAW_SUFFIX = ".raw";

    private List<Term> termsByDescendingScore;
    protected boolean descendingScoresDone = false;

    protected List<ScoreType> scoreTypes;
    private Number minDefaultScore, maxDefaultScore;

    public Resource init() throws ResourceInstantiationException {
        prepare();
        initializeScoreTypes();
        // Above method must be set in each subclass;
        // now we check it has been done.
        if (this.scoreTypes.size() == 0) {
            throw new ResourceInstantiationException("No score types found in " + this.toString());
        }
        resetScores();
        processCorpora();
        calculateScores();
        return this;
    }

    public void cleanup() {
        super.cleanup();
    }

    public List<ScoreType> getScoreTypes() {
        return this.scoreTypes;
    }

    public Number getScore(ScoreType type, Term term) {
        Map<Term, Number> ss = this.getScores(type);
        if (ss.containsKey(term)) {
            return ss.get(term);
        }

        // implied else
        return 0;
    }

    public Collection<Term> getTerms() {
        return this.getDefaultScores().keySet();
    }

    public ScoreType getDefaultScoreType() {
        return this.scoreTypes.get(0);
    }

    public Map<Term, Number> getDefaultScores() {
        return this.scores.get(getDefaultScoreType());
    }

    protected abstract void initializeScoreTypes();

    public List<Term> getTermsByDescendingScore() {
        // lazy computation
        if (!descendingScoresDone) {
            termsByDescendingScore = new ArrayList<Term>(this.getTerms());
            Collections.sort(termsByDescendingScore,
                    new TermComparatorByDescendingScore(scores.get(this.getDefaultScoreType())));
            descendingScoresDone = true;
        }
        return this.termsByDescendingScore;
    }

    public Map<Term, Set<String>> getTermDocuments() {
        return this.termDocuments;
    }

    public Map<ScoreType, Number> getScoreMap(Term term) {
        Map<ScoreType, Number> result = new HashMap<ScoreType, Number>();
        for (ScoreType st : this.scoreTypes) {
            result.put(st, this.scores.get(st).get(term));
        }
        return result;
    }

    public Map<Term, Number> getScores(ScoreType st) {
        return this.scores.get(st);
    }

    public Set<String> getDocumentsForTerm(Term term) {
        if (this.termDocuments.containsKey(term)) {
            return this.termDocuments.get(term);
        }

        // implied else: empty set
        return new HashSet<String>();
    }

    private void findMinAndMaxDefaultScores() {
        Collection<Number> values = this.getDefaultScores().values();
        if (values.isEmpty()) {
            minDefaultScore = new Integer(0);
            maxDefaultScore = new Integer(1);
        } else {
            minDefaultScore = values.iterator().next();
            maxDefaultScore = values.iterator().next();
            for (Number n : values) {
                if (n.doubleValue() < minDefaultScore.doubleValue()) {
                    minDefaultScore = n;
                }

                if (n.doubleValue() > maxDefaultScore.doubleValue()) {
                    maxDefaultScore = n;
                }
            }
        }
    }

    public Number getMinScore() {
        // lazy calculation
        if (minDefaultScore == null) {
            findMinAndMaxDefaultScores();
        }
        return minDefaultScore;
    }

    public Number getMaxScore() {
        // lazy calculation
        if (maxDefaultScore == null) {
            findMinAndMaxDefaultScores();
        }
        return maxDefaultScore;
    }

    protected void prepare() throws ResourceInstantiationException {
        if ((corpora == null) || (corpora.size() == 0)) {
            throw new ResourceInstantiationException("No corpora given");
        }

        this.types = new TreeSet<String>();
        this.languages = new TreeSet<String>();
    }

    protected void createActions() {
        actionsList = new ArrayList<Action>();
        actionsList.add(new ActionSaveCsv("Save as CSV...", this));
    }

    protected void processCorpora() {
        for (Corpus corpus : corpora) {
            processCorpus(corpus);
            if (debugMode) {
                System.out.println(
                        "Termbank: added corpus " + corpus.getName() + " with " + corpus.size() + " documents");
            }
        }
    }

    protected void processCorpus(Corpus corpus) {
        for (int i = 0; i < corpus.size(); i++) {
            boolean wasLoaded = corpus.isDocumentLoaded(i);
            Document document = (Document) corpus.get(i);

            processDocument(document);

            // datastore safety
            if (!wasLoaded) {
                corpus.unloadDocument(document);
                Factory.deleteResource(document);
            }
        }
    }

    /* BEHOLD THE GUBBINS to distinguish the various types of Termbanks */

    protected abstract void resetScores();

    protected abstract void processDocument(Document document);

    /**
     * This also needs to fill types and languages
     */
    protected abstract void calculateScores();

    public abstract Map<String, String> getMiscDataForGui();

    /* Methods for saving as CSV */

    public void saveAsCsv(Number threshold, File outputFile) throws GateException {
        CsvGenerator.generateAndSaveCsv(this, threshold, outputFile);
    }

    /**
     * Convenience method to save everything in the termbank.
     * @param outputFile
     * @throws GateException
     */
    public void saveAsCsv(File outputFile) throws GateException {
        saveAsCsv(this.getMinScore(), outputFile);
    }

    @Override
    public List<Action> getActions() {
        // lazy instantiation because actionsList is transient
        if (actionsList == null) {
            createActions();
        }

        return this.actionsList;
    }

    public String getCsvHeader() {
        StringBuilder sb = new StringBuilder();
        sb.append(StringEscapeUtils.escapeCsv("Term"));
        sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
        sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
        for (ScoreType type : this.scoreTypes) {
            sb.append(',').append(StringEscapeUtils.escapeCsv(type.toString()));
        }
        sb.append(getCsvSubheader());
        return sb.toString();
    }

    /**
     * TODO: This is not right (columns).
     * Should be overridden as necessary, for totals etc.
     * Must start with a newline.
     * @return
     */
    protected String getCsvSubheader() {
        StringBuilder sb = new StringBuilder();
        sb.append('\n');
        sb.append(',').append(StringEscapeUtils.escapeCsv("_TOTAL_DOCS_"));
        sb.append(',').append(StringEscapeUtils.escapeCsv(""));
        sb.append(',').append(StringEscapeUtils.escapeCsv(""));
        sb.append(',').append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocumentCount())));
        return sb.toString();
    }

    public String getCsvLine(Term term) {
        StringBuilder sb = new StringBuilder();
        sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
        sb.append(',').append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
        sb.append(',').append(StringEscapeUtils.escapeCsv(term.getType()));
        for (ScoreType type : this.scoreTypes) {
            sb.append(',').append(StringEscapeUtils.escapeCsv(this.getScore(type, term).toString()));
        }
        return sb.toString();
    }

    /***** CREOLE PARAMETERS *****/

    @CreoleParameter(comment = "input annotation types", defaultValue = "SingleWord;MultiWord")
    public void setInputAnnotationTypes(Set<String> names) {
        this.inputAnnotationTypes = names;
    }

    public Set<String> getInputAnnotationTypes() {
        return this.inputAnnotationTypes;
    }

}