eu.project.ttc.resources.GeneralLanguageResource.java Source code

Java tutorial

Introduction

Here is the source code for eu.project.ttc.resources.GeneralLanguageResource.java

Source

/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright 2, 2015nership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package eu.project.ttc.resources;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.Scanner;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.uima.resource.DataResource;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Objects;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;

public class GeneralLanguageResource implements GeneralLanguage {
    private static final Logger LOGGER = LoggerFactory.getLogger(GeneralLanguageResource.class);

    /**
     * The number of WordAnnotation in the corpus that has been used 
     * to produced this general language resource.
     * 
     * This field is used as the reference for frequency normalization.
     */
    private int nbCorpusWords = -1;

    private boolean cumulatedFrequencyMode = false;
    private int cumulatedFrequency;

    private Multimap<String, Entry> frequencies;
    private Set<String> words;

    public class Entry {
        private String lemma;
        private String pattern;
        private int frequency;

        public Entry(String lemma, String pattern, int frequency) {
            super();
            this.lemma = lemma;
            this.pattern = pattern;
            this.frequency = frequency;
        }

        public String getLemma() {
            return lemma;
        }

        public String getPattern() {
            return pattern;
        }

        public int getFrequency() {
            return frequency;
        }

        @Override
        public int hashCode() {
            return Objects.hashCode(this.lemma, this.pattern);
        }

        @Override
        public boolean equals(Object obj) {
            if (obj instanceof Entry)
                return Objects.equal(((Entry) obj).lemma, ((Entry) obj).pattern);
            else
                return false;
        }
    }

    public GeneralLanguageResource() {
        this.cumulatedFrequency = 0;
        this.frequencies = HashMultimap.create();
    }

    @Override
    public void load(DataResource data) throws ResourceInitializationException {
        try {
            load(data.getInputStream());
        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        }
    }

    public void load(InputStream inputStream) throws ResourceInitializationException {
        words = Sets.newHashSet();
        Scanner scanner = null;
        try {
            scanner = new Scanner(inputStream, "UTF-8");
            scanner.useDelimiter("\n");
            int index = 0;
            while (scanner.hasNext()) {
                index++;
                String line = scanner.next();
                String[] items = line.split("::");
                if (items.length == 3) {
                    String key = items[0].trim();
                    if (!key.contains(" "))
                        this.words.add(key);
                    Integer value = Integer.valueOf(items[2].trim());
                    this.cumulatedFrequency += value.intValue();
                    String lemma = key;
                    this.frequencies.put(lemma, new Entry(lemma, items[1], new Integer(value.intValue())));
                } else {
                    throw new IOException("Wrong general language format at line " + index + ": " + line);
                }
            }
            this.words = ImmutableSet.copyOf(this.words);

            if (this.frequencies.containsKey(PARAM_NB_CORPUS_WORDS))
                this.nbCorpusWords = this.frequencies.get(PARAM_NB_CORPUS_WORDS).iterator().next().getFrequency();
            else {
                LOGGER.warn("No such key for in GeneralLanguage resource {}", PARAM_NB_CORPUS_WORDS);
                LOGGER.warn("Switch to cumulatedFrequency mode");
                this.cumulatedFrequencyMode = true;
            }
        } catch (Exception e) {
            throw new ResourceInitializationException(e);
        } finally {
            IOUtils.closeQuietly(scanner);
        }
    }

    @Override
    public int getFrequency(String lemma, String pattern) {
        Entry e = getEntry(lemma, pattern);
        return e == null ? 1 : e.getFrequency();
    }

    public Entry getEntry(String lemma, String pattern) {
        for (Entry e : this.frequencies.get(lemma.toLowerCase()))
            if (e.getPattern().equals(pattern))
                return e;
        return null;
    }

    @Override
    public int getNbCorpusWords() {
        return nbCorpusWords;
    }

    @Override
    @Deprecated
    public double getNormalizedFrequency(String entry) {
        if (cumulatedFrequencyMode) {
            Collection<Entry> l = this.frequencies.get(entry.toLowerCase());
            if (l.isEmpty()) {
                return 1.0 / this.cumulatedFrequency;
            } else {
                double freq = new Double(l.iterator().next().getFrequency()).doubleValue();
                return freq / this.cumulatedFrequency;
            }
        } else
            throw new IllegalStateException("Not in cumulatedFrequencyMode. Should call #getFrequency() instead");
    }

    @Override
    public boolean isCumulatedFrequencyMode() {
        return cumulatedFrequencyMode;
    }

    @Override
    public boolean findSingleWord(String word) {
        return this.words.contains(word);
    }

    @Override
    public Set<String> getWords() {
        return words;
    }
}