pl.edu.icm.coansys.kwdextraction.langident.Profile.java Source code

Java tutorial

Introduction

Here is the source code for pl.edu.icm.coansys.kwdextraction.langident.Profile.java

Source

/*
 * This file is part of CoAnSys project.
 * Copyright (c) 2012-2013 ICM-UW
 * 
 * CoAnSys is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
    
 * CoAnSys is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
 */
package pl.edu.icm.coansys.kwdextraction.langident;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;

import org.apache.commons.lang.builder.EqualsBuilder;
import org.apache.commons.lang.builder.HashCodeBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author Lukasz Bolikowski (bolo@icm.edu.pl)
 * @author Gra <Gobiewski Radosaw A.> r.golebiewski@icm.edu.pl
 */
public class Profile {

    protected static final char SEPARATION_CHAR = ' ';
    protected static final String UNTERLINE_STRING = "_";
    protected static final String SPACE_STRING = " ";
    protected static final String EMPTY_STRING = "";
    protected static final String LETTERS = "; letters: ";
    protected static final String PROCESSED_WORDS = "Processed words: ";
    private static final Logger LOG = LoggerFactory.getLogger(Profile.class);
    public static final int PROFILE_CUTOFF = 400;
    public static final int MAX_GRAM = 5;
    private List<String> data = new ArrayList<String>(PROFILE_CUTOFF);
    private Map<String, Integer> hash = new HashMap<String, Integer>(PROFILE_CUTOFF);

    private static class TreeElement implements Comparable<TreeElement> {

        private final String s;
        private final int c;

        TreeElement(final String s, final int c) {
            this.s = s;
            this.c = c;
        }

        @Override
        public int compareTo(final Profile.TreeElement o) {
            final TreeElement lhs = this;
            final TreeElement rhs = o;//(TreeElement)

            if (lhs.c > rhs.c) {
                return -1;
            }
            if (lhs.c < rhs.c) {
                return 1;
            }

            return lhs.s.compareTo(rhs.s);
        }

        @Override
        public int hashCode() {
            return HashCodeBuilder.reflectionHashCode(this);
        }

        @Override
        public boolean equals(Object obj) {
            return EqualsBuilder.reflectionEquals(this, obj);
        }
    }

    /**
     * Reads profile from a stream.
     *
     * @param stream Stream to read profile from
     */
    public Profile(final InputStream stream) throws IOException {
        this(new InputStreamReader(stream, Charset.forName("UTF-8")));
    }

    /**
     * Reads profile from a reader.
     *
     * @param reader Reader to read profile from
     */
    public Profile(final Reader reader) throws IOException {
        data = new ArrayList<String>(PROFILE_CUTOFF);
        hash = new HashMap<String, Integer>(PROFILE_CUTOFF);

        boolean noEOF = true;
        char buf[] = new char[1];

        int seq = 0;
        while (noEOF) {
            StringBuilder gram = new StringBuilder();
            // String gram = EMPTY_STRING;
            int r = reader.read(buf);
            while (r == 1 && !new String(buf).matches("\\s")) {
                gram.append(buf[0]);
                r = reader.read(buf);
            }

            if (!EMPTY_STRING.equals(gram.toString())) {
                data.add(gram.toString());
                hash.put(gram.toString(), Integer.valueOf(seq++));
            }

            noEOF = noEOF && (r == 1);
            while (noEOF && buf[0] != '\n') {
                r = reader.read(buf);
                noEOF = noEOF && (r == 1);
            }
        }
    }

    public void store(final OutputStream stream) throws UnsupportedEncodingException {
        final PrintStream printStream = new PrintStream(stream, false, "UTF-8");
        for (final String gram : data) {
            printStream.println(gram);
        }
    }

    private void addGram(final Map<String, Integer> grams, final String g) {
        final Integer c = grams.get(g);
        grams.put(g, Integer.valueOf((c == null) ? 1 : c.intValue() + 1));
    }

    /**
     * Generates profile from a text.
     *
     * @param text Text to profile
     */
    public Profile(final String txt) {
        final String text = txt.toLowerCase().replaceAll("[^\\p{L}']+", SPACE_STRING);
        final String[] words = text.split(SPACE_STRING);

        int processedWords = 0;
        int processedLetters = 0;
        final Map<String, Integer> grams = new HashMap<String, Integer>();
        for (final String word : words) {
            final int len = word.length();

            for (int n = 1; n <= MAX_GRAM - 1; n++) {
                if (len >= n) {
                    addGram(grams, UNTERLINE_STRING + word.substring(0, n));
                    addGram(grams, word.substring(len - n, len) + UNTERLINE_STRING);
                }
            }

            for (int n = 1; n <= MAX_GRAM; n++) {
                for (int i = 0; i < len - n + 1; i++) {
                    addGram(grams, word.substring(i, i + n));
                }
            }
            processedWords += 1;
            processedLetters += len;
        }

        LOG.debug(PROCESSED_WORDS + processedWords + LETTERS + processedLetters);

        //        All the N-grams are in the grams map -- now we need
        //        to sort them and put first PROFILE_CUTOFF most valuable
        //        ones in the profile
        final TreeSet<TreeElement> ts = new TreeSet<TreeElement>();
        for (final Map.Entry<String, Integer> me : grams.entrySet()) {
            ts.add(new TreeElement(me.getKey(), me.getValue().intValue()));
        }

        int newSize = grams.entrySet().size();
        if (newSize > PROFILE_CUTOFF) {
            newSize = PROFILE_CUTOFF;
        }

        data = new ArrayList<String>(newSize);
        hash = new HashMap<String, Integer>(newSize);
        for (int i = 0; i < newSize; i++) {
            final TreeElement te = ts.first();
            data.add(te.s);
            hash.put(data.get(i), Integer.valueOf(i));
            ts.remove(te);
        }
    }

    public static int distance(final Profile p1, final Profile p2) {
        int distance = 0;
        for (int i = 0; i < p1.data.size(); i++) {
            final String s = p1.data.get(i);
            final Integer jj = p2.hash.get(s);
            if (null == jj) {
                distance += PROFILE_CUTOFF;
                continue;
            }
            final int j = jj.intValue();
            distance += Math.abs(i - j);
        }
        distance += (PROFILE_CUTOFF - p1.data.size()) * PROFILE_CUTOFF;
        return distance;
    }

    public int distance(final Profile other) {
        return distance(this, other);
    }

    @Override
    public String toString() {
        final StringBuilder builder = new StringBuilder();
        for (final String gram : data) {
            builder.append(gram);
            builder.append(SEPARATION_CHAR);
        }
        return builder.toString();
    }
}