pt.ua.tm.trigner.documents.Documents.java Source code

Java tutorial

Introduction

Here is the source code for pt.ua.tm.trigner.documents.Documents.java

Source

/*
 * Copyright (c) 2012 David Campos, University of Aveiro and Erasmus Medical Center.
 *
 * This project is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
 * To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/.
 *
 * This project is a free software, you are free to copy, distribute, change and transmit it. However, you may not use
 * it for commercial purposes.
 *
 * It is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package pt.ua.tm.trigner.documents;

import com.google.common.collect.Multimap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pt.ua.tm.gimli.corpus.Corpus;
import pt.ua.tm.gimli.corpus.Sentence;
import pt.ua.tm.gimli.corpus.Token;

import java.io.*;
import java.util.*;

/**
 * @author david
 */
public class Documents extends ArrayList<Corpus> implements Iterable<Corpus>, Serializable {

    /**
     * {@link org.slf4j.Logger} to be used in the class.
     */
    private static Logger logger = LoggerFactory.getLogger(Documents.class);

    public Documents() {
        super();
    }

    public Documents(Collection c) {
        super(c);
    }

    public static Documents read(InputStream input) throws IOException, ClassNotFoundException {
        ObjectInputStream obj = new ObjectInputStream(input);
        Documents d = (Documents) obj.readObject();
        input.close();
        return d;
    }

    public Corpus getByID(String id) {
        for (Corpus c : this) {
            if (c.getIdentifier().equals(id)) {
                return c;
            }
        }
        return null;
    }

    public Documents[] splitInOrder(double[] ratios) {
        return splitInOrder(ratios, this);
    }

    private Documents[] splitInOrder(double[] ratios, Documents documents) {
        double sum = 0.0;
        for (double d : ratios) {
            sum += d;
        }

        //        if (sum != 1.0) {
        //            throw new RuntimeException("The provided ratios do not sum to 1.0. Sum = " + sum);
        //        }

        int[] numbers = new int[ratios.length];

        int total = 0;
        for (int i = 0; i < numbers.length; i++) {
            numbers[i] = total + (int) (ratios[i] * documents.size());
            total = numbers[i];
        }

        if (total < documents.size()) {
            numbers[numbers.length - 1] += documents.size() - total;
        }

        return splitInOrder(numbers, documents);

    }

    public Documents[] splitInOrder(int[] numbers) {
        return splitInOrder(numbers, this);
    }

    private Documents[] splitInOrder(int[] numbers, Documents documents) {
        int last = numbers[numbers.length - 1];
        if (last != documents.size()) {
            throw new RuntimeException("The provided sizes do not sum to the documents size.");
        }

        Documents[] docs = new Documents[numbers.length];
        int start = 0;
        for (int i = 0; i < numbers.length; i++) {
            int end = numbers[i];

            Documents d = new Documents();
            d.addAll(documents.subList(start, end));
            docs[i] = d;

            start = end;
        }
        return docs;
    }

    public Documents[] splitRandom(double[] ratios) {
        Documents shuffle = (Documents) this.clone();
        Collections.shuffle(shuffle, new Random(new Date().getTime()));
        return splitInOrder(ratios, shuffle);
    }

    public Documents[] splitRandom(int[] numbers) {
        Documents shuffle = (Documents) this.clone();
        Collections.shuffle(shuffle, new Random(new Date().getTime()));

        return splitInOrder(numbers, shuffle);
    }

    public void cleanFeatures(final String[] except) {
        List<String> exceptList = Arrays.asList(except);
        for (Corpus corpus : this) {
            for (Sentence sentence : corpus) {
                for (Token token : sentence) {
                    Multimap<String, String> current = token.getFeaturesMap();
                    //                    Multimap<String, String> next = HashMultimap.create();

                    List<String> toRemove = new ArrayList<>();
                    for (String key : current.keySet()) {
                        if (!exceptList.contains(key)) {
                            toRemove.add(key);
                        }
                    }

                    for (String keyToRemove : toRemove) {
                        current.removeAll(keyToRemove);
                    }

                    // Set new map
                    //                    current = null;
                    //                    token.setFeaturesMap(next);
                }
            }
        }
    }

    @Override
    public Iterator<Corpus> iterator() {
        return super.iterator();
    }

    public void write(OutputStream output) throws IOException {
        ObjectOutputStream obj = new ObjectOutputStream(output);
        obj.writeObject(this);
        obj.close();
    }
}