opennlp.tools.postag.POSSample.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.postag.POSSample.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.postag;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Objects;

import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.InvalidFormatException;

/**
 * Represents an pos-tagged sentence.
 */
public class POSSample implements Serializable {

    private List<String> sentence;

    private List<String> tags;

    private final String[][] additionalContext;

    public POSSample(String[] sentence, String[] tags) {
        this(sentence, tags, null);
    }

    public POSSample(List<String> sentence, List<String> tags) {
        this(sentence, tags, null);
    }

    public POSSample(List<String> sentence, List<String> tags, String[][] additionalContext) {
        this.sentence = Collections.unmodifiableList(sentence);
        this.tags = Collections.unmodifiableList(tags);

        checkArguments();
        String[][] ac;
        if (additionalContext != null) {
            ac = new String[additionalContext.length][];

            for (int i = 0; i < additionalContext.length; i++) {
                ac[i] = new String[additionalContext[i].length];
                System.arraycopy(additionalContext[i], 0, ac[i], 0, additionalContext[i].length);
            }
        } else {
            ac = null;
        }
        this.additionalContext = ac;
    }

    public POSSample(String[] sentence, String[] tags, String[][] additionalContext) {
        this(Arrays.asList(sentence), Arrays.asList(tags), additionalContext);
    }

    private void checkArguments() {
        if (sentence.size() != tags.size()) {
            throw new IllegalArgumentException("There must be exactly one tag for each token. tokens: "
                    + sentence.size() + ", tags: " + tags.size());
        }

        if (sentence.contains(null)) {
            throw new IllegalArgumentException("null elements are not allowed in sentence tokens!");
        }
        if (tags.contains(null)) {
            throw new IllegalArgumentException("null elements are not allowed in tags!");
        }
    }

    public String[] getSentence() {
        return sentence.toArray(new String[sentence.size()]);
    }

    public String[] getTags() {
        return tags.toArray(new String[tags.size()]);
    }

    public String[][] getAddictionalContext() {
        return this.additionalContext;
    }

    @Override
    public String toString() {

        StringBuilder result = new StringBuilder();

        for (int i = 0; i < getSentence().length; i++) {
            result.append(getSentence()[i]);
            result.append('_');
            result.append(getTags()[i]);
            result.append(' ');
        }

        if (result.length() > 0) {
            // get rid of last space
            result.setLength(result.length() - 1);
        }

        return result.toString();
    }

    public static POSSample parse(String sentenceString) throws InvalidFormatException {

        String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);

        String[] sentence = new String[tokenTags.length];
        String[] tags = new String[tokenTags.length];

        for (int i = 0; i < tokenTags.length; i++) {
            int split = tokenTags[i].lastIndexOf("_");

            if (split == -1) {
                throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!");
            }

            sentence[i] = tokenTags[i].substring(0, split);
            tags[i] = tokenTags[i].substring(split + 1);
        }

        return new POSSample(sentence, tags);
    }

    @Override
    public int hashCode() {
        return Objects.hash(Arrays.hashCode(getSentence()), Arrays.hashCode(getTags()));
    }

    @Override
    public boolean equals(Object obj) {
        if (obj == this) {
            return true;
        }

        if (obj instanceof POSSample) {
            POSSample a = (POSSample) obj;

            return Arrays.equals(getSentence(), a.getSentence()) && Arrays.equals(getTags(), a.getTags());
        }

        return this == obj;
    }
}