Example usage for opennlp.tools.util Span getType

List of usage examples for opennlp.tools.util Span getType

Introduction

In this page you can find the example usage for opennlp.tools.util Span getType.

Prototype

public String getType() 

Source Link

Document

Retrieves the type of the span.

Usage

From source file:com.civis.utils.opennlp.models.address.AddressFinderMe.java

/**
 * {@inheritDoc}/*from   w w  w .  j a  v a2 s  .com*/
 */
@Override
public List<AddressSpan> find(String[] tokens) {
    Span[] spans = this.find(tokens, EMPTY);
    if (spans.length == 0) {
        // try to find address with zip code.
        return tryToFindAddressByZip(tokens);
    } else {
        List<Span> fullAddressSpans = new ArrayList<>();
        for (Span span : spans) {
            String spanType = span.getType();
            if (spanType.contains(AddressSpan.PREFIX_TYPE_ADDRESS)) {
                fullAddressSpans.add(span);
            }
        }

        //find probabilities for address
        double[] addressSpanProbs = this.probs(fullAddressSpans);

        //3. add founded contact persons to the result list
        List<AddressSpan> addressSpans = new ArrayList<>();
        for (int i = 0; i < fullAddressSpans.size(); i++) {
            Span fullAddressSpan = fullAddressSpans.get(i);
            double probability = addressSpanProbs[i];
            AddressSpan addressSpan = new AddressSpanBuilder(fullAddressSpan, probability, tokens)
                    .setCountries(countries).setCsvAddressData(csvAddressDataList).build();
            if (addressSpan.isValid()) {
                addressSpans.add(addressSpan);
            }
        }

        return removeDuplicated(addressSpans);
    }
}

From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java

/**Extracts noun-phrase n-grams from the given piece of input text. 
 * @param text  A Text object containing the input from where to extract NP n-grams
 * @return A list of SurfaceFormOccurrence objects.
 *///from   www .j  a  va 2s .  c om
protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) {
    String intext = text.text();
    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel) posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel);

    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
        String sentence = sentSpan.getCoveredText(intext).toString();
        int start = sentSpan.getStart();
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = new String[tokSpans.length];
        // System.out.println("\n\nTokens:");
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
            // System.out.println(tokens[i]);
        }
        String[] tags = posTagger.tag(tokens);
        Span[] chunks = chunker.chunkAsSpans(tokens, tags);
        for (Span chunk : chunks) {
            if ("NP".equals(chunk.getType())) {
                //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk.
                //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans.
                //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence
                //to compute the actual start/end offsets from the begining of the input text.
                int begin = tokSpans[chunk.getStart()].getStart();
                int end = tokSpans[chunk.getEnd() - 1].getEnd();
                List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1);
                extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst);
            }
        }
    }
    return npNgramSFLst;
}