Example usage for opennlp.tools.util Span getEnd

List of usage examples for opennlp.tools.util Span getEnd

Introduction

In this page you can find the example usage for opennlp.tools.util Span getEnd.

Prototype

public int getEnd() 

Source Link

Document

Return the end of a span.

Usage

From source file:opennlp.tools.util.Span.java

  /**
 * Returns true is the specified span crosses this span.
 * /*  w w w  . java2s.c om*/
 * @param s The span to compare with this span.
 * 
 * @return true is the specified span overlaps this span and contains a 
 * non-overlapping section; false otherwise.
 */
public boolean crosses(Span s) {
  int sstart = s.getStart();
  //either s's start is in this or this' start is in s
  return !this.contains(s) && !s.contains(this) && 
    (getStart() <= sstart && sstart < getEnd() ||
    sstart <= getStart() && getStart() < s.getEnd());
}

From source file:opennlp.tools.apps.relevanceVocabs.PhraseProcessor.java

public ArrayList<String> getNounPhrases(Parse p) {
    ArrayList<String> nounphrases = new ArrayList<String>();

    Parse[] subparses = p.getChildren();
    for (int pi = 0; pi < subparses.length; pi++) {

        if (subparses[pi].getType().equals("NP") && allChildNodesArePOSTags(subparses[pi])) {
            Span _span = subparses[pi].getSpan();
            nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));
        } else if (!((Parse) subparses[pi]).isPosTag())
            nounphrases.addAll(getNounPhrases(subparses[pi]));
    }/*from w  w w . j  av a2  s  . co m*/

    return nounphrases;
}

From source file:opennlp.tools.apps.relevanceVocabs.PhraseProcessor.java

public ArrayList<String> getVerbPhrases(Parse p) {
    ArrayList<String> verbPhrases = new ArrayList<String>();

    Parse[] subparses = p.getChildren();
    for (int pi = 0; pi < subparses.length; pi++) {

        if (subparses[pi].getType().startsWith("VB") && allChildNodesArePOSTags(subparses[pi])) {
            Span _span = subparses[pi].getSpan();
            verbPhrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));
        } else if (!((Parse) subparses[pi]).isPosTag())
            verbPhrases.addAll(getNounPhrases(subparses[pi]));
    }/*from w w  w . ja  v  a2 s . co  m*/

    return verbPhrases;
}

From source file:org.apache.lucene.analysis.jate.OpenNLPTokenizer.java

@Override
public final boolean incrementToken() throws IOException {
    if (first) {/*from  w w w.  ja  v a  2 s  .c  o m*/
        loadAll();
        restartAtBeginning();
        first = false;
    }
    if (sentences.length == 0) {
        first = true;
        return false;
    }
    int sentenceOffset = sentences[indexSentence].getStart();
    if (wordSet == null) {
        wordSet = words[indexSentence];
    }
    clearAttributes();

    while (indexSentence < sentences.length) {
        while (indexWord == wordSet.length) {
            indexSentence++;
            if (indexSentence < sentences.length) {
                wordSet = words[indexSentence];
                indexWord = 0;
                sentenceOffset = sentences[indexSentence].getStart();
            } else {
                first = true;
                return false;
            }
        }
        // set termAtt from private buffer
        Span sentence = sentences[indexSentence];
        Span word = wordSet[indexWord];

        int spot = sentence.getStart() + word.getStart();
        termAtt.setEmpty();
        int termLength = word.getEnd() - word.getStart();
        if (termAtt.buffer().length < termLength) {
            termAtt.resizeBuffer(termLength);
        }
        termAtt.setLength(termLength);
        char[] buffer = termAtt.buffer();
        finalOffset = correctOffset(sentenceOffset + word.getEnd());
        int start = correctOffset(word.getStart() + sentenceOffset);

        for (int i = 0; i < termLength; i++) {
            buffer[i] = fullText[spot + i];
        }

        //safeguard tweak to avoid invalid token offsets, see issue 26 on github
        if (finalOffset - start > termLength) {
            offsetAtt.setOffset(start, start + termLength);
            LOG.warn(
                    "Invalid token start and end offsets diff greater than term length. End offset is reset to be start+tokenlength. "
                            + "start=" + start + ", invalid end=" + finalOffset + ", termlength=" + termLength
                            + ". See Issue 26 on JATE webpage");
            /*                String wordStr =  new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset());
                            System.out.println(wordStr);*/
        } else
            offsetAtt.setOffset(start, finalOffset);

        addSentenceContext(sentenceContextAtt, indexWord, indexWord, null, indexSentence);
        //System.out.println(sentenceContextAtt.getPayload().utf8ToString()+","+new String(buffer,0, termAtt.length()));

        indexWord++;

        return true;
    }
    first = true;
    return false;
}

From source file:org.apache.lucene.analysis.jate.OpenNLPTokenizer.java

void splitWords(int i) {
    Span current = sentences[i];
    String sentence = String.copyValueOf(fullText, current.getStart(), current.getEnd() - current.getStart());
    words[i] = tokenizerOp.tokenizePos(sentence);
}

From source file:org.dbpedia.spotlight.spot.NESpotter.java

protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) {
    String intext = text.text();//from  www  .  ja  va2 s  .  c o m
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(intext);
    Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext);
    int[] sentencePositions = new int[sentences.length + 1];
    for (int k = 0; k < sentenceEndings.length; k++) {
        sentencePositions[k] = sentenceEndings[k].getStart();
    }

    NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel);

    List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
    Tokenizer tokenizer = new SimpleTokenizer();
    for (int i = 0; i < sentences.length; i++) {
        String sentence = sentences[i];
        //LOG.debug("Sentence: " + sentence);

        // extract the names in the current sentence
        String[] tokens = tokenizer.tokenize(sentence);
        Span[] tokenspan = tokenizer.tokenizePos(sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();

        if (nameSpans != null && nameSpans.length > 0) {
            //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());
            //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString());
            for (Span span : nameSpans) {
                StringBuilder buf = new StringBuilder();
                //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd());
                for (int j = span.getStart(); j < span.getEnd(); j++) {
                    //System.out.println(tokens[i] + " appended to " + buf.toString());
                    buf.append(tokens[j]);
                    if (j < span.getEnd() - 1)
                        buf.append(" ");
                }
                String surfaceFormStr = buf.toString().trim();
                if (surfaceFormStr.contains(".")) {
                    surfaceFormStr = correctPhrase(surfaceFormStr, sentence);
                }

                int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart();
                int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd();

                /*
                System.out.println("\n\nRR-NE Found = " + buf.toString());
                System.out.println("Start = " + entStart);
                System.out.println("End = " + entEnd);
                System.out.println("Sentence = " + sentence);
                System.out.println("Text = " + text);
                */

                SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr);
                SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart);
                sfocc.features().put("type", new Feature("type", oType.toString()));
                sfOccurrences.add(sfocc);
            }
        }

    }
    finder.clearAdaptiveData();

    if (LOG.isDebugEnabled()) {
        LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", "));
    }
    return sfOccurrences;
}

From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java

/**Extracts noun-phrase n-grams from the given piece of input text. 
 * @param text  A Text object containing the input from where to extract NP n-grams
 * @return A list of SurfaceFormOccurrence objects.
 *//*from  w  w  w.j  a  v  a 2  s . c  om*/
protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) {
    String intext = text.text();
    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel) posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel);

    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
        String sentence = sentSpan.getCoveredText(intext).toString();
        int start = sentSpan.getStart();
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = new String[tokSpans.length];
        // System.out.println("\n\nTokens:");
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
            // System.out.println(tokens[i]);
        }
        String[] tags = posTagger.tag(tokens);
        Span[] chunks = chunker.chunkAsSpans(tokens, tags);
        for (Span chunk : chunks) {
            if ("NP".equals(chunk.getType())) {
                //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk.
                //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans.
                //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence
                //to compute the actual start/end offsets from the begining of the input text.
                int begin = tokSpans[chunk.getStart()].getStart();
                int end = tokSpans[chunk.getEnd() - 1].getEnd();
                List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1);
                extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst);
            }
        }
    }
    return npNgramSFLst;
}

From source file:org.wso2.uima.collectionProccesingEngine.analysisEngines.LocationIdentifier.java

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    String text = jcas.getDocumentText();
    Span[] sentSpans = sentenceDetector.sentPosDetect(jcas.getDocumentText());

    for (Span sentSpan : sentSpans) {
        String sentence = sentSpan.getCoveredText(text).toString();
        int start = sentSpan.getStart();
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = new String[tokSpans.length];
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
        }//from  w  ww  .j a v a 2s .co  m

        logger.debug("Tweet Text: " + jcas.getDocumentText());
        Span locationSpans[] = locationFinder.find(tokens);
        LocationIdentification annotation = new LocationIdentification(jcas);
        for (Span location : locationSpans) {
            annotation.setBegin(start + tokSpans[location.getStart()].getStart());
            annotation.setEnd(start + tokSpans[location.getEnd() - 1].getEnd());
            annotation.addToIndexes(jcas);
            logger.info("Location Detected : " + annotation.getCoveredText());
        }

        if (locationSpans.length == 0) {
            logger.info("Location Unable to be Detected");
        }

    }
}