Example usage for opennlp.tools.util Span getStart

List of usage examples for opennlp.tools.util Span getStart

Introduction

In this page you can find the example usage for opennlp.tools.util Span getStart.

Prototype

public int getStart() 

Source Link

Document

Return the start of a span.

Usage

From source file:opennlp.tools.util.Span.java

  /**
 * Checks if the specified span is equal to the current span.
 *//*w ww  . j  a  va2  s .c o m*/
public boolean equals(Object o) {

  boolean result;
    
  if (o == this) {
    result = true;
  }
  else if (o instanceof Span) {
    Span s = (Span) o;
      
    result = getStart() == s.getStart() && getEnd() == s.getEnd();
  }
  else {
    result = false;
  }
    
  return result;
}

From source file:com.civis.utils.opennlp.models.address.AddressSpanBuilder.java

private void parse(String[] tokens) {
    Span streetSpan = createStreetSpan(originalSpan.getStart(), originalSpan.getEnd(), tokens);
    street = buildString(streetSpan, tokens);
    Span streetNumberSpan = new Span(streetSpan.getEnd(), streetSpan.getEnd() + 1);
    streetNumber = buildString(streetNumberSpan, tokens);
    Span zipSpan = new Span(streetNumberSpan.getEnd(), streetNumberSpan.getEnd() + 1);
    zip = buildString(zipSpan, tokens);//w  w  w  .j  a  v  a2s  .  c o  m
    zip = zip.replaceAll("[+.^:,]", "");
    if (StringUtils.isBlank(zip)) {
        // token include only special chars like , or .
        //try next zip token
        // use case Lindenstr. 19 , 12207 Berlin
        zipSpan = new Span(zipSpan.getStart() + 1, zipSpan.getEnd() + 1);
        zip = buildString(zipSpan, tokens);
    }

    CSVAddressData csvAddressData = findAddressDataByZip(zip);
    if (csvAddressData != null) {
        city = csvAddressData.getCity();
        country = "Deutschland";
    } else {
        String cityAndMaybeCountry = buildString(zipSpan.getEnd(), originalSpan.getEnd(), tokens);
        country = tryToFindCountry(cityAndMaybeCountry);
        if (country == null) {
            // no country found, means rest string is a city string
            city = cityAndMaybeCountry;
        } else {
            city = cityAndMaybeCountry.replace(country, "").trim();
        }
    }
}

From source file:opennlp.tools.apps.relevanceVocabs.PhraseProcessor.java

public ArrayList<String> getNounPhrases(Parse p) {
    ArrayList<String> nounphrases = new ArrayList<String>();

    Parse[] subparses = p.getChildren();
    for (int pi = 0; pi < subparses.length; pi++) {

        if (subparses[pi].getType().equals("NP") && allChildNodesArePOSTags(subparses[pi])) {
            Span _span = subparses[pi].getSpan();
            nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));
        } else if (!((Parse) subparses[pi]).isPosTag())
            nounphrases.addAll(getNounPhrases(subparses[pi]));
    }//  w  w  w .  j av  a2 s .c  om

    return nounphrases;
}

From source file:opennlp.tools.apps.relevanceVocabs.PhraseProcessor.java

public ArrayList<String> getVerbPhrases(Parse p) {
    ArrayList<String> verbPhrases = new ArrayList<String>();

    Parse[] subparses = p.getChildren();
    for (int pi = 0; pi < subparses.length; pi++) {

        if (subparses[pi].getType().startsWith("VB") && allChildNodesArePOSTags(subparses[pi])) {
            Span _span = subparses[pi].getSpan();
            verbPhrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));
        } else if (!((Parse) subparses[pi]).isPosTag())
            verbPhrases.addAll(getNounPhrases(subparses[pi]));
    }//from   w w  w .  ja v  a 2s . c om

    return verbPhrases;
}

From source file:org.apache.lucene.analysis.jate.OpenNLPTokenizer.java

@Override
public final boolean incrementToken() throws IOException {
    if (first) {/*from ww  w. j  av a 2  s.  c o  m*/
        loadAll();
        restartAtBeginning();
        first = false;
    }
    if (sentences.length == 0) {
        first = true;
        return false;
    }
    int sentenceOffset = sentences[indexSentence].getStart();
    if (wordSet == null) {
        wordSet = words[indexSentence];
    }
    clearAttributes();

    while (indexSentence < sentences.length) {
        while (indexWord == wordSet.length) {
            indexSentence++;
            if (indexSentence < sentences.length) {
                wordSet = words[indexSentence];
                indexWord = 0;
                sentenceOffset = sentences[indexSentence].getStart();
            } else {
                first = true;
                return false;
            }
        }
        // set termAtt from private buffer
        Span sentence = sentences[indexSentence];
        Span word = wordSet[indexWord];

        int spot = sentence.getStart() + word.getStart();
        termAtt.setEmpty();
        int termLength = word.getEnd() - word.getStart();
        if (termAtt.buffer().length < termLength) {
            termAtt.resizeBuffer(termLength);
        }
        termAtt.setLength(termLength);
        char[] buffer = termAtt.buffer();
        finalOffset = correctOffset(sentenceOffset + word.getEnd());
        int start = correctOffset(word.getStart() + sentenceOffset);

        for (int i = 0; i < termLength; i++) {
            buffer[i] = fullText[spot + i];
        }

        //safeguard tweak to avoid invalid token offsets, see issue 26 on github
        if (finalOffset - start > termLength) {
            offsetAtt.setOffset(start, start + termLength);
            LOG.warn(
                    "Invalid token start and end offsets diff greater than term length. End offset is reset to be start+tokenlength. "
                            + "start=" + start + ", invalid end=" + finalOffset + ", termlength=" + termLength
                            + ". See Issue 26 on JATE webpage");
            /*                String wordStr =  new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset());
                            System.out.println(wordStr);*/
        } else
            offsetAtt.setOffset(start, finalOffset);

        addSentenceContext(sentenceContextAtt, indexWord, indexWord, null, indexSentence);
        //System.out.println(sentenceContextAtt.getPayload().utf8ToString()+","+new String(buffer,0, termAtt.length()));

        indexWord++;

        return true;
    }
    first = true;
    return false;
}

From source file:org.apache.lucene.analysis.jate.OpenNLPTokenizer.java

void splitWords(int i) {
    Span current = sentences[i];
    String sentence = String.copyValueOf(fullText, current.getStart(), current.getEnd() - current.getStart());
    words[i] = tokenizerOp.tokenizePos(sentence);
}

From source file:org.dbpedia.spotlight.spot.NESpotter.java

protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) {
    String intext = text.text();/*from  ww  w .j  a va  2  s.c o  m*/
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(intext);
    Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext);
    int[] sentencePositions = new int[sentences.length + 1];
    for (int k = 0; k < sentenceEndings.length; k++) {
        sentencePositions[k] = sentenceEndings[k].getStart();
    }

    NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel);

    List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
    Tokenizer tokenizer = new SimpleTokenizer();
    for (int i = 0; i < sentences.length; i++) {
        String sentence = sentences[i];
        //LOG.debug("Sentence: " + sentence);

        // extract the names in the current sentence
        String[] tokens = tokenizer.tokenize(sentence);
        Span[] tokenspan = tokenizer.tokenizePos(sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();

        if (nameSpans != null && nameSpans.length > 0) {
            //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());
            //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString());
            for (Span span : nameSpans) {
                StringBuilder buf = new StringBuilder();
                //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd());
                for (int j = span.getStart(); j < span.getEnd(); j++) {
                    //System.out.println(tokens[i] + " appended to " + buf.toString());
                    buf.append(tokens[j]);
                    if (j < span.getEnd() - 1)
                        buf.append(" ");
                }
                String surfaceFormStr = buf.toString().trim();
                if (surfaceFormStr.contains(".")) {
                    surfaceFormStr = correctPhrase(surfaceFormStr, sentence);
                }

                int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart();
                int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd();

                /*
                System.out.println("\n\nRR-NE Found = " + buf.toString());
                System.out.println("Start = " + entStart);
                System.out.println("End = " + entEnd);
                System.out.println("Sentence = " + sentence);
                System.out.println("Text = " + text);
                */

                SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr);
                SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart);
                sfocc.features().put("type", new Feature("type", oType.toString()));
                sfOccurrences.add(sfocc);
            }
        }

    }
    finder.clearAdaptiveData();

    if (LOG.isDebugEnabled()) {
        LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", "));
    }
    return sfOccurrences;
}

From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java

/**Extracts noun-phrase n-grams from the given piece of input text. 
 * @param text  A Text object containing the input from where to extract NP n-grams
 * @return A list of SurfaceFormOccurrence objects.
 *//*  w ww  . j  a  v  a2  s.c  o  m*/
protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) {
    String intext = text.text();
    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel) posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel);

    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
        String sentence = sentSpan.getCoveredText(intext).toString();
        int start = sentSpan.getStart();
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = new String[tokSpans.length];
        // System.out.println("\n\nTokens:");
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
            // System.out.println(tokens[i]);
        }
        String[] tags = posTagger.tag(tokens);
        Span[] chunks = chunker.chunkAsSpans(tokens, tags);
        for (Span chunk : chunks) {
            if ("NP".equals(chunk.getType())) {
                //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk.
                //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans.
                //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence
                //to compute the actual start/end offsets from the begining of the input text.
                int begin = tokSpans[chunk.getStart()].getStart();
                int end = tokSpans[chunk.getEnd() - 1].getEnd();
                List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1);
                extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst);
            }
        }
    }
    return npNgramSFLst;
}

From source file:org.wso2.uima.collectionProccesingEngine.analysisEngines.LocationIdentifier.java

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    String text = jcas.getDocumentText();
    Span[] sentSpans = sentenceDetector.sentPosDetect(jcas.getDocumentText());

    for (Span sentSpan : sentSpans) {
        String sentence = sentSpan.getCoveredText(text).toString();
        int start = sentSpan.getStart();
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = new String[tokSpans.length];
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
        }/* ww  w  .j  a v a  2s .  co m*/

        logger.debug("Tweet Text: " + jcas.getDocumentText());
        Span locationSpans[] = locationFinder.find(tokens);
        LocationIdentification annotation = new LocationIdentification(jcas);
        for (Span location : locationSpans) {
            annotation.setBegin(start + tokSpans[location.getStart()].getStart());
            annotation.setEnd(start + tokSpans[location.getEnd() - 1].getEnd());
            annotation.addToIndexes(jcas);
            logger.info("Location Detected : " + annotation.getCoveredText());
        }

        if (locationSpans.length == 0) {
            logger.info("Location Unable to be Detected");
        }

    }
}