List of usage examples for opennlp.tools.util Span getStart
public int getStart()
From source file:opennlp.tools.util.Span.java
/** * Checks if the specified span is equal to the current span. *//*w ww . j a va2 s .c o m*/ public boolean equals(Object o) { boolean result; if (o == this) { result = true; } else if (o instanceof Span) { Span s = (Span) o; result = getStart() == s.getStart() && getEnd() == s.getEnd(); } else { result = false; } return result; }
From source file:com.civis.utils.opennlp.models.address.AddressSpanBuilder.java
private void parse(String[] tokens) { Span streetSpan = createStreetSpan(originalSpan.getStart(), originalSpan.getEnd(), tokens); street = buildString(streetSpan, tokens); Span streetNumberSpan = new Span(streetSpan.getEnd(), streetSpan.getEnd() + 1); streetNumber = buildString(streetNumberSpan, tokens); Span zipSpan = new Span(streetNumberSpan.getEnd(), streetNumberSpan.getEnd() + 1); zip = buildString(zipSpan, tokens);//w w w .j a v a2s . c o m zip = zip.replaceAll("[+.^:,]", ""); if (StringUtils.isBlank(zip)) { // token include only special chars like , or . //try next zip token // use case Lindenstr. 19 , 12207 Berlin zipSpan = new Span(zipSpan.getStart() + 1, zipSpan.getEnd() + 1); zip = buildString(zipSpan, tokens); } CSVAddressData csvAddressData = findAddressDataByZip(zip); if (csvAddressData != null) { city = csvAddressData.getCity(); country = "Deutschland"; } else { String cityAndMaybeCountry = buildString(zipSpan.getEnd(), originalSpan.getEnd(), tokens); country = tryToFindCountry(cityAndMaybeCountry); if (country == null) { // no country found, means rest string is a city string city = cityAndMaybeCountry; } else { city = cityAndMaybeCountry.replace(country, "").trim(); } } }
From source file:opennlp.tools.apps.relevanceVocabs.PhraseProcessor.java
public ArrayList<String> getNounPhrases(Parse p) { ArrayList<String> nounphrases = new ArrayList<String>(); Parse[] subparses = p.getChildren(); for (int pi = 0; pi < subparses.length; pi++) { if (subparses[pi].getType().equals("NP") && allChildNodesArePOSTags(subparses[pi])) { Span _span = subparses[pi].getSpan(); nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd())); } else if (!((Parse) subparses[pi]).isPosTag()) nounphrases.addAll(getNounPhrases(subparses[pi])); }// w w w . j av a2 s .c om return nounphrases; }
From source file:opennlp.tools.apps.relevanceVocabs.PhraseProcessor.java
public ArrayList<String> getVerbPhrases(Parse p) { ArrayList<String> verbPhrases = new ArrayList<String>(); Parse[] subparses = p.getChildren(); for (int pi = 0; pi < subparses.length; pi++) { if (subparses[pi].getType().startsWith("VB") && allChildNodesArePOSTags(subparses[pi])) { Span _span = subparses[pi].getSpan(); verbPhrases.add(p.getText().substring(_span.getStart(), _span.getEnd())); } else if (!((Parse) subparses[pi]).isPosTag()) verbPhrases.addAll(getNounPhrases(subparses[pi])); }//from w w w . ja v a 2s . c om return verbPhrases; }
From source file:org.apache.lucene.analysis.jate.OpenNLPTokenizer.java
@Override public final boolean incrementToken() throws IOException { if (first) {/*from ww w. j av a 2 s. c o m*/ loadAll(); restartAtBeginning(); first = false; } if (sentences.length == 0) { first = true; return false; } int sentenceOffset = sentences[indexSentence].getStart(); if (wordSet == null) { wordSet = words[indexSentence]; } clearAttributes(); while (indexSentence < sentences.length) { while (indexWord == wordSet.length) { indexSentence++; if (indexSentence < sentences.length) { wordSet = words[indexSentence]; indexWord = 0; sentenceOffset = sentences[indexSentence].getStart(); } else { first = true; return false; } } // set termAtt from private buffer Span sentence = sentences[indexSentence]; Span word = wordSet[indexWord]; int spot = sentence.getStart() + word.getStart(); termAtt.setEmpty(); int termLength = word.getEnd() - word.getStart(); if (termAtt.buffer().length < termLength) { termAtt.resizeBuffer(termLength); } termAtt.setLength(termLength); char[] buffer = termAtt.buffer(); finalOffset = correctOffset(sentenceOffset + word.getEnd()); int start = correctOffset(word.getStart() + sentenceOffset); for (int i = 0; i < termLength; i++) { buffer[i] = fullText[spot + i]; } //safeguard tweak to avoid invalid token offsets, see issue 26 on github if (finalOffset - start > termLength) { offsetAtt.setOffset(start, start + termLength); LOG.warn( "Invalid token start and end offsets diff greater than term length. End offset is reset to be start+tokenlength. " + "start=" + start + ", invalid end=" + finalOffset + ", termlength=" + termLength + ". See Issue 26 on JATE webpage"); /* String wordStr = new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset()); System.out.println(wordStr);*/ } else offsetAtt.setOffset(start, finalOffset); addSentenceContext(sentenceContextAtt, indexWord, indexWord, null, indexSentence); //System.out.println(sentenceContextAtt.getPayload().utf8ToString()+","+new String(buffer,0, termAtt.length())); indexWord++; return true; } first = true; return false; }
From source file:org.apache.lucene.analysis.jate.OpenNLPTokenizer.java
void splitWords(int i) { Span current = sentences[i]; String sentence = String.copyValueOf(fullText, current.getStart(), current.getEnd() - current.getStart()); words[i] = tokenizerOp.tokenizePos(sentence); }
From source file:org.dbpedia.spotlight.spot.NESpotter.java
protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) { String intext = text.text();/*from ww w .j a va 2 s.c o m*/ SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); String[] sentences = sentenceDetector.sentDetect(intext); Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext); int[] sentencePositions = new int[sentences.length + 1]; for (int k = 0; k < sentenceEndings.length; k++) { sentencePositions[k] = sentenceEndings[k].getStart(); } NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel); List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>(); Tokenizer tokenizer = new SimpleTokenizer(); for (int i = 0; i < sentences.length; i++) { String sentence = sentences[i]; //LOG.debug("Sentence: " + sentence); // extract the names in the current sentence String[] tokens = tokenizer.tokenize(sentence); Span[] tokenspan = tokenizer.tokenizePos(sentence); Span[] nameSpans = finder.find(tokens); double[] probs = finder.probs(); if (nameSpans != null && nameSpans.length > 0) { //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString()); //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString()); for (Span span : nameSpans) { StringBuilder buf = new StringBuilder(); //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd()); for (int j = span.getStart(); j < span.getEnd(); j++) { //System.out.println(tokens[i] + " appended to " + buf.toString()); buf.append(tokens[j]); if (j < span.getEnd() - 1) buf.append(" "); } String surfaceFormStr = buf.toString().trim(); if (surfaceFormStr.contains(".")) { surfaceFormStr = correctPhrase(surfaceFormStr, sentence); } int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart(); int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd(); /* System.out.println("\n\nRR-NE Found = " + buf.toString()); System.out.println("Start = " + entStart); System.out.println("End = " + entEnd); System.out.println("Sentence = " + sentence); System.out.println("Text = " + text); */ SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr); SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart); sfocc.features().put("type", new Feature("type", oType.toString())); sfOccurrences.add(sfocc); } } } finder.clearAdaptiveData(); if (LOG.isDebugEnabled()) { LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", ")); } return sfOccurrences; }
From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java
/**Extracts noun-phrase n-grams from the given piece of input text. * @param text A Text object containing the input from where to extract NP n-grams * @return A list of SurfaceFormOccurrence objects. *//* w ww . j a v a2 s.c o m*/ protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) { String intext = text.text(); //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n"); List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>(); SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel); POSTaggerME posTagger = new POSTaggerME((POSModel) posModel); ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel); Span[] sentSpans = sentenceDetector.sentPosDetect(intext); for (Span sentSpan : sentSpans) { String sentence = sentSpan.getCoveredText(intext).toString(); int start = sentSpan.getStart(); Span[] tokSpans = tokenizer.tokenizePos(sentence); String[] tokens = new String[tokSpans.length]; // System.out.println("\n\nTokens:"); for (int i = 0; i < tokens.length; i++) { tokens[i] = tokSpans[i].getCoveredText(sentence).toString(); // System.out.println(tokens[i]); } String[] tags = posTagger.tag(tokens); Span[] chunks = chunker.chunkAsSpans(tokens, tags); for (Span chunk : chunks) { if ("NP".equals(chunk.getType())) { //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk. //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans. //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence //to compute the actual start/end offsets from the begining of the input text. int begin = tokSpans[chunk.getStart()].getStart(); int end = tokSpans[chunk.getEnd() - 1].getEnd(); List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1); extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst); } } } return npNgramSFLst; }
From source file:org.wso2.uima.collectionProccesingEngine.analysisEngines.LocationIdentifier.java
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { String text = jcas.getDocumentText(); Span[] sentSpans = sentenceDetector.sentPosDetect(jcas.getDocumentText()); for (Span sentSpan : sentSpans) { String sentence = sentSpan.getCoveredText(text).toString(); int start = sentSpan.getStart(); Span[] tokSpans = tokenizer.tokenizePos(sentence); String[] tokens = new String[tokSpans.length]; for (int i = 0; i < tokens.length; i++) { tokens[i] = tokSpans[i].getCoveredText(sentence).toString(); }/* ww w .j a v a 2s . co m*/ logger.debug("Tweet Text: " + jcas.getDocumentText()); Span locationSpans[] = locationFinder.find(tokens); LocationIdentification annotation = new LocationIdentification(jcas); for (Span location : locationSpans) { annotation.setBegin(start + tokSpans[location.getStart()].getStart()); annotation.setEnd(start + tokSpans[location.getEnd() - 1].getEnd()); annotation.addToIndexes(jcas); logger.info("Location Detected : " + annotation.getCoveredText()); } if (locationSpans.length == 0) { logger.info("Location Unable to be Detected"); } } }