List of usage examples for opennlp.tools.util Span getCoveredText
public CharSequence getCoveredText(CharSequence text)
From source file:edu.stanford.muse.index.NER.java
/** * triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive). * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String) *///from w w w. j av a 2 s .c om private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets( String documentText) { try { NER.initialize(); } catch (Exception e) { Util.print_exception(e, log); } if (documentText.indexOf("\u00A0") > 0) documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages. // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER? if (REMOVE_I18N_CHARS) documentText = cleanI18NChars(documentText); List<Pair<String, String>> namedEntities = new ArrayList<Pair<String, String>>(); // token-type pairs List<Triple<String, Integer, Integer>> allTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs Span sentenceSpans[] = sFinder.sentPosDetect(documentText); // do NER sentence by sentence -- much faster than doing the entire documentText at once for (Span sentenceSpan : sentenceSpans) { int sentenceStartOffset = sentenceSpan.getStart(); String sentence = sentenceSpan.getCoveredText(documentText).toString(); if (sentence.length() > 2000) continue; // that's not a reasonable sentence, could be a uuencoded-something. // convert sentence to tokens cos that's what the name finders need Span[] tokSpans = tokenizer.tokenizePos(sentence); String tokens[] = new String[tokSpans.length]; for (int i = 0; i < tokSpans.length; i++) tokens[i] = tokSpans[i].getCoveredText(sentence).toString(); // find the actual spans (in terms of tokens) that represent names Span[] pSpans = pFinder.find(tokens); Span[] lSpans = lFinder.find(tokens); Span[] oSpans = oFinder.find(tokens); List<Triple<String, Integer, Integer>> sentenceTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs for (Span span : pSpans) sentenceTriples.add(new Triple<String, Integer, Integer>("PERSON", span.getStart(), span.getEnd())); for (Span span : lSpans) sentenceTriples .add(new Triple<String, Integer, Integer>("LOCATION", span.getStart(), span.getEnd())); for (Span span : oSpans) sentenceTriples .add(new Triple<String, Integer, Integer>("ORGANIZATION", span.getStart(), span.getEnd())); for (Triple<String, Integer, Integer> t : sentenceTriples) { String type = t.first(); if (type == null) type = "UNKNOWN"; // we see type = null sometimes #!@#$ allTypes.add(type); int startTok = t.second(); int endTok = t.third(); String namedEntity = sentence.substring(tokSpans[startTok].getStart(), tokSpans[endTok - 1].getEnd()); // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already. if (namedEntity.toLowerCase().startsWith("hi ")) namedEntity = namedEntity.substring("hi ".length()).trim(); if (namedEntity.toLowerCase().startsWith("hello ")) namedEntity = namedEntity.substring("hello ".length()).trim(); if (namedEntity.toLowerCase().startsWith("dear ")) namedEntity = namedEntity.substring("dear ".length()).trim(); if (namedEntity.toLowerCase().startsWith("cheers ")) namedEntity = namedEntity.substring("cheers ".length()).trim(); if (namedEntity.toLowerCase().startsWith("thanks ")) namedEntity = namedEntity.substring("thanks ".length()).trim(); if (DictUtils.tabooNames.contains(namedEntity.toLowerCase())) continue; if (!nameFilterPass(namedEntity)) continue; if (namedEntity.length() < MIN_NAME_LENGTH || namedEntity.length() > MAX_NAME_LENGTH) // drop it continue; namedEntities.add(new Pair<String, String>(namedEntity, type)); if (log.isDebugEnabled()) log.debug(t.first() + " : [" + t.second() + ":" + t.third() + "] " + namedEntity); } // sentence triple offsets cannot be used directly ... have to be first converted to the right offset within the entire document by adding sentenceStartOffset for (Triple<String, Integer, Integer> t : sentenceTriples) { int startTok = t.second(); int endTok = t.third(); int start = tokSpans[startTok].getStart(), end = tokSpans[endTok - 1].getEnd(); //allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + t.getSecond(), sentenceStartOffset + t.getThird())); allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + start, sentenceStartOffset + end)); } } return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(namedEntities), allTriples); }
From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java
/**Extracts noun-phrase n-grams from the given piece of input text. * @param text A Text object containing the input from where to extract NP n-grams * @return A list of SurfaceFormOccurrence objects. *//*from ww w.j ava 2s. c om*/ protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) { String intext = text.text(); //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n"); List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>(); SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel); POSTaggerME posTagger = new POSTaggerME((POSModel) posModel); ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel); Span[] sentSpans = sentenceDetector.sentPosDetect(intext); for (Span sentSpan : sentSpans) { String sentence = sentSpan.getCoveredText(intext).toString(); int start = sentSpan.getStart(); Span[] tokSpans = tokenizer.tokenizePos(sentence); String[] tokens = new String[tokSpans.length]; // System.out.println("\n\nTokens:"); for (int i = 0; i < tokens.length; i++) { tokens[i] = tokSpans[i].getCoveredText(sentence).toString(); // System.out.println(tokens[i]); } String[] tags = posTagger.tag(tokens); Span[] chunks = chunker.chunkAsSpans(tokens, tags); for (Span chunk : chunks) { if ("NP".equals(chunk.getType())) { //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk. //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans. //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence //to compute the actual start/end offsets from the begining of the input text. int begin = tokSpans[chunk.getStart()].getStart(); int end = tokSpans[chunk.getEnd() - 1].getEnd(); List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1); extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst); } } } return npNgramSFLst; }
From source file:org.wso2.uima.collectionProccesingEngine.analysisEngines.LocationIdentifier.java
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { String text = jcas.getDocumentText(); Span[] sentSpans = sentenceDetector.sentPosDetect(jcas.getDocumentText()); for (Span sentSpan : sentSpans) { String sentence = sentSpan.getCoveredText(text).toString(); int start = sentSpan.getStart(); Span[] tokSpans = tokenizer.tokenizePos(sentence); String[] tokens = new String[tokSpans.length]; for (int i = 0; i < tokens.length; i++) { tokens[i] = tokSpans[i].getCoveredText(sentence).toString(); }//w w w . ja v a 2 s . c o m logger.debug("Tweet Text: " + jcas.getDocumentText()); Span locationSpans[] = locationFinder.find(tokens); LocationIdentification annotation = new LocationIdentification(jcas); for (Span location : locationSpans) { annotation.setBegin(start + tokSpans[location.getStart()].getStart()); annotation.setEnd(start + tokSpans[location.getEnd() - 1].getEnd()); annotation.addToIndexes(jcas); logger.info("Location Detected : " + annotation.getCoveredText()); } if (locationSpans.length == 0) { logger.info("Location Unable to be Detected"); } } }