List of usage examples for opennlp.tools.util Span getEnd
public int getEnd()
From source file:com.civprod.writerstoolbox.testarea.UnsupervisedDiscourseSegmentation.java
public static List<List<String>> segment(Document<?> inDocument, SentenceDetector inSentenceDetector, StringTokenizer inStringTokenizer) { List<String> concatenateTokens = concatenateTokens(inDocument, inSentenceDetector, inStringTokenizer); List<String> stemmAndFilterList = TokenUtil.stemmAndFilterList(concatenateTokens); List<List<String>> splitIntoFixLengthLists = splitIntoFixLengthLists(stemmAndFilterList, 20); List<Counter<String>> counters = splitIntoFixLengthLists.parallelStream() .map((List<String> curSentence) -> CounterUtils.count(curSentence)).collect(Collectors.toList()); List<Double> cosineSimilarity = new ArrayList<>(counters.size() - 20); for (int i = 0; i < (counters.size() - 20); i++) { cosineSimilarity.add(cosineSimilarityStemmedAndFiltered(Counter.join(counters.subList(i, i + 10)), Counter.join(counters.subList(i + 11, i + 20)))); }//from ww w .jav a 2s .co m List<Double> valleys = new ArrayList<>(cosineSimilarity.size() - 2); for (int i = 0; i < valleys.size(); i++) { double ya1 = cosineSimilarity.get(i); double ya2 = cosineSimilarity.get(i + 1); double ya3 = cosineSimilarity.get(i + 2); valleys.add((ya1 - ya2) + (ya3 - ya2)); } SummaryStatistics valleyStatistics = valleys.parallelStream().collect(SummaryStatisticCollector.instance); double cutoffThreshold = valleyStatistics.getMean() - valleyStatistics.getStandardDeviation(); int lastLocation = 0; List<Span> spans = new ArrayList<>(1); for (int i = 0; i < valleys.size(); i++) { double curValley = valleys.get(i); if (curValley < cutoffThreshold) { int curLocation = (i + 11) * 20; spans.add(new Span(lastLocation, curLocation)); lastLocation = curLocation; } } spans.add(new Span(lastLocation, concatenateTokens.size())); return spans.parallelStream() .map((Span curSpan) -> concatenateTokens.subList(curSpan.getStart(), curSpan.getEnd())) .collect(Collectors.toList()); }
From source file:edu.stanford.muse.index.NER.java
public static void testOpenNLP() { try {/* www. ja v a2 s .c om*/ String s = Util.readFile("/tmp/in"); /* List<Pair<String,Float>> pairs = NER.namesFromText(s); for (Pair<String,Float> p: pairs) { System.out.println (p); } System.out.println ("-----"); */ InputStream pis = Config.getResourceAsStream("en-ner-person.bin"); TokenNameFinderModel pmodel = new TokenNameFinderModel(pis); InputStream lis = Config.getResourceAsStream("en-ner-location.bin"); TokenNameFinderModel lmodel = new TokenNameFinderModel(lis); InputStream ois = Config.getResourceAsStream("en-ner-organization.bin"); TokenNameFinderModel omodel = new TokenNameFinderModel(ois); InputStream tokenStream = Config.getResourceAsStream("en-token.bin"); TokenizerModel modelTokenizer = new TokenizerModel(tokenStream); TokenizerME tokenizer = new TokenizerME(modelTokenizer); Span[] tokSpans = tokenizer.tokenizePos(s); // Util.tokenize(s).toArray(new String[0]); String tokens[] = new String[tokSpans.length]; for (int i = 0; i < tokSpans.length; i++) tokens[i] = s.substring(tokSpans[i].getStart(), tokSpans[i].getEnd()); NameFinderME pFinder = new NameFinderME(pmodel); Span[] pSpans = pFinder.find(tokens); NameFinderME lFinder = new NameFinderME(lmodel); Span[] lSpans = lFinder.find(tokens); NameFinderME oFinder = new NameFinderME(omodel); Span[] oSpans = oFinder.find(tokens); System.out.println("Names found:"); for (Span span : pSpans) { for (int i = span.getStart(); i < span.getEnd(); i++) System.out.print(tokens[i] + " "); System.out.println(); } System.out.println("Locations found:"); for (Span span : lSpans) { for (int i = span.getStart(); i < span.getEnd(); i++) System.out.print(tokens[i] + " "); System.out.println(); } System.out.println("Orgs found:"); for (Span span : oSpans) { for (int i = span.getStart(); i < span.getEnd(); i++) System.out.print(tokens[i] + " "); System.out.println(); } } catch (IOException e) { e.printStackTrace(); } }
From source file:edu.stanford.muse.index.NER.java
/** * triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive). * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String) *///www .j a v a2 s .com private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets( String documentText) { try { NER.initialize(); } catch (Exception e) { Util.print_exception(e, log); } if (documentText.indexOf("\u00A0") > 0) documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages. // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER? if (REMOVE_I18N_CHARS) documentText = cleanI18NChars(documentText); List<Pair<String, String>> namedEntities = new ArrayList<Pair<String, String>>(); // token-type pairs List<Triple<String, Integer, Integer>> allTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs Span sentenceSpans[] = sFinder.sentPosDetect(documentText); // do NER sentence by sentence -- much faster than doing the entire documentText at once for (Span sentenceSpan : sentenceSpans) { int sentenceStartOffset = sentenceSpan.getStart(); String sentence = sentenceSpan.getCoveredText(documentText).toString(); if (sentence.length() > 2000) continue; // that's not a reasonable sentence, could be a uuencoded-something. // convert sentence to tokens cos that's what the name finders need Span[] tokSpans = tokenizer.tokenizePos(sentence); String tokens[] = new String[tokSpans.length]; for (int i = 0; i < tokSpans.length; i++) tokens[i] = tokSpans[i].getCoveredText(sentence).toString(); // find the actual spans (in terms of tokens) that represent names Span[] pSpans = pFinder.find(tokens); Span[] lSpans = lFinder.find(tokens); Span[] oSpans = oFinder.find(tokens); List<Triple<String, Integer, Integer>> sentenceTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs for (Span span : pSpans) sentenceTriples.add(new Triple<String, Integer, Integer>("PERSON", span.getStart(), span.getEnd())); for (Span span : lSpans) sentenceTriples .add(new Triple<String, Integer, Integer>("LOCATION", span.getStart(), span.getEnd())); for (Span span : oSpans) sentenceTriples .add(new Triple<String, Integer, Integer>("ORGANIZATION", span.getStart(), span.getEnd())); for (Triple<String, Integer, Integer> t : sentenceTriples) { String type = t.first(); if (type == null) type = "UNKNOWN"; // we see type = null sometimes #!@#$ allTypes.add(type); int startTok = t.second(); int endTok = t.third(); String namedEntity = sentence.substring(tokSpans[startTok].getStart(), tokSpans[endTok - 1].getEnd()); // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already. if (namedEntity.toLowerCase().startsWith("hi ")) namedEntity = namedEntity.substring("hi ".length()).trim(); if (namedEntity.toLowerCase().startsWith("hello ")) namedEntity = namedEntity.substring("hello ".length()).trim(); if (namedEntity.toLowerCase().startsWith("dear ")) namedEntity = namedEntity.substring("dear ".length()).trim(); if (namedEntity.toLowerCase().startsWith("cheers ")) namedEntity = namedEntity.substring("cheers ".length()).trim(); if (namedEntity.toLowerCase().startsWith("thanks ")) namedEntity = namedEntity.substring("thanks ".length()).trim(); if (DictUtils.tabooNames.contains(namedEntity.toLowerCase())) continue; if (!nameFilterPass(namedEntity)) continue; if (namedEntity.length() < MIN_NAME_LENGTH || namedEntity.length() > MAX_NAME_LENGTH) // drop it continue; namedEntities.add(new Pair<String, String>(namedEntity, type)); if (log.isDebugEnabled()) log.debug(t.first() + " : [" + t.second() + ":" + t.third() + "] " + namedEntity); } // sentence triple offsets cannot be used directly ... have to be first converted to the right offset within the entire document by adding sentenceStartOffset for (Triple<String, Integer, Integer> t : sentenceTriples) { int startTok = t.second(); int endTok = t.third(); int start = tokSpans[startTok].getStart(), end = tokSpans[endTok - 1].getEnd(); //allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + t.getSecond(), sentenceStartOffset + t.getThird())); allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + start, sentenceStartOffset + end)); } } return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(namedEntities), allTriples); }
From source file:com.civis.utils.opennlp.models.address.AddressSpanBuilder.java
private String buildString(Span span, String[] tokens) { return buildString(span.getStart(), span.getEnd(), tokens); }
From source file:de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter.java
@Override protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException { for (Span sSpan : sentenceModelProvider.getResource().sentPosDetect(aText)) { createSentence(aJCas, sSpan.getStart() + aZoneBegin, sSpan.getEnd() + aZoneBegin); for (Span tSpan : tokenModelProvider.getResource() .tokenizePos(aText.substring(sSpan.getStart(), sSpan.getEnd()))) { createToken(aJCas, tSpan.getStart() + sSpan.getStart() + aZoneBegin, tSpan.getEnd() + sSpan.getStart() + aZoneBegin); }//from w ww.java 2 s .co m } }
From source file:com.civis.utils.opennlp.models.address.AddressSpanBuilder.java
private void parse(String[] tokens) { Span streetSpan = createStreetSpan(originalSpan.getStart(), originalSpan.getEnd(), tokens); street = buildString(streetSpan, tokens); Span streetNumberSpan = new Span(streetSpan.getEnd(), streetSpan.getEnd() + 1); streetNumber = buildString(streetNumberSpan, tokens); Span zipSpan = new Span(streetNumberSpan.getEnd(), streetNumberSpan.getEnd() + 1); zip = buildString(zipSpan, tokens);/*from w ww . j a v a 2 s. c om*/ zip = zip.replaceAll("[+.^:,]", ""); if (StringUtils.isBlank(zip)) { // token include only special chars like , or . //try next zip token // use case Lindenstr. 19 , 12207 Berlin zipSpan = new Span(zipSpan.getStart() + 1, zipSpan.getEnd() + 1); zip = buildString(zipSpan, tokens); } CSVAddressData csvAddressData = findAddressDataByZip(zip); if (csvAddressData != null) { city = csvAddressData.getCity(); country = "Deutschland"; } else { String cityAndMaybeCountry = buildString(zipSpan.getEnd(), originalSpan.getEnd(), tokens); country = tryToFindCountry(cityAndMaybeCountry); if (country == null) { // no country found, means rest string is a city string city = cityAndMaybeCountry; } else { city = cityAndMaybeCountry.replace(country, "").trim(); } } }
From source file:opennlp.tools.util.Span.java
/** * Returns true if the specified span is contained by this span. * Identical spans are considered to contain each other. * //from w w w . java2s . c om * @param s The span to compare with this span. * * @return true is the specified span is contained by this span; * false otherwise. */ public boolean contains(Span s) { return start <= s.getStart() && s.getEnd() <= end; }
From source file:opennlp.tools.util.Span.java
/** * Checks if the specified span is equal to the current span. *///from w w w . j a v a2 s . c om public boolean equals(Object o) { boolean result; if (o == this) { result = true; } else if (o instanceof Span) { Span s = (Span) o; result = getStart() == s.getStart() && getEnd() == s.getEnd(); } else { result = false; } return result; }
From source file:opennlp.tools.util.Span.java
/** * Compares the specified span to the current span. *//*from w ww .java2 s . c om*/ public int compareTo(Object o) { Span s = (Span) o; if (getStart() < s.getStart()) { return -1; } else if (getStart() == s.getStart()) { if (getEnd() > s.getEnd()) { return -1; } else if (getEnd() < s.getEnd()) { return 1; } else { return 0; } } else { return 1; } }
From source file:opennlp.tools.util.Span.java
/** * Returns true if the specified span intersects with this span. * //from ww w . jav a2 s . co m * @param s The span to compare with this span. * * @return true is the spans overlap; false otherwise. */ public boolean intersects(Span s) { int sstart = s.getStart(); //either s's start is in this or this' start is in s return this.contains(s) || s.contains(this) || getStart() <= sstart && sstart < getEnd() || sstart <= getStart() && getStart() < s.getEnd(); }