Example usage for opennlp.tools.util Span getEnd

Introduction

In this page you can find the example usage for opennlp.tools.util Span getEnd.

Prototype

public int getEnd()

Source Link

Document

Return the end of a span.

Usage

From source file:com.civprod.writerstoolbox.testarea.UnsupervisedDiscourseSegmentation.java

public static List<List<String>> segment(Document<?> inDocument, SentenceDetector inSentenceDetector,
        StringTokenizer inStringTokenizer) {
    List<String> concatenateTokens = concatenateTokens(inDocument, inSentenceDetector, inStringTokenizer);
    List<String> stemmAndFilterList = TokenUtil.stemmAndFilterList(concatenateTokens);
    List<List<String>> splitIntoFixLengthLists = splitIntoFixLengthLists(stemmAndFilterList, 20);
    List<Counter<String>> counters = splitIntoFixLengthLists.parallelStream()
            .map((List<String> curSentence) -> CounterUtils.count(curSentence)).collect(Collectors.toList());
    List<Double> cosineSimilarity = new ArrayList<>(counters.size() - 20);
    for (int i = 0; i < (counters.size() - 20); i++) {
        cosineSimilarity.add(cosineSimilarityStemmedAndFiltered(Counter.join(counters.subList(i, i + 10)),
                Counter.join(counters.subList(i + 11, i + 20))));
    }//from  ww w .jav  a  2s .co m
    List<Double> valleys = new ArrayList<>(cosineSimilarity.size() - 2);
    for (int i = 0; i < valleys.size(); i++) {
        double ya1 = cosineSimilarity.get(i);
        double ya2 = cosineSimilarity.get(i + 1);
        double ya3 = cosineSimilarity.get(i + 2);
        valleys.add((ya1 - ya2) + (ya3 - ya2));
    }
    SummaryStatistics valleyStatistics = valleys.parallelStream().collect(SummaryStatisticCollector.instance);
    double cutoffThreshold = valleyStatistics.getMean() - valleyStatistics.getStandardDeviation();
    int lastLocation = 0;
    List<Span> spans = new ArrayList<>(1);
    for (int i = 0; i < valleys.size(); i++) {
        double curValley = valleys.get(i);
        if (curValley < cutoffThreshold) {
            int curLocation = (i + 11) * 20;
            spans.add(new Span(lastLocation, curLocation));
            lastLocation = curLocation;
        }
    }
    spans.add(new Span(lastLocation, concatenateTokens.size()));
    return spans.parallelStream()
            .map((Span curSpan) -> concatenateTokens.subList(curSpan.getStart(), curSpan.getEnd()))
            .collect(Collectors.toList());
}

From source file:edu.stanford.muse.index.NER.java

public static void testOpenNLP() {

    try {/* www.  ja v  a2 s  .c om*/
        String s = Util.readFile("/tmp/in");
        /*
        List<Pair<String,Float>> pairs = NER.namesFromText(s);
        for (Pair<String,Float> p: pairs) {
           System.out.println (p);
        }
        System.out.println ("-----");
        */

        InputStream pis = Config.getResourceAsStream("en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        InputStream lis = Config.getResourceAsStream("en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        InputStream ois = Config.getResourceAsStream("en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        InputStream tokenStream = Config.getResourceAsStream("en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        TokenizerME tokenizer = new TokenizerME(modelTokenizer);
        Span[] tokSpans = tokenizer.tokenizePos(s); // Util.tokenize(s).toArray(new String[0]);

        String tokens[] = new String[tokSpans.length];
        for (int i = 0; i < tokSpans.length; i++)
            tokens[i] = s.substring(tokSpans[i].getStart(), tokSpans[i].getEnd());

        NameFinderME pFinder = new NameFinderME(pmodel);
        Span[] pSpans = pFinder.find(tokens);
        NameFinderME lFinder = new NameFinderME(lmodel);
        Span[] lSpans = lFinder.find(tokens);
        NameFinderME oFinder = new NameFinderME(omodel);
        Span[] oSpans = oFinder.find(tokens);
        System.out.println("Names found:");
        for (Span span : pSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Locations found:");
        for (Span span : lSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Orgs found:");
        for (Span span : oSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:edu.stanford.muse.index.NER.java

/**
 * triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive).
 * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String)
 *///www .j  a v  a2 s .com
private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets(
        String documentText) {
    try {
        NER.initialize();
    } catch (Exception e) {
        Util.print_exception(e, log);
    }

    if (documentText.indexOf("\u00A0") > 0)
        documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages.
    // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER?
    if (REMOVE_I18N_CHARS)
        documentText = cleanI18NChars(documentText);

    List<Pair<String, String>> namedEntities = new ArrayList<Pair<String, String>>(); // token-type pairs
    List<Triple<String, Integer, Integer>> allTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs

    Span sentenceSpans[] = sFinder.sentPosDetect(documentText); // do NER sentence by sentence -- much faster than doing the entire documentText at once

    for (Span sentenceSpan : sentenceSpans) {
        int sentenceStartOffset = sentenceSpan.getStart();
        String sentence = sentenceSpan.getCoveredText(documentText).toString();
        if (sentence.length() > 2000)
            continue; // that's not a reasonable sentence, could be a uuencoded-something.

        // convert sentence to tokens cos that's what the name finders need
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String tokens[] = new String[tokSpans.length];
        for (int i = 0; i < tokSpans.length; i++)
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();

        // find the actual spans (in terms of tokens) that represent names
        Span[] pSpans = pFinder.find(tokens);
        Span[] lSpans = lFinder.find(tokens);
        Span[] oSpans = oFinder.find(tokens);
        List<Triple<String, Integer, Integer>> sentenceTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs

        for (Span span : pSpans)
            sentenceTriples.add(new Triple<String, Integer, Integer>("PERSON", span.getStart(), span.getEnd()));
        for (Span span : lSpans)
            sentenceTriples
                    .add(new Triple<String, Integer, Integer>("LOCATION", span.getStart(), span.getEnd()));
        for (Span span : oSpans)
            sentenceTriples
                    .add(new Triple<String, Integer, Integer>("ORGANIZATION", span.getStart(), span.getEnd()));

        for (Triple<String, Integer, Integer> t : sentenceTriples) {
            String type = t.first();
            if (type == null)
                type = "UNKNOWN"; // we see type = null sometimes #!@#$
            allTypes.add(type);
            int startTok = t.second();
            int endTok = t.third();

            String namedEntity = sentence.substring(tokSpans[startTok].getStart(),
                    tokSpans[endTok - 1].getEnd());
            // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already.

            if (namedEntity.toLowerCase().startsWith("hi "))
                namedEntity = namedEntity.substring("hi ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("hello "))
                namedEntity = namedEntity.substring("hello ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("dear "))
                namedEntity = namedEntity.substring("dear ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("cheers "))
                namedEntity = namedEntity.substring("cheers ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("thanks "))
                namedEntity = namedEntity.substring("thanks ".length()).trim();

            if (DictUtils.tabooNames.contains(namedEntity.toLowerCase()))
                continue;
            if (!nameFilterPass(namedEntity))
                continue;

            if (namedEntity.length() < MIN_NAME_LENGTH || namedEntity.length() > MAX_NAME_LENGTH) // drop it
                continue;
            namedEntities.add(new Pair<String, String>(namedEntity, type));
            if (log.isDebugEnabled())
                log.debug(t.first() + " : [" + t.second() + ":" + t.third() + "] " + namedEntity);
        }

        // sentence triple offsets cannot be used directly ... have to be first converted to the right offset within the entire document by adding sentenceStartOffset
        for (Triple<String, Integer, Integer> t : sentenceTriples) {
            int startTok = t.second();
            int endTok = t.third();
            int start = tokSpans[startTok].getStart(), end = tokSpans[endTok - 1].getEnd();

            //allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + t.getSecond(), sentenceStartOffset + t.getThird()));
            allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + start,
                    sentenceStartOffset + end));
        }
    }

    return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(namedEntities),
            allTriples);
}

From source file:com.civis.utils.opennlp.models.address.AddressSpanBuilder.java

private String buildString(Span span, String[] tokens) {
    return buildString(span.getStart(), span.getEnd(), tokens);
}

From source file:de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter.java

@Override
protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
    for (Span sSpan : sentenceModelProvider.getResource().sentPosDetect(aText)) {
        createSentence(aJCas, sSpan.getStart() + aZoneBegin, sSpan.getEnd() + aZoneBegin);
        for (Span tSpan : tokenModelProvider.getResource()
                .tokenizePos(aText.substring(sSpan.getStart(), sSpan.getEnd()))) {
            createToken(aJCas, tSpan.getStart() + sSpan.getStart() + aZoneBegin,
                    tSpan.getEnd() + sSpan.getStart() + aZoneBegin);
        }//from  w  ww.java  2 s  .co m
    }
}

From source file:com.civis.utils.opennlp.models.address.AddressSpanBuilder.java

private void parse(String[] tokens) {
    Span streetSpan = createStreetSpan(originalSpan.getStart(), originalSpan.getEnd(), tokens);
    street = buildString(streetSpan, tokens);
    Span streetNumberSpan = new Span(streetSpan.getEnd(), streetSpan.getEnd() + 1);
    streetNumber = buildString(streetNumberSpan, tokens);
    Span zipSpan = new Span(streetNumberSpan.getEnd(), streetNumberSpan.getEnd() + 1);
    zip = buildString(zipSpan, tokens);/*from   w ww  . j a  v  a  2  s.  c om*/
    zip = zip.replaceAll("[+.^:,]", "");
    if (StringUtils.isBlank(zip)) {
        // token include only special chars like , or .
        //try next zip token
        // use case Lindenstr. 19 , 12207 Berlin
        zipSpan = new Span(zipSpan.getStart() + 1, zipSpan.getEnd() + 1);
        zip = buildString(zipSpan, tokens);
    }

    CSVAddressData csvAddressData = findAddressDataByZip(zip);
    if (csvAddressData != null) {
        city = csvAddressData.getCity();
        country = "Deutschland";
    } else {
        String cityAndMaybeCountry = buildString(zipSpan.getEnd(), originalSpan.getEnd(), tokens);
        country = tryToFindCountry(cityAndMaybeCountry);
        if (country == null) {
            // no country found, means rest string is a city string
            city = cityAndMaybeCountry;
        } else {
            city = cityAndMaybeCountry.replace(country, "").trim();
        }
    }
}

From source file:opennlp.tools.util.Span.java

  /**
 * Returns true if the specified span is contained by this span.  
 * Identical spans are considered to contain each other. 
 * //from w  w  w  . java2s  . c  om
 * @param s The span to compare with this span.
 * 
 * @return true is the specified span is contained by this span; 
 * false otherwise.
 */
public boolean contains(Span s) {
  return start <= s.getStart() && s.getEnd() <= end;
}

From source file:opennlp.tools.util.Span.java

  /**
 * Checks if the specified span is equal to the current span.
 *///from   w w w  .  j a v a2 s  . c  om
public boolean equals(Object o) {

  boolean result;
    
  if (o == this) {
    result = true;
  }
  else if (o instanceof Span) {
    Span s = (Span) o;
      
    result = getStart() == s.getStart() && getEnd() == s.getEnd();
  }
  else {
    result = false;
  }
    
  return result;
}

From source file:opennlp.tools.util.Span.java

  /**
 * Compares the specified span to the current span.
 *//*from  w ww .java2 s  .  c om*/
public int compareTo(Object o) { 
  Span s = (Span) o;
  if (getStart() < s.getStart()) {
    return -1;
  }
  else if (getStart() == s.getStart()) {
    if (getEnd() > s.getEnd()) {
      return -1;
    }
    else if (getEnd() < s.getEnd()) {
      return 1;
    }
    else {
      return 0;
    }
  }
  else {
    return 1;
  }
}

From source file:opennlp.tools.util.Span.java

  /**
 * Returns true if the specified span intersects with this span.
 * //from ww w  .  jav  a2  s  . co  m
 * @param s The span to compare with this span. 
 * 
 * @return true is the spans overlap; false otherwise. 
 */
public boolean intersects(Span s) {
  int sstart = s.getStart();
  //either s's start is in this or this' start is in s
  return this.contains(s) || s.contains(this) || 
    getStart() <= sstart && sstart < getEnd() ||
    sstart <= getStart() && getStart() < s.getEnd();
}