org.structr.files.text.FulltextTokenizer.java Source code

Introduction

Here is the source code for org.structr.files.text.FulltextTokenizer.java
Source

 /**
  * Copyright (C) 2010-2016 Structr GmbH
  *
  * This file is part of Structr <http://structr.org>.
  *
  * Structr is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Affero General Public License as
  * published by the Free Software Foundation, either version 3 of the
  * License, or (at your option) any later version.
  *
  * Structr is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Affero General Public License for more details.
  *
  * You should have received a copy of the GNU Affero General Public License
  * along with Structr.  If not, see <http://www.gnu.org/licenses/>.
  */
 package org.structr.files.text;

 import java.io.IOException;
 import java.io.Writer;
 import java.util.LinkedHashSet;
 import java.util.Set;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import org.apache.commons.lang.StringUtils;
 import org.apache.tika.language.LanguageIdentifier;
 import org.structr.core.Services;
 import org.structr.core.app.StructrApp;

 /**
  *
  *
  */
 public class FulltextTokenizer extends Writer {

     private static final Logger logger = Logger.getLogger(FulltextTokenizer.class.getName());
     public static final Set<Character> SpecialChars = new LinkedHashSet<>();

     private final int wordCountLimit = Services
             .parseInt(StructrApp.getConfigurationValue(Services.APPLICATION_FILESYSTEM_INDEXING_LIMIT), 50_000);
     private final int wordMinLength = Services
             .parseInt(StructrApp.getConfigurationValue(Services.APPLICATION_FILESYSTEM_INDEXING_MINLENGTH), 4);
     private final int wordMaxLength = Services
             .parseInt(StructrApp.getConfigurationValue(Services.APPLICATION_FILESYSTEM_INDEXING_MAXLENGTH), 40);
     private final StringBuilder rawText = new StringBuilder();
     private final StringBuilder wordBuffer = new StringBuilder();
     private final Set<String> words = new LinkedHashSet<>();
     private String language = "de";
     private String fileName = null;
     private char lastCharacter = 0;
     private int consecutiveCharCount = 0;
     private int wordCount = 0;

static {

   SpecialChars.add('_');
   SpecialChars.add('');
   SpecialChars.add('');
   SpecialChars.add('');
   SpecialChars.add('');
   SpecialChars.add('');
   SpecialChars.add('');
   SpecialChars.add('');
   SpecialChars.add('');
   SpecialChars.add('-');
   SpecialChars.add('%');
   SpecialChars.add('/');
   SpecialChars.add('@');
   SpecialChars.add('$');
   SpecialChars.add('');
   SpecialChars.add('');
   SpecialChars.add('');
   SpecialChars.add('.');
   SpecialChars.add(',');
   SpecialChars.add('\'');
   SpecialChars.add('\"');
   SpecialChars.add('`');
}

     public FulltextTokenizer(final String fileName) {
         this.fileName = fileName;
     }

     @Override
     public void write(final char[] cbuf, final int off, final int len) throws IOException {

         if (wordCount < wordCountLimit) {

             final int limit = off + len;
             final int length = Math.min(limit, cbuf.length);

             for (int i = off; i < length; i++) {

                 final char c = cbuf[i];

                 // remove occurrences of more than 10 identical chars in a row
                 if (c == lastCharacter) {

                     if (consecutiveCharCount++ >= 10) {
                         continue;
                     }

                 } else {

                     consecutiveCharCount = 0;
                 }

                 if (!Character.isAlphabetic(c) && !Character.isDigit(c) && !SpecialChars.contains(c)) {

                     flush();

                     if (Character.isWhitespace(c)) {

                         rawText.append(c);

                     } else {

                         rawText.append(" ");
                     }

                 } else {

                     wordBuffer.append(c);
                     rawText.append(c);
                 }

                 lastCharacter = c;
             }
         }
     }

     public String getLanguage() {
         return language;
     }

     public String getRawText() {
         return rawText.toString();
     }

     public Set<String> getWords() {
         return words;
     }

     @Override
     public void flush() throws IOException {

         final String word = wordBuffer.toString().trim();
         if (StringUtils.isNotBlank(word)) {

             // check for numbers
             if (word.contains(".") || word.contains(",")) {

                 // try to separate numbers
                 if (word.matches("[\\-0-9\\.,]+")) {

                     addWord(word);

                 } else {

                     final String[] parts = word.split("[\\.,]+");
                     final int len = parts.length;

                     for (int i = 0; i < len; i++) {

                         final String part = parts[i].trim();

                         if (StringUtils.isNotBlank(part)) {

                             addWord(part.toLowerCase());
                         }
                     }
                 }

             } else {

                 addWord(word.toLowerCase());
             }
         }

         wordBuffer.setLength(0);
     }

     @Override
     public void close() throws IOException {

         flush();

         final LanguageIdentifier identifier = new LanguageIdentifier(rawText.toString());
         if (identifier.isReasonablyCertain()) {

             language = identifier.getLanguage();
         }
     }

     public int getWordCount() {
         return wordCount;
     }

     // ----- private methods -----
     private void addWord(final String word) {

         final int length = word.length();
         if (length >= wordMinLength && length <= wordMaxLength) {

             words.add(word);

             wordCount++;

             if (wordCount > wordCountLimit) {

                 logger.log(Level.INFO,
                         "Indexing word count of {0} reached for {1}, no more words will be indexed. Set {2} in structr.conf to increase this limit.",

                         new Object[] { wordCountLimit, fileName, Services.APPLICATION_FILESYSTEM_INDEXING_LIMIT });
             }
         }
     }
 }