org.structr.files.text.FulltextIndexingAgent.java Source code

Introduction

Here is the source code for org.structr.files.text.FulltextIndexingAgent.java
Source

/**
 * Copyright (C) 2010-2016 Structr GmbH
 *
 * This file is part of Structr <http://structr.org>.
 *
 * Structr is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * Structr is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with Structr.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.structr.files.text;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.index.Index;
import org.structr.agent.Agent;
import org.structr.agent.ReturnValue;
import org.structr.agent.Task;
import org.structr.core.Services;
import org.structr.core.app.StructrApp;
import org.structr.core.entity.Principal;
import static org.structr.core.graph.NodeInterface.owner;
import org.structr.core.graph.NodeService;
import org.structr.core.graph.Tx;
import org.structr.web.entity.Indexable;
import org.structr.web.entity.User;

/**
 *
 *
 */
public class FulltextIndexingAgent extends Agent<Indexable> {

    private static final Logger logger = Logger.getLogger(FulltextIndexingAgent.class.getName());
    private static final Map<String, Set<String>> languageStopwordMap = new LinkedHashMap<>();
    public static final String TASK_NAME = "FulltextIndexing";

    @Override
    public ReturnValue processTask(final Task<Indexable> task) throws Throwable {

        if (TASK_NAME.equals(task.getType())) {

            for (final Indexable file : task.getNodes()) {

                doIndexing(file);
            }

            return ReturnValue.Success;
        }

        return ReturnValue.Abort;
    }

    @Override
    public Class getSupportedTaskType() {
        return FulltextIndexingTask.class;
    }

    @Override
    public boolean createEnclosingTransaction() {
        return false;
    }

    // ----- private methods -----
    private void doIndexing(final Indexable file) {

        boolean parsingSuccessful = false;
        InputStream inputStream = null;
        String fileName = "unknown file";

        try {

            try (final Tx tx = StructrApp.getInstance().tx()) {

                inputStream = file.getInputStream();
                fileName = file.getName();

                tx.success();
            }

            if (inputStream != null) {

                final FulltextTokenizer tokenizer = new FulltextTokenizer(fileName);

                try (final InputStream is = inputStream) {

                    final AutoDetectParser parser = new AutoDetectParser();

                    parser.parse(is, new BodyContentHandler(tokenizer), new Metadata());
                    parsingSuccessful = true;
                }

                // only do indexing when parsing was successful
                if (parsingSuccessful) {

                    try (Tx tx = StructrApp.getInstance().tx()) {

                        // don't modify access time when indexing is finished
                        file.getSecurityContext().preventModificationOfAccessTime();

                        // save raw extracted text
                        file.setProperty(Indexable.extractedContent, tokenizer.getRawText());

                        // tokenize name
                        tokenizer.write(getName());

                        // tokenize owner name
                        final Principal _owner = file.getProperty(owner);
                        if (_owner != null) {

                            final String ownerName = _owner.getName();
                            if (ownerName != null) {

                                tokenizer.write(ownerName);
                            }

                            final String eMail = _owner.getProperty(User.eMail);
                            if (eMail != null) {

                                tokenizer.write(eMail);
                            }

                            final String twitterName = _owner.getProperty(User.twitterName);
                            if (twitterName != null) {

                                tokenizer.write(twitterName);
                            }
                        }

                        tx.success();
                    }

                    // index document excluding stop words
                    final NodeService nodeService = Services.getInstance().getService(NodeService.class);
                    final Index<Node> fulltextIndex = nodeService.getNodeIndex(NodeService.NodeIndex.fulltext);
                    final Set<String> stopWords = languageStopwordMap.get(tokenizer.getLanguage());
                    final String indexKeyName = Indexable.indexedWords.jsonName();
                    final Iterator<String> wordIterator = tokenizer.getWords().iterator();
                    final Node node = file.getNode();
                    final Set<String> indexedWords = new TreeSet<>();

                    logger.log(Level.INFO, "Indexing {0}..", fileName);

                    while (wordIterator.hasNext()) {

                        try (Tx tx = StructrApp.getInstance().tx()) {

                            // remove node from index (in case of previous indexing runs)
                            fulltextIndex.remove(node, indexKeyName);

                            while (wordIterator.hasNext()) {

                                // strip double quotes
                                final String word = StringUtils.strip(wordIterator.next(), "\"");

                                if (!stopWords.contains(word)) {

                                    indexedWords.add(word);
                                    fulltextIndex.add(node, indexKeyName, word);

                                    //                           if (indexedWords > 1000) {
                                    //                              indexedWords = 0;
                                    //                              break;
                                    //                           }
                                }
                            }

                            tx.success();
                        }
                    }

                    // store indexed words separately
                    try (Tx tx = StructrApp.getInstance().tx()) {

                        // don't modify access time when indexing is finished
                        file.getSecurityContext().preventModificationOfAccessTime();

                        // store indexed words
                        file.setProperty(Indexable.indexedWords,
                                (String[]) indexedWords.toArray(new String[indexedWords.size()]));

                        tx.success();
                    }

                    logger.log(Level.INFO, "Indexing of {0} finished, {1} words extracted",
                            new Object[] { fileName, tokenizer.getWordCount() });

                }
            }

        } catch (final Throwable t) {

            logger.log(Level.WARNING, "Indexing of {0} failed: {1}", new Object[] { fileName, t.getMessage() });
        }
    }

    static {

        try (final ZipInputStream zis = new ZipInputStream(new BufferedInputStream(
                FulltextIndexingAgent.class.getResourceAsStream("/stopwords/stop-words.zip")))) {

            for (ZipEntry entry = zis.getNextEntry(); entry != null; entry = zis.getNextEntry()) {

                if (!entry.isDirectory()) {

                    final String entryName = entry.getName();
                    if (entryName.contains("_") && entryName.endsWith(".txt")) {

                        final int langPos = entryName.lastIndexOf("_") + 1;
                        final String language = entryName.substring(langPos, langPos + 2);

                        Set<String> stopwordSet = languageStopwordMap.get(language);
                        if (stopwordSet == null) {

                            stopwordSet = new LinkedHashSet<>();
                            languageStopwordMap.put(language, stopwordSet);
                        }

                        // read stopword set
                        for (final String word : IOUtils.readLines(zis)) {
                            stopwordSet.add(word.trim());
                        }
                    }
                }
            }

        } catch (IOException ioex) {

            ioex.printStackTrace();
        }
    }
}