org.sakaiproject.search.util.DocumentIndexingUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.sakaiproject.search.util.DocumentIndexingUtils.java

Source

/**********************************************************************************
 * $URL: https://source.sakaiproject.org/svn/search/trunk/search-impl/impl/src/java/org/sakaiproject/search/component/dao/impl/SearchIndexBuilderWorkerDaoJdbcImpl.java $
 * $Id: SearchIndexBuilderWorkerDaoJdbcImpl.java 103115 2012-01-13 14:05:23Z david.horwitz@uct.ac.za $
 ***********************************************************************************
 *
 * Copyright (c) 2003, 2004, 2005, 2006, 2007, 2008 The Sakai Foundation
 *
 * Licensed under the Educational Community License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.opensource.org/licenses/ECL-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **********************************************************************************/
package org.sakaiproject.search.util;

import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.CompressionTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.sakaiproject.search.api.EntityContentProducer;
import org.sakaiproject.search.api.SearchService;
import org.sakaiproject.search.api.StoredDigestContentProducer;

/**
 * Utilities for indexing documents
 * @author dhorwitz
 * @since 1.5.0
 *
 */
public class DocumentIndexingUtils {

    private static final Log log = LogFactory.getLog(DocumentIndexingUtils.class);

    /**
     * Index a Sakai entity to a document
     * @param ref the reference of the entity 
     * @param digestStorageUtil
     * @param sep the {@link EntityContentProducer} to index the document
     * @param the URL of the server
     * @return The Lucene document suitable for adding to the Index
     */
    public static Document createIndexDocument(String ref, DigestStorageUtil digestStorageUtil,
            EntityContentProducer sep, String serverURL, Reader contentReader) {

        Document doc = new Document();

        String container = sep.getContainer(ref);
        if (container == null)
            container = ""; //$NON-NLS-1$

        //The reference of the Sakai object stored and index unanalyzed
        doc.add(new Field(SearchService.FIELD_REFERENCE, CompressionTools.compressString(filterNull(ref)),
                Field.Store.YES));
        doc.add(new Field(SearchService.FIELD_REFERENCE, filterNull(ref), Field.Store.NO,
                Field.Index.NOT_ANALYZED));
        log.debug("added " + ref + " as the reference field");
        log.debug("container is:" + container);
        //The date of indexing
        String timeStamp = String.valueOf(System.currentTimeMillis());
        doc.add(new Field(SearchService.DATE_STAMP, timeStamp, Field.Store.NO, Field.Index.NOT_ANALYZED));
        doc.add(new Field(SearchService.DATE_STAMP, CompressionTools.compressString(timeStamp), Field.Store.YES));

        //The Container
        //TODO: check if this is ever used
        doc.add(new Field(SearchService.FIELD_CONTAINER, filterNull(container), Field.Store.NO,
                Field.Index.NOT_ANALYZED));
        doc.add(new Field(SearchService.FIELD_CONTAINER, CompressionTools.compressString(filterNull(container)),
                Field.Store.YES));

        //the type of the object
        //TODO: check if this is ever used
        /*doc.add(new Field(SearchService.FIELD_TYPE,
        filterNull(sep.getType(ref)),
        Field.Store.COMPRESS, Field.Index.NOT_ANALYZED));
        */
        //the opject subtype
        //TODO: check if this is ever used
        /*doc.add(new Field(SearchService.FIELD_SUBTYPE,
        filterNull(sep.getSubType(ref)),
        Field.Store.COMPRESS, Field.Index.NOT_ANALYZED));
        */

        // add last part of the index as this is the filename
        String idIndex = sep.getId(ref);
        if (idIndex != null && idIndex.indexOf("/") > 0) {
            idIndex = idIndex.substring(idIndex.lastIndexOf("/"));
        }
        idIndex = filterPunctuation(idIndex);

        doc.add(new Field(SearchService.FIELD_CONTENTS, idIndex, Field.Store.NO, Field.Index.ANALYZED,
                Field.TermVector.YES));

        // add the title 
        String title = filterPunctuation(sep.getTitle(ref));
        doc.add(new Field(SearchService.FIELD_CONTENTS, title, Field.Store.NO, Field.Index.ANALYZED,
                Field.TermVector.YES));

        if (sep.isContentFromReader(ref)) {
            contentReader = sep.getContentReader(ref);
            if (log.isDebugEnabled()) {
                log.debug("Adding Content for " + ref + " using " + contentReader);
            }
            doc.add(new Field(SearchService.FIELD_CONTENTS, contentReader, Field.TermVector.YES));
        } else {
            String content = sep.getContent(ref);
            //its possible that there is no content to index
            if (content != null && content.trim().length() > 0) {
                if (log.isDebugEnabled()) {
                    log.debug("Adding Content for " + ref + " as [" + content + "]");
                }
                int docCount = digestStorageUtil.getDocCount(ref) + 1;
                doc.add(new Field(SearchService.FIELD_CONTENTS, filterNull(content), Field.Store.NO,
                        Field.Index.ANALYZED, Field.TermVector.YES));
                if (sep instanceof StoredDigestContentProducer) {
                    doc.add(new Field(SearchService.FIELD_DIGEST_COUNT, Integer.valueOf(docCount).toString(),
                            Field.Store.YES, Field.Index.NO, Field.TermVector.NO));
                    digestStorageUtil.saveContentToStore(ref, content, docCount);
                    if (docCount > 2) {
                        digestStorageUtil.cleanOldDigests(ref);
                    }
                }
            }
        }

        //The document title
        String docTitle = filterNull(sep.getTitle(ref));
        doc.add(new Field(SearchService.FIELD_TITLE, docTitle, Field.Store.NO, Field.Index.ANALYZED,
                Field.TermVector.YES));

        doc.add(new Field(SearchService.FIELD_TITLE, CompressionTools.compressString(docTitle), Field.Store.YES));

        // The tool - used for limiting searches
        //does not need to be stored
        doc.add(new Field(SearchService.FIELD_TOOL, filterNull(sep.getTool()), Field.Store.NO,
                Field.Index.NOT_ANALYZED));
        doc.add(new Field(SearchService.FIELD_TOOL, CompressionTools.compressString(filterNull(sep.getTool())),
                Field.Store.YES));
        //The document URL - should not be indexed
        doc.add(new Field(SearchService.FIELD_URL,
                CompressionTools.compressString(filterUrl(filterNull(sep.getUrl(ref)), serverURL)),
                Field.Store.YES));
        //the owning site
        String siteId = filterNull(sep.getSiteId(ref));
        doc.add(new Field(SearchService.FIELD_SITEID, siteId, Field.Store.NO, Field.Index.NOT_ANALYZED));

        doc.add(new Field(SearchService.FIELD_SITEID, CompressionTools.compressString(siteId), Field.Store.YES));

        // add the custom properties

        Map<String, ?> m = sep.getCustomProperties(ref);
        if (m != null) {
            Set<?> entries = m.entrySet();
            for (Iterator<?> cprops = entries.iterator(); cprops.hasNext();) {
                Entry<String, ?> entry = (Entry<String, ?>) cprops.next();
                String key = entry.getKey();
                Object value = entry.getValue();
                String[] values = null;
                if (value instanceof String) {
                    values = new String[1];
                    values[0] = (String) value;
                }
                if (value instanceof String[]) {
                    values = (String[]) value;
                }
                if (values == null) {
                    log.info("Null Custom Properties value has been suppled by " //$NON-NLS-1$
                            + sep + " in index " //$NON-NLS-1$
                            + key);
                } else {
                    for (int i = 0; i < values.length; i++) {
                        if (key.startsWith("T")) {
                            key = key.substring(1);
                            String val = (filterNull(values[i]));
                            doc.add(new Field(key, val, Field.Store.NO, Field.Index.ANALYZED,
                                    Field.TermVector.YES));
                            doc.add(new Field(key, CompressionTools.compressString(val), Field.Store.YES));
                        } else {
                            String val = filterNull(values[i]);
                            doc.add(new Field(key, val, Field.Store.NO, Field.Index.NOT_ANALYZED));
                            doc.add(new Field(key, CompressionTools.compressString(val), Field.Store.YES));
                        }
                    }
                }
            }
        }

        log.debug("Indexing Document " + doc); //$NON-NLS-1$
        return doc;
    }

    /**
     * @param title
     * @return
     */
    private static String filterNull(String s) {
        if (s == null) {
            return "";
        }
        return s;
    }

    private static String filterPunctuation(String term) {
        if (term == null) {
            return "";
        }
        char[] endTerm = term.toCharArray();
        for (int i = 0; i < endTerm.length; i++) {
            if (!Character.isLetterOrDigit(endTerm[i])) {
                endTerm[i] = ' ';
            }
        }
        return new String(endTerm);
    }

    /**
     * @param string
     * @return
     */
    private static String filterUrl(String url, String serverURL) {
        if (url != null && url.startsWith(serverURL)) {
            String absUrl = url.substring(serverURL.length());
            if (!absUrl.startsWith("/")) {
                absUrl = "/" + absUrl;
            }
            return absUrl;
        }
        return url;
    }

}