org.eclipse.orion.internal.server.search.Indexer.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.orion.internal.server.search.Indexer.java

Source

/*******************************************************************************
 * Copyright (c) 2010, 2011 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *******************************************************************************/
package org.eclipse.orion.internal.server.search;

import java.io.*;
import java.util.*;
import org.apache.solr.client.solrj.*;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;
import org.eclipse.core.filesystem.*;
import org.eclipse.core.runtime.*;
import org.eclipse.core.runtime.jobs.Job;
import org.eclipse.orion.internal.server.core.IOUtilities;
import org.eclipse.orion.internal.server.servlets.Activator;
import org.eclipse.orion.internal.server.servlets.ProtocolConstants;
import org.eclipse.orion.internal.server.servlets.workspace.WebProject;
import org.eclipse.orion.internal.server.servlets.workspace.authorization.AuthorizationService;
import org.eclipse.orion.server.core.LogHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The indexer is responsible for keeping the solr/lucene index up to date.
 * It currently does this by naively polling the file system on a periodic basis.
 */
public class Indexer extends Job {

    private static final long DEFAULT_DELAY = 60000;//one minute
    private static final long MAX_SEARCH_SIZE = 300000;//don't index files larger than 300,000 bytes
    //private static final List<String> IGNORED_FILE_TYPES = Arrays.asList("png", "jpg", "gif", "bmp", "pdf", "tiff", "class", "so", "zip", "jar", "tar");
    private final List<String> INDEXED_FILE_TYPES;
    private final SolrServer server;

    public Indexer(SolrServer server) {
        super("Indexing"); //$NON-NLS-1$
        this.server = server;
        setSystem(true);
        INDEXED_FILE_TYPES = Arrays.asList("css", "js", "html", "txt", "xml", "java", "properties", "php", "htm",
                "project", "conf", "pl", "sh", "text", "xhtml", "mf", "manifest");
        Collections.sort(INDEXED_FILE_TYPES);
    }

    @Override
    public boolean belongsTo(Object family) {
        return SearchActivator.JOB_FAMILY.equals(family);
    }

    /**
     * Adds all files in the given directory to the provided list.
     */
    private void collectFiles(IFileStore dir, List<IFileStore> files) {
        try {
            IFileStore[] children = dir.childStores(EFS.NONE, null);
            for (IFileStore child : children) {
                if (!child.getName().startsWith(".")) { //$NON-NLS-1$
                    IFileInfo info = child.fetchInfo();
                    if (info.isDirectory())
                        collectFiles(child, files);
                    else if (!skip(info))
                        files.add(child);
                }
            }
        } catch (CoreException e) {
            handleIndexingFailure(e);
        }
    }

    public void ensureUpdated() {
        schedule(DEFAULT_DELAY);
    }

    private String getContentsAsString(IFileStore file) {
        StringWriter writer = new StringWriter();
        try {
            IOUtilities.pipe(new InputStreamReader(file.openInputStream(EFS.NONE, null)), writer, true, false);
        } catch (IOException e) {
            handleIndexingFailure(e);
        } catch (CoreException e) {
            handleIndexingFailure(e);
        }
        return writer.toString();
    }

    /**
     * Helper method for handling failures that occur while indexing.
     */
    private void handleIndexingFailure(Throwable t) {
        LogHelper.log(new Status(IStatus.ERROR, SearchActivator.PI_SEARCH, "Error during search indexing", t)); //$NON-NLS-1$
    }

    private void indexProject(WebProject project, SubMonitor monitor, List<SolrInputDocument> documents) {
        Logger logger = LoggerFactory.getLogger(Indexer.class);
        if (logger.isDebugEnabled())
            logger.debug("Indexing project id: " + project.getId() + " name: " + project.getName()); //$NON-NLS-1$ //$NON-NLS-2$
        checkCanceled(monitor);
        IFileStore projectStore;
        try {
            projectStore = project.getProjectStore();
        } catch (CoreException e) {
            //TODO implement indexing of remote content
            handleIndexingFailure(e);
            return;
        }
        //project location is always a directory
        IPath projectLocation = new Path(Activator.LOCATION_FILE_SERVLET).append(project.getId())
                .addTrailingSeparator();
        //gather all files
        int projectLocationLength = projectStore.toURI().toString().length();
        final List<IFileStore> toIndex = new ArrayList<IFileStore>();
        collectFiles(projectStore, toIndex);
        int unmodifiedCount = 0, indexedCount = 0;
        //add each file to the index
        List<String> users = findUsers(projectLocation);
        for (IFileStore file : toIndex) {
            checkCanceled(monitor);
            IFileInfo fileInfo = file.fetchInfo();
            if (!isModified(file, fileInfo)) {
                unmodifiedCount++;
                continue;
            }
            indexedCount++;
            SolrInputDocument doc = new SolrInputDocument();
            doc.addField(ProtocolConstants.KEY_ID, file.toURI().toString());
            doc.addField(ProtocolConstants.KEY_NAME, fileInfo.getName());
            doc.addField(ProtocolConstants.KEY_LENGTH, Long.toString(fileInfo.getLength()));
            doc.addField(ProtocolConstants.KEY_DIRECTORY, Boolean.toString(fileInfo.isDirectory()));
            doc.addField(ProtocolConstants.KEY_LAST_MODIFIED, Long.toString(fileInfo.getLastModified()));
            //we add the server-relative location so the server can be moved without affecting the index
            IPath fileLocation = projectLocation.append(file.toURI().toString().substring(projectLocationLength));
            doc.addField(ProtocolConstants.KEY_LOCATION, fileLocation.toString());
            doc.addField("Text", getContentsAsString(file)); //$NON-NLS-1$
            if (users != null)
                for (String user : users)
                    doc.addField(ProtocolConstants.KEY_USER_NAME, user);
            try {
                server.add(doc);
            } catch (Exception e) {
                handleIndexingFailure(e);
            }
        }
        try {
            server.commit();
        } catch (Exception e) {
            handleIndexingFailure(e);
        }
        if (logger.isDebugEnabled())
            logger.debug("\tIndexed: " + indexedCount + " Unchanged:  " + unmodifiedCount); //$NON-NLS-1$ //$NON-NLS-2$
    }

    private List<String> findUsers(IPath projectLocation) {
        return AuthorizationService.findUserWithRights(projectLocation.toString());
    }

    private boolean skip(IFileInfo fileInfo) {
        if (fileInfo.getLength() > MAX_SEARCH_SIZE)
            return true;
        //skip files with no extension, or known binary file type extensions
        String extension = new Path(fileInfo.getName()).getFileExtension();
        if (extension == null || (Collections.binarySearch(INDEXED_FILE_TYPES, extension.toLowerCase()) < 0))
            return true;
        return false;
    }

    private boolean isModified(IFileStore file, IFileInfo fileInfo) {
        try {
            //if there is no match, then the file last modified doesn't match last index so assume it was modified
            StringBuffer qString = new StringBuffer(ProtocolConstants.KEY_ID);
            qString.append(':');
            qString.append(ClientUtils.escapeQueryChars(file.toURI().toString()));
            qString.append(" AND "); //$NON-NLS-1$
            qString.append(ProtocolConstants.KEY_LAST_MODIFIED);
            qString.append(':');
            qString.append(Long.toString(fileInfo.getLastModified()));
            SolrQuery query = new SolrQuery(qString.toString());
            query.setParam(CommonParams.FL, ProtocolConstants.KEY_ID);
            QueryResponse response = server.query(query);
            return response.getResults().getNumFound() == 0;
        } catch (SolrServerException e) {
            handleIndexingFailure(e);
            //attempt to re-index
            return true;
        }
    }

    private void checkCanceled(IProgressMonitor monitor) {
        if (monitor.isCanceled())
            throw new OperationCanceledException();
    }

    @Override
    protected IStatus run(IProgressMonitor monitor) {
        long start = System.currentTimeMillis();
        List<WebProject> projects = WebProject.allProjects();
        SubMonitor progress = SubMonitor.convert(monitor, projects.size());
        List<SolrInputDocument> documents = new ArrayList<SolrInputDocument>();
        for (WebProject project : projects) {
            indexProject(project, progress.newChild(1), documents);
        }
        long duration = System.currentTimeMillis() - start;
        Logger logger = LoggerFactory.getLogger(Indexer.class);
        if (logger.isDebugEnabled())
            logger.debug("Indexed " + projects.size() + " projects in " + duration + "ms"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
        //reschedule the indexing - throttle so the job never runs more than 5% of the time
        long delay = Math.max(DEFAULT_DELAY, duration * 20);
        if (logger.isDebugEnabled())
            logger.debug("Rescheduling indexing in " + delay + "ms"); //$NON-NLS-1$//$NON-NLS-2$
        schedule(delay);
        return Status.OK_STATUS;
    }
}