org.dspace.search.IndexDenIndex.java Source code

Introduction

Here is the source code for org.dspace.search.IndexDenIndex.java
Source

/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.search;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.List;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.flaptor.indextank.apiclient.IndexAlreadyExistsException;
import com.flaptor.indextank.apiclient.IndexDoesNotExistException;
import com.flaptor.indextank.apiclient.InvalidSyntaxException;
import com.flaptor.indextank.apiclient.IndexTankClient;
import com.flaptor.indextank.apiclient.IndexTankClient.Index;
import com.flaptor.indextank.apiclient.IndexTankClient.Query;
import com.flaptor.indextank.apiclient.IndexTankClient.SearchResults;
import com.flaptor.indextank.apiclient.MaximumIndexesExceededException;

import org.apache.lucene.document.DateTools;

import com.google.common.base.Strings;

import org.dspace.core.Constants;
import org.dspace.core.LogManager;

import static org.dspace.search.DSIndexer.*;

/**
 * IndexDenIndex provides indexing and querying services backed by a
 * IndexDen Search as a service (hosted index). Pretty basic, but
 * demonstrator for using search SaaS.
 * 
 * @author richardrodgers
 */
public class IndexDenIndex implements IndexService {
    private static final Logger log = LoggerFactory.getLogger(IndexDenIndex.class);

    private static final String NOT_ANALYZED = "not-analyzed";
    private static final String ANALYZED = "analyzed";
    private static final String NONE = "no";

    // search field schema - hard-coded here, but could easily be made more configurable
    // RLR - this schema is incomplete and has errors - again, current use is as a demonstrator
    private static final Map<String, FieldConfig> schema = new HashMap<String, FieldConfig>() {
        {
            put(LAST_INDEXED_FIELD, new FieldConfig(LAST_INDEXED_FIELD, "text", true, NOT_ANALYZED));
            put(DOCUMENT_STATUS_FIELD, new FieldConfig(DOCUMENT_STATUS_FIELD, "text", true, NOT_ANALYZED));
            put(DOCUMENT_KEY, new FieldConfig(DOCUMENT_KEY, "text", true, NOT_ANALYZED));
            put("search.resourcetype", new FieldConfig("search.resourcetype", "text", true, NOT_ANALYZED));
            put("search.resourceid", new FieldConfig("search.resourceid", "text", true, NONE));
            put("name", new FieldConfig("name", "text", false, ANALYZED));
            put("default", new FieldConfig("default", "text", false, ANALYZED));
            put("abstract", new FieldConfig("abstract", "text", false, ANALYZED));
            put("author", new FieldConfig("author", "text", false, ANALYZED));
            put("title", new FieldConfig("title", "text", false, ANALYZED));
            put("text", new FieldConfig("text", "text", false, ANALYZED));
            put("location", new FieldConfig("location", "text", false, ANALYZED));
            // following are templates
            put("sort_", new FieldConfig("sort_", "text", false, NOT_ANALYZED));
        }
    };

    private IndexTankClient client;
    private Index index;

    public IndexDenIndex() {
    }

    /**
     * Is stale checks the lastModified time stamp in the database and the index
     * to determine if the index is stale.
     * 
     * @param lastModified
     * @return document status true if index date earlier than lastModified
     * @throws IOException
     */
    @Override
    public boolean isDocumentStale(String documentKey, Date lastModified) throws IOException {

        String queryStr = "handle:" + documentKey;

        try {
            SearchResults results = index.search(Query.forString(queryStr).withFetchFields("timestamp"));
            // compare with lastmodified
            if (results.matches > 0) {
                Map<String, Object> docMap = results.results.get(0);
                String ts = (String) docMap.get("timestamp");
                return (Long.valueOf(ts) < (lastModified.getTime() / 1000));
            }
        } catch (InvalidSyntaxException isE) {
        }
        return true;
    }

    @Override
    public void doTask(IndexingTask task) throws IOException {

        switch (task.getAction()) {
        case DELETE:
            try {
                index.deleteDocument(task.getFieldValue(DOCUMENT_KEY));
            } catch (IndexDoesNotExistException idneE) {
            }
            break;
        case UPDATE:
            // for this simple demo, just add each field individually, then lump all in 'text' field (the default) 
            Map<String, String> fields = new HashMap<>();
            StringBuilder textSb = new StringBuilder();
            // add in fields
            for (String key : task.getFieldKeys()) {
                // get config for field
                FieldConfig fc = schema.get(key);
                if (fc != null) {
                    for (String value : task.getFieldValues(key)) {
                        mapValue(value, fc, fields);
                        // all get mapped to 'text' index
                        textSb.append(value).append(" ");
                    }
                } else {
                    log.error("Invalid field map - field: '" + key + "' undefined in schema");
                }
            }
            mapValue(textSb.toString(), schema.get("text"), fields);
            // likewise any streams - not yet implemented
            /*
            for (String key : task.getStreamKeys()) {
            for (InputStream is : task.getStreamValues(key)) {
               doc.add(new Field("default", new BufferedReader(new InputStreamReader(is))));
            }
            }
            commit(task.getFieldValue(DOCUMENT_KEY), doc, true);
            */
            try {
                /*
                // RLR DEBUG
                for (String fname : fields.keySet()) {
                    log.info("add2doc: " + fname + " :" + fields.get(fname));
                }
                */
                index.addDocument(task.getFieldValue(DOCUMENT_KEY), fields);
            } catch (IndexDoesNotExistException idneE) {
            }
            break;
        case TX_BEGIN:
            // not implemented setBatchProcessingMode(true);
            break;
        case TX_END:
            // not implemented setBatchProcessingMode(false);
            break;
        case PURGE:
            try {
                index.delete();
            } catch (IndexDoesNotExistException idneE) {
            }
            break;
        default:
            break;
        }
    }

    @Override
    public QueryResults doQuery(QueryArgs args) throws IOException {
        String queryStr = args.getQuery();
        QueryResults qr = new QueryResults();
        List<String> hitHandles = new ArrayList<String>();
        List<Integer> hitIds = new ArrayList<Integer>();
        List<Integer> hitTypes = new ArrayList<Integer>();

        // set up the QueryResults object
        qr.setHitHandles(hitHandles);
        qr.setHitIds(hitIds);
        qr.setHitTypes(hitTypes);
        qr.setStart(args.getStart());
        qr.setPageSize(args.getPageSize());
        qr.setEtAl(args.getEtAl());

        // massage the query string a bit
        queryStr = DSQuery.checkEmptyQuery(queryStr); // change nulls to an empty string
        queryStr = DSQuery.stripHandles(queryStr); // remove handles from query string
        queryStr = DSQuery.stripAsterisk(queryStr); // remove asterisk from beginning of string

        try {
            SearchResults results = index.search(Query.forString(queryStr).withFetchFields("handle",
                    "search.resourcetype", "search.resourceid"));
            qr.setHitCount((int) results.matches);
            // We now have a bunch of hits - snip out a 'window'
            // defined in start, count and return the handles
            // from that window, if there are enough hits
            if (args.getStart() < results.matches) {
                // get as many as we can, up to the window size
                // how many are available after snipping off at offset 'start'?
                int hitsRemaining = (int) results.matches - args.getStart();
                int hitsToProcess = (hitsRemaining < args.getPageSize()) ? hitsRemaining : args.getPageSize();

                for (int i = args.getStart(); i < (args.getStart() + hitsToProcess); i++) {
                    Map<String, Object> result = results.results.get(i);

                    String resourceId = (String) result.get("search.resourceid");
                    String resourceType = (String) result.get("search.resourcetype");

                    String handleText = (String) result.get("handle");
                    String handleType = (String) result.get("type");

                    switch (Integer.parseInt(resourceType != null ? resourceType : handleType)) {
                    case Constants.ITEM:
                        hitTypes.add(Constants.ITEM);
                        break;

                    case Constants.COLLECTION:
                        hitTypes.add(Constants.COLLECTION);
                        break;

                    case Constants.COMMUNITY:
                        hitTypes.add(Constants.COMMUNITY);
                        break;
                    }

                    hitHandles.add(handleText);
                    hitIds.add(resourceId == null ? null : Integer.parseInt(resourceId));
                }
            }
        } catch (InvalidSyntaxException isE) {
        }

        return qr;
    }

    @Override
    public void init(String config) {
        // config is API URL
        client = new IndexTankClient(config);
        index = client.getIndex("index1");
        try {
            if (!index.exists()) {
                // RLR TEST only - should likely be false in production
                boolean apiPublic = true;
                //index = client.createIndex("index1", new IndexTankClient.IndexConfiguration().enablePublicSearch(apiPublic));
                index = client.createIndex("index1");
            }
        } catch (IndexAlreadyExistsException e) {
            // just checked - should not see this
            throw new IllegalStateException("Could not create search index: " + e.getMessage(), e);
        } catch (IOException | MaximumIndexesExceededException e) {
            throw new IllegalStateException("Could not create search index: " + e.getMessage(), e);
        }
    }

    private void mapValue(String value, FieldConfig fc, Map<String, String> fields) {
        if ("timestamp".equals(fc.fieldType)) {
            Date date = toDate(value);
            if (date != null) {
                fields.put(fc.fieldName, DateTools.dateToString(date, DateTools.Resolution.SECOND));
                fields.put(fc.fieldName + ".year", DateTools.dateToString(date, DateTools.Resolution.YEAR));
            }
        } else if ("date".equals(fc.fieldType)) {
            Date date = toDate(value);
            if (date != null) {
                fields.put(fc.fieldName, DateTools.dateToString(date, DateTools.Resolution.DAY));
                fields.put(fc.fieldName + ".year", DateTools.dateToString(date, DateTools.Resolution.YEAR));
            }
        } else {
            // all other cases - just add one field with untransformed value
            fields.put(fc.fieldName, value);
        }
    }

    /**
     * Helper function to retrieve a date using a best guess of the potential date encodings on a field
     *  
     * @param t
     * @return
     */
    private static Date toDate(String t) {

        List<String> fmts = new ArrayList<String>();
        // Choose the likely date formats based on string length
        switch (t.length()) {
        case 4:
            fmts.add("yyyy");
            break;
        case 6:
            fmts.add("yyyyMM");
            break;
        case 7:
            fmts.add("yyyy-MM");
            break;
        case 8:
            fmts.add("yyyyMMdd");
            fmts.add("yyyy MMM");
            break;
        case 10:
            fmts.add("yyyy-MM-dd");
            break;
        case 11:
            fmts.add("yyyy MM dd");
            break;
        case 20:
            fmts.add("yyyy-MM-dd'T'HH:mm:ss'Z'");
            break;
        default:
            fmts.add("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
            break;
        }

        for (String fmt : fmts) {
            try {
                // Parse the date
                DateTimeFormatter formatter = DateTimeFormat.forPattern(fmt);
                DateTime dt = formatter.parseDateTime(t);
                return dt.toDate();
            } catch (IllegalArgumentException pe) {
                log.error("Unable to parse date format", pe);
            }
        }
        return null;
    }

    private static class FieldConfig {
        String fieldName;
        String fieldType;
        boolean store;
        String index;

        public FieldConfig(String fieldName, String fieldType, boolean store, String index) {
            this.fieldName = fieldName;
            this.fieldType = fieldType;
            this.store = store;
            this.index = index;
        }
    }
}