info.extensiblecatalog.OAIToolkit.oai.dataproviders.LuceneFacadeDataProvider.java Source code

Introduction

Here is the source code for info.extensiblecatalog.OAIToolkit.oai.dataproviders.LuceneFacadeDataProvider.java
Source

/**
  * Copyright (c) 2009 University of Rochester
  *
  * This program is free software; you can redistribute it and/or modify it under the terms of the MIT/X11 license. The text of the  
  * license can be found at http://www.opensource.org/licenses/mit-license.php and copy of the license can be found on the project
  * website http://www.extensiblecatalog.org/. 
  *
  */

package info.extensiblecatalog.OAIToolkit.oai.dataproviders;

import java.io.IOException;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.Version;

import info.extensiblecatalog.OAIToolkit.DTOs.DataTransferObject;
import info.extensiblecatalog.OAIToolkit.DTOs.RecordDTO;
import info.extensiblecatalog.OAIToolkit.DTOs.ResumptionTokenDTO;
import info.extensiblecatalog.OAIToolkit.DTOs.SetToRecordDTO;
import info.extensiblecatalog.OAIToolkit.db.ResumptionTokensMgr;
import info.extensiblecatalog.OAIToolkit.utils.ApplInfo;
import info.extensiblecatalog.OAIToolkit.utils.Logging;
import info.extensiblecatalog.OAIToolkit.utils.TextUtil;

/**
 * 
 * @author Peter Kiraly
 *
 */
public class LuceneFacadeDataProvider extends BasicFacadeDataProvider implements FacadeDataProvider {

    /** The programmer's log object */
    private static String programmer_log = "programmer";
    private static final Logger prglog = Logging.getLogger(programmer_log);
    //private static Logger logger = Logging.getLogger();

    /** Manager of resumption_token records */
    private static ResumptionTokensMgr tokenMgr = new ResumptionTokensMgr();

    private String queryString;
    private TopDocs hits;
    private int currentRecord;
    private int lastRecord;
    private long getIdTime = 0;
    private long doc2RecordTime = 0;
    private long getDocTime = 0;

    // we want to keep a full harvest in memory for fast initial harvesting (first/initial harvest since server started)
    static private BitSet cachedFullHarvestIds = null;
    static private String cachedFullHarvestExpiry = null;
    static private Date cachedFullHarvestEarliestDate = null;
    static private IndexSearcher cachedFullHarvestIndexSearcher = null;
    static private Set<String> cachedFullHarvestTokenIds = new HashSet<String>();
    // vars used to handle cachedFullHarvest
    private boolean cachedFullHarvest = false;
    private int tempIndex;

    synchronized static public void initializeCachedFullHarvest() {
        if (cachedFullHarvestIds == null) {

            IndexReader indexReader;
            try {
                indexReader = ApplInfo.luceneSearcher.getIndexReader().clone(true);
            } catch (CorruptIndexException e1) {
                prglog.error("[PRG] " + e1);
                return;
            } catch (IOException e1) {
                prglog.error("[PRG] " + e1);
                return;
            }
            cachedFullHarvestIndexSearcher = new IndexSearcher(indexReader);

            try {
                cachedFullHarvestEarliestDate = TextUtil
                        .luceneToDate(ApplInfo.luceneSearcher.getEarliestDatestamp());
                cachedFullHarvestExpiry = ApplInfo.luceneSearcher.getLatestDatestamp();
            } catch (ParseException pe) {
                prglog.error("[PRG] " + pe);
                return;
            }

            BooleanQuery query = new BooleanQuery();

            // don't include deleted records
            query.add((Query) new TermQuery(new Term("is_deleted", "false")), Occur.MUST);

            // do we need to filter based on orgCode?
            if (ApplInfo.getOrgCodeFilter() != null) {
                query.add((Query) new TermQuery(new Term("repository_code", ApplInfo.getOrgCodeFilter())),
                        Occur.MUST);
            }

            try {
                cachedFullHarvestIds = new BitSet(indexReader.maxDoc());
                cachedFullHarvestIndexSearcher.search(query, new Collector() {
                    private int docBase;

                    // ignore scorer
                    public void setScorer(Scorer scorer) {
                    }

                    // accept docs out of order (for a BitSet it doesn't matter)
                    public boolean acceptsDocsOutOfOrder() {
                        return true;
                    }

                    public void collect(int doc) {
                        cachedFullHarvestIds.set(doc + docBase);
                    }

                    public void setNextReader(IndexReader reader, int docBase) {
                        this.docBase = docBase;
                    }
                });

                prglog.info("[PRG] Initial Full Harvest Cache created successfully.");

            } catch (IOException e) {
                prglog.error("[PRG] " + e);
                cachedFullHarvestIds = null;
                return;
            }

        }

    }

    public String getEarliestDatestamp() {
        try {
            return TextUtil
                    .timestampToUTC(TextUtil.luceneToTimestamp(ApplInfo.luceneSearcher.getEarliestDatestamp()));
        } catch (Exception e) {
            prglog.error("[PRG] " + e);
            return "";
        }
    }

    public List<DataTransferObject> getRecord(String xcOaiId) {
        List<DataTransferObject> list = new ArrayList<DataTransferObject>();

        Integer docId[] = new Integer[1];
        Document doc = ApplInfo.luceneSearcher.getRecordByXcOaiID(xcOaiId, docId);
        if (doc != null) {
            // make sure this record is part of the orgCode subset!
            final String orgCode = ApplInfo.getOrgCodeFilter();
            if (orgCode != null) {
                if (!doc.get("repository_code").equals(orgCode)) {
                    return list;
                }
            }
            list.add(doc2RecordDTO(doc, docId[0]));
        }

        return list;
    }

    public List<DataTransferObject> getRecord(Integer id, Integer recordType, List<String> filter) {
        prglog.info("[PRG] id: " + id + ", recordType: " + recordType);
        List<DataTransferObject> list = new ArrayList<DataTransferObject>();

        Document doc = ApplInfo.luceneSearcher.getRecordByID(id);
        if (doc != null) {
            // make sure this record is part of the orgCode subset!
            final String orgCode = ApplInfo.getOrgCodeFilter();
            if (orgCode != null) {
                if (!doc.get("repository_code").equals(orgCode)) {
                    return list;
                }
            }
            list.add(doc2RecordDTO(doc, id));
        }

        /*
        List<Object[]> docs = ApplInfo.luceneSearcher
           .getRecordByIDAndRecordType(id, recordType);
        for(int i=0; i<docs.size(); i++) {
           Object[] obj = docs.get(i);
           list.add(doc2RecordDTO((Document)obj[1], (Integer)obj[0]));
        }
        */
        return list;
    }

    public void selectRecords() {
        if (cachedFullHarvest) {
            selectRecordsCachedFullHarvest();
            return;
        }
        lastRecord = recordLimit;
        if (lastRecord > recordLimit) {
            lastRecord = recordLimit;
        }
        if (lastRecord > hits.scoreDocs.length) {
            lastRecord = hits.scoreDocs.length;
        }
        currentRecord = 0; // count each iteration      

        getIdTime = 0;
        doc2RecordTime = 0;
        getDocTime = 0;
    }

    public void selectRecordsCachedFullHarvest() {
        lastRecord = offset + recordLimit;
        if (lastRecord > cachedFullHarvestIds.cardinality()) {
            lastRecord = cachedFullHarvestIds.cardinality();
        }
        currentRecord = offset; // count each iteration

        int NthBit = 0;
        int n = offset;
        // Is the first bit set?  If not, then we need to account for the fact we aren't starting at N=0.
        if (!cachedFullHarvestIds.get(0)) {
            n++;
        }
        for (; n > 0; n--) {
            NthBit = cachedFullHarvestIds.nextSetBit(NthBit + 1);
        }
        tempIndex = NthBit; // keep track of the current bit (not always incremental!)

        getIdTime = 0;
        doc2RecordTime = 0;
        getDocTime = 0;
    }

    public boolean hasNextRecord() {
        return currentRecord < lastRecord;
    }

    public boolean hasMoreRecords() {
        if (cachedFullHarvest) {
            return hasMoreRecordsCachedFullHarvest();
        }
        return hits.scoreDocs.length > recordLimit;
    }

    public boolean hasMoreRecordsCachedFullHarvest() {
        return cachedFullHarvestIds.cardinality() > lastRecord;
    }

    public DataTransferObject nextRecord() {
        if (cachedFullHarvest) {
            return nextRecordCachedFullHarvest();
        }
        RecordDTO recordDTO = null;
        int id;
        long t1 = System.currentTimeMillis();
        long t2 = 0;
        long t3 = 0;
        try {
            t2 = System.currentTimeMillis();
            getIdTime += (t2 - t1);
            id = hits.scoreDocs[currentRecord].doc;
            Document doc = ApplInfo.luceneSearcher.getDoc(id);
            t3 = System.currentTimeMillis();
            getDocTime += (t3 - t2);
            recordDTO = doc2RecordDTO(doc, id);
            doc2RecordTime += (System.currentTimeMillis() - t3);
        } catch (Exception e) {
            prglog.error("[PRG] " + e);
        }
        currentRecord++;
        return recordDTO;
    }

    public DataTransferObject nextRecordCachedFullHarvest() {
        RecordDTO recordDTO = null;
        int id;
        long t1 = System.currentTimeMillis();
        long t2 = 0;
        long t3 = 0;
        try {
            t2 = System.currentTimeMillis();
            getIdTime += (t2 - t1);
            id = cachedFullHarvestIds.nextSetBit(tempIndex);
            Document doc = cachedFullHarvestIndexSearcher.doc(id);
            t3 = System.currentTimeMillis();
            getDocTime += (t3 - t2);
            tempIndex = id + 1;
            recordDTO = doc2RecordDTO(doc, id);
            doc2RecordTime += (System.currentTimeMillis() - t3);
        } catch (Exception e) {
            prglog.error("[PRG] " + e);
        }
        currentRecord++;
        return recordDTO;
    }

    public List<DataTransferObject> getSetsOfRecord(Integer recordId) {
        Document doc = ApplInfo.luceneSearcher.getRecordByID(recordId);
        List<DataTransferObject> sets = new ArrayList<DataTransferObject>();
        DataTransferObject dto = doc2SetToRecordDTO(doc, recordId);
        if (dto != null)
            sets.add(dto);
        return sets;
    }

    public List<DataTransferObject> getSetsOfRecord(Integer recordId, Integer recordType) {
        List<Object[]> docs = ApplInfo.luceneSearcher.getRecordByIDAndRecordType(recordId, recordType);
        Object[] pair = docs.get(0);
        prglog.info("[PRG] docId: " + pair[0]);
        Document doc = (Document) pair[1];
        //Document doc = docs.get(0);
        prglog.info("[PRG] doc: " + doc);
        List<DataTransferObject> sets = new ArrayList<DataTransferObject>();
        DataTransferObject dto = doc2SetToRecordDTO(doc, recordId);
        if (dto != null)
            sets.add(dto);
        return sets;
    }

    public String getXmlOfRecord(Integer recordId, Integer recordType) {
        return ApplInfo.luceneSearcher.getXmlOfRecord(recordId, recordType);
    }

    public int prepareQuery() {
        if (null != tokenId) {
            ResumptionTokenDTO tokenDTO = getSQLsFromResumptionToken(tokenId);
            if (tokenDTO == null) {
                badResumptionTokenError = true;
            } else {
                queryString = tokenDTO.getQuery();
                metadataPrefix = tokenDTO.getMetadataPrefix();
            }
            if (cachedFullHarvestTokenIds.contains(tokenId)) {
                cachedFullHarvest = true;
            } else if (initialHarvest == 1) {
                // Uh-oh.  This harvester using this resumption token had used the cached full harvest prior to this server's restart
                // This means it's STALE.  We need to throw an exception here.
                //TODO: throw invalid resumption token error
                prglog.warn(
                        "[PRG] A prior harvester is attempting to harvest via STALE (no longer viable) cached full harvest resumptionToken.");
                return -1;
            }
        } else {
            extractQueriesFromParameters(from, until, set);
            if (0 >= queryString.length()) {
                prglog.error("[PRG] query string is null");
            }

            // Can we use the cached full harvest? (fast!)
            initializeCachedFullHarvest();
            if (cachedFullHarvestIds == null) {

                prglog.warn("[PRG] The cached full harvest was not created for some reason (???)");

            } else {

                if (set == null) {
                    boolean fromIsTooRecent = true;
                    boolean untilIsTooRecent = true;
                    boolean untilIsTooOld = true;
                    if (until == null) {
                        until = TextUtil.nowInUTC();
                    }

                    try {
                        String queryString = "+modification_date:{\"" + cachedFullHarvestExpiry + "\" TO \""
                                + TextUtil.utcToMysqlTimestamp(until) + "\"}";
                        //prglog.info("testing if untilIsTooRecent, queryString:" + queryString);
                        TopDocs h = ApplInfo.luceneSearcher.search(queryString);
                        if (h.totalHits < 1)
                            untilIsTooRecent = false;

                        Date uts = TextUtil.utcToDate(until);
                        Date lts = TextUtil.luceneToDate(cachedFullHarvestExpiry);
                        //prglog.info("testing if untilTimestamp:" + uts + " is more recent than the oldest record:" + lts);
                        if (uts.after(lts)) {
                            untilIsTooOld = false;
                        }
                    } catch (ParseException pe) {
                        prglog.error("[PRG] " + pe);
                    }

                    if (from == null) {
                        fromIsTooRecent = false;
                    } else {
                        try {
                            Date fts = TextUtil.utcToDate(from);
                            if (fts.before(cachedFullHarvestEarliestDate))
                                fromIsTooRecent = false;
                            //prglog.info("testing if fromTimestamp:" + fts + " is before oldest created rec:" + cachedFullHarvestEarliestDate);                     
                        } catch (ParseException pe) {
                            prglog.error("[PRG] " + pe);
                        }
                    }
                    if (!fromIsTooRecent && !untilIsTooRecent && !untilIsTooOld) {
                        cachedFullHarvest = true;
                    }
                    prglog.info("fromIsTooRecent:" + fromIsTooRecent + " untilIsTooRecent:" + untilIsTooRecent
                            + " untilIsTooOld:" + untilIsTooOld);

                }
            }
        }

        if (cachedFullHarvest) {

            prglog.info("[PRG] We are using the cached full harvest for extra speed! (That's good!)");

            // for all others, we perform the search each time
        } else {

            prglog.info("[PRG] We are not using the cached full harvest. (Standard query.)");

            Sort sort = new Sort(new SortField("xc_id", SortField.INT));
            try {
                // query recordLimit+1 (one extra) so that way we'll know if we're done with our list
                if (lastRecordRead > 0) {
                    String from = String.format("%d", lastRecordRead);
                    hits = ApplInfo.luceneSearcher.searchRange(queryString, "xc_id", Integer.valueOf(from), null,
                            false, false, sort, recordLimit + 1);
                } else {
                    hits = ApplInfo.luceneSearcher.search(queryString, sort, recordLimit + 1);
                }
            } catch (Exception ex) {
                hits = null;
            }
        }

        return cachedFullHarvest ? 1 : 0;

    }

    public int getTotalRecordCount() {
        if (cachedFullHarvest) {
            return getTotalRecordCountCachedFullHarvest();
        }
        Sort sort = null;
        Query query = null;
        QueryParser parser = new QueryParser(Version.LUCENE_30, "id", new KeywordAnalyzer());
        try {
            query = parser.parse(queryString);
        } catch (org.apache.lucene.queryParser.ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return 0;
        }
        BitSet ids = ApplInfo.luceneSearcher.searchForBits(query, sort);
        return ids.cardinality();
    }

    public int getTotalRecordCountCachedFullHarvest() {
        return cachedFullHarvestIds.cardinality();
    }

    public String getMetadataPrefix() {
        if (metadataPrefix != null) {
            return metadataPrefix;
        } else if (tokenId != null) {
            ResumptionTokenDTO tokenDTO = getSQLsFromResumptionToken(tokenId);
            if (tokenDTO != null) {
                metadataPrefix = tokenDTO.getMetadataPrefix();
                return metadataPrefix;
            }
        }
        return null;
    }

    private ResumptionTokenDTO getSQLsFromResumptionToken(String resumptionToken) {

        ResumptionTokenDTO tokenDTO = new ResumptionTokenDTO();
        tokenDTO.setId(Integer.valueOf(resumptionToken));
        try {
            List<DataTransferObject> tokens = tokenMgr.get(tokenDTO);
            if (null != tokens) {
                prglog.info("[PRG] tokens.size: " + tokens.size());
                return (ResumptionTokenDTO) tokens.get(0);
            } else {
                prglog.info("[PRG] token not found");
                return null;
            }
        } catch (Exception e) {
            prglog.error("[PRG] " + e);
            e.printStackTrace();
            return null;
        }
    }

    private void extractQueriesFromParameters(String from, String until, String set) {
        StringBuffer queryBuffer = new StringBuffer();
        if (null == from) {
            // if this is a clean harvest, there is no need to serve
            // deleted records
            queryBuffer.append("+is_deleted:false");
        }

        // if until is not set, we set it implicitly to "now"
        try {
            if (null == until) {
                until = TextUtil.utcToMysqlTimestamp(TextUtil.nowInUTC());
            } else {
                until = TextUtil.utcToMysqlTimestamp(until);
            }
        } catch (ParseException e) {
            prglog.error("[PRG]" + e);
            return;
        }

        if (null != from || null != until) {
            prglog.info("[PRG] " + from + ", " + until);
            if (null == from) {
                from = ApplInfo.luceneSearcher.showFirstTerm("modification_date");
            } else {
                try {
                    from = TextUtil.utcToMysqlTimestamp(from);
                } catch (ParseException e) {
                    prglog.error("[PRG]" + e);
                    return;
                }
            }
            prglog.info("[PRG] " + from + ", " + until);

            if (queryBuffer.length() > 0) {
                queryBuffer.append(" AND ");
            }

            queryBuffer.append("+modification_date:[\"" + from + "\" TO \"" + until + "\"]");
        }
        if (null != set) {
            // checking the set's existence
            if (ApplInfo.setIdsByName.containsKey(set)) {
                int setId = ApplInfo.setIdsByName.get(set);
                if (queryBuffer.length() > 0) {
                    queryBuffer.append(" AND ");
                }
                queryBuffer.append("+set:" + setId);
            }
        }

        // do we need to filter by orgCode?
        if (ApplInfo.getOrgCodeFilter() != null) {
            queryBuffer.append(" AND +repository_code:\"" + ApplInfo.getOrgCodeFilter() + "\"");
        }

        queryString = queryBuffer.toString();
        prglog.info("[PRG] " + queryString);
    }

    public String storeResumptionToken() {
        ResumptionTokenDTO tokenDTO = new ResumptionTokenDTO();
        tokenDTO.setQuery(queryString);
        tokenDTO.setQueryForCount("");
        tokenDTO.setMetadataPrefix(metadataPrefix);
        tokenDTO.setCreationDate(new Timestamp(new Date().getTime()));

        try {
            List<Integer> intids = tokenMgr.insert(tokenDTO);
            prglog.info("intids.get(0)" + intids.get(0));
            String resumptionToken = String.valueOf(intids.get(0));
            prglog.info("Resumption Token: " + resumptionToken);
            // we need to keep track of cached harvests based on resumption token
            if (cachedFullHarvest) {
                cachedFullHarvestTokenIds.add(resumptionToken);
            }

            return resumptionToken;
        } catch (SQLException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    private RecordDTO doc2RecordDTO(Document doc, int id) {
        RecordDTO mainData = new RecordDTO(id);
        try {
            if (doc.get("creation_date") != null) {
                mainData.setCreationDate(TextUtil.luceneToTimestamp(doc.get("creation_date")));
            }
            if (doc.get("modification_date") != null) {
                mainData.setModificationDate(TextUtil.luceneToTimestamp(doc.get("modification_date")));
            }
        } catch (ParseException e) {
            prglog.error("[PRG] " + e);
        }
        mainData.setExternalId(doc.get("external_id"));
        mainData.setXcOaiId(doc.get("xc_oaiid"));
        mainData.setXcId(Integer.parseInt(doc.get("xc_id")));
        mainData.setIsDeleted(Boolean.valueOf(doc.get("is_deleted")));
        mainData.setRecordType(Integer.parseInt(doc.get("record_type")));
        return mainData;
    }

    private SetToRecordDTO doc2SetToRecordDTO(Document doc, Integer recordId) {
        SetToRecordDTO setsToRecordDTO = new SetToRecordDTO();
        setsToRecordDTO.setRecordId(recordId.intValue());
        try {
            if (doc != null)
                setsToRecordDTO.setSetId(Integer.parseInt(doc.get("set")));
        } catch (Exception e) {
            prglog.info("[PRG] exception in doc2SetToRecordDTO; Couldn't parse 'set' property; recordId: "
                    + recordId + "\n\ndoc: " + doc + "\n");
            return null;
        }
        return setsToRecordDTO;
    }

    public long getDoc2RecordTime() {
        return doc2RecordTime;
    }

    public long getIdTime() {
        return getIdTime;
    }

    public long getDocTime() {
        return getDocTime;
    }

}