au.org.ala.bhl.service.IndexingService.java Source code

Java tutorial

Introduction

Here is the source code for au.org.ala.bhl.service.IndexingService.java

Source

/*******************************************************************************
 * Copyright (C) 2011 Atlas of Living Australia
 * All Rights Reserved.
 *   
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *   
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ******************************************************************************/
package au.org.ala.bhl.service;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.CoreContainer;
import org.codehaus.jackson.JsonNode;

import au.org.ala.bhl.ItemDescriptor;
import au.org.ala.bhl.ItemStatus;

/**
 * Service object that facades the process of interacting with the SOLR index
 * 
 * @author baird
 * 
 */
public class IndexingService extends AbstractService {

    private final String _serverURL;
    private DocumentCacheService _docCache;
    private SolrServer _solrServer;
    private CoreContainer _coreContainer;
    private byte[] _serverLock = new byte[] {};

    private static Pattern SINGLE_YEAR_PATTERN = Pattern.compile("(\\d{4})");
    private static Pattern YEAR_RANGE_PATTERN = Pattern.compile("(\\d{4})\\s*[^\\d]\\s*(\\d{4})");
    private static Pattern ABBREV_RANGE_PATTERN = Pattern.compile("(\\d{4})\\s*[^\\d]\\s*(\\d{2})");

    /**
     * CTOR
     * 
     * @param serverUrl
     * @param docCache
     */
    public IndexingService(String serverUrl, DocumentCacheService docCache) {
        _serverURL = serverUrl;
        _docCache = docCache;
    }

    public void shutdown() {
        log("Shutting down indexing service...");
        synchronized (_serverLock) {
            if (_coreContainer != null) {
                _solrServer = null;
                _coreContainer.shutdown();
            }
        }
    }

    /**
     * Create a new instance of the Solr server API facade
     * 
     * @return
     */
    private SolrServer createSolrServer() {
        try {
            synchronized (_serverLock) {
                if (_solrServer == null) {
                    if (!StringUtils.isEmpty(_serverURL)) {
                        if (_serverURL.startsWith("http://")) {
                            _solrServer = new CommonsHttpSolrServer(_serverURL);
                        } else {
                            // Note that the following property could be set through JVM level arguments too
                            System.setProperty("solr.solr.home", _serverURL);
                            CoreContainer.Initializer initializer = new CoreContainer.Initializer();
                            _coreContainer = initializer.initialize();
                            _solrServer = new EmbeddedSolrServer(_coreContainer, "");
                        }
                    } else {
                        throw new RuntimeException("Neither a local SOLR path or a SOLR HTTP Url was specified!");
                    }
                }
            }

            return _solrServer;
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    /**
     * Indexes an item that exists in the document cache
     * 
     * @param item
     */
    public void indexItem(final ItemDescriptor item) {

        String itemPathStr = _docCache.getItemDirectoryPath(item.getInternetArchiveId());

        final SolrServer server = createSolrServer();

        log("Indexing pages %s for item %s", itemPathStr, item.getItemId());

        try {
            final AtomicInteger pageCount = new AtomicInteger(0);
            File itemPath = new File(itemPathStr);
            if (itemPath.exists() && itemPath.isDirectory()) {
                File f = _docCache.getPageArchiveFile(item);
                if (f.exists()) {
                    _docCache.forEachItemPage(item, new CachedItemPageHandler() {

                        public void startItem(String itemId) {
                        }

                        public void onPage(String iaId, String pageId, String text) {
                            indexPage(item, pageId, text, server);
                            pageCount.incrementAndGet();

                            if (pageCount.get() % 100 == 0) {
                                try {
                                    server.commit();
                                } catch (Exception ex) {
                                    throw new RuntimeException(ex);
                                }
                            }
                        }

                        public void endItem(String itemId) {
                        }
                    });

                    if (pageCount.get() > 0) {
                        server.commit();
                        getItemsService().setItemStatus(item.getItemId(), ItemStatus.INDEXED, pageCount.get());
                        log("%s pages indexed for item: %s", pageCount, item.getItemId());
                    } else {
                        log("Ignoring empty item (no pages): %s", item.getItemId());
                    }
                } else {
                    log("Ignoring partial or empty item (no archive file found): %s", item.getInternetArchiveId());
                }
            }

        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    /**
     * Send a page of text to SOLR for indexing
     * 
     * @param item
     * @param pageId
     * @param pageText
     * @param server
     */
    private void indexPage(ItemDescriptor item, String pageId, String pageText, SolrServer server) {
        if (!StringUtils.isEmpty(pageText)) {
            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("id", pageId, 1.0f);
            doc.addField("name", item.getTitle(), 1.0f);
            doc.addField("text", pageText);
            doc.addField("internetArchiveId", item.getInternetArchiveId());
            doc.addField("itemId", item.getItemId());
            doc.addField("pageId", pageId, 1.0f);
            doc.addField("pageUrl", String.format("http://bhl.ala.org.au/pageimage/%s", pageId));

            JsonNode metadata = _docCache.getItemMetaData(item);
            if (metadata != null) {
                addItemMetadata(doc, metadata);
                JsonNode titleData = _docCache.getTitleMetaData(item);
                if (titleData != null) {
                    addTitleMetadata(doc, titleData);
                }
            }

            try {
                server.add(doc);
            } catch (Exception ex) {
                throw new RuntimeException(ex);
            }
        }
    }

    private void addItemMetadata(SolrInputDocument doc, JsonNode metadata) {
        String year = metadata.get("Year").getTextValue();
        if (!StringUtils.isEmpty(year)) {
            YearRange range = parseYearRange(year);
            if (range != null) {
                doc.addField("startYear", range.startYear);
                doc.addField("endYear", range.endYear);
            }
        }

        addField(metadata, "Volume", "volume", doc);
        addField(metadata, "Contributor", "contributor", doc);
        addField(metadata, "Source", "source", doc);

        int titleId = metadata.get("PrimaryTitleID").asInt();
        doc.addField("titleId", titleId);

    }

    private void addTitleMetadata(SolrInputDocument doc, JsonNode titleData) {
        // Full title
        addField(titleData, "FullTitle", "fullTitle", doc);
        // Publisher Name
        addField(titleData, "PublisherName", "publisherName", doc);
        // Publisher Place
        addField(titleData, "PublisherPlace", "publisherPlace", doc);

        // Author(s)
        for (String author : selectField(titleData.get("Authors"), "Name")) {
            doc.addField("author", author);
        }

        // Author ID
        for (String authorId : selectField(titleData.get("Authors"), "CreatorID")) {
            doc.addField("authorId", authorId);
        }

        // Subject(s)
        for (String subject : selectField(titleData.get("Subjects"), "SubjectText")) {
            doc.addField("subject", subject);
        }

        // Publication Dates
        String date = titleData.get("PublicationDate").getTextValue();
        YearRange range = parseYearRange(date);
        if (range != null) {
            doc.addField("publicationStartYear", range.startYear);
            doc.addField("publicationEndYear", range.startYear);
        }

    }

    private void addField(JsonNode obj, String jsonfield, String indexField, SolrInputDocument doc) {
        String value = obj.get(jsonfield).getTextValue();
        if (!StringUtils.isEmpty(value)) {
            doc.addField(indexField, value);
        }
    }

    private List<String> selectField(JsonNode parent, String textField) {
        ArrayList<String> results = new ArrayList<String>();
        if (parent.isArray()) {
            for (JsonNode child : parent) {
                String name = child.get(textField).asText();
                if (name != null) {
                    if (!StringUtils.isEmpty(name)) {
                        results.add(name);
                    }
                }
            }
        }
        return results;
    }

    public static YearRange parseYearRange(String range) {

        if (StringUtils.isEmpty(range)) {
            return null;
        }

        if (StringUtils.isNumeric(range)) {
            return new YearRange(range, range);
        }

        // Look for YYYY-YYYY
        Matcher m = YEAR_RANGE_PATTERN.matcher(range);
        if (m.find()) {
            return new YearRange(m.group(1), m.group(2));
        }

        // Look for YYYY-YY
        m = ABBREV_RANGE_PATTERN.matcher(range);
        if (m.find()) {
            String start = m.group(1);
            return new YearRange(start, start.substring(0, 2) + m.group(2));
        }

        // Look for any for 4 consecutive digits!
        m = SINGLE_YEAR_PATTERN.matcher(range);
        if (m.find()) {
            return new YearRange(m.group(1), m.group(1));
        }

        return null;
    }

    public static class YearRange {

        public YearRange() {
        }

        public YearRange(int start, int end) {
            startYear = start;
            endYear = end;
        }

        public YearRange(String start, String end) {
            startYear = Integer.parseInt(start);
            endYear = Integer.parseInt(end);
        }

        public int startYear;
        public int endYear;
    }

}