fr.paris.lutece.plugins.document.modules.solr.indexer.SolrDocIndexer.java Source code

Introduction

Here is the source code for fr.paris.lutece.plugins.document.modules.solr.indexer.SolrDocIndexer.java
Source

/*
 * Copyright (c) 2002-2014, Mairie de Paris
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *  1. Redistributions of source code must retain the above copyright notice
 *     and the following disclaimer.
 *
 *  2. Redistributions in binary form must reproduce the above copyright notice
 *     and the following disclaimer in the documentation and/or other materials
 *     provided with the distribution.
 *
 *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
 *     contributors may be used to endorse or promote products derived from
 *     this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * License 1.0
 */
package fr.paris.lutece.plugins.document.modules.solr.indexer;

import fr.paris.lutece.plugins.document.business.Document;
import fr.paris.lutece.plugins.document.business.DocumentHome;
import fr.paris.lutece.plugins.document.business.DocumentType;
import fr.paris.lutece.plugins.document.business.DocumentTypeHome;
import fr.paris.lutece.plugins.document.business.attributes.DocumentAttribute;
import fr.paris.lutece.plugins.document.business.attributes.DocumentAttributeHome;
import fr.paris.lutece.plugins.document.business.category.Category;
import fr.paris.lutece.plugins.document.business.portlet.DocumentListPortletHome;
import fr.paris.lutece.plugins.document.business.portlet.DocumentPortletHome;
import fr.paris.lutece.plugins.document.service.publishing.PublishingService;
import fr.paris.lutece.plugins.document.utils.DocumentIndexerUtils;
import fr.paris.lutece.plugins.leaflet.business.GeolocItem;
import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexer;
import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexerFactory;
import fr.paris.lutece.plugins.search.solr.business.field.Field;
import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexer;
import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexerService;
import fr.paris.lutece.plugins.search.solr.indexer.SolrItem;
import fr.paris.lutece.plugins.search.solr.util.SolrConstants;
import fr.paris.lutece.portal.business.page.Page;
import fr.paris.lutece.portal.business.page.PageHome;
import fr.paris.lutece.portal.business.portlet.Portlet;
import fr.paris.lutece.portal.business.portlet.PortletHome;
import fr.paris.lutece.portal.service.plugin.PluginService;
import fr.paris.lutece.portal.service.spring.SpringContextService;
import fr.paris.lutece.portal.service.util.AppException;
import fr.paris.lutece.portal.service.util.AppLogService;
import fr.paris.lutece.portal.service.util.AppPropertiesService;
import fr.paris.lutece.portal.web.admin.PluginAdminPageJspBean;
import fr.paris.lutece.util.url.UrlItem;

import org.apache.commons.lang.StringUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.GregorianCalendar;
import java.util.Iterator;
import java.util.List;

/**
 * The indexer service for Solr.
 *
 */
public class SolrDocIndexer implements SolrIndexer {
    public static final String BEAN_NAME = "document-solr.solrDocIndexer";
    // Not used
    // private static final String PARAMETER_SOLR_DOCUMENT_ID = "solr_document_id";
    private static final String PARAMETER_PORTLET_ID = "portlet_id";
    private static final String PROPERTY_INDEXER_ENABLE = "solr.indexer.document.enable";
    private static final String PROPERTY_DOCUMENT_MAX_CHARS = "document-solr.indexer.document.characters.limit";
    private static final String PROPERTY_NAME = "document-solr.indexer.name";
    private static final String PROPERTY_DESCRIPTION = "document-solr.indexer.description";
    private static final String PROPERTY_VERSION = "document-solr.indexer.version";
    private static final String PARAMETER_DOCUMENT_ID = "document_id";
    private static final String PARAMETER_ATTRIBUTE_ID = "id_attribute";
    private static final List<String> LIST_RESSOURCES_NAME = new ArrayList<String>();
    private static final String SHORT_NAME = "doc";
    private static final String DOC_INDEXATION_ERROR = "[SolrDocIndexer] An error occured during the indexation of the document number ";

    private static final String PARAMETER_TYPE_NUMERICTEXT = "numerictext";
    private static final String PARAMETER_TYPE_GEOLOC = "geoloc";

    private static final Integer PARAMETER_DOCUMENT_MAX_CHARS = Integer
            .parseInt(AppPropertiesService.getProperty(PROPERTY_DOCUMENT_MAX_CHARS));

    /**
     * Creates a new SolrPageIndexer
     */
    public SolrDocIndexer() {
        LIST_RESSOURCES_NAME.add(DocumentIndexerUtils.CONSTANT_TYPE_RESOURCE);
    }

    @Override
    public boolean isEnable() {
        return "true".equalsIgnoreCase(AppPropertiesService.getProperty(PROPERTY_INDEXER_ENABLE));
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public List<String> indexDocuments() {
        List<String> lstErrors = new ArrayList<String>();
        List<Integer> listDocument = new ArrayList<Integer>();

        //Page page;
        List<Portlet> portletList = PortletHome
                .findByType(DocumentListPortletHome.getInstance().getPortletTypeId());
        portletList.addAll(PortletHome.findByType(DocumentPortletHome.getInstance().getPortletTypeId()));

        for (Portlet portlet : portletList) {
            for (Document d : PublishingService.getInstance().getPublishedDocumentsByPortletId(portlet.getId())) {
                try {
                    //The Lucene document of plugin-document
                    Document document = DocumentHome.findByPrimaryKey(d.getId());

                    if (document != null && !listDocument.contains(document.getId())) {
                        // Generates the item to index
                        SolrItem item = getItem(portlet, document);

                        if (item != null) {
                            SolrIndexerService.write(item);
                        }
                        listDocument.add(document.getId());
                    }
                } catch (Exception e) {
                    lstErrors.add(SolrIndexerService.buildErrorMessage(e));
                    AppLogService.error(DOC_INDEXATION_ERROR + d.getId(), e);

                }
            }
        }

        return lstErrors;
    }

    /**
     * iNDEX LIST oF DICUMENT PUBLISHED
     * @param listIdDocument
     * @return error LIST
     * @throws Exception 
     */
    public List<String> indexListDocuments(Portlet portlet, List<Integer> listIdDocument) throws Exception {
        List<String> lstErrors = new ArrayList<String>();

        for (Integer d : listIdDocument) {

            Document document = DocumentHome.findByPrimaryKey(d);
            try {
                // Generates the item to index
                if (document != null && document.getPublishedStatus() == 0) {
                    SolrItem item = getItem(portlet, document);

                    if (item != null) {
                        SolrIndexerService.write(item);
                    }

                }
            } catch (Exception e) {
                lstErrors.add(SolrIndexerService.buildErrorMessage(e));
                AppLogService.error(DOC_INDEXATION_ERROR + document.getId(), e);
                throw new Exception();
            }
        }

        return lstErrors;
    }

    /**
     * Get item
     * @param portlet The portlet
     * @param document The document
     * @return The item
     * @throws IOException
     */
    private SolrItem getItem(Portlet portlet, Document document) throws IOException {
        // the item
        SolrItem item = new SolrItem();
        item.setUid(getResourceUid(Integer.valueOf(document.getId()).toString(),
                DocumentIndexerUtils.CONSTANT_TYPE_RESOURCE));
        item.setDate(document.getDateModification());
        item.setType(document.getType());
        item.setSummary(document.getSummary());
        item.setTitle(document.getTitle());
        item.setSite(SolrIndexerService.getWebAppName());
        item.setRole("none");

        if (portlet != null) {
            item.setDocPortletId(document.getId() + SolrConstants.CONSTANT_AND + portlet.getId());
        }

        item.setXmlContent(document.getXmlValidatedContent());

        // Reload the full object to get all its searchable attributes
        UrlItem url = new UrlItem(SolrIndexerService.getBaseUrl());
        url.addParameter(PARAMETER_DOCUMENT_ID, document.getId());
        url.addParameter(PARAMETER_PORTLET_ID, portlet.getId());
        item.setUrl(url.getUrl());

        // Date Hierarchy
        GregorianCalendar calendar = new GregorianCalendar();
        calendar.setTime(document.getDateModification());
        item.setHieDate(calendar.get(GregorianCalendar.YEAR) + "/" + (calendar.get(GregorianCalendar.MONTH) + 1)
                + "/" + calendar.get(GregorianCalendar.DAY_OF_MONTH) + "/");

        List<String> categorie = new ArrayList<String>();

        for (Category cat : document.getCategories()) {
            categorie.add(cat.getName());
        }

        item.setCategorie(categorie);

        // The content
        String strContentToIndex = getContentToIndex(document, item);
        ContentHandler handler = null;
        if (PARAMETER_DOCUMENT_MAX_CHARS != null) {
            handler = new BodyContentHandler(PARAMETER_DOCUMENT_MAX_CHARS);
        } else {
            handler = new BodyContentHandler();
        }

        Metadata metadata = new Metadata();

        try {
            new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata,
                    new ParseContext());
        } catch (SAXException e) {
            throw new AppException("Error during document parsing.");
        } catch (TikaException e) {
            throw new AppException("Error during document parsing.");
        }

        item.setContent(handler.toString());

        return item;
    }

    /**
     * GEt the content to index
     * @param document The document
     * @param item The SolR item
     * @return The content
     */
    private static String getContentToIndex(Document document, SolrItem item) {
        StringBuilder sbContentToIndex = new StringBuilder();
        sbContentToIndex.append(document.getTitle());
        sbContentToIndex.append(" ");

        for (DocumentAttribute attribute : document.getAttributes()) {
            if (attribute.isSearchable()) {
                if (!attribute.isBinary()) {
                    if (PARAMETER_TYPE_GEOLOC.equalsIgnoreCase(attribute.getCodeAttributeType())) {
                        // Geojson attribute, put the address as text if it exists
                        String address = null;
                        GeolocItem geolocItem = null;
                        try {
                            geolocItem = GeolocItem.fromJSON(attribute.getTextValue());
                        } catch (IOException e) {
                            AppLogService.error("SolrDocumentIndexer, error parsing JSON" + e);
                        }
                        if (geolocItem != null && geolocItem.getAddress() != null) {
                            sbContentToIndex.append(geolocItem.getAddress());
                        }
                    } else {
                        // Text attributes
                        sbContentToIndex.append(attribute.getTextValue());
                    }
                    sbContentToIndex.append(" ");

                    //Dynamic Field

                    if (PARAMETER_TYPE_NUMERICTEXT.equalsIgnoreCase(attribute.getCodeAttributeType())) {
                        Long nI = StringUtils.isNotEmpty(attribute.getTextValue())
                                && StringUtils.isNumeric(attribute.getTextValue().trim())
                                        ? Long.valueOf(attribute.getTextValue().trim())
                                        : 0;
                        item.addDynamicField(attribute.getCode(), nI);
                    } else if (PARAMETER_TYPE_GEOLOC.equalsIgnoreCase(attribute.getCodeAttributeType())) {
                        item.addDynamicFieldGeoloc(attribute.getCode(), attribute.getTextValue(),
                                document.getCodeDocumentType());
                    } else
                        item.addDynamicField(attribute.getCode(), attribute.getTextValue());
                } else {
                    // Binary file attribute
                    // Gets indexer depending on the ContentType (ie: "application/pdf" should use a PDF indexer)
                    IFileIndexerFactory _factoryIndexer = (IFileIndexerFactory) SpringContextService
                            .getBean(IFileIndexerFactory.BEAN_FILE_INDEXER_FACTORY);
                    IFileIndexer indexer = _factoryIndexer.getIndexer(attribute.getValueContentType());

                    if (indexer != null) {
                        try {
                            ByteArrayInputStream bais = new ByteArrayInputStream(attribute.getBinaryValue());
                            sbContentToIndex.append(indexer.getContentToIndex(bais));
                            sbContentToIndex.append(" ");
                            bais.close();
                        } catch (IOException e) {
                            AppLogService.error(e.getMessage(), e);
                        }
                    } else {
                        AppLogService.debug("No indexer found. Url to this data will be given instead");

                        String strName = attribute.getCode() + "_" + attribute.getCodeAttributeType() + "_url";
                        UrlItem url = new UrlItem(SolrIndexerService.getBaseUrl());
                        url.addParameter(PARAMETER_DOCUMENT_ID, document.getId());
                        url.addParameter(PARAMETER_ATTRIBUTE_ID, attribute.getId());
                        item.addDynamicField(strName, url.getUrl());
                    }
                }
            }
        }

        // Index Metadata
        if (document.getXmlMetadata() != null) {
            sbContentToIndex.append(document.getXmlMetadata());
        }

        return sbContentToIndex.toString();
    }

    //GETTERS & SETTERS
    /**
     * Returns the name of the indexer.
     *
     * @return the name of the indexer
     */
    @Override
    public String getName() {
        return AppPropertiesService.getProperty(PROPERTY_NAME);
    }

    /**
     * Returns the version.
     *
     * @return the version.
     */
    @Override
    public String getVersion() {
        return AppPropertiesService.getProperty(PROPERTY_VERSION);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public String getDescription() {
        return AppPropertiesService.getProperty(PROPERTY_DESCRIPTION);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public List<Field> getAdditionalFields() {
        Collection<DocumentType> cAllTypes = DocumentTypeHome.findAll();
        List<Field> lstFields = new ArrayList<Field>();

        for (DocumentType type : cAllTypes) {
            DocumentAttributeHome.setDocumentTypeAttributes(type);

            for (DocumentAttribute attribute : type.getAttributes()) {
                Field field = new Field();
                field.setEnableFacet(true);
                field.setDescription(attribute.getDescription());
                field.setIsFacet(true);
                field.setName(attribute.getCode() + SolrItem.DYNAMIC_TEXT_FIELD_SUFFIX);
                field.setLabel(attribute.getName());

                lstFields.add(field);
            }
        }

        return lstFields;
    }

    /**
     * Builds a document which will be used by solr during the indexing of the
     * pages of the site with the following fields : summary, uid, url,
     * contents, title and description.
     *
     * @param document the document to index
     * @param strUrl the url of the documents
     * @param strRole the lutece role of the page associate to the document
     * @param strPortletDocumentId the document id concatened to the id portlet
     * with a & in the middle
     * @return the built Document
     * @throws IOException The IO Exception
     * @throws InterruptedException The InterruptedException
     */
    private SolrItem getDocument(Document document, String strUrl, String strRole, String strPortletDocumentId)
            throws IOException, InterruptedException {
        // make a new, empty document
        SolrItem item = new SolrItem();

        // Add the url as a field named "url".  Use an UnIndexed field, so
        // that the url is just stored with the document, but is not searchable.
        item.setUrl(strUrl);

        // Add the PortletDocumentId as a field named "document_portlet_id".  
        item.setDocPortletId(strPortletDocumentId);

        // Add the last modified date of the file a field named "modified".
        // Use a field that is indexed (i.e. searchable), but don't tokenize
        // the field into words.
        item.setDate(document.getDateModification());

        // Add the uid as a field, so that index can be incrementally maintained.
        // This field is not stored with document, it is indexed, but it is not
        // tokenized prior to indexing.
        String strIdDocument = String.valueOf(document.getId());
        item.setUid(getResourceUid(strIdDocument, DocumentIndexerUtils.CONSTANT_TYPE_RESOURCE));

        String strContentToIndex = getContentToIndex(document, item);
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        try {
            new org.apache.tika.parser.html.HtmlParser().parse(
                    new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata, new ParseContext());
        } catch (SAXException e) {
            throw new AppException("Error during document parsing.");
        } catch (TikaException e) {
            throw new AppException("Error during document parsing.");
        }

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.
        item.setContent(handler.toString());

        // Add the title as a separate Text field, so that it can be searched
        // separately.
        item.setTitle(document.getTitle());

        item.setType(document.getType());

        item.setRole(strRole);

        item.setSite(SolrIndexerService.getWebAppName());

        // return the document
        return item;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public List<SolrItem> getDocuments(String strIdDocument) {
        List<SolrItem> lstItems = new ArrayList<SolrItem>();

        int nIdDocument = Integer.parseInt(strIdDocument);
        Document document = DocumentHome.findByPrimaryKey(nIdDocument);
        Iterator<Portlet> it = PublishingService.getInstance()
                .getPortletsByDocumentId(Integer.toString(nIdDocument)).iterator();

        try {
            while (it.hasNext()) {
                Portlet portlet = it.next();
                UrlItem url = new UrlItem(SolrIndexerService.getBaseUrl());
                url.addParameter(PARAMETER_DOCUMENT_ID, nIdDocument);
                url.addParameter(PARAMETER_PORTLET_ID, portlet.getId());

                String strPortletDocumentId = nIdDocument + "&" + portlet.getId();
                Page page = PageHome.getPage(portlet.getPageId());

                lstItems.add(getDocument(document, url.getUrl(), page.getRole(), strPortletDocumentId));
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        }

        return lstItems;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public List<String> getResourcesName() {
        return LIST_RESSOURCES_NAME;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public String getResourceUid(String strResourceId, String strResourceType) {
        StringBuilder sb = new StringBuilder(strResourceId);
        sb.append(SolrConstants.CONSTANT_UNDERSCORE).append(SHORT_NAME);

        return sb.toString();
    }
}