Java tutorial
/* * Copyright (c) 2002-2014, Mairie de Paris * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright notice * and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice * and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * 3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * License 1.0 */ package fr.paris.lutece.plugins.document.service.search; import fr.paris.lutece.plugins.document.business.Document; import fr.paris.lutece.plugins.document.business.DocumentHome; import fr.paris.lutece.plugins.document.business.DocumentTypeHome; import fr.paris.lutece.plugins.document.business.attributes.DocumentAttribute; import fr.paris.lutece.plugins.document.business.portlet.DocumentListPortletHome; import fr.paris.lutece.plugins.document.service.publishing.PublishingService; import fr.paris.lutece.plugins.document.utils.IntegerUtils; import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexer; import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexerFactory; import fr.paris.lutece.portal.business.page.Page; import fr.paris.lutece.portal.business.page.PageHome; import fr.paris.lutece.portal.business.portlet.Portlet; import fr.paris.lutece.portal.business.portlet.PortletHome; import fr.paris.lutece.portal.service.search.IndexationService; import fr.paris.lutece.portal.service.search.SearchIndexer; import fr.paris.lutece.portal.service.search.SearchItem; import fr.paris.lutece.portal.service.spring.SpringContextService; import fr.paris.lutece.portal.service.util.AppException; import fr.paris.lutece.portal.service.util.AppLogService; import fr.paris.lutece.portal.service.util.AppPropertiesService; import fr.paris.lutece.util.ReferenceItem; import fr.paris.lutece.util.url.UrlItem; import org.apache.commons.lang.StringUtils; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * Document Indexer */ public class DocumentIndexer implements SearchIndexer { public static final String INDEXER_NAME = "DocumentIndexer"; public static final String SHORT_NAME = "dcm"; private static final String INDEXER_DESCRIPTION = "Indexer service for documents"; private static final String INDEXER_VERSION = "1.0.0"; private static final String PROPERTY_PAGE_BASE_URL = "document.documentIndexer.baseUrl"; private static final String PROPERTY_INDEXER_ENABLE = "document.documentIndexer.enable"; private static final String PARAMETER_DOCUMENT_ID = "document_id"; private static final String PARAMETER_PORTLET_ID = "portlet_id"; private static final String JSP_PAGE_ADVANCED_SEARCH = "jsp/site/Portal.jsp?page=advanced_search"; /** * index all lucene documents * @throws java.io.IOException i/o exception * @throws java.lang.InterruptedException interrupted exception */ @Override public void indexDocuments() throws IOException, InterruptedException { String strBaseUrl = AppPropertiesService.getProperty(PROPERTY_PAGE_BASE_URL); Page page; for (Portlet portlet : PortletHome.findByType(DocumentListPortletHome.getInstance().getPortletTypeId())) { page = PageHome.getPage(portlet.getPageId()); for (Document d : PublishingService.getInstance().getPublishedDocumentsByPortletId(portlet.getId())) { Document document = DocumentHome.findByPrimaryKey(d.getId()); // Reload the full object to get all its searchable attributes UrlItem url = new UrlItem(strBaseUrl); url.addParameter(PARAMETER_DOCUMENT_ID, document.getId()); url.addParameter(PARAMETER_PORTLET_ID, portlet.getId()); String strPortletDocumentId = document.getId() + "_" + SHORT_NAME + "&" + portlet.getId(); org.apache.lucene.document.Document doc = null; try { doc = getDocument(document, url.getUrl(), page.getRole(), strPortletDocumentId); } catch (Exception e) { String strMessage = "Document ID : " + document.getId() + " - Portlet ID : " + portlet.getId(); IndexationService.error(this, e, strMessage); } if (doc != null) { IndexationService.write(doc); } } } } /** * Returns a collection of lucene documents with the same id * @param strIdDocument the document id * @return lucene documents * @throws IOexception i/o exception * @throws InterruptedException interrupted exception */ @Override public List<org.apache.lucene.document.Document> getDocuments(String strIdDocument) throws IOException, InterruptedException { List<org.apache.lucene.document.Document> listDocs = new ArrayList<org.apache.lucene.document.Document>(); int nIdDocument = IntegerUtils.convert(strIdDocument); Document document = DocumentHome.findByPrimaryKey(nIdDocument); Iterator<Portlet> it = PublishingService.getInstance().getPortletsByDocumentId(strIdDocument).iterator(); String strBaseUrl = AppPropertiesService.getProperty(PROPERTY_PAGE_BASE_URL); Page page; while (it.hasNext()) { Portlet portlet = it.next(); UrlItem url = new UrlItem(strBaseUrl); url.addParameter(PARAMETER_DOCUMENT_ID, nIdDocument); url.addParameter(PARAMETER_PORTLET_ID, portlet.getId()); String strPortletDocumentId = nIdDocument + "_" + SHORT_NAME + "&" + portlet.getId(); page = PageHome.getPage(portlet.getPageId()); org.apache.lucene.document.Document doc = getDocument(document, url.getUrl(), page.getRole(), strPortletDocumentId); listDocs.add(doc); } return listDocs; } /** * Returns the indexer service name * @return the indexer service name */ @Override public String getName() { return INDEXER_NAME; } /** * Returns the indexer service version * @return The indexer service version */ @Override public String getVersion() { return INDEXER_VERSION; } /** * Returns the indexer service description * @return The indexer service description */ @Override public String getDescription() { return INDEXER_DESCRIPTION; } /** * Tells whether the service is enable or not * @return true if enable, otherwise false */ @Override public boolean isEnable() { String strEnable = AppPropertiesService.getProperty(PROPERTY_INDEXER_ENABLE, "true"); return (strEnable.equalsIgnoreCase("true")); } /** * Builds a document which will be used by Lucene during the indexing of the * pages of the site with the following * fields : summary, uid, url, contents, title and description. * * @param document the document to index * @param strUrl the url of the documents * @param strRole the lutece role of the page associate to the document * @param strPortletDocumentId the document id concatened to the id portlet * with a & in the middle * @return the built Document * @throws IOException The IO Exception * @throws InterruptedException The InterruptedException */ public static org.apache.lucene.document.Document getDocument(Document document, String strUrl, String strRole, String strPortletDocumentId) throws IOException, InterruptedException { // make a new, empty document org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); FieldType ft = new FieldType(StringField.TYPE_STORED); ft.setOmitNorms(false); // Add the url as a field named "url". Use an UnIndexed field, so // that the url is just stored with the document, but is not searchable. doc.add(new Field(SearchItem.FIELD_URL, strUrl, ft)); // Add the PortletDocumentId as a field named "document_portlet_id". doc.add(new Field(SearchItem.FIELD_DOCUMENT_PORTLET_ID, strPortletDocumentId, ft)); // Add the last modified date of the file a field named "modified". // Use a field that is indexed (i.e. searchable), but don't tokenize // the field into words. String strDate = DateTools.dateToString(document.getDateModification(), DateTools.Resolution.DAY); doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft)); // Add the uid as a field, so that index can be incrementally maintained. // This field is not stored with document, it is indexed, but it is not // tokenized prior to indexing. String strIdDocument = String.valueOf(document.getId()); doc.add(new Field(SearchItem.FIELD_UID, strIdDocument + "_" + DocumentIndexer.SHORT_NAME, ft)); String strContentToIndex = getContentToIndex(document); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try { new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata, new ParseContext()); } catch (SAXException e) { throw new AppException("Error during document parsing."); } catch (TikaException e) { throw new AppException("Error during document parsing."); } //the content of the article is recovered in the parser because this one //had replaced the encoded caracters (as é) by the corresponding special caracter (as ?) String strContent = handler.toString(); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED)); // Add the title as a separate Text field, so that it can be searched // separately. FieldType ft2 = new FieldType(TextField.TYPE_STORED); ft2.setOmitNorms(true); doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ft2)); doc.add(new Field(SearchItem.FIELD_TYPE, document.getType(), ft)); doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft)); // add metadata (mapped to summary) doc.add(new Field(SearchItem.FIELD_METADATA, document.getSummary(), TextField.TYPE_NOT_STORED)); doc.add(new StoredField(SearchItem.FIELD_SUMMARY, document.getSummary())); // return the document return doc; } /** * Get the content from the document * @param document the document to index * @return the content */ private static String getContentToIndex(Document document) { StringBuilder sbContentToIndex = new StringBuilder(); sbContentToIndex.append(document.getTitle()); for (DocumentAttribute attribute : document.getAttributes()) { if (attribute.isSearchable()) { if (!attribute.isBinary()) { // Text attributes sbContentToIndex.append(" "); sbContentToIndex.append(attribute.getTextValue()); } else { // Binary file attribute // Gets indexer depending on the ContentType (ie: "application/pdf" should use a PDF indexer) IFileIndexerFactory factoryIndexer = (IFileIndexerFactory) SpringContextService .getBean(IFileIndexerFactory.BEAN_FILE_INDEXER_FACTORY); IFileIndexer indexer = factoryIndexer.getIndexer(attribute.getValueContentType()); if (indexer != null) { try { ByteArrayInputStream bais = new ByteArrayInputStream(attribute.getBinaryValue()); sbContentToIndex.append(" "); sbContentToIndex.append(indexer.getContentToIndex(bais)); bais.close(); } catch (IOException e) { AppLogService.error(e.getMessage(), e); } } } } } // Index Metadata sbContentToIndex.append(" "); sbContentToIndex.append(StringUtils.defaultString(document.getXmlMetadata())); return sbContentToIndex.toString(); } /** * {@inheritDoc} */ @Override public List<String> getListType() { List<String> typeList = new ArrayList<String>(); for (ReferenceItem item : DocumentTypeHome.getDocumentTypesList()) { typeList.add(item.getName()); } return typeList; } /** * {@inheritDoc} */ @Override public String getSpecificSearchAppUrl() { return JSP_PAGE_ADVANCED_SEARCH; } }