fr.paris.lutece.plugins.dila.modules.solr.utils.parsers.DilaSolrPublicParser.java Source code

Java tutorial

Introduction

Here is the source code for fr.paris.lutece.plugins.dila.modules.solr.utils.parsers.DilaSolrPublicParser.java

Source

/*
 * Copyright (c) 2002-2014, Mairie de Paris
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *  1. Redistributions of source code must retain the above copyright notice
 *     and the following disclaimer.
 *
 *  2. Redistributions in binary form must reproduce the above copyright notice
 *     and the following disclaimer in the documentation and/or other materials
 *     provided with the distribution.
 *
 *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
 *     contributors may be used to endorse or promote products derived from
 *     this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * License 1.0
 */
package fr.paris.lutece.plugins.dila.modules.solr.utils.parsers;

import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexerService;
import fr.paris.lutece.plugins.search.solr.indexer.SolrItem;
import fr.paris.lutece.portal.service.content.XPageAppService;
import fr.paris.lutece.portal.service.util.AppLogService;
import fr.paris.lutece.portal.service.util.AppPropertiesService;
import fr.paris.lutece.util.url.UrlItem;

import org.apache.commons.lang.StringUtils;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.io.File;
import java.io.IOException;

import java.text.ParseException;
import java.text.SimpleDateFormat;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Locale;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

/**
 * Parser for public cards (dila)
 */
public class DilaSolrPublicParser extends DefaultHandler {
    // -------------
    // - Constants -
    // -------------
    // Plugin name
    private static final String PROPERTY_PLUGIN_NAME = "dila.plugin.name";

    // Index keys
    private static final String PROPERTY_INDEXING_XML_BASE_VAR = "archive.dir.path.extract.final";
    private static final String PROPERTY_INDEXING_FRAGMENT = "dila.indexing.";
    private static final String PROPERTY_LIST_INDEX_KEYS_FRAGMENT = "listIndexKeys";

    // XPath comparisons
    private static final String PROPERTY_XPATH_CARD = "dila.parser.xpath.public.card";
    private static final String PROPERTY_XPATH_DATE = "dila.parser.xpath.public.date";
    private static final String PROPERTY_XPATH_TITLE = "dila.parser.xpath.public.title";
    private static final String PROPERTY_XPATH_THEME = "dila.parser.xpath.public.theme";
    private static final String PROPERTY_XPATH_AUDIENCE = "dila.parser.xpath.public.audience";
    private static final String PROPERTY_ATTRIBUTE_ID = "dila.parser.xpath.public.attribute.id";

    // Index type
    private static final String PROPERTY_INDEXING_TYPE = "dila-solr.indexing.publicType";

    // Site name
    private static final String PROPERTY_SITE = "lutece.name";

    // Paths contents
    private static final String PROPERTY_PATH_ID = "dila.parser.path.id";
    private static final String PROPERTY_PATH_CATEGORY = "dila.parser.path.category";

    // Strings
    private static final String STRING_EMPTY = "";
    private static final String STRING_SLASH = "/";
    private static final String STRING_SPACE = " ";
    private static final String STRING_POINT = ".";

    // -------------
    // - Variables -
    // -------------
    // List of Solr items
    private List<SolrItem> _listSolrItems;

    // XPath
    private String _strXPath;

    // Contents
    private String _strId;
    private String _strDate;
    private String _strType;
    private String _strTitle;
    private String _strSite;
    private String _strProdUrl;
    private String _strTheme;
    private String _strAudience;
    private String _strContents;

    /**
     * Initializes and launches the parsing of the public cards (public
     * constructor)
     */
    public DilaSolrPublicParser() {
        // Gets the list of CDC index keys
        String strIndexKeys = AppPropertiesService
                .getProperty(PROPERTY_INDEXING_FRAGMENT + PROPERTY_LIST_INDEX_KEYS_FRAGMENT);

        // Initializes the Solr Item list
        _listSolrItems = new ArrayList<SolrItem>();

        // Initializes the indexing type
        _strType = AppPropertiesService.getProperty(PROPERTY_INDEXING_TYPE);

        // Initializes the site
        _strSite = AppPropertiesService.getProperty(PROPERTY_SITE);

        // Initializes the prod url
        _strProdUrl = SolrIndexerService.getBaseUrl();

        try {
            // Initializes the SAX parser
            SAXParserFactory factory = SAXParserFactory.newInstance();
            SAXParser parser = factory.newSAXParser();

            // Splits the list of CDC index keys
            String[] splitKeys = strIndexKeys.split(",");

            for (int i = 0; i < splitKeys.length; i++) {
                // Gets the XML index file path
                String strXmlDirectory = AppPropertiesService
                        .getProperty(splitKeys[i] + STRING_POINT + PROPERTY_INDEXING_XML_BASE_VAR);
                File xmlPath = new File(strXmlDirectory);
                // Launches the parsing of all files in this directory
                parseAllPublicCards(xmlPath, parser);
            }
        } catch (ParserConfigurationException e) {
            AppLogService.error(e.getMessage(), e);
        } catch (SAXException e) {
            AppLogService.error(e.getMessage(), e);
        }
    }

    /**
     * Launches the parsing on each public card
     *
     * @param fileBasePath the base path
     * @param parser the SAX parser
     */
    private void parseAllPublicCards(File fileBasePath, SAXParser parser) {
        if (fileBasePath.isFile()) {
            // Launches the parsing of this public card (with the current handler)
            try {
                parser.parse(fileBasePath.getAbsolutePath(), this);
            } catch (SAXException e) {
                AppLogService.error(e.getMessage(), e);
            } catch (IOException e) {
                AppLogService.error(e.getMessage(), e);
            }
        } else {
            // Processes all the files of the current directory
            File[] files = fileBasePath.listFiles();

            for (File fileCurrent : files) {
                // Launches the parsing on each public card (recursive)
                parseAllPublicCards(fileCurrent, parser);
            }
        }
    }

    /**
     * Event received when starting the parsing operation
     *
     * @throws SAXException any SAX exception
     */
    public void startDocument() throws SAXException {
        // Initializes the XPATH
        _strXPath = STRING_EMPTY;

        // Initializes the contents
        _strId = STRING_EMPTY;
        _strDate = STRING_EMPTY;
        _strTitle = STRING_EMPTY;
        _strTheme = STRING_EMPTY;
        _strAudience = STRING_EMPTY;
        _strContents = STRING_EMPTY;
    }

    /**
     * Event received at the end of the parsing operation
     *
     * @throws SAXException any SAX exception
     */
    public void endDocument() throws SAXException {
        // Sets the ID 

        // Sets the full URL
        UrlItem url = new UrlItem(_strProdUrl);
        url.addParameter(XPageAppService.PARAM_XPAGE_APP, AppPropertiesService.getProperty(PROPERTY_PLUGIN_NAME));
        url.addParameter(AppPropertiesService.getProperty(PROPERTY_PATH_ID), _strId);
        url.addParameter(AppPropertiesService.getProperty(PROPERTY_PATH_CATEGORY), _strAudience);

        // Converts the date from "dd MMMMM yyyy" to "yyyyMMdd"
        Locale locale = Locale.FRENCH;
        Date dateUpdate = null;

        if (StringUtils.isNotEmpty(_strDate)) {
            try {
                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd", locale);
                String strDate = _strDate.split(STRING_SPACE)[1];
                dateUpdate = dateFormat.parse(strDate);

                dateFormat.applyPattern("yyyyMMdd");
            } catch (ParseException e) {
                dateUpdate = null;
            }
        } else {
            dateUpdate = null;
        }

        if (StringUtils.isNotEmpty(_strId)) {
            // Creates a new lucene document
            SolrItem item = new SolrItem();

            item.setUrl(url.getUrl());
            item.setDate(dateUpdate);
            item.setUid(_strId);
            item.setContent(_strContents);
            item.setTitle(_strTitle);
            item.setType(_strType);
            item.setSite(_strSite);

            String[] categories = new String[] { _strAudience };
            item.setCategorie(Arrays.asList(categories));

            // Adds the new item to the list
            _listSolrItems.add(item);
        }
    }

    /**
     * Event received at the start of an element
     *
     * @param uri the Namespace URI
     * @param localName the local name
     * @param qName the qualified XML name
     * @param atts the attributes attached to the element
     *
     * @throws SAXException any SAX exception
     */
    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
        // Updates the XPath
        _strXPath += (STRING_SLASH + qName);

        // Gets the URL (attribute)
        String strXPathCard = AppPropertiesService.getProperty(PROPERTY_XPATH_CARD);

        if ((_strXPath != null) && _strXPath.equals(strXPathCard)) {
            String strAttributeId = AppPropertiesService.getProperty(PROPERTY_ATTRIBUTE_ID);
            _strId = atts.getValue(strAttributeId);
        }
    }

    /**
     * Event received at the end of an element
     *
     * @param uri the Namespace URI
     * @param localName the local name
     * @param qName the qualified XML name
     *
     * @throws SAXException any SAX exception
     */
    public void endElement(String uri, String localName, String qName) throws SAXException {
        // Updates the XPath
        _strXPath = _strXPath.substring(0, _strXPath.lastIndexOf(STRING_SLASH));
    }

    /**
     * Event received when the analyzer encounters text (between two tags)
     *
     * @param ch the characters from the XML document
     * @param start the start position in the array
     * @param length the number of characters to read from the array
     *
     * @throws SAXException any SAX exception
     */
    public void characters(char[] ch, int start, int length) throws SAXException {
        // Gets the XPath comparisons properties
        String strXPathDate = AppPropertiesService.getProperty(PROPERTY_XPATH_DATE);
        String strXPathTitle = AppPropertiesService.getProperty(PROPERTY_XPATH_TITLE);
        String strXPathTheme = AppPropertiesService.getProperty(PROPERTY_XPATH_THEME);
        String strXPathAudience = AppPropertiesService.getProperty(PROPERTY_XPATH_AUDIENCE);

        // Gets the date
        if ((_strXPath != null) && _strXPath.equals(strXPathDate)) {
            _strDate += new String(ch, start, length);
        }

        // Gets the title
        else if ((_strXPath != null) && _strXPath.equals(strXPathTitle)) {
            _strTitle += new String(ch, start, length);
        }

        // Gets the theme
        else if ((_strXPath != null) && _strXPath.equals(strXPathTheme)) {
            if ((_strTheme != null) && !_strTheme.equals(STRING_EMPTY)) {
                _strTheme += (STRING_SPACE + new String(ch, start, length));
            } else {
                _strTheme += new String(ch, start, length);
            }
        }

        // Gets the audience
        else if ((_strXPath != null) && _strXPath.equals(strXPathAudience)) {
            _strAudience += new String(ch, start, length);
        }

        // Gets the contents
        if ((_strContents != null) && !_strContents.equals(STRING_EMPTY)) {
            _strContents += (STRING_SPACE + new String(ch, start, length));
        } else {
            _strContents += new String(ch, start, length);
        }
    }

    /**
     * Gets the list of Solr items
     *
     * @return The list of Solr items
     */
    public List<SolrItem> getPublicSolrItems() {
        return _listSolrItems;
    }
}