org.eclipse.smila.integration.solr.SolrPipelet.java Source code

Introduction

Here is the source code for org.eclipse.smila.integration.solr.SolrPipelet.java
Source

/*******************************************************************************
 * Copyright (c) 2010 Empolis GmbH and brox IT Solutions GmbH. 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *    Igor Novakovic (Empolis GmbH) - initial implementation
 *******************************************************************************/

package org.eclipse.smila.integration.solr;

import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.io.Writer;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xerces.dom.DOMImplementationImpl;
import org.eclipse.smila.blackboard.Blackboard;
import org.eclipse.smila.blackboard.path.Path;
import org.eclipse.smila.datamodel.id.Id;
import org.eclipse.smila.datamodel.id.dom.IdBuilder;
import org.eclipse.smila.datamodel.record.Annotation;
import org.eclipse.smila.datamodel.record.Literal;
import org.eclipse.smila.processing.ProcessingException;
import org.eclipse.smila.processing.SearchMessage;
import org.eclipse.smila.processing.SearchPipelet;
import org.eclipse.smila.processing.SimplePipelet;
import org.eclipse.smila.processing.configuration.PipeletConfiguration;
import org.eclipse.smila.processing.parameters.ParameterAccessor;
import org.eclipse.smila.processing.parameters.SearchAnnotations;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;

/**
 * This pipelet integrates the Apache Solr's REST API and is designed to be used
 * both in processing and in search pipelines.
 * 
 * */
public class SolrPipelet implements SimplePipelet, SearchPipelet {

    private static final String UPDATE = "update";
    private static final String HIGHLIGHT_PARAMS = "highlight.params";
    private static final String COMMIT_WITHIN = "commitWithin";
    private static final String OVERWRITE = "overwrite";
    private static final String HIGHLIGHT = "highlight";
    private static final String DEFAULT_SEARCH_TERM = "SMILA";
    private static final String CONTENT_LENGTH = "Content-Length";
    private static final String RESPONSE_HEADER = "ResponseHeader";
    private static final String META_DATA = "MetaData";
    private static final String SELECT = "select?";
    private static final String TEXT_XML_CHARSET = "text/xml; charset=";
    private static final String CONTENT_TYPE = "Content-Type";
    private static final String SHARDS = "shards";
    private static final String HTTP_LOCALHOST = "http://localhost";
    private static final String SOLR_WEBAPP = ":8983/solr/";
    private static final String GET = "GET";
    private static final String POST = "POST";
    private static final String INDEX_NAME = "indexName";
    private static final String ALLOW_DOUBLETS = "allowDoublets";
    private static final String EXECUTION_MODE = "executionMode";
    private static final String FIELD = "field";
    private static final String ADD = "add";
    private static final String DELETE = "delete";
    public static final String UTF8 = "utf-8";
    private static final SAXParserFactory pf = SAXParserFactory.newInstance();
    private static final TransformerFactory TRANSFORMER_FACTORY = TransformerFactory.newInstance();

    private String[] _shards = null;
    private boolean _highlight = true;
    private String[] _highlightParams = null;
    private ExecutionMode _mode = ExecutionMode.ADD;
    private boolean _allowDoublets = false;
    private int _commitWithin = 10000;
    private String _indexName = "";
    private Log _log = LogFactory.getLog(SolrPipelet.class);

    public enum ExecutionMode {
        ADD, DELETE
    };

    public Id[] process(Blackboard blackboard, Id[] recordIds) throws ProcessingException {
        String updateURL = HTTP_LOCALHOST + SOLR_WEBAPP + UPDATE;
        String updateXMLMessage = null;
        URL url = null;
        HttpURLConnection conn = null;
        Id _id = null;
        try {
            url = new URL(updateURL);
            conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod(POST);
            conn.setRequestProperty(CONTENT_TYPE, TEXT_XML_CHARSET + UTF8);
            conn.setUseCaches(false);
            conn.setDoOutput(true);
            conn.setDoInput(true);
            conn.setReadTimeout(10000);
        } catch (Exception e) {
            String msg = "Error while opening Solr connection: '" + e.getMessage() + "'";
            _log.error(msg, e);
            throw new ProcessingException(msg, e);
        }
        try {
            DOMImplementation impl = DOMImplementationImpl.getDOMImplementation();
            Document document = impl.createDocument(null, SolrResponseHandler.SOLR, null);
            Element add = null;
            if (_mode == ExecutionMode.ADD) {
                add = document.createElement(ADD);
            } else {
                add = document.createElement(DELETE);
            }
            if (_allowDoublets) {
                add.setAttribute(OVERWRITE, "false");
            } else {
                add.setAttribute(OVERWRITE, "true");
            }
            add.setAttribute(COMMIT_WITHIN, String.valueOf(_commitWithin));

            for (Id id : recordIds) {
                _id = id;
                Element doc = document.createElement(SolrResponseHandler.DOC);
                add.appendChild(doc);

                // Create id attribute
                Element field = document.createElement(FIELD);
                field.setAttribute(SolrResponseHandler.NAME, SolrResponseHandler.ID);
                IdBuilder idBuilder = new IdBuilder();
                String idXML = idBuilder.idToString(id);
                String idEncoded = URLEncoder.encode(idXML, UTF8);
                Text text = document.createTextNode(idEncoded);
                field.appendChild(text);
                doc.appendChild(field);

                // Create all other attributes
                Iterator<String> i = blackboard.getAttributeNames(id);
                while (i.hasNext()) {
                    String attrName = i.next();
                    if (!attrName.startsWith(META_DATA) && !attrName.startsWith(RESPONSE_HEADER)) {
                        Path path = new Path(attrName);
                        Iterator<Literal> literals = blackboard.getLiterals(id, path).iterator();
                        while (literals.hasNext()) {
                            Literal value = literals.next();
                            String stringValue = null;
                            if (Literal.DataType.DATE.equals(value.getDataType())) {
                                SimpleDateFormat df = new SimpleDateFormat(SolrResponseHandler.DATE_FORMAT_PATTERN);
                                stringValue = df.format(value.getDateValue());
                            } else if (Literal.DataType.DATETIME.equals(value.getDataType())) {
                                SimpleDateFormat df = new SimpleDateFormat(SolrResponseHandler.DATE_FORMAT_PATTERN);
                                stringValue = df.format(value.getDateTimeValue());
                            } else {
                                stringValue = replaceNonXMLChars(value.getStringValue());
                            }
                            field = document.createElement(FIELD);
                            field.setAttribute(SolrResponseHandler.NAME, attrName);
                            text = document.createTextNode(stringValue);
                            field.appendChild(text);
                            doc.appendChild(field);
                        }
                    }
                }
            }
            Transformer transformer = TRANSFORMER_FACTORY.newTransformer();
            if (_log.isDebugEnabled()) {
                transformer.setOutputProperty(OutputKeys.INDENT, "yes");
            }
            DOMSource source = new DOMSource(add);
            Writer w = new StringWriter();
            StreamResult streamResult = new StreamResult(w);
            transformer.transform(source, streamResult);
            updateXMLMessage = streamResult.getWriter().toString();
            conn.setRequestProperty(CONTENT_LENGTH, Integer.toString(updateXMLMessage.length()));
            DataOutputStream os = new DataOutputStream(conn.getOutputStream());
            os.write(updateXMLMessage.getBytes(UTF8));
            os.flush();
            os.close();
            System.out.println(updateXMLMessage);

            BufferedReader rd = new BufferedReader(new InputStreamReader(conn.getInputStream()));
            String line;
            StringBuffer response = new StringBuffer();
            while ((line = rd.readLine()) != null) {
                response.append(line);
                response.append('\r');
            }
            rd.close();
            System.out.println("Response:\n" + response.toString());
        } catch (Exception e) {
            String msg = "Error while processing record '" + _id + "' for index '" + _indexName + "': "
                    + e.getMessage() + "'.";
            _log.error(msg, e);
            if (_log.isDebugEnabled()) {
                try {
                    FileOutputStream fos = new FileOutputStream(_id.getIdHash() + ".xml");
                    fos.write(updateXMLMessage.getBytes(UTF8));
                    fos.flush();
                    fos.close();
                } catch (Exception ee) {
                    throw new ProcessingException(msg, ee);
                }
            }
            throw new ProcessingException(msg, e);
        } finally {
            if (conn != null) {
                conn.disconnect();
            }
        }
        return recordIds;
    }

    public SearchMessage process(Blackboard blackboard, SearchMessage message) throws ProcessingException {
        if (message.hasQuery()) {
            ParameterAccessor parameters = new ParameterAccessor(blackboard, message.getQuery());
            String query = parameters.getQuery();
            int resultSize = parameters.getResultSize();
            int resultOffset = parameters.getResultOffset();
            List<String> resultAttributes = parameters.getResultAttributes();
            // Threshold seems not to be implemented in Solr, so we just ignore
            // it for now.
            // double threshold = parameters.getThreshold();
            Id queryId = message.getQuery();

            HttpURLConnection conn = null;
            List<Id> rIds = new ArrayList<Id>();

            String searchURL = HTTP_LOCALHOST + SOLR_WEBAPP + SELECT;
            if (_shards != null) {
                searchURL += "shards=";
                for (String shard : _shards) {
                    searchURL += shard + SOLR_WEBAPP + ",";
                }
            }
            if (_highlight) {
                searchURL += "&hl=true";
            }
            if (_highlightParams != null) {
                for (String hp : _highlightParams) {
                    searchURL += "&" + hp;
                }
            }
            searchURL += "&start=" + resultOffset;
            searchURL += "&rows=" + resultSize;
            // Include requested attributes and scores
            // into the result.
            if (resultAttributes != null) {
                // We need to retrieve id explicitly
                searchURL += "&fl=id,";
                for (String ra : resultAttributes) {
                    searchURL += ra + ",";
                }
                searchURL += "score";
            }
            searchURL += "&indent=true&q=";
            try {
                if (query != null) {
                    searchURL += URLEncoder.encode(query, UTF8);
                } else {
                    // We have to set any value for a query otherwise we get
                    // error 500
                    searchURL += DEFAULT_SEARCH_TERM;
                }
                URL url = new URL(searchURL);
                conn = (HttpURLConnection) url.openConnection();
                conn.setRequestMethod(GET);
                conn.setDoOutput(true);
                conn.setReadTimeout(10000);
                conn.connect();

                InputStream is = conn.getInputStream();
                SAXParser p = pf.newSAXParser();
                SolrResponseHandler srh = new SolrResponseHandler(blackboard, rIds);
                p.parse(is, srh);
                String totalHits = Integer.toString(srh.noOfHits);
                Annotation resultAnno = blackboard.getAnnotation(queryId, null, SearchAnnotations.RESULT);
                if (resultAnno == null) {
                    resultAnno = blackboard.createAnnotation(queryId);
                    blackboard.setAnnotation(queryId, null, SearchAnnotations.RESULT, resultAnno);
                }
                resultAnno.setNamedValue(SearchAnnotations.TOTAL_HITS, totalHits);
            } catch (Exception e) {
                String msg = "Error while while processing search request: '" + e.getMessage() + "'.";
                _log.error(msg, e);
                throw new ProcessingException(msg, e);
            } finally {
                conn.disconnect();
                conn = null;
            }
            message.setRecords(rIds);
        }
        return message;
    }

    public void configure(PipeletConfiguration config) throws ProcessingException {
        // Search configuration parameters
        _shards = config.getPropertyStringValues(SHARDS);
        Object configValue = config.getPropertyFirstValue(HIGHLIGHT);
        if (configValue != null) {
            _highlight = (Boolean) configValue;
        }
        _highlightParams = config.getPropertyStringValues(HIGHLIGHT_PARAMS);

        // Indexing configuration parameters
        configValue = config.getPropertyFirstValue(EXECUTION_MODE);
        if (configValue != null) {
            String mode = (String) configValue;
            try {
                _mode = ExecutionMode.valueOf(mode);
            } catch (IllegalArgumentException iae) {
                String msg = "Error while configuring SolrPipelet: mode '" + mode + "' is not supported!";
                _log.error(msg);
                throw new ProcessingException(msg, iae);
            }
        }
        configValue = config.getPropertyFirstValue(ALLOW_DOUBLETS);
        if (configValue != null) {
            _allowDoublets = (Boolean) configValue;
        }
        configValue = config.getPropertyFirstValue(COMMIT_WITHIN);
        if (configValue != null) {
            _commitWithin = (Integer) configValue;
        }

        // Currently this pipelet does not support more than one
        // index i.e. core, but we read this parameter anyway.
        // Note: In Solr an index is called a core.
        configValue = config.getPropertyFirstValue(INDEX_NAME);
        if (configValue != null) {
            _indexName = (String) configValue;
        }

        // Pass own reference to a JXM agent, but do this
        // only if we are in search pipeline.
        if (_shards.length != 0) {
            new SolrPipeletAgent(this);
        }
    }

    public void setShards(String shards) {
        if (!shards.isEmpty()) {
            String[] nodes = shards.split(",");
            List<String> sList = new ArrayList<String>();
            for (int i = 0; i < nodes.length; i++) {
                if (!nodes[i].isEmpty()) {
                    sList.add(nodes[i]);
                }
            }
            _shards = sList.toArray(new String[] {});
        }
    }

    public String getShards() {
        String shardsStr = new String();
        if (_shards != null) {
            for (String shard : _shards) {
                shardsStr += shard + ",";
            }
        }
        return shardsStr;
    }

    /**
     * This method replaces valid UTF-8 characters which are not allowed in XML
     * with spaces.
     * 
     * @param source
     *            - the original source string
     * @return The filtered string
     */
    public static String replaceNonXMLChars(String source) {
        if (source == null)
            return null;
        StringBuffer sb = new StringBuffer();
        char[] charArray = source.toCharArray();
        for (int i = 0; i < charArray.length; i++) {
            char c = charArray[i];
            if (c != '\u0000' && c != '\u0001' && c != '\u0002' && c != '\u0003' && c != '\u0004' && c != '\u0005'
                    && c != '\u0006' && c != '\u0007' && c != '\u0008' && c != '\u000B' && c != '\u000C'
                    && c != '\u000E' && c != '\u000F' && c != '\u0010' && c != '\u0011' && c != '\u0012'
                    && c != '\u0013' && c != '\u0014' && c != '\u0015' && c != '\u0016' && c != '\u0017'
                    && c != '\u0018' && c != '\u0019' && c != '\u001A' && c != '\u001B' && c != '\u001C'
                    && c != '\u001D' && c != '\u001E' && c != '\u001F' && c != '\uFFFE' && c != '\uFFFF') {
                sb.append(c);
            } else {
                sb.append(' ');
            }
        }
        return sb.toString();
    }
}