org.apache.solr.handler.CSVRequestHandler.java Source code

Introduction

Here is the source code for org.apache.solr.handler.CSVRequestHandler.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler;

import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.update.*;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.commons.csv.CSVStrategy;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.io.IOUtils;

import java.util.regex.Pattern;
import java.util.List;
import java.io.*;

/**
 * @version $Id: CSVRequestHandler.java 1165749 2011-09-06 16:20:07Z janhoy $
 */

public class CSVRequestHandler extends ContentStreamHandlerBase {

    @Override
    protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
        return new SingleThreadedCSVLoader(req, processor);
    }

    //////////////////////// SolrInfoMBeans methods //////////////////////
    @Override
    public String getDescription() {
        return "Add/Update multiple documents with CSV formatted rows";
    }

    @Override
    public String getVersion() {
        return "$Revision: 1165749 $";
    }

    @Override
    public String getSourceId() {
        return "$Id: CSVRequestHandler.java 1165749 2011-09-06 16:20:07Z janhoy $";
    }

    @Override
    public String getSource() {
        return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_3_5/solr/core/src/java/org/apache/solr/handler/CSVRequestHandler.java $";
    }
}

abstract class CSVLoader extends ContentStreamLoader {
    public static final String SEPARATOR = "separator";
    public static final String FIELDNAMES = "fieldnames";
    public static final String HEADER = "header";
    public static final String SKIP = "skip";
    public static final String SKIPLINES = "skipLines";
    public static final String MAP = "map";
    public static final String TRIM = "trim";
    public static final String EMPTY = "keepEmpty";
    public static final String SPLIT = "split";
    public static final String ENCAPSULATOR = "encapsulator";
    public static final String ESCAPE = "escape";
    public static final String OVERWRITE = "overwrite";

    private static Pattern colonSplit = Pattern.compile(":");
    private static Pattern commaSplit = Pattern.compile(",");

    final IndexSchema schema;
    final SolrParams params;
    final CSVStrategy strategy;
    final UpdateRequestProcessor processor;

    String[] fieldnames;
    SchemaField[] fields;
    CSVLoader.FieldAdder[] adders;

    int skipLines; // number of lines to skip at start of file

    final AddUpdateCommand templateAdd;

    /** Add a field to a document unless it's zero length.
     * The FieldAdder hierarchy handles all the complexity of
     * further transforming or splitting field values to keep the
     * main logic loop clean.  All implementations of add() must be
     * MT-safe!
     */
    private class FieldAdder {
        void add(SolrInputDocument doc, int line, int column, String val) {
            if (val.length() > 0) {
                doc.addField(fields[column].getName(), val, 1.0f);
            }
        }
    }

    /** add zero length fields */
    private class FieldAdderEmpty extends CSVLoader.FieldAdder {
        @Override
        void add(SolrInputDocument doc, int line, int column, String val) {
            doc.addField(fields[column].getName(), val, 1.0f);
        }
    }

    /** trim fields */
    private class FieldTrimmer extends CSVLoader.FieldAdder {
        private final CSVLoader.FieldAdder base;

        FieldTrimmer(CSVLoader.FieldAdder base) {
            this.base = base;
        }

        @Override
        void add(SolrInputDocument doc, int line, int column, String val) {
            base.add(doc, line, column, val.trim());
        }
    }

    /** map a single value.
     * for just a couple of mappings, this is probably faster than
     * using a HashMap.
     */
    private class FieldMapperSingle extends CSVLoader.FieldAdder {
        private final String from;
        private final String to;
        private final CSVLoader.FieldAdder base;

        FieldMapperSingle(String from, String to, CSVLoader.FieldAdder base) {
            this.from = from;
            this.to = to;
            this.base = base;
        }

        @Override
        void add(SolrInputDocument doc, int line, int column, String val) {
            if (from.equals(val))
                val = to;
            base.add(doc, line, column, val);
        }
    }

    /** Split a single value into multiple values based on
     * a CSVStrategy.
     */
    private class FieldSplitter extends CSVLoader.FieldAdder {
        private final CSVStrategy strategy;
        private final CSVLoader.FieldAdder base;

        FieldSplitter(CSVStrategy strategy, CSVLoader.FieldAdder base) {
            this.strategy = strategy;
            this.base = base;
        }

        @Override
        void add(SolrInputDocument doc, int line, int column, String val) {
            CSVParser parser = new CSVParser(new StringReader(val), strategy);
            try {
                String[] vals = parser.getLine();
                if (vals != null) {
                    for (String v : vals)
                        base.add(doc, line, column, v);
                } else {
                    base.add(doc, line, column, val);
                }
            } catch (IOException e) {
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
            }
        }
    }

    String errHeader = "CSVLoader:";

    CSVLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
        this.processor = processor;
        this.params = req.getParams();
        schema = req.getSchema();

        templateAdd = new AddUpdateCommand();
        templateAdd.allowDups = false;
        templateAdd.overwriteCommitted = true;
        templateAdd.overwritePending = true;

        if (params.getBool(OVERWRITE, true)) {
            templateAdd.allowDups = false;
            templateAdd.overwriteCommitted = true;
            templateAdd.overwritePending = true;
        } else {
            templateAdd.allowDups = true;
            templateAdd.overwriteCommitted = false;
            templateAdd.overwritePending = false;
        }

        templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);

        strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, CSVStrategy.ESCAPE_DISABLED, false,
                false, false, true);
        String sep = params.get(SEPARATOR);
        if (sep != null) {
            if (sep.length() != 1)
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Invalid separator:'" + sep + "'");
            strategy.setDelimiter(sep.charAt(0));
        }

        String encapsulator = params.get(ENCAPSULATOR);
        if (encapsulator != null) {
            if (encapsulator.length() != 1)
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                        "Invalid encapsulator:'" + encapsulator + "'");
        }

        String escape = params.get(ESCAPE);
        if (escape != null) {
            if (escape.length() != 1)
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Invalid escape:'" + escape + "'");
        }

        // if only encapsulator or escape is set, disable the other escaping mechanism
        if (encapsulator == null && escape != null) {
            strategy.setEncapsulator(CSVStrategy.ENCAPSULATOR_DISABLED);
            strategy.setEscape(escape.charAt(0));
        } else {
            if (encapsulator != null) {
                strategy.setEncapsulator(encapsulator.charAt(0));
            }
            if (escape != null) {
                char ch = escape.charAt(0);
                strategy.setEscape(ch);
                if (ch == '\\') {
                    // If the escape is the standard backslash, then also enable
                    // unicode escapes (it's harmless since 'u' would not otherwise
                    // be escaped.                    
                    strategy.setUnicodeEscapeInterpretation(true);
                }
            }
        }

        String fn = params.get(FIELDNAMES);
        fieldnames = fn != null ? commaSplit.split(fn, -1) : null;

        Boolean hasHeader = params.getBool(HEADER);

        skipLines = params.getInt(SKIPLINES, 0);

        if (fieldnames == null) {
            if (null == hasHeader) {
                // assume the file has the headers if they aren't supplied in the args
                hasHeader = true;
            } else if (!hasHeader) {
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                        "CSVLoader: must specify fieldnames=<fields>* or header=true");
            }
        } else {
            // if the fieldnames were supplied and the file has a header, we need to
            // skip over that header.
            if (hasHeader != null && hasHeader)
                skipLines++;

            prepareFields();
        }
    }

    /** create the FieldAdders that control how each field  is indexed */
    void prepareFields() {
        // Possible future optimization: for really rapid incremental indexing
        // from a POST, one could cache all of this setup info based on the params.
        // The link from FieldAdder to this would need to be severed for that to happen.

        fields = new SchemaField[fieldnames.length];
        adders = new CSVLoader.FieldAdder[fieldnames.length];
        String skipStr = params.get(SKIP);
        List<String> skipFields = skipStr == null ? null : StrUtils.splitSmart(skipStr, ',');

        CSVLoader.FieldAdder adder = new CSVLoader.FieldAdder();
        CSVLoader.FieldAdder adderKeepEmpty = new CSVLoader.FieldAdderEmpty();

        for (int i = 0; i < fields.length; i++) {
            String fname = fieldnames[i];
            // to skip a field, leave the entries in fields and addrs null
            if (fname.length() == 0 || (skipFields != null && skipFields.contains(fname)))
                continue;

            fields[i] = schema.getField(fname);
            boolean keepEmpty = params.getFieldBool(fname, EMPTY, false);
            adders[i] = keepEmpty ? adderKeepEmpty : adder;

            // Order that operations are applied: split -> trim -> map -> add
            // so create in reverse order.
            // Creation of FieldAdders could be optimized and shared among fields

            String[] fmap = params.getFieldParams(fname, MAP);
            if (fmap != null) {
                for (String mapRule : fmap) {
                    String[] mapArgs = colonSplit.split(mapRule, -1);
                    if (mapArgs.length != 2)
                        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                                "Map rules must be of the form 'from:to' ,got '" + mapRule + "'");
                    adders[i] = new CSVLoader.FieldMapperSingle(mapArgs[0], mapArgs[1], adders[i]);
                }
            }

            if (params.getFieldBool(fname, TRIM, false)) {
                adders[i] = new CSVLoader.FieldTrimmer(adders[i]);
            }

            if (params.getFieldBool(fname, SPLIT, false)) {
                String sepStr = params.getFieldParam(fname, SEPARATOR);
                char fsep = sepStr == null || sepStr.length() == 0 ? ',' : sepStr.charAt(0);
                String encStr = params.getFieldParam(fname, ENCAPSULATOR);
                char fenc = encStr == null || encStr.length() == 0 ? (char) -2 : encStr.charAt(0);
                String escStr = params.getFieldParam(fname, ESCAPE);
                char fesc = escStr == null || escStr.length() == 0 ? CSVStrategy.ESCAPE_DISABLED : escStr.charAt(0);

                CSVStrategy fstrat = new CSVStrategy(fsep, fenc, CSVStrategy.COMMENTS_DISABLED, fesc, false, false,
                        false, false);
                adders[i] = new CSVLoader.FieldSplitter(fstrat, adders[i]);
            }
        }
    }

    private void input_err(String msg, String[] line, int lineno) {
        StringBuilder sb = new StringBuilder();
        sb.append(errHeader).append(", line=").append(lineno).append(",").append(msg).append("\n\tvalues={");
        for (String val : line) {
            sb.append("'").append(val).append("',");
        }
        sb.append('}');
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, sb.toString());
    }

    private void input_err(String msg, String[] lines, int lineNo, Throwable e) {
        StringBuilder sb = new StringBuilder();
        sb.append(errHeader).append(", line=").append(lineNo).append(",").append(msg).append("\n\tvalues={");
        if (lines != null) {
            for (String val : lines) {
                sb.append("'").append(val).append("',");
            }
        } else {
            sb.append("NO LINES AVAILABLE");
        }
        sb.append('}');
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, sb.toString(), e);
    }

    /** load the CSV input */
    @Override
    public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream) throws IOException {
        errHeader = "CSVLoader: input=" + stream.getSourceInfo();
        Reader reader = null;
        try {
            reader = stream.getReader();
            if (skipLines > 0) {
                if (!(reader instanceof BufferedReader)) {
                    reader = new BufferedReader(reader);
                }
                BufferedReader r = (BufferedReader) reader;
                for (int i = 0; i < skipLines; i++) {
                    r.readLine();
                }
            }

            CSVParser parser = new CSVParser(reader, strategy);

            // parse the fieldnames from the header of the file
            if (fieldnames == null) {
                fieldnames = parser.getLine();
                if (fieldnames == null) {
                    throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                            "Expected fieldnames in CSV input");
                }
                prepareFields();
            }

            // read the rest of the CSV file
            for (;;) {
                int line = parser.getLineNumber(); // for error reporting in MT mode
                String[] vals = null;
                try {
                    vals = parser.getLine();
                } catch (IOException e) {
                    //Catch the exception and rethrow it with more line information
                    input_err("can't read line: " + line, null, line, e);
                }
                if (vals == null)
                    break;

                if (vals.length != fields.length) {
                    input_err("expected " + fields.length + " values but got " + vals.length, vals, line);
                }

                addDoc(line, vals);
            }
        } finally {
            if (reader != null) {
                IOUtils.closeQuietly(reader);
            }
        }
    }

    /** called for each line of values (document) */
    abstract void addDoc(int line, String[] vals) throws IOException;

    /** this must be MT safe... may be called concurrently from multiple threads. */
    void doAdd(int line, String[] vals, SolrInputDocument doc, AddUpdateCommand template) throws IOException {
        // the line number is passed simply for error reporting in MT mode.
        // first, create the lucene document
        for (int i = 0; i < vals.length; i++) {
            if (fields[i] == null)
                continue; // ignore this field
            String val = vals[i];
            adders[i].add(doc, line, i, val);
        }

        template.solrDoc = doc;
        processor.processAdd(template);
    }

}

class SingleThreadedCSVLoader extends CSVLoader {
    SingleThreadedCSVLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
        super(req, processor);
    }

    @Override
    void addDoc(int line, String[] vals) throws IOException {
        templateAdd.indexedId = null;
        SolrInputDocument doc = new SolrInputDocument();
        doAdd(line, vals, doc, templateAdd);
    }
}