ddf.catalog.source.solr.DynamicSchemaResolver.java Source code

Introduction

Here is the source code for ddf.catalog.source.solr.DynamicSchemaResolver.java
Source

/**
 * Copyright (c) Codice Foundation
 * <p>
 * This is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License as published by the Free Software Foundation, either version 3 of the
 * License, or any later version.
 * <p>
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details. A copy of the GNU Lesser General Public License
 * is distributed along with this program and can be found at
 * <http://www.gnu.org/licenses/lgpl.html>.
 */
package ddf.catalog.source.solr;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.codehaus.stax2.XMLInputFactory2;
import org.codice.solr.factory.ConfigurationStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import ddf.catalog.data.AttributeDescriptor;
import ddf.catalog.data.AttributeType;
import ddf.catalog.data.AttributeType.AttributeFormat;
import ddf.catalog.data.Metacard;
import ddf.catalog.data.MetacardCreationException;
import ddf.catalog.data.MetacardType;
import ddf.catalog.data.Result;
import ddf.catalog.data.impl.AttributeDescriptorImpl;
import ddf.catalog.data.impl.BasicTypes;
import ddf.catalog.data.impl.MetacardTypeImpl;
import lux.Config;
import lux.xml.SaxonDocBuilder;
import lux.xml.XmlReader;
import lux.xml.tinybin.TinyBinary;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.tree.tiny.TinyDocumentImpl;
import net.sf.saxon.tree.tiny.TinyTree;

/**
 * This class tries to resolve all user given field names to their corresponding dynamic Solr index
 * field name. This class takes most of its logic directly from the configured Solr schema.xml. For
 * instance, the suffixes enumerated in this class are directly copied from the schema.xml.
 *
 * @since 0.2.0
 */
public class DynamicSchemaResolver {

    protected static final char FIRST_CHAR_OF_SUFFIX = '_';

    protected static final String COULD_NOT_READ_METACARD_TYPE_MESSAGE = "Could not read MetacardType.";

    protected static final String FIELDS_KEY = "fields";

    protected static final String COULD_NOT_SERIALIZE_OBJECT_MESSAGE = "Could not serialize object";

    protected static final XMLInputFactory XML_INPUT_FACTORY;

    private static final String SOLR_CLOUD_VERSION_FIELD = "_version_";

    public static final String LUX_XML_FIELD_NAME = "lux_xml";

    public static final String SCORE_FIELD_NAME = "score";

    private static final List<String> PRIVATE_SOLR_FIELDS = Arrays.asList(SOLR_CLOUD_VERSION_FIELD,
            SchemaFields.METACARD_TYPE_FIELD_NAME, SchemaFields.METACARD_TYPE_OBJECT_FIELD_NAME, LUX_XML_FIELD_NAME,
            SCORE_FIELD_NAME);

    private static final Logger LOGGER = LoggerFactory.getLogger(DynamicSchemaResolver.class);

    static {
        ClassLoader tccl = Thread.currentThread().getContextClassLoader();
        try {
            Thread.currentThread().setContextClassLoader(DynamicSchemaResolver.class.getClassLoader());

            XML_INPUT_FACTORY = XMLInputFactory2.newInstance();
            XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.FALSE);
            XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
            XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_COALESCING, Boolean.FALSE);
            XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
        } finally {
            Thread.currentThread().setContextClassLoader(tccl);
        }
    }

    protected Set<String> fieldsCache = new HashSet<>();

    protected Set<String> anyTextFieldsCache = new HashSet<>();

    protected SchemaFields schemaFields;

    protected Map<String, MetacardType> metacardTypesCache = new HashMap<>();

    protected Map<String, byte[]> metacardTypeNameToSerialCache = new HashMap<>();

    private Processor processor = new Processor(new Config());

    public DynamicSchemaResolver() {
        this.schemaFields = new SchemaFields();

        fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX);
        fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED);
        fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED + SchemaFields.HAS_CASE);
        fieldsCache.add(Metacard.TAGS + SchemaFields.TEXT_SUFFIX);

        anyTextFieldsCache.add(Metacard.METADATA + SchemaFields.TEXT_SUFFIX);
        Set<String> basicTextAttributes = BasicTypes.BASIC_METACARD.getAttributeDescriptors().stream()
                .filter(descriptor -> BasicTypes.STRING_TYPE.equals(descriptor.getType()))
                .map(stringDescriptor -> stringDescriptor.getName() + SchemaFields.TEXT_SUFFIX)
                .collect(Collectors.toSet());
        anyTextFieldsCache.addAll(basicTextAttributes);
        fieldsCache.add(BasicTypes.VALIDATION_ERRORS + SchemaFields.TEXT_SUFFIX);
        fieldsCache.add(BasicTypes.VALIDATION_WARNINGS + SchemaFields.TEXT_SUFFIX);
    }

    /**
     * Adds the fields that are already in Solr to the cache. This method should be called
     * once the SolrClient is up to ensure the cache is synchronized with Solr.
     *
     * @param client
     *            the SolrClient we are working with
     */
    public void addFieldsFromClient(SolrClient client) {
        if (client == null) {
            LOGGER.warn("Solr client is null, could not add fields to cache.");
            return;
        }

        SolrQuery query = new SolrQuery();

        // numterms=0 means retrieve everything (regular or dynamic fields)
        query.add("numterms", "0");

        /*
         * Adding this request handler allows us to query the schema dynamically. The name of the
         * request handler is provided by the schema.xml. If the name is changed in the schema.xml,
         * then this value must be changed as well.
         */
        query.setRequestHandler("/admin/luke");

        try {
            QueryResponse response = client.query(query);
            for (Entry<String, ?> e : ((SimpleOrderedMap<?>) (response.getResponse().get(FIELDS_KEY)))) {
                fieldsCache.add(e.getKey());
                if (e.getKey().endsWith(SchemaFields.TEXT_SUFFIX)) {
                    anyTextFieldsCache.add(e.getKey());
                }
            }
        } catch (SolrServerException | SolrException | IOException e) {
            LOGGER.warn("Could not update cache for field names.", e);
        }
    }

    /**
     * Adds the fields of the Metacard into the {@link SolrInputDocument}
     */
    public void addFields(Metacard metacard, SolrInputDocument solrInputDocument) throws MetacardCreationException {
        MetacardType schema = metacard.getMetacardType();

        // TODO: register these metacard types when a new one is seen

        for (AttributeDescriptor ad : schema.getAttributeDescriptors()) {
            if (metacard.getAttribute(ad.getName()) != null) {
                List<Serializable> attributeValues = metacard.getAttribute(ad.getName()).getValues();

                if (attributeValues != null && attributeValues.size() > 0 && attributeValues.get(0) != null) {
                    AttributeFormat format = ad.getType().getAttributeFormat();
                    String formatIndexName = ad.getName() + getFieldSuffix(format);

                    if (AttributeFormat.XML.equals(format)) {
                        List<String> parsedTexts = parseTextFrom(attributeValues);

                        // text => metadata_txt_ws
                        String whitespaceTokenizedIndexName = ad.getName() + getFieldSuffix(AttributeFormat.STRING)
                                + SchemaFields.WHITESPACE_TEXT_SUFFIX;
                        solrInputDocument.addField(whitespaceTokenizedIndexName, parsedTexts);

                        // text => metadata_txt_ws_has_case
                        String whiteSpaceTokenizedHasCaseIndexName = ad.getName()
                                + getFieldSuffix(AttributeFormat.STRING) + SchemaFields.WHITESPACE_TEXT_SUFFIX
                                + SchemaFields.HAS_CASE;
                        solrInputDocument.addField(whiteSpaceTokenizedHasCaseIndexName, parsedTexts);

                        // text => metadata_txt_tokenized
                        String specialStringIndexName = ad.getName() + getFieldSuffix(AttributeFormat.STRING)
                                + getSpecialIndexSuffix(AttributeFormat.STRING);
                        solrInputDocument.addField(specialStringIndexName, parsedTexts);

                        // text case sensitive
                        solrInputDocument.addField(specialStringIndexName + SchemaFields.HAS_CASE, parsedTexts);
                    } else if (AttributeFormat.OBJECT.equals(format)) {
                        ByteArrayOutputStream byteArrayOS = new ByteArrayOutputStream();
                        List<Serializable> byteArrays = new ArrayList<>();

                        try (ObjectOutputStream out = new ObjectOutputStream(byteArrayOS)) {
                            for (Serializable serializable : attributeValues) {
                                out.writeObject(serializable);
                                byteArrays.add(byteArrayOS.toByteArray());
                                out.reset();
                            }
                        } catch (IOException e) {
                            LOGGER.warn(COULD_NOT_SERIALIZE_OBJECT_MESSAGE, e);
                            throw new MetacardCreationException(COULD_NOT_SERIALIZE_OBJECT_MESSAGE);
                        }

                        attributeValues = byteArrays;
                    }
                    solrInputDocument.addField(formatIndexName, attributeValues);
                    if (!(formatIndexName.endsWith(SchemaFields.BINARY_SUFFIX)
                            || formatIndexName.endsWith(SchemaFields.OBJECT_SUFFIX))) {
                        solrInputDocument.addField(formatIndexName + SchemaFields.SORT_KEY_SUFFIX,
                                attributeValues.get(0));
                    }
                }
            }
        }

        if (!ConfigurationStore.getInstance().isDisableTextPath()) {
            if (StringUtils.isNotBlank(metacard.getMetadata())) {
                try {
                    byte[] luxXml = createTinyBinary(metacard.getMetadata());
                    solrInputDocument.addField(LUX_XML_FIELD_NAME, luxXml);
                } catch (XMLStreamException | SaxonApiException e) {
                    LOGGER.warn("Unable to parse metadata field.  XPath support unavailable for metacard "
                            + metacard.getId());
                }
            }
        }

        /*
         * Lastly the metacardType must be added to the solr document. These are internal fields
         */
        solrInputDocument.addField(SchemaFields.METACARD_TYPE_FIELD_NAME, schema.getName());
        byte[] metacardTypeBytes = metacardTypeNameToSerialCache.get(schema.getName());

        if (metacardTypeBytes == null) {
            MetacardType coreMetacardType = new MetacardTypeImpl(schema.getName(),
                    convertAttributeDescriptors(schema.getAttributeDescriptors()));

            metacardTypesCache.put(schema.getName(), coreMetacardType);

            metacardTypeBytes = serialize(coreMetacardType);
            metacardTypeNameToSerialCache.put(schema.getName(), metacardTypeBytes);

            addToFieldsCache(coreMetacardType.getAttributeDescriptors());
        }

        solrInputDocument.addField(SchemaFields.METACARD_TYPE_OBJECT_FIELD_NAME, metacardTypeBytes);
    }

    private byte[] createTinyBinary(String xml) throws XMLStreamException, SaxonApiException {
        SaxonDocBuilder builder = new SaxonDocBuilder(processor);

        XmlReader xmlReader = new XmlReader();
        xmlReader.addHandler(builder);
        xmlReader.setStripNamespaces(true);
        xmlReader.read(IOUtils.toInputStream(xml));

        XdmNode node = builder.getDocument();

        TinyTree tinyTree = ((TinyDocumentImpl) node.getUnderlyingNode()).getTree();
        TinyBinary tinyBinary = new TinyBinary(tinyTree, StandardCharsets.UTF_8);

        return tinyBinary.getBytes();
    }

    /**
     * Returns the best approximation as to what {@link AttributeFormat} this Solr Field is.
     *
     * @param solrFieldName name of the Solr field
     * @return the {@link AttributeFormat} associated with the Solr field
     */
    public AttributeFormat getType(String solrFieldName) {

        String suffix = "";
        int lastIndexOfUndercore = solrFieldName.lastIndexOf(FIRST_CHAR_OF_SUFFIX);

        if (lastIndexOfUndercore != -1) {
            suffix = solrFieldName.substring(lastIndexOfUndercore, solrFieldName.length());
        }

        return schemaFields.getFormat(suffix);
    }

    public List<Serializable> getDocValues(String solrFieldName, Collection<Object> docValues) {
        ArrayList<Serializable> values = new ArrayList<>();
        Iterator<Object> iterator = docValues.iterator();
        while (iterator.hasNext()) {
            values.add(getDocValue(solrFieldName, iterator.next()));
        }
        return values;
    }

    public Serializable getDocValue(String solrFieldName, Object docValue) {

        AttributeFormat format = getType(solrFieldName);

        if (AttributeFormat.SHORT.equals(format)) {
            /*
             * We have inside knowledge that user-given short objects are stored as Integers in
             * Solr. You cannot cast an int to a short, so the workaround is to parse it. This
             * should not lead to any loss of information because the value was originally a short.
             */
            return Short.parseShort(docValue.toString());
        } else if (AttributeFormat.OBJECT.equals(format)) {

            ByteArrayInputStream bais = null;
            ObjectInputStream in = null;
            try {
                bais = new ByteArrayInputStream((byte[]) docValue);
                in = new ObjectInputStream(bais);
                return (Serializable) in.readObject();
            } catch (IOException e) {
                LOGGER.warn("IO exception loading input document", e);
            } catch (ClassNotFoundException e) {
                LOGGER.warn("Could not create object to return.", e);
                // TODO which exception to throw?
            } finally {
                IOUtils.closeQuietly(bais);
                IOUtils.closeQuietly(in);
            }

            return null;
        } else {
            return ((Serializable) docValue);
        }
    }

    /**
     * PRE-CONDITION is that fieldname cannot be null.
     * <p>
     * The convention is that we add a suffix starting with an underscore, so if we find the last
     * underscore, then we can return the original field name.
     *
     * @param solrFieldName Solr index field name
     * @return the original field name
     */
    public String resolveFieldName(String solrFieldName) {
        int lastIndexOfUndercore = solrFieldName.lastIndexOf(FIRST_CHAR_OF_SUFFIX);

        if (lastIndexOfUndercore != -1) {
            return solrFieldName.substring(0, lastIndexOfUndercore);
        }
        return solrFieldName;
    }

    public boolean isPrivateField(String solrFieldName) {
        return PRIVATE_SOLR_FIELDS.contains(solrFieldName) || solrFieldName.endsWith(SchemaFields.SORT_KEY_SUFFIX);
    }

    /**
     * Attempts to resolve the name of a field without being given an {@link AttributeFormat}
     *
     * @param field user given field name
     * @return a list of possible Solr field names that match the given field. If none are found,
     * then an empty list is returned
     */
    public List<String> getAnonymousField(String field) {
        ArrayList<String> list = new ArrayList<>();

        for (AttributeFormat format : AttributeFormat.values()) {
            String fullFieldName = field + schemaFields.getFieldSuffix(format);

            if (fieldsCache.contains(fullFieldName)) {
                list.add(fullFieldName);
            }
        }

        return list;
    }

    /**
     * Attempts to find the fieldName for the given propertyName value.
     *
     * @param propertyName           property name provided by user
     * @param format                 {@link AttributeFormat} that describes the type of {@link ddf.catalog.data.Attribute} the field is
     * @param isSearchedAsExactValue specifies if any special index suffixes need to be added to the field
     * @return the proper schema field name. If a schema name cannot be found in cache, returns a
     * schema field name that matches the dynamic field type formatting.
     */
    public String getField(String propertyName, AttributeFormat format, boolean isSearchedAsExactValue) {

        if (Metacard.ANY_GEO.equals(propertyName)) {
            return Metacard.GEOGRAPHY + "_geo_index";
        }

        String fieldName = propertyName + schemaFields.getFieldSuffix(format)
                + (isSearchedAsExactValue ? "" : getSpecialIndexSuffix(format));

        if (fieldsCache.contains(fieldName)) {
            return fieldName;
        }

        switch (format) {
        case DOUBLE:
        case LONG:
        case INTEGER:
        case SHORT:
        case FLOAT:
            return findAnyMatchingNumericalField(propertyName);
        default:
            break;
        }

        LOGGER.debug("Could not find exact schema field name for [{}], attempting to search with [{}]",
                propertyName, fieldName);

        return fieldName;
    }

    public String getFieldSuffix(AttributeFormat format) {
        return schemaFields.getFieldSuffix(format);
    }

    public MetacardType getMetacardType(SolrDocument doc) throws MetacardCreationException {
        String mTypeFieldName = doc.getFirstValue(SchemaFields.METACARD_TYPE_FIELD_NAME).toString();

        MetacardType cachedMetacardType = metacardTypesCache.get(mTypeFieldName);

        if (cachedMetacardType != null) {
            return cachedMetacardType;
        }

        byte[] bytes = (byte[]) doc.getFirstValue(SchemaFields.METACARD_TYPE_OBJECT_FIELD_NAME);

        ByteArrayInputStream bais = null;
        ObjectInputStream in = null;
        try {

            bais = new ByteArrayInputStream(bytes);

            in = new ObjectInputStream(bais);

            cachedMetacardType = (MetacardType) in.readObject();

        } catch (IOException e) {

            LOGGER.warn("IO exception loading cached metacard type", e);

            throw new MetacardCreationException(COULD_NOT_READ_METACARD_TYPE_MESSAGE);
        } catch (ClassNotFoundException e) {
            LOGGER.warn("Class exception loading cached metacard type", e);

            throw new MetacardCreationException(COULD_NOT_READ_METACARD_TYPE_MESSAGE);
        } finally {
            IOUtils.closeQuietly(bais);
            IOUtils.closeQuietly(in);
        }

        metacardTypeNameToSerialCache.put(mTypeFieldName, bytes);
        metacardTypesCache.put(mTypeFieldName, cachedMetacardType);
        addToFieldsCache(cachedMetacardType.getAttributeDescriptors());
        return cachedMetacardType;
    }

    public String getCaseSensitiveField(String mappedPropertyName) {
        // TODO We can check if this field really does exist
        return mappedPropertyName + SchemaFields.HAS_CASE;
    }

    public String getWhitespaceTokenizedField(String mappedPropertyName) {
        return mappedPropertyName + SchemaFields.WHITESPACE_TEXT_SUFFIX;
    }

    protected String getSpecialIndexSuffix(AttributeFormat format) {

        switch (format) {
        case STRING:
            return SchemaFields.TOKENIZED;
        case GEOMETRY:
            return SchemaFields.INDEXED;
        case XML:
            return SchemaFields.TEXT_PATH;
        default:
            break;
        }

        return "";
    }

    private void addToFieldsCache(Set<AttributeDescriptor> descriptors) {
        for (AttributeDescriptor ad : descriptors) {

            AttributeFormat format = ad.getType().getAttributeFormat();

            fieldsCache.add(ad.getName() + schemaFields.getFieldSuffix(format));

            if (!getSpecialIndexSuffix(format).equals("")) {
                fieldsCache.add(ad.getName() + schemaFields.getFieldSuffix(format) + getSpecialIndexSuffix(format));
            }

            if (format.equals(AttributeFormat.STRING)) {
                fieldsCache.add(ad.getName() + schemaFields.getFieldSuffix(format) + getSpecialIndexSuffix(format)
                        + SchemaFields.HAS_CASE);
                anyTextFieldsCache.add(ad.getName() + schemaFields.getFieldSuffix(format));
            }

            if (format.equals(AttributeFormat.XML)) {
                fieldsCache.add(ad.getName() + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED);
                fieldsCache.add(
                        ad.getName() + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED + SchemaFields.HAS_CASE);
                fieldsCache.add(ad.getName() + schemaFields.getFieldSuffix(format) + getSpecialIndexSuffix(format));
            }
        }
    }

    private byte[] serialize(MetacardType anywhereMType) throws MetacardCreationException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        try {
            ObjectOutputStream out = new ObjectOutputStream(baos);
            out.writeObject(anywhereMType);

            byte[] bytes = baos.toByteArray();

            baos.close();
            out.close();

            return bytes;
        } catch (IOException e) {
            LOGGER.warn("IO exception reading metacard type message", e);
            throw new MetacardCreationException(COULD_NOT_READ_METACARD_TYPE_MESSAGE);
        }

    }

    private String findAnyMatchingNumericalField(String propertyName) {

        if (fieldsCache.contains(propertyName + SchemaFields.DOUBLE_SUFFIX)) {
            return propertyName + SchemaFields.DOUBLE_SUFFIX;
        }
        if (fieldsCache.contains(propertyName + SchemaFields.FLOAT_SUFFIX)) {
            return propertyName + SchemaFields.FLOAT_SUFFIX;
        }
        if (fieldsCache.contains(propertyName + SchemaFields.INTEGER_SUFFIX)) {
            return propertyName + SchemaFields.INTEGER_SUFFIX;
        }
        if (fieldsCache.contains(propertyName + SchemaFields.LONG_SUFFIX)) {
            return propertyName + SchemaFields.LONG_SUFFIX;
        }
        if (fieldsCache.contains(propertyName + SchemaFields.SHORT_SUFFIX)) {
            return propertyName + SchemaFields.SHORT_SUFFIX;
        }

        LOGGER.debug("Did not find any numerical schema fields for property [{}]. Replacing with property [{}]",
                propertyName, propertyName + SchemaFields.INTEGER_SUFFIX);
        return propertyName + SchemaFields.INTEGER_SUFFIX;
    }

    /**
     * Given xml as a string, this method will parse out element text and CDATA text. It separates
     * each by one space character.
     *
     * @param xmlDatas List of XML as {@code String}
     * @return parsed CDATA and element text
     */
    protected List<String> parseTextFrom(List<Serializable> xmlDatas) {

        StringBuilder builder = new StringBuilder();
        List<String> parsedTexts = new ArrayList<>();
        XMLStreamReader xmlStreamReader = null;
        StringReader sr = null;
        long starttime = System.currentTimeMillis();

        try {
            for (Serializable xmlData : xmlDatas) {
                // xml parser does not handle leading whitespace
                sr = new StringReader(xmlData.toString());
                xmlStreamReader = XML_INPUT_FACTORY.createXMLStreamReader(sr);

                while (xmlStreamReader.hasNext()) {
                    int event = xmlStreamReader.next();

                    if (event == XMLStreamConstants.CHARACTERS || event == XMLStreamConstants.CDATA) {

                        String text = xmlStreamReader.getText();

                        if (StringUtils.isNotBlank(text)) {
                            builder.append(" ").append(text.trim());
                        }

                    }
                    if (event == XMLStreamConstants.START_ELEMENT) {
                        for (int i = 0; i < xmlStreamReader.getAttributeCount(); i++) {

                            String text = xmlStreamReader.getAttributeValue(i);

                            if (StringUtils.isNotBlank(text)) {
                                builder.append(" ").append(text.trim());
                            }
                        }
                    }
                }
                parsedTexts.add(builder.toString());
                builder.setLength(0);
            }
        } catch (XMLStreamException e1) {
            LOGGER.warn("Failure occurred in parsing the xml data. No data has been stored or indexed.", e1);
        } finally {
            IOUtils.closeQuietly(sr);
            if (xmlStreamReader != null) {
                try {
                    xmlStreamReader.close();
                } catch (XMLStreamException e) {
                    LOGGER.debug("Exception closing XMLStreamReader", e);
                }
            }
        }
        long endTime = System.currentTimeMillis();

        LOGGER.debug("Parsing took {} ms", endTime - starttime);

        return parsedTexts;
    }

    private Set<AttributeDescriptor> convertAttributeDescriptors(Set<AttributeDescriptor> attributeDescriptors) {
        Set<AttributeDescriptor> newAttributeDescriptors = new HashSet<>(attributeDescriptors.size());

        for (AttributeDescriptor attributeDescriptor : attributeDescriptors) {
            String name = attributeDescriptor.getName();
            boolean isIndexed = attributeDescriptor.isIndexed();
            boolean isStored = attributeDescriptor.isStored();
            boolean isTokenized = attributeDescriptor.isTokenized();
            boolean isMultiValued = attributeDescriptor.isMultiValued();
            AttributeType<?> attributeType = attributeDescriptor.getType();
            newAttributeDescriptors.add(new AttributeDescriptorImpl(name, isIndexed, isStored, isTokenized,
                    isMultiValued, attributeType));
        }

        return newAttributeDescriptors;
    }

    public String getSortKey(String field) {
        if (!(field.endsWith(SchemaFields.SORT_KEY_SUFFIX) || Result.DISTANCE.equals(field)
                || Result.RELEVANCE.equals(field) || Result.TEMPORAL.equals(field))) {
            field = field + SchemaFields.SORT_KEY_SUFFIX;
        }
        return field;
    }

    public Stream<String> anyTextFields() {
        return anyTextFieldsCache.stream();
    }
}