dk.statsbiblioteket.netark.dvenabler.wrapper.SortedDocValuesWrapper.java Source code

Introduction

Here is the source code for dk.statsbiblioteket.netark.dvenabler.wrapper.SortedDocValuesWrapper.java
Source

/*
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */
package dk.statsbiblioteket.netark.dvenabler.wrapper;

import dk.statsbiblioteket.netark.dvenabler.DVConfig;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.util.BytesRef;

import java.io.IOException;
import java.util.*;

/**
 * Memory-intensive transformer: All values are stored as String in (gasp) a Set and a List.
 */
public class SortedDocValuesWrapper extends SortedDocValues {
    private static Log log = LogFactory.getLog(SortedDocValuesWrapper.class);

    private final AtomicReader reader;
    private final DVConfig field;
    private final Set<String> FIELDS; // Contains {@link #field} and nothing else
    // TODO: Store this in a BytesRefArray instead. This requires custom binary search
    private final List<BytesRef> values;
    private final ProgressTracker tracker;

    public SortedDocValuesWrapper(AtomicReader reader, DVConfig field) throws IOException {
        this.reader = reader;
        this.field = field;
        FIELDS = new HashSet<>(Arrays.asList(field.getName()));
        log.info("Creating map for SortedDocValues for field '" + field + "'");
        long startTime = System.nanoTime();
        values = fill();
        tracker = new ProgressTracker(field.getName(), log, reader.maxDoc());
        log.info("Finished creating SortedDocValues with " + values.size() + " unique values for " + reader.maxDoc()
                + " docs for field '" + field + "' in " + ((System.nanoTime() - startTime) / 1000000 / 1000)
                + "ms");
    }

    private List<BytesRef> fill() throws IOException {
        final SortedSet<BytesRef> values = new TreeSet<>();
        for (int docID = 0; docID < reader.maxDoc(); docID++) {
            String value = reader.document(docID, FIELDS).get(field.getName());
            //System.out.println(value);
            if (value != null) {
                values.add(new BytesRef(value));
            }
        }
        return new ArrayList<>(values);
    }

    @Override
    public int getOrd(int docID) {
        tracker.ping(docID);

        try {
            String value = reader.document(docID, FIELDS).get(field.getName());
            if (value == null) {
                return -1;
            }
            int ord = Collections.binarySearch(values, new BytesRef(value));
            if (ord < 0) {
                throw new IllegalStateException("The ord for value '" + value + "' for docID " + docID
                        + " in field '" + field + "' could not be located but should always be present");
            }
            return ord;
        } catch (IOException e) {
            throw new RuntimeException("Unable to lookup docID=" + docID + ", field=" + field, e);
        }
    }

    @Override
    public void lookupOrd(int ord, BytesRef result) {
        result.copyBytes(values.get(ord));
    }

    @Override
    public int getValueCount() {
        return values.size();
    }
}