org.apache.hadoop.io.MapFileConcurrentReader.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.io.MapFileConcurrentReader.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.io;

import java.util.ArrayList;
import java.util.Arrays;
import java.io.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.Options;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;

/** A file-based map from keys to values.
 *
 * <p>A map is a directory containing two files, the <code>data</code> file,
 * containing all keys and values in the map, and a smaller <code>index</code>
 * file, containing a fraction of the keys.  The fraction is determined by
 * {@link Writer#getIndexInterval()}.
 *
 * <p>The index file is read entirely into memory.  Thus key implementations
 * should try to keep themselves small.
 *
 * <p>Map files are created by adding entries in-order.  To maintain a large
 * database, perform updates by copying the previous version of a database and
 * merging in a sorted change list, to create a new version of the database in
 * a new file.  Sorting large change lists can be done with {@link
 * SequenceFile.Sorter}.
 */
public class MapFileConcurrentReader {
    private static final Log LOG = LogFactory.getLog(MapFileConcurrentReader.class);

    /** Number of index entries to skip between each entry.  Zero by default.
     * Setting this to values larger than zero can facilitate opening large map
     * files using less memory. */
    private int INDEX_SKIP = 0;

    private WritableComparator comparator;

    // the data, on disk
    private ThreadLocal<SequenceFile.Reader> data;
    private ArrayList<SequenceFile.Reader> allDataFiles = new ArrayList<SequenceFile.Reader>();
    private SequenceFile.Reader index;
    long firstPosition = -1;

    // whether the index Reader was closed
    private boolean indexClosed = false;

    // the index, in memory
    private int count = -1;
    private WritableComparable[] keys;
    private long[] positions;

    public MapFileConcurrentReader(Path dir, Configuration conf, SequenceFile.Reader.Option... opts)
            throws IOException {
        MapFile.Reader.ComparatorOption comparatorOption = Options.getOption(MapFile.Reader.ComparatorOption.class,
                opts);
        WritableComparator comparator = comparatorOption == null ? null : comparatorOption.getValue();
        INDEX_SKIP = conf.getInt("io.map.index.skip", 0);
        open(dir, comparator, conf, opts);
    }

    protected synchronized void open(Path dir, WritableComparator comparator, final Configuration conf,
            final SequenceFile.Reader.Option... options) throws IOException {
        final Path dataFile = new Path(dir, MapFile.DATA_FILE_NAME);
        final Path indexFile = new Path(dir, MapFile.INDEX_FILE_NAME);

        // open the data
        this.data = new ThreadLocal<SequenceFile.Reader>() {
            protected SequenceFile.Reader initialValue() {
                try {
                    SequenceFile.Reader r = createDataFileReader(dataFile, conf, options);
                    LOG.info("opened new SequenceFile.Reader for " + dataFile);
                    synchronized (this) {
                        allDataFiles.add(r);
                    }
                    return r;
                } catch (IOException ioe) {
                    throw new RuntimeException(ioe);
                }
            }
        };
        this.firstPosition = data.get().getPosition();

        if (comparator == null)
            this.comparator = WritableComparator.get(data.get().getKeyClass().asSubclass(WritableComparable.class));
        else
            this.comparator = comparator;

        // open the index
        SequenceFile.Reader.Option[] indexOptions = Options.prependOptions(options,
                SequenceFile.Reader.file(indexFile));
        this.index = new SequenceFile.Reader(conf, indexOptions);
    }

    /**
     * Override this method to specialize the type of
     * {@link SequenceFile.Reader} returned.
     */
    protected SequenceFile.Reader createDataFileReader(Path dataFile, Configuration conf,
            SequenceFile.Reader.Option... options) throws IOException {
        SequenceFile.Reader.Option[] newOptions = Options.prependOptions(options,
                SequenceFile.Reader.file(dataFile));
        return new SequenceFile.Reader(conf, newOptions);
    }

    private void readIndex() throws IOException {
        // read the index entirely into memory
        if (this.keys != null)
            return;
        this.count = 0;

        this.positions = new long[1024];

        try {
            int skip = INDEX_SKIP;
            LongWritable position = new LongWritable();
            WritableComparable lastKey = null;
            long lastIndex = -1;
            ArrayList<WritableComparable> keyBuilder = new ArrayList<WritableComparable>(1024);
            while (true) {
                WritableComparable k = comparator.newKey();

                if (!index.next(k, position))
                    break;

                // check order to make sure comparator is compatible
                if (lastKey != null && comparator.compare(lastKey, k) > 0)
                    throw new IOException("key out of order: " + k + " after " + lastKey);
                lastKey = k;
                if (skip > 0) {
                    skip--;
                    continue; // skip this entry
                } else {
                    skip = INDEX_SKIP; // reset skip
                }

                // don't read an index that is the same as the previous one. Block
                // compressed map files used to do this (multiple entries would point
                // at the same block)
                if (position.get() == lastIndex)
                    continue;

                if (count == positions.length) {
                    positions = Arrays.copyOf(positions, positions.length * 2);
                }

                keyBuilder.add(k);
                positions[count] = position.get();
                count++;
            }

            this.keys = keyBuilder.toArray(new WritableComparable[count]);
            positions = Arrays.copyOf(positions, count);
        } catch (EOFException e) {
            LOG.warn("Unexpected EOF reading " + index + " at entry #" + count + ".  Ignoring.");
        } finally {
            indexClosed = true;
            index.close();
        }
    }

    /** Re-positions the reader before its first key. */
    public synchronized void reset() throws IOException {
        data.get().seek(firstPosition);
    }

    /** Get the key at approximately the middle of the file. Or null if the
     *  file is empty.
     */
    public synchronized WritableComparable midKey() throws IOException {

        readIndex();
        if (count == 0) {
            return null;
        }

        return keys[(count - 1) / 2];
    }

    /** Reads the final key from the file.
     *
     * @param key key to read into
     */
    public synchronized void finalKey(WritableComparable key) throws IOException {

        readIndex(); // make sure index is valid
        if (count > 0) {
            data.get().seek(positions[count - 1]); // skip to last indexed entry
        } else {
            reset(); // start at the beginning
        }
        while (data.get().next(key)) {
        } // scan to eof
    }

    private long findPosition(WritableComparable key) throws IOException {
        readIndex(); // make sure index is read

        long seekPosition = -1;
        int seekIndex = binarySearch(key);
        if (seekIndex < 0) // decode insertion point
            seekIndex = -seekIndex - 2;

        if (seekIndex == -1) // belongs before first entry
            seekPosition = firstPosition; // use beginning of file
        else
            seekPosition = positions[seekIndex]; // else use index
        data.get().seek(seekPosition);

        WritableComparable nextKey = comparator.newKey();

        while (data.get().next(nextKey)) {
            int c = comparator.compare(key, nextKey);
            if (c < 0) { // at or beyond desired
                return -1;
            } else if (c == 0) {
                return data.get().getPosition();
            }
        }

        return -1;
    }

    private int binarySearch(WritableComparable key) {
        int low = 0;
        int high = count - 1;

        while (low <= high) {
            int mid = (low + high) >>> 1;
            WritableComparable midVal = keys[mid];
            int cmp = comparator.compare(midVal, key);

            if (cmp < 0)
                low = mid + 1;
            else if (cmp > 0)
                high = mid - 1;
            else
                return mid; // key found
        }
        return -(low + 1); // key not found.
    }

    /** Return the value for the named key, or null if none exists. */
    public Writable get(WritableComparable key, Writable val) throws IOException {
        long position = findPosition(key);
        if (position >= 0) {
            SequenceFile.Reader threadLocalData = data.get();
            threadLocalData.seek(position);
            threadLocalData.getCurrentValue(val);
            return val;
        } else
            return null;
    }

    /** Close the map. */
    public synchronized void close() throws IOException {
        if (!indexClosed) {
            index.close();
        }
        for (SequenceFile.Reader dataFile : allDataFiles) {
            dataFile.close();
        }
    }
}