org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig.java Source code

Introduction

Here is the source code for org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.hadoopImpl.mapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;

import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.ScannerBase;
import org.apache.accumulo.core.client.sample.SamplerConfiguration;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.util.Pair;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

/**
 * This class to holds a batch scan configuration for a table. It contains all the properties needed
 * to specify how rows should be returned from the table.
 */
public class InputTableConfig implements Writable {

    // store iterators by name to prevent duplicates for addIterator
    private Map<String, IteratorSetting> iterators = Collections.emptyMap();
    private List<Range> ranges = Collections.emptyList();
    private Collection<IteratorSetting.Column> columns = Collections.emptyList();

    private Optional<Authorizations> scanAuths = Optional.empty();
    private Optional<String> context = Optional.empty();

    private boolean autoAdjustRanges = true;
    private boolean useLocalIterators = false;
    private boolean useIsolatedScanners = false;
    private boolean offlineScan = false;
    private boolean batchScan = false;
    private SamplerConfiguration samplerConfig = null;
    private Map<String, String> executionHints = Collections.emptyMap();

    public InputTableConfig() {
    }

    /**
     * Creates a batch scan config object out of a previously serialized batch scan config object.
     *
     * @param input
     *          the data input of the serialized batch scan config
     */
    public InputTableConfig(DataInput input) throws IOException {
        readFields(input);
    }

    /**
     * Sets the input ranges to scan for all tables associated with this job. This will be added to
     * any per-table ranges that have been set using
     *
     * @param ranges
     *          the ranges that will be mapped over
     * @since 1.6.0
     */
    public InputTableConfig setRanges(List<Range> ranges) {
        this.ranges = ranges;
        return this;
    }

    /**
     * Returns the ranges to be queried in the configuration
     */
    public List<Range> getRanges() {
        return ranges;
    }

    /**
     * Restricts the columns that will be mapped over for this job for the default input table.
     *
     * @param columns
     *          a pair of {@link Text} objects corresponding to column family and column qualifier. If
     *          the column qualifier is null, the entire column family is selected. An empty set is
     *          the default and is equivalent to scanning the all columns.
     * @since 1.6.0
     */
    public InputTableConfig fetchColumns(Collection<IteratorSetting.Column> columns) {
        this.columns = columns;
        return this;
    }

    /**
     * Returns the columns to be fetched for this configuration
     */
    public Collection<IteratorSetting.Column> getFetchedColumns() {
        return columns != null ? columns : new HashSet<>();
    }

    public void addIterator(IteratorSetting cfg) {
        if (this.iterators.isEmpty())
            this.iterators = new LinkedHashMap<>();
        this.iterators.put(cfg.getName(), cfg);
    }

    /**
     * Returns the iterators to be set on this configuration
     */
    public List<IteratorSetting> getIterators() {
        return new LinkedList<>(iterators.values());
    }

    /**
     * Controls the automatic adjustment of ranges for this job. This feature merges overlapping
     * ranges, then splits them to align with tablet boundaries. Disabling this feature will cause
     * exactly one Map task to be created for each specified range. The default setting is enabled. *
     *
     * <p>
     * By default, this feature is <b>enabled</b>.
     *
     * @param autoAdjustRanges
     *          the feature is enabled if true, disabled otherwise
     * @see #setRanges(java.util.List)
     * @since 1.6.0
     */
    public InputTableConfig setAutoAdjustRanges(boolean autoAdjustRanges) {
        this.autoAdjustRanges = autoAdjustRanges;
        return this;
    }

    /**
     * Determines whether a configuration has auto-adjust ranges enabled.
     *
     * @return false if the feature is disabled, true otherwise
     * @since 1.6.0
     * @see #setAutoAdjustRanges(boolean)
     */
    public boolean shouldAutoAdjustRanges() {
        return autoAdjustRanges;
    }

    /**
     * Controls the use of the {@link org.apache.accumulo.core.client.ClientSideIteratorScanner} in
     * this job. Enabling this feature will cause the iterator stack to be constructed within the Map
     * task, rather than within the Accumulo TServer. To use this feature, all classes needed for
     * those iterators must be available on the classpath for the task.
     *
     * <p>
     * By default, this feature is <b>disabled</b>.
     *
     * @param useLocalIterators
     *          the feature is enabled if true, disabled otherwise
     * @since 1.6.0
     */
    public InputTableConfig setUseLocalIterators(boolean useLocalIterators) {
        this.useLocalIterators = useLocalIterators;
        return this;
    }

    /**
     * Determines whether a configuration uses local iterators.
     *
     * @return true if the feature is enabled, false otherwise
     * @since 1.6.0
     * @see #setUseLocalIterators(boolean)
     */
    public boolean shouldUseLocalIterators() {
        return useLocalIterators;
    }

    /**
     * Enable reading offline tables. By default, this feature is disabled and only online tables are
     * scanned. This will make the map reduce job directly read the table's files. If the table is not
     * offline, then the job will fail. If the table comes online during the map reduce job, it is
     * likely that the job will fail.
     *
     * <p>
     * To use this option, the map reduce user will need access to read the Accumulo directory in
     * HDFS.
     *
     * <p>
     * Reading the offline table will create the scan time iterator stack in the map process. So any
     * iterators that are configured for the table will need to be on the mapper's classpath. The
     * accumulo.properties may need to be on the mapper's classpath if HDFS or the Accumulo directory
     * in HDFS are non-standard.
     *
     * <p>
     * One way to use this feature is to clone a table, take the clone offline, and use the clone as
     * the input table for a map reduce job. If you plan to map reduce over the data many times, it
     * may be better to the compact the table, clone it, take it offline, and use the clone for all
     * map reduce jobs. The reason to do this is that compaction will reduce each tablet in the table
     * to one file, and it is faster to read from one file.
     *
     * <p>
     * There are two possible advantages to reading a tables file directly out of HDFS. First, you may
     * see better read performance. Second, it will support speculative execution better. When reading
     * an online table speculative execution can put more load on an already slow tablet server.
     *
     * <p>
     * By default, this feature is <b>disabled</b>.
     *
     * @param offlineScan
     *          the feature is enabled if true, disabled otherwise
     */
    public InputTableConfig setOfflineScan(boolean offlineScan) {
        this.offlineScan = offlineScan;
        return this;
    }

    /**
     * Determines whether a configuration has the offline table scan feature enabled.
     *
     * @return true if the feature is enabled, false otherwise
     * @see #setOfflineScan(boolean)
     */
    public boolean isOfflineScan() {
        return offlineScan;
    }

    /**
     * Controls the use of the {@link org.apache.accumulo.core.client.IsolatedScanner} in this job.
     *
     * <p>
     * By default, this feature is <b>disabled</b>.
     *
     * @param useIsolatedScanners
     *          the feature is enabled if true, disabled otherwise
     */
    public InputTableConfig setUseIsolatedScanners(boolean useIsolatedScanners) {
        this.useIsolatedScanners = useIsolatedScanners;
        return this;
    }

    /**
     * Determines whether a configuration has isolation enabled.
     *
     * @return true if the feature is enabled, false otherwise
     * @see #setUseIsolatedScanners(boolean)
     */
    public boolean shouldUseIsolatedScanners() {
        return useIsolatedScanners;
    }

    public void setUseBatchScan(boolean value) {
        this.batchScan = value;
    }

    public boolean shouldBatchScan() {
        return batchScan;
    }

    /**
     * Set the sampler configuration to use when reading from the data.
     */
    public void setSamplerConfiguration(SamplerConfiguration samplerConfiguration) {
        this.samplerConfig = samplerConfiguration;
    }

    public SamplerConfiguration getSamplerConfiguration() {
        return samplerConfig;
    }

    /**
     * The execution hints to set on created scanners. See {@link ScannerBase#setExecutionHints(Map)}
     */
    public void setExecutionHints(Map<String, String> executionHints) {
        this.executionHints = executionHints;
    }

    public Map<String, String> getExecutionHints() {
        return executionHints;
    }

    public Optional<Authorizations> getScanAuths() {
        return scanAuths;
    }

    public InputTableConfig setScanAuths(Authorizations scanAuths) {
        this.scanAuths = Optional.of(scanAuths);
        return this;
    }

    public Optional<String> getContext() {
        return context;
    }

    public void setContext(String context) {
        this.context = Optional.of(context);
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        if (iterators != null) {
            dataOutput.writeInt(iterators.size());
            for (IteratorSetting setting : getIterators())
                setting.write(dataOutput);
        } else {
            dataOutput.writeInt(0);
        }
        if (ranges != null) {
            dataOutput.writeInt(ranges.size());
            for (Range range : ranges)
                range.write(dataOutput);
        } else {
            dataOutput.writeInt(0);
        }
        if (columns != null) {
            dataOutput.writeInt(columns.size());
            for (Pair<Text, Text> column : columns) {
                if (column.getSecond() == null) {
                    dataOutput.writeInt(1);
                    column.getFirst().write(dataOutput);
                } else {
                    dataOutput.writeInt(2);
                    column.getFirst().write(dataOutput);
                    column.getSecond().write(dataOutput);
                }
            }
        } else {
            dataOutput.writeInt(0);
        }
        dataOutput.writeBoolean(autoAdjustRanges);
        dataOutput.writeBoolean(useLocalIterators);
        dataOutput.writeBoolean(useIsolatedScanners);
        dataOutput.writeBoolean(offlineScan);
        if (samplerConfig == null) {
            dataOutput.writeBoolean(false);
        } else {
            dataOutput.writeBoolean(true);
            new SamplerConfigurationImpl(samplerConfig).write(dataOutput);
        }

        if (executionHints.isEmpty()) {
            dataOutput.writeInt(0);
        } else {
            dataOutput.writeInt(executionHints.size());
            for (Entry<String, String> entry : executionHints.entrySet()) {
                dataOutput.writeUTF(entry.getKey());
                dataOutput.writeUTF(entry.getValue());
            }
        }
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        // load iterators
        long iterSize = dataInput.readInt();
        if (iterSize > 0)
            iterators = new LinkedHashMap<>();
        for (int i = 0; i < iterSize; i++) {
            IteratorSetting newIter = new IteratorSetting(dataInput);
            iterators.put(newIter.getName(), newIter);
        }
        // load ranges
        long rangeSize = dataInput.readInt();
        if (rangeSize > 0)
            ranges = new ArrayList<>();
        for (int i = 0; i < rangeSize; i++) {
            Range range = new Range();
            range.readFields(dataInput);
            ranges.add(range);
        }
        // load columns
        long columnSize = dataInput.readInt();
        if (columnSize > 0)
            columns = new HashSet<>();
        for (int i = 0; i < columnSize; i++) {
            long numPairs = dataInput.readInt();
            Text colFam = new Text();
            colFam.readFields(dataInput);
            if (numPairs == 1) {
                columns.add(new IteratorSetting.Column(colFam, null));
            } else if (numPairs == 2) {
                Text colQual = new Text();
                colQual.readFields(dataInput);
                columns.add(new IteratorSetting.Column(colFam, colQual));
            }
        }
        autoAdjustRanges = dataInput.readBoolean();
        useLocalIterators = dataInput.readBoolean();
        useIsolatedScanners = dataInput.readBoolean();
        offlineScan = dataInput.readBoolean();

        if (dataInput.readBoolean()) {
            samplerConfig = new SamplerConfigurationImpl(dataInput).toSamplerConfiguration();
        }

        executionHints = new HashMap<>();
        int numHints = dataInput.readInt();
        for (int i = 0; i < numHints; i++) {
            String k = dataInput.readUTF();
            String v = dataInput.readUTF();
            executionHints.put(k, v);
        }
    }

    @Override
    public boolean equals(Object o) {
        if (this == o)
            return true;
        if (o == null || getClass() != o.getClass())
            return false;

        InputTableConfig that = (InputTableConfig) o;

        if (autoAdjustRanges != that.autoAdjustRanges)
            return false;
        if (offlineScan != that.offlineScan)
            return false;
        if (useIsolatedScanners != that.useIsolatedScanners)
            return false;
        if (useLocalIterators != that.useLocalIterators)
            return false;
        if (!Objects.equals(columns, that.columns))
            return false;
        if (!Objects.equals(iterators, that.iterators))
            return false;
        if (!Objects.equals(ranges, that.ranges))
            return false;
        if (!Objects.equals(executionHints, that.executionHints))
            return false;
        return Objects.equals(samplerConfig, that.samplerConfig);
    }

    @Override
    public int hashCode() {
        int result = 31 * (iterators != null ? iterators.hashCode() : 0);
        result = 31 * result + (ranges != null ? ranges.hashCode() : 0);
        result = 31 * result + (columns != null ? columns.hashCode() : 0);
        result = 31 * result + (autoAdjustRanges ? 1 : 0);
        result = 31 * result + (useLocalIterators ? 1 : 0);
        result = 31 * result + (useIsolatedScanners ? 1 : 0);
        result = 31 * result + (offlineScan ? 1 : 0);
        result = 31 * result + (samplerConfig == null ? 0 : samplerConfig.hashCode());
        result = 31 * result + (executionHints == null ? 0 : executionHints.hashCode());
        return result;
    }
}