Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package joshelser; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.TimeUnit; import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.data.ByteSequence; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.PartialKey; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.IteratorEnvironment; import org.apache.accumulo.core.iterators.LongCombiner; import org.apache.accumulo.core.iterators.LongCombiner.Type; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; import org.apache.accumulo.core.iterators.TypedValueCombiner.Encoder; import org.apache.accumulo.core.iterators.WrappingIterator; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; /** * */ public class LimitAndSumColumnFamilyIterator extends WrappingIterator { private static final Logger log = Logger.getLogger(LimitAndSumColumnFamilyIterator.class); public static final String COLUMNS = "limit.columns"; public static final String TYPE = "limit.type"; public static void setColumns(IteratorSetting cfg, Collection<String> columns) { Preconditions.checkNotNull(columns); final StringBuilder sb = new StringBuilder(64); for (String column : columns) { if (0 != sb.length()) { sb.append(','); } sb.append(column); } cfg.addOption(COLUMNS, sb.toString()); } private SortedSet<Text> getColumns(Map<String, String> options) { String serializedColumns = options.get(COLUMNS); if (null == serializedColumns) { throw new IllegalArgumentException("Did not find " + COLUMNS + " in the options map"); } String[] columns = StringUtils.split(serializedColumns, ','); TreeSet<Text> sortedColumns = new TreeSet<Text>(); for (String column : columns) { sortedColumns.add(new Text(column)); } return sortedColumns; } private void setEncoder(Map<String, String> options) { String type = options.get(TYPE); if (type == null) throw new IllegalArgumentException("no type specified"); switch (Type.valueOf(type)) { case VARLEN: setEncoder(LongCombiner.VAR_LEN_ENCODER); return; case FIXEDLEN: setEncoder(LongCombiner.FIXED_LEN_ENCODER); return; case STRING: setEncoder(LongCombiner.STRING_ENCODER); return; default: throw new IllegalArgumentException(); } } /** * Sets the Encoder<V> used to translate Values to V and back. * * @param encoder */ protected void setEncoder(Encoder<Long> encoder) { this.encoder = encoder; } /** * A convenience method for setting the long encoding type. * * @param is * IteratorSetting object to configure. * @param type * LongCombiner.Type specifying the encoding type. */ public static void setEncodingType(IteratorSetting is, LongCombiner.Type type) { is.addOption(TYPE, type.toString()); } private Encoder<Long> encoder = null; private SortedSet<Text> desiredColumns; private Long sum; private Text rowHolder, colfamHolder; private Key topKey; private Value topValue; // Internal state used during seek() or next() private Range currentRange; private Collection<ByteSequence> currentColumnFamilies; private boolean currentColumnFamiliesInclusive; public LimitAndSumColumnFamilyIterator() { this.rowHolder = new Text(); this.colfamHolder = new Text(); this.sum = 0l; } @Override public SortedKeyValueIterator<Key, Value> deepCopy(IteratorEnvironment env) { LimitAndSumColumnFamilyIterator skvi = new LimitAndSumColumnFamilyIterator(); skvi.desiredColumns = this.desiredColumns; skvi.encoder = this.encoder; // These are likely to not be used (reseeked immediately after deepCopy), but it's prudent to copy them anyways skvi.topKey = this.topKey; skvi.sum = this.sum; skvi.topValue = this.topValue; return skvi; } @Override public void init(SortedKeyValueIterator<Key, Value> source, Map<String, String> options, IteratorEnvironment env) throws IOException { super.init(source, options, env); desiredColumns = getColumns(options); setEncoder(options); } @Override public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException { Stopwatch seekSw = Stopwatch.createStarted(); // Make sure we invalidate our last record nextRecordNotFound(); log.debug("Seeking to " + range); getSource().seek(range, columnFamilies, inclusive); currentRange = range; currentColumnFamilies = columnFamilies; currentColumnFamiliesInclusive = inclusive; aggregate(); seekSw.stop(); log.info("Seek duration: " + seekSw.elapsed(TimeUnit.MILLISECONDS)); } @Override public void next() throws IOException { Stopwatch nextSw = Stopwatch.createStarted(); // Make sure we invalidate our last record nextRecordNotFound(); // Catch the case where we have no data and there's nothing to next() if (!getSource().hasTop()) { return; } getSource().next(); aggregate(); nextSw.stop(); log.info("Next duration: " + nextSw.elapsed(TimeUnit.MILLISECONDS)); } @Override public Key getTopKey() { return topKey; } @Override public Value getTopValue() { return topValue; } @Override public boolean hasTop() { // topKey and sum should be held in step -- they are either both null, or both non-null if ((null == topKey && null != sum) || (null != topKey && null == sum)) { throw new IllegalStateException("topKey and sum in differing states: " + topKey + " => " + sum); } // thus, we can just look at one to determine if there's a record to return. return topKey != null; } /** * Given the current position in the {@link source}, filter to only the columns specified. Sets topKey and topValue to non-null on success * * @throws IOException */ protected void aggregate() throws IOException { Stopwatch aggrSw = Stopwatch.createStarted(); // Explicitly track the "startKey" bound on the Range of data we want to observe // Would be desirable to just use a Range for this, but you incur a penalty for this // due to the repeated creation/deletion of the Range object. I believe this is due to the additional // call to Key#compareTo(Key) that a call to Range#contains(Key) would perform when we really only care about // one of those comparisons Key currentStartKey = currentRange.getStartKey(); if (!getSource().hasTop()) { nextRecordNotFound(); aggrSw.stop(); log.trace("Aggregate duration: " + aggrSw.elapsed(TimeUnit.MILLISECONDS)); return; } while (getSource().hasTop()) { final Key currentKey = getSource().getTopKey(); final Value currentValue = getSource().getTopValue(); if (currentRange.afterEndKey(currentKey)) { break; } currentKey.getColumnFamily(colfamHolder); final Iterator<Text> remainingColumns = desiredColumns.tailSet(colfamHolder).iterator(); if (desiredColumns.contains(colfamHolder)) { // found a column we wanted nextRecordFound(currentKey, currentValue); // The tailSet returns elements greater than or equal to the provided element // need to consume the equality element remainingColumns.next(); } if (remainingColumns.hasNext()) { // Get the row avoiding a new Text currentKey.getRow(rowHolder); Text nextColumn = remainingColumns.next(); currentStartKey = new Key(rowHolder, nextColumn); } else { currentStartKey = currentKey.followingKey(PartialKey.ROW); // No more data to read, outside of where we wanted to look if (currentRange.afterEndKey(currentStartKey)) { setReturnValue(aggrSw); return; } } log.trace("Moving to " + currentStartKey); if (!getSource().hasTop()) { setReturnValue(aggrSw); return; } boolean advancedToDesiredPoint = false; // Move down to the next Key for (int i = 0; i < 10 && !advancedToDesiredPoint; i++) { getSource().next(); if (getSource().hasTop()) { // Move to at least currentStartKey if (currentStartKey.compareTo(getSource().getTopKey()) <= 0) { advancedToDesiredPoint = true; } } else { setReturnValue(aggrSw); return; } } if (!advancedToDesiredPoint) { log.debug("Seeking to find next desired key: " + currentStartKey); getSource().seek( new Range(currentStartKey, true, currentRange.getEndKey(), currentRange.isEndKeyInclusive()), currentColumnFamilies, currentColumnFamiliesInclusive); } } setReturnValue(aggrSw); } private void nextRecordNotFound() { topKey = null; sum = null; } private void nextRecordFound(Key k, Value v) { // For every value that we add to the computed sum, we want to also advance // the Key that is returned so that we don't re-process any records topKey = k; if (null == sum) { sum = 0l; } sum += encoder.decode(v.get()); } private void setReturnValue(Stopwatch aggrSw) { aggrSw.stop(); log.trace("Aggregate duration: " + aggrSw.elapsed(TimeUnit.MILLISECONDS)); if (null != sum) { log.debug("Computed a sum of " + sum); topValue = new Value(encoder.encode(sum)); } else { topValue = new Value(encoder.encode(0l)); } } }