Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.FooterBuffer; import org.apache.hadoop.hive.ql.io.IOContext.Comparison; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; /** This class prepares an IOContext, and provides the ability to perform a binary search on the * data. The binary search can be used by setting the value of inputFormatSorted in the * MapreduceWork to true, but it should only be used if the data is going to a FilterOperator, * which filters by comparing a value in the data with a constant, using one of the comparisons * =, <, >, <=, >=. If the RecordReader's underlying format is an RCFile, this object can perform * a binary search to find the block to begin reading from, and stop reading once it can be * determined no other entries will match the filter. */ public abstract class HiveContextAwareRecordReader<K, V> implements RecordReader<K, V> { private static final Log LOG = LogFactory.getLog(HiveContextAwareRecordReader.class.getName()); private boolean initDone = false; private long rangeStart; private long rangeEnd; private long splitEnd; private long previousPosition = -1; private boolean wasUsingSortedSearch = false; private String genericUDFClassName = null; private final List<Comparison> stopComparisons = new ArrayList<Comparison>(); private Map<String, PartitionDesc> pathToPartitionInfo; protected RecordReader recordReader; protected JobConf jobConf; protected boolean isSorted = false; public HiveContextAwareRecordReader(JobConf conf) throws IOException { this(null, conf); } public HiveContextAwareRecordReader(RecordReader recordReader) { this.recordReader = recordReader; } public HiveContextAwareRecordReader(RecordReader recordReader, JobConf conf) throws IOException { this.recordReader = recordReader; this.jobConf = conf; } public void setRecordReader(RecordReader recordReader) { this.recordReader = recordReader; } /** * Close this {@link InputSplit} to future operations. * * @throws IOException */ public abstract void doClose() throws IOException; private IOContext ioCxtRef = null; @Override public void close() throws IOException { doClose(); initDone = false; ioCxtRef = null; } @Override public boolean next(K key, V value) throws IOException { if (!initDone) { throw new IOException("Hive IOContext is not inited."); } updateIOContext(); try { boolean retVal = doNext(key, value); if (retVal) { if (key instanceof RecordIdentifier) { //supports AcidInputFormat which uses the KEY pass ROW__ID info ioCxtRef.setRecordIdentifier((RecordIdentifier) key); } else if (recordReader instanceof AcidInputFormat.AcidRecordReader) { //supports AcidInputFormat which do not use the KEY pass ROW__ID info ioCxtRef.setRecordIdentifier( ((AcidInputFormat.AcidRecordReader) recordReader).getRecordIdentifier()); } } return retVal; } catch (IOException e) { ioCxtRef.setIOExceptions(true); throw e; } } protected void updateIOContext() throws IOException { long pointerPos = this.getPos(); if (!ioCxtRef.isBlockPointer()) { ioCxtRef.setCurrentBlockStart(pointerPos); ioCxtRef.setCurrentRow(0); return; } ioCxtRef.setCurrentRow(ioCxtRef.getCurrentRow() + 1); if (ioCxtRef.getNextBlockStart() == -1) { ioCxtRef.setNextBlockStart(pointerPos); ioCxtRef.setCurrentRow(0); } if (pointerPos != ioCxtRef.getNextBlockStart()) { // the reader pointer has moved to the end of next block, or the end of // current record. ioCxtRef.setCurrentRow(0); if (ioCxtRef.getCurrentBlockStart() == ioCxtRef.getNextBlockStart()) { ioCxtRef.setCurrentRow(1); } ioCxtRef.setCurrentBlockStart(ioCxtRef.getNextBlockStart()); ioCxtRef.setNextBlockStart(pointerPos); } } public IOContext getIOContext() { return IOContextMap.get(jobConf); } private void initIOContext(long startPos, boolean isBlockPointer, Path inputPath) { ioCxtRef = this.getIOContext(); ioCxtRef.setCurrentBlockStart(startPos); ioCxtRef.setBlockPointer(isBlockPointer); ioCxtRef.setInputPath(inputPath); LOG.info("Processing file " + inputPath); initDone = true; } public void initIOContext(FileSplit split, JobConf job, Class inputFormatClass) throws IOException { this.initIOContext(split, job, inputFormatClass, null); } public void initIOContext(FileSplit split, JobConf job, Class inputFormatClass, RecordReader recordReader) throws IOException { boolean blockPointer = false; long blockStart = -1; FileSplit fileSplit = split; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(job); if (inputFormatClass.getName().contains("SequenceFile")) { SequenceFile.Reader in = new SequenceFile.Reader(fs, path, job); blockPointer = in.isBlockCompressed(); in.sync(fileSplit.getStart()); blockStart = in.getPosition(); in.close(); } else if (recordReader instanceof RCFileRecordReader) { blockPointer = true; blockStart = ((RCFileRecordReader) recordReader).getStart(); } else if (inputFormatClass.getName().contains("RCFile")) { blockPointer = true; RCFile.Reader in = new RCFile.Reader(fs, path, job); in.sync(fileSplit.getStart()); blockStart = in.getPosition(); in.close(); } this.jobConf = job; this.initIOContext(blockStart, blockPointer, path.makeQualified(fs)); this.initIOContextSortedProps(split, recordReader, job); } public void initIOContextSortedProps(FileSplit split, RecordReader recordReader, JobConf job) { this.jobConf = job; this.getIOContext().resetSortingValues(); this.isSorted = jobConf.getBoolean("hive.input.format.sorted", false); this.rangeStart = split.getStart(); this.rangeEnd = split.getStart() + split.getLength(); this.splitEnd = rangeEnd; if (recordReader instanceof RCFileRecordReader && rangeEnd != 0 && this.isSorted) { // Binary search only works if we know the size of the split, and the recordReader is an // RCFileRecordReader this.getIOContext().setUseSorted(true); this.getIOContext().setBinarySearching(true); this.wasUsingSortedSearch = true; } else { // Use the defalut methods for next in the child class this.isSorted = false; } } @Override public float getProgress() throws IOException { if (this.getIOContext().isBinarySearching()) { return 0; } else { return recordReader.getProgress(); } } private FooterBuffer footerBuffer = null; private int headerCount = 0; private int footerCount = 0; public boolean doNext(K key, V value) throws IOException { if (this.isSorted) { if (this.getIOContext().shouldEndBinarySearch() || (!this.getIOContext().useSorted() && this.wasUsingSortedSearch)) { beginLinearSearch(); this.wasUsingSortedSearch = false; this.getIOContext().setEndBinarySearch(false); } if (this.getIOContext().useSorted()) { if (this.genericUDFClassName == null && this.getIOContext().getGenericUDFClassName() != null) { setGenericUDFClassName(this.getIOContext().getGenericUDFClassName()); } if (this.getIOContext().isBinarySearching()) { // Proceed with a binary search if (this.getIOContext().getComparison() != null) { switch (this.getIOContext().getComparison()) { case GREATER: case EQUAL: // Indexes have only one entry per value, could go linear from here, if we want to // use this for any sorted table, we'll need to continue the search rangeEnd = previousPosition; break; case LESS: rangeStart = previousPosition; break; default: break; } } long position = (rangeStart + rangeEnd) / 2; sync(position); long newPosition = getSyncedPosition(); // If the newPosition is the same as the previousPosition, we've reached the end of the // binary search, if the new position at least as big as the size of the split, any // matching rows must be in the final block, so we can end the binary search. if (newPosition == previousPosition || newPosition >= splitEnd) { this.getIOContext().setBinarySearching(false); sync(rangeStart); } previousPosition = newPosition; } else if (foundAllTargets()) { // Found all possible rows which will not be filtered return false; } } } try { /** * When start reading new file, check header, footer rows. * If file contains header, skip header lines before reading the records. * If file contains footer, used a FooterBuffer to remove footer lines * at the end of the table file. **/ if (this.ioCxtRef.getCurrentBlockStart() == 0) { // Check if the table file has header to skip. Path filePath = this.ioCxtRef.getInputPath(); PartitionDesc part = null; try { if (pathToPartitionInfo == null) { pathToPartitionInfo = Utilities.getMapWork(jobConf).getPathToPartitionInfo(); } part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, filePath, IOPrepareCache.get().getPartitionDescMap()); } catch (AssertionError ae) { LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath() + "because " + ae.getMessage()); part = null; } catch (Exception e) { LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath() + "because " + e.getMessage()); part = null; } TableDesc table = (part == null) ? null : part.getTableDesc(); if (table != null) { headerCount = Utilities.getHeaderCount(table); footerCount = Utilities.getFooterCount(table, jobConf); } // If input contains header, skip header. if (!Utilities.skipHeader(recordReader, headerCount, (WritableComparable) key, (Writable) value)) { return false; } if (footerCount > 0) { footerBuffer = new FooterBuffer(); if (!footerBuffer.initializeBuffer(jobConf, recordReader, footerCount, (WritableComparable) key, (Writable) value)) { return false; } } } if (footerBuffer == null) { // Table files don't have footer rows. return recordReader.next(key, value); } else { return footerBuffer.updateBuffer(jobConf, recordReader, (WritableComparable) key, (Writable) value); } } catch (Exception e) { return HiveIOExceptionHandlerUtil.handleRecordReaderNextException(e, jobConf); } } private void sync(long position) throws IOException { ((RCFileRecordReader) recordReader).sync(position); ((RCFileRecordReader) recordReader).resetBuffer(); } private long getSyncedPosition() throws IOException { return recordReader.getPos(); } /** * This uses the name of the generic UDF being used by the filter to determine whether we should * perform a binary search, and what the comparisons we should use to signal the end of the * linear scan are. * @param genericUDFClassName * @throws IOException */ private void setGenericUDFClassName(String genericUDFClassName) throws IOException { this.genericUDFClassName = genericUDFClassName; if (genericUDFClassName.equals(GenericUDFOPEqual.class.getName())) { stopComparisons.add(Comparison.GREATER); } else if (genericUDFClassName.equals(GenericUDFOPLessThan.class.getName())) { stopComparisons.add(Comparison.EQUAL); stopComparisons.add(Comparison.GREATER); if (this.getIOContext().isBinarySearching()) { beginLinearSearch(); } } else if (genericUDFClassName.equals(GenericUDFOPEqualOrLessThan.class.getName())) { stopComparisons.add(Comparison.GREATER); if (this.getIOContext().isBinarySearching()) { beginLinearSearch(); } } else if (genericUDFClassName.equals(GenericUDFOPGreaterThan.class.getName()) || genericUDFClassName.equals(GenericUDFOPEqualOrGreaterThan.class.getName())) { // Do nothing } else { // This is an unsupported operator LOG.debug(genericUDFClassName + " is not the name of a supported class. " + "Continuing linearly."); if (this.getIOContext().isBinarySearching()) { beginLinearSearch(); } } } /** * This should be called after the binary search is finished and before the linear scan begins * @throws IOException */ private void beginLinearSearch() throws IOException { sync(rangeStart); this.getIOContext().setBinarySearching(false); this.wasUsingSortedSearch = false; } /** * Returns true if the current comparison is in the list of stop comparisons, i.e. we've found * all records which won't be filtered * @return true if the current comparison is found */ public boolean foundAllTargets() { if (this.getIOContext().getComparison() == null || !stopComparisons.contains(this.getIOContext().getComparison())) { return false; } return true; } }