Java tutorial
/* * Copyright 2010-2013 10gen Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zjy.mongo.splitter; import com.mongodb.BasicDBObjectBuilder; import com.zjy.mongo.input.BSONFileSplit; import com.zjy.mongo.util.MongoConfigUtil; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.bson.BSONObject; import org.bson.BasicBSONCallback; import org.bson.BasicBSONDecoder; import org.bson.BasicBSONEncoder; import org.bson.LazyBSONCallback; import org.bson.LazyBSONDecoder; import org.bson.LazyBSONObject; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class BSONSplitter extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(BSONSplitter.class); private ArrayList<BSONFileSplit> splitsList; private Path inputPath; private final BasicBSONCallback callback = new BasicBSONCallback(); private final LazyBSONCallback lazyCallback = new LazyBSONCallback(); private final LazyBSONDecoder lazyDec = new LazyBSONDecoder(); private final BasicBSONDecoder bsonDec = new BasicBSONDecoder(); private final BasicBSONEncoder bsonEnc = new BasicBSONEncoder(); public static class NoSplitFileException extends Exception { } public void setInputPath(final Path p) { inputPath = p; } public ArrayList<BSONFileSplit> getAllSplits() { if (splitsList == null) { return new ArrayList<BSONFileSplit>(0); } else { return splitsList; } } public BSONFileSplit createFileSplitFromBSON(final BSONObject obj, final FileSystem fs, final FileStatus inputFile) throws IOException { long start = (Long) obj.get("s"); long splitLen = (Long) obj.get("l"); return createFileSplit(inputFile, fs, start, splitLen); } public BSONFileSplit createFileSplit(final FileStatus inFile, final FileSystem fs, final long splitStart, final long splitLen) { BSONFileSplit split; try { BlockLocation[] blkLocations; // This code is based off of org.apache.hadoop.mapreduce.lib // .input.FileInputFormat.getSplits() if (inFile instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) inFile).getBlockLocations(); } else { blkLocations = fs.getFileBlockLocations(inFile, splitStart, splitLen); } int blockIndex = getBlockIndex(blkLocations, splitStart); split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, blkLocations[blockIndex].getHosts()); } catch (IOException e) { LOG.warn( "Couldn't find block locations when constructing input split from byte offset. Using non-block-aware input split; " + e.getMessage()); split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, null); } split.setKeyField(MongoConfigUtil.getInputKey(getConf())); return split; } /** * Load splits from a splits file. * * @param inputFile the file whose splits are contained in the splits file. * @param splitFile the Path to the splits file. * @throws NoSplitFileException if the splits file is not found. * @throws IOException when an error occurs reading from the file. */ public void loadSplitsFromSplitFile(final FileStatus inputFile, final Path splitFile) throws NoSplitFileException, IOException { ArrayList<BSONFileSplit> splits = new ArrayList<BSONFileSplit>(); FileSystem fs = splitFile.getFileSystem(getConf()); // throws IOException FileStatus splitFileStatus; FSDataInputStream fsDataStream = null; try { try { splitFileStatus = fs.getFileStatus(splitFile); LOG.info("Found split file at : " + splitFileStatus); } catch (Exception e) { throw new NoSplitFileException(); } fsDataStream = fs.open(splitFile); // throws IOException while (fsDataStream.getPos() < splitFileStatus.getLen()) { callback.reset(); bsonDec.decode(fsDataStream, callback); BSONObject splitInfo = (BSONObject) callback.get(); splits.add(createFileSplitFromBSON(splitInfo, fs, inputFile)); } } finally { if (null != fsDataStream) { fsDataStream.close(); } } splitsList = splits; } public static long getSplitSize(final Configuration conf, final FileStatus file) { // Try new configuration options first, but fall back to old ones. long maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", conf.getLong("mapred.max.split.size", Long.MAX_VALUE)); long minSize = Math.max(1L, conf.getLong("mapreduce.input.fileinputformat.split.minsize", conf.getLong("mapred.min.split.size", 1L))); if (file != null) { long fileBlockSize = file.getBlockSize(); return Math.max(minSize, Math.min(maxSize, fileBlockSize)); } else { long blockSize = conf.getLong("dfs.blockSize", 64 * 1024 * 1024); return Math.max(minSize, Math.min(maxSize, blockSize)); } } /** * Calculate the splits for a given input file, sensitive to options such * as {@link com.zjy.mongo.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}. * This method always re-calculates the splits and will try to write the * splits file. * * @param file the FileStatus for which to calculate splits. * @throws IOException when an error occurs reading from the FileSystem * * @see #readSplits */ public void readSplitsForFile(final FileStatus file) throws IOException { long length = file.getLen(); if (!MongoConfigUtil.getBSONReadSplits(getConf())) { LOG.info("Reading splits is disabled - constructing single split for " + file); FileSystem fs = file.getPath().getFileSystem(getConf()); BSONFileSplit onesplit = createFileSplit(file, fs, 0, length); ArrayList<BSONFileSplit> splits = new ArrayList<BSONFileSplit>(); splits.add(onesplit); splitsList = splits; return; } if (length != 0) { splitsList = (ArrayList<BSONFileSplit>) splitFile(file); writeSplits(); } else { LOG.warn("Zero-length file, skipping split calculation."); } } /** * Calculate the splits for a given input file according to the settings * for split size only. This method does not respect options like * {@link com.zjy.mongo.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}. * * @param file the FileStatus for which to calculate splits. * @return a List of the calculated splits. * * @throws IOException when an error occurs reading from the FileSystem */ protected List<BSONFileSplit> splitFile(final FileStatus file) throws IOException { Path path = file.getPath(); ArrayList<BSONFileSplit> splits = new ArrayList<BSONFileSplit>(); FileSystem fs = path.getFileSystem(getConf()); long length = file.getLen(); int numDocsRead = 0; long splitSize = getSplitSize(getConf(), file); if (LOG.isDebugEnabled()) { LOG.debug("Generating splits for " + path + " of up to " + splitSize + " bytes."); } FSDataInputStream fsDataStream = fs.open(path); long curSplitLen = 0; long curSplitStart = 0; try { while (fsDataStream.getPos() + 1 < length) { lazyCallback.reset(); lazyDec.decode(fsDataStream, lazyCallback); LazyBSONObject bo = (LazyBSONObject) lazyCallback.get(); int bsonDocSize = bo.getBSONSize(); if (curSplitLen + bsonDocSize >= splitSize) { BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen); splits.add(split); if (LOG.isDebugEnabled()) { LOG.debug(String.format("Creating new split (%d) %s", splits.size(), split)); } curSplitStart = fsDataStream.getPos() - bsonDocSize; curSplitLen = 0; } curSplitLen += bsonDocSize; numDocsRead++; if (numDocsRead % 1000 == 0) { float splitProgress = 100f * ((float) fsDataStream.getPos() / length); if (LOG.isDebugEnabled()) { LOG.debug(String.format("Read %d docs calculating splits for %s; %3.3f%% complete.", numDocsRead, file.getPath(), splitProgress)); } } } if (curSplitLen > 0) { BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen); splits.add(split); if (LOG.isDebugEnabled()) { LOG.debug(String.format("Final split (%d) %s", splits.size(), split.getPath())); } } if (LOG.isDebugEnabled()) { LOG.debug("Completed splits calculation for " + file.getPath()); } } catch (IOException e) { LOG.warn("IOException: " + e); } finally { fsDataStream.close(); } return splits; } /** * Write out the splits file, if doing so has been enabled. Splits must * already have been calculated previously by a call to {@link * #readSplitsForFile readSplitsForFile} or {@link #readSplits readSplits}. * * @see com.zjy.mongo.util.MongoConfigUtil#BSON_WRITE_SPLITS * * @throws IOException when an error occurs writing the file */ public void writeSplits() throws IOException { if (getConf().getBoolean("bson.split.write_splits", true)) { LOG.info("Writing splits to disk."); } else { LOG.info("bson.split.write_splits is set to false - skipping writing splits to disk."); return; } if (splitsList == null) { LOG.info("No splits found, skipping write of splits file."); } Path outputPath = getSplitsFilePath(inputPath, getConf()); FileSystem pathFileSystem = outputPath.getFileSystem(getConf()); FSDataOutputStream fsDataOut = null; try { fsDataOut = pathFileSystem.create(outputPath, false); for (FileSplit inputSplit : splitsList) { BSONObject splitObj = BasicDBObjectBuilder.start().add("s", inputSplit.getStart()) .add("l", inputSplit.getLength()).get(); byte[] encodedObj = bsonEnc.encode(splitObj); fsDataOut.write(encodedObj, 0, encodedObj.length); } } catch (IOException e) { LOG.error("Could not create splits file: " + e.getMessage()); throw e; } finally { if (fsDataOut != null) { fsDataOut.close(); } } } /** * Calculate splits for each file in the input path, sensitive to options such * as {@link com.zjy.mongo.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}. * This method always re-calculates the splits and will try to write the * splits file. * * @see #readSplitsForFile * * @throws IOException when an error occurs reading from the file */ public void readSplits() throws IOException { splitsList = new ArrayList<BSONFileSplit>(); if (inputPath == null) { throw new IllegalStateException("Input path has not been set."); } FileSystem fs = inputPath.getFileSystem(getConf()); FileStatus file = fs.getFileStatus(inputPath); readSplitsForFile(file); } public int run(final String[] args) throws Exception { setInputPath(new Path(getConf().get("mapred.input.dir", ""))); readSplits(); writeSplits(); return 0; } /** * Get the index of the block within the given BlockLocations that * contains the given offset. Raises IllegalArgumentException if the * offset is outside the file. * * @param blockLocations BlockLocations to search. * @param offset the offset into the file. * @return the index of the BlockLocation containing the offset. */ private static int getBlockIndex(final BlockLocation[] blockLocations, final long offset) { for (int i = 0; i < blockLocations.length; i++) { BlockLocation bl = blockLocations[i]; if (bl.getOffset() <= offset && offset < bl.getOffset() + bl.getLength()) { return i; } } BlockLocation lastBlock = blockLocations[blockLocations.length - 1]; long fileLength = lastBlock.getOffset() + lastBlock.getLength() - 1; throw new IllegalArgumentException( String.format("Offset %d is outside the file [0..%d].", offset, fileLength)); } /** * Get the position at which the BSONFileRecordReader should begin * iterating the given split. This may not be at the beginning of the split * if the splits were not calculated by BSONSplitter. * * @param split the FileSplit for which to find the starting position. * @return the position of the first complete document within the split. * @throws IOException when an error occurs while reading a file */ public synchronized long getStartingPositionForSplit(final FileSplit split) throws IOException { FileSystem fs = split.getPath().getFileSystem(getConf()); FileStatus file = fs.getFileStatus(split.getPath()); ArrayList<BSONFileSplit> splits; BSONFileSplit[] splitsArr; // Get splits calculated on document boundaries. if (MongoConfigUtil.getBSONReadSplits(getConf())) { // Use the splits file to load splits on document boundaries. try { // Try to use the existing splits file. loadSplitsFromSplitFile(file, getSplitsFilePath(file.getPath(), getConf())); } catch (NoSplitFileException e) { // Create a splits file from scratch. readSplitsForFile(file); } splits = getAllSplits(); } else { // Can't use a splits file, so create splits from scratch. splits = (ArrayList<BSONFileSplit>) splitFile(file); } splitsArr = new BSONFileSplit[splits.size()]; splits.toArray(splitsArr); // Get the first pre-calculated split occurring before the start of // the given split. long previousStart = split.getStart(); long startIterating = 0; for (BSONFileSplit bfs : splitsArr) { if (bfs.getStart() >= split.getStart()) { startIterating = previousStart; break; } previousStart = bfs.getStart(); } // Beginning at 'startIterating', jump to the first document that begins // at or beyond the given split. FSDataInputStream fsDataStream = null; long pos = startIterating; try { fsDataStream = fs.open(split.getPath()); fsDataStream.seek(pos); while (pos < split.getStart()) { callback.reset(); bsonDec.decode(fsDataStream, callback); pos = fsDataStream.getPos(); } } finally { if (null != fsDataStream) { fsDataStream.close(); } } return pos; } /** * Get the path to the ".splits" file for a BSON file. * @param filePath the path to the BSON file. * @param conf the Hadoop configuration. * @return the path to the ".splits" file. */ public static Path getSplitsFilePath(final Path filePath, final Configuration conf) { String splitsPath = MongoConfigUtil.getBSONSplitsPath(conf); String splitsFileName = "." + filePath.getName() + ".splits"; if (null == splitsPath) { return new Path(filePath.getParent(), splitsFileName); } return new Path(splitsPath, splitsFileName); } public static void main(final String[] args) throws Exception { System.exit(ToolRunner.run(new BSONSplitter(), args)); } }