Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. */ package com.ricemap.spateDB.mapred; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Vector; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.lib.CombineFileSplit; /** * A set of method and algorithms used to support management of file splits. * * @author tonyren, eldawy * */ public class FileSplitUtil { static final Log LOG = LogFactory.getLog(FileSplitUtil.class); /**Disallow instantiation of this class*/ private FileSplitUtil() { } /** * Combines a number of file splits into one CombineFileSplit. If number of * splits to be combined is one, it returns this split as is without creating * a CombineFileSplit. * @param splits * @param startIndex * @param count * @return * @throws IOException */ public static InputSplit combineFileSplits(JobConf conf, List<FileSplit> splits, int startIndex, int count) throws IOException { if (count == 1) { return splits.get(startIndex); } else { Path[] paths = new Path[count]; long[] starts = new long[count]; long[] lengths = new long[count]; Vector<String> vlocations = new Vector<String>(); while (count > 0) { paths[count - 1] = splits.get(startIndex).getPath(); starts[count - 1] = splits.get(startIndex).getStart(); lengths[count - 1] = splits.get(startIndex).getLength(); vlocations.addAll(Arrays.asList(splits.get(startIndex).getLocations())); count--; startIndex++; } String[] locations = prioritizeLocations(vlocations); return new CombineFileSplit(conf, paths, starts, lengths, locations); } } /** * Combines two file splits into a CombineFileSplit. * @param conf * @param split1 * @param split2 * @return * @throws IOException */ public static InputSplit combineFileSplits(JobConf conf, FileSplit split1, FileSplit split2) throws IOException { Path[] paths = new Path[2]; long[] starts = new long[2]; long[] lengths = new long[2]; Vector<String> vlocations = new Vector<String>(); paths[0] = split1.getPath(); starts[0] = split1.getStart(); lengths[0] = split1.getLength(); vlocations.addAll(Arrays.asList(split1.getLocations())); paths[1] = split2.getPath(); starts[1] = split2.getStart(); lengths[1] = split2.getLength(); vlocations.addAll(Arrays.asList(split2.getLocations())); String[] locations = prioritizeLocations(vlocations); return new CombineFileSplit(conf, paths, starts, lengths, locations); } /** * Takes a list of locations as a vector, and returns a unique array of * locations where locations on the head are more frequent in the original * vector than the ones on the tail. * * @param vlocations - A vector of locations with possible duplicates * @return - A unique array of locations. */ public static String[] prioritizeLocations(Vector<String> vlocations) { Collections.sort(vlocations); @SuppressWarnings("unchecked") Vector<String>[] locations_by_count = new Vector[vlocations.size() + 1]; int unique_location_count = 0; int first_in_run = 0; int i = 1; while (i < vlocations.size()) { if (vlocations.get(first_in_run).equals(vlocations.get(i))) { i++; } else { // End of run unique_location_count++; int count = i - first_in_run; if (locations_by_count[count] == null) { locations_by_count[count] = new Vector<String>(); } locations_by_count[count].add(vlocations.get(first_in_run)); first_in_run = i; } } // add last run unique_location_count++; int count = i - first_in_run; if (locations_by_count[count] == null) { locations_by_count[count] = new Vector<String>(); } locations_by_count[count].add(vlocations.get(first_in_run)); String[] unique_locations = new String[unique_location_count]; for (Vector<String> locations_with_same_count : locations_by_count) { if (locations_with_same_count == null) continue; for (String loc : locations_with_same_count) { unique_locations[--unique_location_count] = loc; } } if (unique_location_count != 0) throw new RuntimeException(); return unique_locations; } /** * Combines a number of input splits into the given numSplits. * @param conf * @param inputSplits * @param numSplits * @return * @throws IOException */ public static InputSplit[] autoCombineSplits(JobConf conf, Vector<FileSplit> inputSplits, int numSplits) throws IOException { LOG.info("Combining " + inputSplits.size() + " splits into " + numSplits); Map<String, Vector<FileSplit>> blocksPerHost = new HashMap<String, Vector<FileSplit>>(); for (FileSplit fsplit : inputSplits) { // Get locations for this split final Path path = fsplit.getPath(); final FileSystem fs = path.getFileSystem(conf); BlockLocation[] blockLocations = fs.getFileBlockLocations(fs.getFileStatus(path), fsplit.getStart(), fsplit.getLength()); for (BlockLocation blockLocation : blockLocations) { for (String hostName : blockLocation.getHosts()) { if (!blocksPerHost.containsKey(hostName)) blocksPerHost.put(hostName, new Vector<FileSplit>()); blocksPerHost.get(hostName).add(fsplit); } } } // If the user requested a fewer number of splits, start to combine them InputSplit[] combined_splits = new InputSplit[numSplits]; int splitsAvailable = inputSplits.size(); for (int i = 0; i < numSplits; i++) { // Decide how many splits to combine int numSplitsToCombine = splitsAvailable / (numSplits - i); Vector<FileSplit> splitsToCombine = new Vector<FileSplit>(); while (numSplitsToCombine > 0) { // Choose the host with minimum number of splits Map.Entry<String, Vector<FileSplit>> minEntry = null; for (Map.Entry<String, Vector<FileSplit>> entry : blocksPerHost.entrySet()) { if (minEntry == null || entry.getValue().size() < minEntry.getValue().size()) { minEntry = entry; } } // Combine all or some of blocks in this host for (FileSplit fsplit : minEntry.getValue()) { if (!splitsToCombine.contains(fsplit)) { splitsToCombine.add(fsplit); if (--numSplitsToCombine == 0) break; } } if (numSplitsToCombine != 0) { // Remove this host so that it is not selected again blocksPerHost.remove(minEntry.getKey()); } } combined_splits[i] = combineFileSplits(conf, splitsToCombine, 0, splitsToCombine.size()); for (Map.Entry<String, Vector<FileSplit>> entry : blocksPerHost.entrySet()) { entry.getValue().removeAll(splitsToCombine); } splitsAvailable -= splitsToCombine.size(); } LOG.info("Combined splits " + combined_splits.length); return combined_splits; } }