Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hama.bsp; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.mapred.InvalidInputException; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.net.Node; import org.apache.hadoop.net.NodeBase; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; public abstract class FileInputFormat<K, V> implements InputFormat<K, V> { public static final Log LOG = LogFactory.getLog(FileInputFormat.class); private static final double SPLIT_SLOP = 1.1; // 10% slop private static final PathFilter hiddenFileFilter = new PathFilter() { @Override public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }; /** * Proxy PathFilter that accepts a path only if all filters given in the * constructor do. Used by the listPaths() to apply the built-in * hiddenFileFilter together with a user provided one (if any). */ private static class MultiPathFilter implements PathFilter { private List<PathFilter> filters; public MultiPathFilter(List<PathFilter> filters) { this.filters = filters; } @Override public boolean accept(Path path) { for (PathFilter filter : filters) { if (!filter.accept(path)) { return false; } } return true; } } @Override public abstract RecordReader<K, V> getRecordReader(InputSplit split, BSPJob job) throws IOException; /** * Set a PathFilter to be applied to the input paths for the BSP job. * * @param filter the PathFilter class use for filtering the input paths. */ public static void setInputPathFilter(BSPJob conf, Class<? extends PathFilter> filter) { conf.getConfiguration().setClass("bsp.input.pathFilter.class", filter, PathFilter.class); } /** * Get a PathFilter instance of the filter set for the input paths. * * @return the PathFilter instance set for the job, NULL if none has been set. */ public static PathFilter getInputPathFilter(BSPJob conf) { Class<? extends PathFilter> filterClass = conf.getConfiguration().getClass("bsp.input.pathFilter.class", null, PathFilter.class); return (filterClass != null) ? ReflectionUtils.newInstance(filterClass, conf.getConfiguration()) : null; } /** * List input directories. Subclasses may override to, e.g., select only files * matching a regular expression. * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected FileStatus[] listStatus(BSPJob job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = null; try { matches = fs.globStatus(p, inputFilter); } catch (Exception e) { LOG.info(p + "\n" + e.toString()); } if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { Collections.addAll(result, fs.listStatus(globStat.getPath(), inputFilter)); } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result.toArray(new FileStatus[result.size()]); } /** * Splits files returned by {@link #listStatus(BSPJob)} when they're too big. <br/> * numSplits will be ignored by the framework. */ @Override public InputSplit[] getSplits(BSPJob job, int numSplits) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); FileStatus[] files = listStatus(job); /* * TODO: This does not consider data locality. When the numSplits * (user-defined) is equal to or smaller than the number of DFS splits, we * should assign multiple splits to a task. */ // take the short circuit path if we have already partitioned // if (numSplits == files.length) { // for (FileStatus file : files) { // if (file != null) { // splits.add(new FileSplit(file.getPath(), 0, file.getLen(), // new String[0])); // } // } // return splits.toArray(new FileSplit[splits.size()]); // } for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } // Save the number of input files in the job-conf job.getConfiguration().setLong("bsp.input.files", files.length); LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new InputSplit[splits.size()]); } /** * @return true if the file is splittable (default), false if not. */ protected boolean isSplitable(BSPJob job, Path path) { return true; } /** * Get the lower bound on split size imposed by the format. * * @return the number of bytes of the minimal split for this format */ protected long getFormatMinSplitSize() { return 1; } /** * Set the minimum input split size * * @param job the job to modify * @param size the minimum size */ public static void setMinInputSplitSize(Job job, long size) { job.getConfiguration().setLong("bsp.min.split.size", size); } /** * Get the minimum split size * * @param job the job * @return the minimum number of bytes that can be in a split */ public static long getMinSplitSize(BSPJob job) { return job.getConfiguration().getLong("bsp.min.split.size", 1L); } /** * Set the maximum split size * * @param job the job to modify * @param size the maximum split size */ public static void setMaxInputSplitSize(Job job, long size) { job.getConfiguration().setLong("bsp.max.split.size", size); } /** * Get the maximum split size. * * @param context the job to look at. * @return the maximum number of bytes a split can include */ public static long getMaxSplitSize(BSPJob context) { return context.getConfiguration().getLong("bsp.max.split.size", Long.MAX_VALUE); } protected long computeTotalSize(BSPJob job, FileStatus[] files) throws IOException { long totalSize = 0L; for (int i = 0; i < files.length; i++) { // check we have valid files FileStatus file = files[i]; if (file.isDir()) { final Path path = file.getPath(); if (path.getName().equals("hama-partitions") || (job.get("bsp.partitioning.dir") != null && path.getName().equals(job.get("bsp.partitioning.dir")))) { // if we find the partitioning dir, just remove it. LOG.warn("Removing already existing partitioning directory " + path); FileSystem fileSystem = path.getFileSystem(job.getConfiguration()); if (!fileSystem.delete(path, true)) { LOG.error("Remove failed."); } // remove this file from our initial list files[i] = null; } else { throw new IOException("Not a file (dir): " + path); } } totalSize += file.getLen(); } return totalSize; } protected long computeGoalSize(int numSplits, long totalSize) { // The minus 1 is for the remainder. return totalSize / (numSplits <= 1 ? 1 : numSplits - 1); } protected long computeSplitSize(long goalSize, long minSize, long blockSize) { if (goalSize > blockSize) { return Math.max(minSize, Math.max(goalSize, blockSize)); } else { return Math.max(minSize, Math.min(goalSize, blockSize)); } } protected int getBlockIndex(BlockLocation[] blkLocations, long offset) { for (int i = 0; i < blkLocations.length; i++) { // is the offset inside this block? if ((blkLocations[i].getOffset() <= offset) && (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) { return i; } } BlockLocation last = blkLocations[blkLocations.length - 1]; long fileLength = last.getOffset() + last.getLength() - 1; throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")"); } /** * Sets the given comma separated paths as the list of inputs for the BSP job. * * @param conf Configuration of the job * @param commaSeparatedPaths Comma separated paths to be set as the list of * inputs for the BSP job. */ public static void setInputPaths(BSPJob conf, String commaSeparatedPaths) { setInputPaths(conf, StringUtils.stringToPath(getPathStrings(commaSeparatedPaths))); } /** * Add the given comma separated paths to the list of inputs for the BSP job. * * @param conf The configuration of the job * @param commaSeparatedPaths Comma separated paths to be added to the list of * inputs for the BSP job. */ public static void addInputPaths(BSPJob conf, String commaSeparatedPaths) { for (String str : getPathStrings(commaSeparatedPaths)) { addInputPath(conf, new Path(str)); } } /** * Set the array of {@link Path}s as the list of inputs for the BSP job. * * @param conf Configuration of the job. * @param inputPaths the {@link Path}s of the input directories/files for the * BSP job. */ public static void setInputPaths(BSPJob conf, Path... inputPaths) { Path path = new Path(conf.getWorkingDirectory(), inputPaths[0]); StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString())); for (int i = 1; i < inputPaths.length; i++) { str.append(StringUtils.COMMA_STR); path = new Path(conf.getWorkingDirectory(), inputPaths[i]); str.append(StringUtils.escapeString(path.toString())); } conf.set("bsp.input.dir", str.toString()); } /** * Add a {@link Path} to the list of inputs for the BSP job. * * @param conf The configuration of the job * @param p {@link Path} to be addaiaied to the list of inputs for the BSP * job. */ public static void addInputPath(BSPJob conf, Path p) { Path path = new Path(conf.getWorkingDirectory(), p); String dirStr = StringUtils.escapeString(path.toString()); String dirs = conf.get("bsp.input.dir"); conf.set("bsp.input.dir", dirs == null ? dirStr : dirs + StringUtils.COMMA_STR + dirStr); } // This method escapes commas in the glob pattern of the given paths. private static String[] getPathStrings(String commaSeparatedPaths) { int length = commaSeparatedPaths.length(); int curlyOpen = 0; int pathStart = 0; boolean globPattern = false; List<String> pathStrings = new ArrayList<String>(); for (int i = 0; i < length; i++) { char ch = commaSeparatedPaths.charAt(i); switch (ch) { case '{': { curlyOpen++; if (!globPattern) { globPattern = true; } break; } case '}': { curlyOpen--; if (curlyOpen == 0 && globPattern) { globPattern = false; } break; } case ',': { if (!globPattern) { pathStrings.add(commaSeparatedPaths.substring(pathStart, i)); pathStart = i + 1; } break; } } } pathStrings.add(commaSeparatedPaths.substring(pathStart, length)); return pathStrings.toArray(new String[pathStrings.size()]); } /** * Get the list of input {@link Path}s for the BSP job. * * @param conf The configuration of the job * @return the list of input {@link Path}s for the BSP job. */ public static Path[] getInputPaths(BSPJob conf) { String dirs = conf.getConfiguration().get("bsp.input.dir", ""); String[] list = StringUtils.split(dirs); Path[] result = new Path[list.length]; for (int i = 0; i < list.length; i++) { result[i] = new Path(StringUtils.unEscapeString(list[i])); } return result; } private static void sortInDescendingOrder(List<NodeInfo> mylist) { Collections.sort(mylist, new Comparator<NodeInfo>() { @Override public int compare(NodeInfo obj1, NodeInfo obj2) { if (obj1 == null || obj2 == null) return -1; if (obj1.getValue() == obj2.getValue()) { return 0; } else { return ((obj1.getValue() < obj2.getValue()) ? 1 : -1); } } }); } /** * This function identifies and returns the hosts that contribute most for a * given split. For calculating the contribution, rack locality is treated on * par with host locality, so hosts from racks that contribute the most are * preferred over hosts on racks that contribute less * * @param blkLocations The list of block locations * @param offset * @param pSplitSize * @return array of hosts that contribute most to this split * @throws IOException */ protected String[] getSplitHosts(BlockLocation[] blkLocations, long offset, long pSplitSize, NetworkTopology clusterMap) throws IOException { long splitSize = pSplitSize; int startIndex = getBlockIndex(blkLocations, offset); long bytesInThisBlock = blkLocations[startIndex].getOffset() + blkLocations[startIndex].getLength() - offset; // If this is the only block, just return if (bytesInThisBlock >= splitSize) { return blkLocations[startIndex].getHosts(); } long bytesInFirstBlock = bytesInThisBlock; int index = startIndex + 1; splitSize -= bytesInThisBlock; while (splitSize > 0) { bytesInThisBlock = Math.min(splitSize, blkLocations[index++].getLength()); splitSize -= bytesInThisBlock; } long bytesInLastBlock = bytesInThisBlock; int endIndex = index - 1; Map<Node, NodeInfo> hostsMap = new IdentityHashMap<Node, NodeInfo>(); Map<Node, NodeInfo> racksMap = new IdentityHashMap<Node, NodeInfo>(); String[] allTopos = new String[0]; // Build the hierarchy and aggregate the contribution of // bytes at each level. See TestGetSplitHosts.java for (index = startIndex; index <= endIndex; index++) { // Establish the bytes in this block if (index == startIndex) { bytesInThisBlock = bytesInFirstBlock; } else if (index == endIndex) { bytesInThisBlock = bytesInLastBlock; } else { bytesInThisBlock = blkLocations[index].getLength(); } allTopos = blkLocations[index].getTopologyPaths(); // If no topology information is available, just // prefix a fakeRack if (allTopos.length == 0) { allTopos = fakeRacks(blkLocations, index); } // NOTE: This code currently works only for one level of // hierarchy (rack/host). However, it is relatively easy // to extend this to support aggregation at different // levels for (String topo : allTopos) { Node node, parentNode; NodeInfo nodeInfo, parentNodeInfo; node = clusterMap.getNode(topo); if (node == null) { node = new NodeBase(topo); clusterMap.add(node); } nodeInfo = hostsMap.get(node); if (nodeInfo == null) { nodeInfo = new NodeInfo(node); hostsMap.put(node, nodeInfo); parentNode = node.getParent(); parentNodeInfo = racksMap.get(parentNode); if (parentNodeInfo == null) { parentNodeInfo = new NodeInfo(parentNode); racksMap.put(parentNode, parentNodeInfo); } parentNodeInfo.addLeaf(nodeInfo); } else { nodeInfo = hostsMap.get(node); parentNode = node.getParent(); parentNodeInfo = racksMap.get(parentNode); } nodeInfo.addValue(index, bytesInThisBlock); parentNodeInfo.addValue(index, bytesInThisBlock); } // for all topos } // for all indices return identifyHosts(allTopos.length, racksMap); } private static String[] identifyHosts(int replicationFactor, Map<Node, NodeInfo> racksMap) { String[] retVal = new String[replicationFactor]; List<NodeInfo> rackList = new LinkedList<NodeInfo>(); rackList.addAll(racksMap.values()); // Sort the racks based on their contribution to this split sortInDescendingOrder(rackList); boolean done = false; int index = 0; // Get the host list for all our aggregated items, sort // them and return the top entries for (NodeInfo ni : rackList) { Set<NodeInfo> hostSet = ni.getLeaves(); List<NodeInfo> hostList = new LinkedList<NodeInfo>(); hostList.addAll(hostSet); // Sort the hosts in this rack based on their contribution sortInDescendingOrder(hostList); for (NodeInfo host : hostList) { // Strip out the port number from the host name retVal[index++] = host.node.getName().split(":")[0]; if (index == replicationFactor) { done = true; break; } } if (done == true) { break; } } return retVal; } private static String[] fakeRacks(BlockLocation[] blkLocations, int index) throws IOException { String[] allHosts = blkLocations[index].getHosts(); String[] allTopos = new String[allHosts.length]; for (int i = 0; i < allHosts.length; i++) { allTopos[i] = NetworkTopology.DEFAULT_RACK + "/" + allHosts[i]; } return allTopos; } private static class NodeInfo { final Node node; final Set<Integer> blockIds; final Set<NodeInfo> leaves; private long value; NodeInfo(Node node) { this.node = node; blockIds = new HashSet<Integer>(); leaves = new HashSet<NodeInfo>(); } long getValue() { return value; } void addValue(int blockIndex, long value) { if (blockIds.add(blockIndex) == true) { this.value += value; } } Set<NodeInfo> getLeaves() { return leaves; } void addLeaf(NodeInfo nodeInfo) { leaves.add(nodeInfo); } } }