Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.accumulo.server.util; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Optional; import java.util.Random; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import org.apache.accumulo.core.conf.AccumuloConfiguration; import org.apache.accumulo.core.conf.Property; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.PartialKey; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.data.impl.KeyExtent; import org.apache.accumulo.core.file.FileOperations; import org.apache.accumulo.core.file.FileSKVIterator; import org.apache.accumulo.core.file.FileSKVWriter; import org.apache.accumulo.core.file.rfile.RFile; import org.apache.accumulo.core.file.rfile.RFileOperations; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; import org.apache.accumulo.core.iterators.system.MultiIterator; import org.apache.accumulo.core.util.CachedConfiguration; import org.apache.accumulo.core.util.LocalityGroupUtil; import org.apache.accumulo.core.volume.Volume; import org.apache.accumulo.server.ServerConstants; import org.apache.accumulo.server.fs.FileRef; import org.apache.accumulo.server.fs.VolumeManager; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class FileUtil { public static class FileInfo { Key firstKey = new Key(); Key lastKey = new Key(); public FileInfo(Key firstKey, Key lastKey) { this.firstKey = firstKey; this.lastKey = lastKey; } public Text getFirstRow() { return firstKey.getRow(); } public Text getLastRow() { return lastKey.getRow(); } } private static final Logger log = LoggerFactory.getLogger(FileUtil.class); private static Path createTmpDir(AccumuloConfiguration acuConf, VolumeManager fs) throws IOException { String accumuloDir = fs.choose(Optional.<String>empty(), ServerConstants.getBaseUris()); Path result = null; while (result == null) { result = new Path(accumuloDir + Path.SEPARATOR + "tmp/idxReduce_" + String.format("%09d", new Random().nextInt(Integer.MAX_VALUE))); try { fs.getFileStatus(result); result = null; continue; } catch (FileNotFoundException fne) { // found an unused temp directory } fs.mkdirs(result); // try to reserve the tmp dir // In some versions of hadoop, two clients concurrently trying to create the same directory might both return true // Creating a file is not subject to this, so create a special file to make sure we solely will use this directory if (!fs.createNewFile(new Path(result, "__reserve"))) result = null; } return result; } public static Collection<String> reduceFiles(AccumuloConfiguration acuConf, Configuration conf, VolumeManager fs, Text prevEndRow, Text endRow, Collection<String> mapFiles, int maxFiles, Path tmpDir, int pass) throws IOException { ArrayList<String> paths = new ArrayList<>(mapFiles); if (paths.size() <= maxFiles) return paths; String newDir = String.format("%s/pass_%04d", tmpDir, pass); int start = 0; ArrayList<String> outFiles = new ArrayList<>(); int count = 0; while (start < paths.size()) { int end = Math.min(maxFiles + start, paths.size()); List<String> inFiles = paths.subList(start, end); start = end; String newMapFile = String.format("%s/%04d.%s", newDir, count++, RFile.EXTENSION); outFiles.add(newMapFile); FileSystem ns = fs.getVolumeByPath(new Path(newMapFile)).getFileSystem(); FileSKVWriter writer = new RFileOperations().newWriterBuilder() .forFile(newMapFile.toString(), ns, ns.getConf()).withTableConfiguration(acuConf).build(); writer.startDefaultLocalityGroup(); List<SortedKeyValueIterator<Key, Value>> iters = new ArrayList<>(inFiles.size()); FileSKVIterator reader = null; try { for (String s : inFiles) { ns = fs.getVolumeByPath(new Path(s)).getFileSystem(); reader = FileOperations.getInstance().newIndexReaderBuilder().forFile(s, ns, ns.getConf()) .withTableConfiguration(acuConf).build(); iters.add(reader); } MultiIterator mmfi = new MultiIterator(iters, true); while (mmfi.hasTop()) { Key key = mmfi.getTopKey(); boolean gtPrevEndRow = prevEndRow == null || key.compareRow(prevEndRow) > 0; boolean lteEndRow = endRow == null || key.compareRow(endRow) <= 0; if (gtPrevEndRow && lteEndRow) writer.append(key, new Value(new byte[0])); if (!lteEndRow) break; mmfi.next(); } } finally { try { if (reader != null) reader.close(); } catch (IOException e) { log.error("{}", e.getMessage(), e); } for (SortedKeyValueIterator<Key, Value> r : iters) try { if (r != null) ((FileSKVIterator) r).close(); } catch (IOException e) { // continue closing log.error("{}", e.getMessage(), e); } try { writer.close(); } catch (IOException e) { log.error("{}", e.getMessage(), e); throw e; } } } return reduceFiles(acuConf, conf, fs, prevEndRow, endRow, outFiles, maxFiles, tmpDir, pass + 1); } public static SortedMap<Double, Key> findMidPoint(VolumeManager fs, AccumuloConfiguration acuConf, Text prevEndRow, Text endRow, Collection<String> mapFiles, double minSplit) throws IOException { return findMidPoint(fs, acuConf, prevEndRow, endRow, mapFiles, minSplit, true); } public static double estimatePercentageLTE(VolumeManager fs, AccumuloConfiguration acuconf, Text prevEndRow, Text endRow, Collection<String> mapFiles, Text splitRow) throws IOException { Configuration conf = CachedConfiguration.getInstance(); Path tmpDir = null; int maxToOpen = acuconf.getCount(Property.TSERV_TABLET_SPLIT_FINDMIDPOINT_MAXOPEN); ArrayList<FileSKVIterator> readers = new ArrayList<>(mapFiles.size()); try { if (mapFiles.size() > maxToOpen) { tmpDir = createTmpDir(acuconf, fs); log.debug("Too many indexes (" + mapFiles.size() + ") to open at once for " + endRow + " " + prevEndRow + ", reducing in tmpDir = " + tmpDir); long t1 = System.currentTimeMillis(); mapFiles = reduceFiles(acuconf, conf, fs, prevEndRow, endRow, mapFiles, maxToOpen, tmpDir, 0); long t2 = System.currentTimeMillis(); log.debug("Finished reducing indexes for " + endRow + " " + prevEndRow + " in " + String.format("%6.2f secs", (t2 - t1) / 1000.0)); } if (prevEndRow == null) prevEndRow = new Text(); long numKeys = 0; numKeys = countIndexEntries(acuconf, prevEndRow, endRow, mapFiles, true, conf, fs, readers); if (numKeys == 0) { // not enough info in the index to answer the question, so instead of going to // the data just punt and return .5 return .5; } List<SortedKeyValueIterator<Key, Value>> iters = new ArrayList<>(readers); MultiIterator mmfi = new MultiIterator(iters, true); // skip the prevendrow while (mmfi.hasTop() && mmfi.getTopKey().compareRow(prevEndRow) <= 0) { mmfi.next(); } int numLte = 0; while (mmfi.hasTop() && mmfi.getTopKey().compareRow(splitRow) <= 0) { numLte++; mmfi.next(); } if (numLte > numKeys) { // something went wrong throw new RuntimeException("numLte > numKeys " + numLte + " " + numKeys + " " + prevEndRow + " " + endRow + " " + splitRow + " " + mapFiles); } // do not want to return 0% or 100%, so add 1 and 2 below return (numLte + 1) / (double) (numKeys + 2); } finally { cleanupIndexOp(tmpDir, fs, readers); } } /** * * @param mapFiles * - list MapFiles to find the mid point key * * ISSUES : This method used the index files to find the mid point. If the map files have different index intervals this method will not return an * accurate mid point. Also, it would be tricky to use this method in conjunction with an in memory map because the indexing interval is unknown. */ public static SortedMap<Double, Key> findMidPoint(VolumeManager fs, AccumuloConfiguration acuConf, Text prevEndRow, Text endRow, Collection<String> mapFiles, double minSplit, boolean useIndex) throws IOException { Configuration conf = CachedConfiguration.getInstance(); Collection<String> origMapFiles = mapFiles; Path tmpDir = null; int maxToOpen = acuConf.getCount(Property.TSERV_TABLET_SPLIT_FINDMIDPOINT_MAXOPEN); ArrayList<FileSKVIterator> readers = new ArrayList<>(mapFiles.size()); try { if (mapFiles.size() > maxToOpen) { if (!useIndex) throw new IOException("Cannot find mid point using data files, too many " + mapFiles.size()); tmpDir = createTmpDir(acuConf, fs); log.debug("Too many indexes (" + mapFiles.size() + ") to open at once for " + endRow + " " + prevEndRow + ", reducing in tmpDir = " + tmpDir); long t1 = System.currentTimeMillis(); mapFiles = reduceFiles(acuConf, conf, fs, prevEndRow, endRow, mapFiles, maxToOpen, tmpDir, 0); long t2 = System.currentTimeMillis(); log.debug("Finished reducing indexes for " + endRow + " " + prevEndRow + " in " + String.format("%6.2f secs", (t2 - t1) / 1000.0)); } if (prevEndRow == null) prevEndRow = new Text(); long t1 = System.currentTimeMillis(); long numKeys = 0; numKeys = countIndexEntries(acuConf, prevEndRow, endRow, mapFiles, tmpDir == null ? useIndex : false, conf, fs, readers); if (numKeys == 0) { if (useIndex) { log.warn( "Failed to find mid point using indexes, falling back to data files which is slower. No entries between " + prevEndRow + " and " + endRow + " for " + mapFiles); // need to pass original map files, not possibly reduced indexes return findMidPoint(fs, acuConf, prevEndRow, endRow, origMapFiles, minSplit, false); } throw new IOException("Failed to find mid point, no entries between " + prevEndRow + " and " + endRow + " for " + mapFiles); } List<SortedKeyValueIterator<Key, Value>> iters = new ArrayList<>(readers); MultiIterator mmfi = new MultiIterator(iters, true); // skip the prevendrow while (mmfi.hasTop() && mmfi.getTopKey().compareRow(prevEndRow) <= 0) mmfi.next(); // read half of the keys in the index TreeMap<Double, Key> ret = new TreeMap<>(); Key lastKey = null; long keysRead = 0; Key keyBeforeMidPoint = null; long keyBeforeMidPointPosition = 0; while (keysRead < numKeys / 2) { if (lastKey != null && !lastKey.equals(mmfi.getTopKey(), PartialKey.ROW) && (keysRead - 1) / (double) numKeys >= minSplit) { keyBeforeMidPoint = new Key(lastKey); keyBeforeMidPointPosition = keysRead - 1; } if (lastKey == null) lastKey = new Key(); lastKey.set(mmfi.getTopKey()); keysRead++; // consume minimum mmfi.next(); } if (keyBeforeMidPoint != null) ret.put(keyBeforeMidPointPosition / (double) numKeys, keyBeforeMidPoint); long t2 = System.currentTimeMillis(); log.debug(String.format("Found midPoint from indexes in %6.2f secs.%n", ((t2 - t1) / 1000.0))); ret.put(.5, mmfi.getTopKey()); // sanity check for (Key key : ret.values()) { boolean inRange = (key.compareRow(prevEndRow) > 0 && (endRow == null || key.compareRow(endRow) < 1)); if (!inRange) { throw new IOException("Found mid point is not in range " + key + " " + prevEndRow + " " + endRow + " " + mapFiles); } } return ret; } finally { cleanupIndexOp(tmpDir, fs, readers); } } protected static void cleanupIndexOp(Path tmpDir, VolumeManager fs, ArrayList<FileSKVIterator> readers) throws IOException { // close all of the index sequence files for (FileSKVIterator r : readers) { try { if (r != null) r.close(); } catch (IOException e) { // okay, try to close the rest anyway log.error("{}", e.getMessage(), e); } } if (tmpDir != null) { Volume v = fs.getVolumeByPath(tmpDir); if (v.getFileSystem().exists(tmpDir)) { fs.deleteRecursively(tmpDir); return; } log.error("Did not delete tmp dir because it wasn't a tmp dir " + tmpDir); } } private static long countIndexEntries(AccumuloConfiguration acuConf, Text prevEndRow, Text endRow, Collection<String> mapFiles, boolean useIndex, Configuration conf, VolumeManager fs, ArrayList<FileSKVIterator> readers) throws IOException { long numKeys = 0; // count the total number of index entries for (String ref : mapFiles) { FileSKVIterator reader = null; Path path = new Path(ref); FileSystem ns = fs.getVolumeByPath(path).getFileSystem(); try { if (useIndex) reader = FileOperations.getInstance().newIndexReaderBuilder() .forFile(path.toString(), ns, ns.getConf()).withTableConfiguration(acuConf).build(); else reader = FileOperations.getInstance().newScanReaderBuilder() .forFile(path.toString(), ns, ns.getConf()).withTableConfiguration(acuConf) .overRange(new Range(prevEndRow, false, null, true), LocalityGroupUtil.EMPTY_CF_SET, false) .build(); while (reader.hasTop()) { Key key = reader.getTopKey(); if (endRow != null && key.compareRow(endRow) > 0) break; else if (prevEndRow == null || key.compareRow(prevEndRow) > 0) numKeys++; reader.next(); } } finally { try { if (reader != null) reader.close(); } catch (IOException e) { log.error("{}", e.getMessage(), e); } } if (useIndex) readers.add(FileOperations.getInstance().newIndexReaderBuilder() .forFile(path.toString(), ns, ns.getConf()).withTableConfiguration(acuConf).build()); else readers.add(FileOperations.getInstance().newScanReaderBuilder() .forFile(path.toString(), ns, ns.getConf()).withTableConfiguration(acuConf) .overRange(new Range(prevEndRow, false, null, true), LocalityGroupUtil.EMPTY_CF_SET, false) .build()); } return numKeys; } public static Map<FileRef, FileInfo> tryToGetFirstAndLastRows(VolumeManager fs, AccumuloConfiguration acuConf, Set<FileRef> mapfiles) { HashMap<FileRef, FileInfo> mapFilesInfo = new HashMap<>(); long t1 = System.currentTimeMillis(); for (FileRef mapfile : mapfiles) { FileSKVIterator reader = null; FileSystem ns = fs.getVolumeByPath(mapfile.path()).getFileSystem(); try { reader = FileOperations.getInstance().newReaderBuilder() .forFile(mapfile.toString(), ns, ns.getConf()).withTableConfiguration(acuConf).build(); Key firstKey = reader.getFirstKey(); if (firstKey != null) { mapFilesInfo.put(mapfile, new FileInfo(firstKey, reader.getLastKey())); } } catch (IOException ioe) { log.warn("Failed to read map file to determine first and last key : " + mapfile, ioe); } finally { if (reader != null) { try { reader.close(); } catch (IOException ioe) { log.warn("failed to close " + mapfile, ioe); } } } } long t2 = System.currentTimeMillis(); log.debug(String.format("Found first and last keys for %d map files in %6.2f secs", mapfiles.size(), (t2 - t1) / 1000.0)); return mapFilesInfo; } public static WritableComparable<Key> findLastKey(VolumeManager fs, AccumuloConfiguration acuConf, Collection<FileRef> mapFiles) throws IOException { Key lastKey = null; for (FileRef ref : mapFiles) { Path path = ref.path(); FileSystem ns = fs.getVolumeByPath(path).getFileSystem(); FileSKVIterator reader = FileOperations.getInstance().newReaderBuilder() .forFile(path.toString(), ns, ns.getConf()).withTableConfiguration(acuConf).seekToBeginning() .build(); try { if (!reader.hasTop()) // file is empty, so there is no last key continue; Key key = reader.getLastKey(); if (lastKey == null || key.compareTo(lastKey) > 0) lastKey = key; } finally { try { if (reader != null) reader.close(); } catch (IOException e) { log.error("{}", e.getMessage(), e); } } } return lastKey; } private static class MLong { public MLong(long i) { l = i; } long l; } public static Map<KeyExtent, Long> estimateSizes(AccumuloConfiguration acuConf, Path mapFile, long fileSize, List<KeyExtent> extents, Configuration conf, VolumeManager fs) throws IOException { long totalIndexEntries = 0; Map<KeyExtent, MLong> counts = new TreeMap<>(); for (KeyExtent keyExtent : extents) counts.put(keyExtent, new MLong(0)); Text row = new Text(); FileSystem ns = fs.getVolumeByPath(mapFile).getFileSystem(); FileSKVIterator index = FileOperations.getInstance().newIndexReaderBuilder() .forFile(mapFile.toString(), ns, ns.getConf()).withTableConfiguration(acuConf).build(); try { while (index.hasTop()) { Key key = index.getTopKey(); totalIndexEntries++; key.getRow(row); for (Entry<KeyExtent, MLong> entry : counts.entrySet()) if (entry.getKey().contains(row)) entry.getValue().l++; index.next(); } } finally { try { if (index != null) index.close(); } catch (IOException e) { // continue with next file log.error("{}", e.getMessage(), e); } } Map<KeyExtent, Long> results = new TreeMap<>(); for (KeyExtent keyExtent : extents) { double numEntries = counts.get(keyExtent).l; if (numEntries == 0) numEntries = 1; long estSize = (long) ((numEntries / totalIndexEntries) * fileSize); results.put(keyExtent, estSize); } return results; } public static Collection<String> toPathStrings(Collection<FileRef> refs) { ArrayList<String> ret = new ArrayList<>(); for (FileRef fileRef : refs) { ret.add(fileRef.path().toString()); } return ret; } }