Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.metastore; import java.io.IOException; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.concurrent.BasicThreadFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileChecksum; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.Trash; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class ReplChangeManager { private static final Logger LOG = LoggerFactory.getLogger(ReplChangeManager.class); static private ReplChangeManager instance; private static boolean inited = false; private static boolean enabled = false; private static Path cmroot; private static HiveConf hiveConf; private String msUser; private String msGroup; private FileSystem fs; private static final String ORIG_LOC_TAG = "user.original-loc"; static final String REMAIN_IN_TRASH_TAG = "user.remain-in-trash"; private static final String URI_FRAGMENT_SEPARATOR = "#"; public enum RecycleType { MOVE, COPY } public static ReplChangeManager getInstance(HiveConf hiveConf) throws MetaException { if (instance == null) { instance = new ReplChangeManager(hiveConf); } return instance; } private ReplChangeManager(HiveConf hiveConf) throws MetaException { try { if (!inited) { if (hiveConf.getBoolVar(HiveConf.ConfVars.REPLCMENABLED)) { ReplChangeManager.enabled = true; ReplChangeManager.cmroot = new Path(hiveConf.get(HiveConf.ConfVars.REPLCMDIR.varname)); ReplChangeManager.hiveConf = hiveConf; fs = cmroot.getFileSystem(hiveConf); // Create cmroot with permission 700 if not exist if (!fs.exists(cmroot)) { fs.mkdirs(cmroot); fs.setPermission(cmroot, new FsPermission("700")); } UserGroupInformation usergroupInfo = UserGroupInformation.getCurrentUser(); msUser = usergroupInfo.getShortUserName(); msGroup = usergroupInfo.getPrimaryGroupName(); } inited = true; } } catch (IOException e) { throw new MetaException(StringUtils.stringifyException(e)); } } // Filter files starts with ".". Note Hadoop consider files starts with // "." or "_" as hidden file. However, we need to replicate files starts // with "_". We find at least 2 use cases: // 1. For har files, _index and _masterindex is required files // 2. _success file is required for Oozie to indicate availability of data source private static final PathFilter hiddenFileFilter = new PathFilter() { public boolean accept(Path p) { return !p.getName().startsWith("."); } }; /*** * Move a path into cmroot. If the path is a directory (of a partition, or table if nonpartitioned), * recursively move files inside directory to cmroot. Note the table must be managed table * @param path a single file or directory * @param type if the files to be copied or moved to cmpath. * Copy is costly but preserve the source file * @param ifPurge if the file should skip Trash when move/delete source file. * This is referred only if type is MOVE. * @return int * @throws MetaException */ int recycle(Path path, RecycleType type, boolean ifPurge) throws MetaException { if (!enabled) { return 0; } try { int count = 0; if (fs.isDirectory(path)) { FileStatus[] files = fs.listStatus(path, hiddenFileFilter); for (FileStatus file : files) { count += recycle(file.getPath(), type, ifPurge); } } else { String fileCheckSum = checksumFor(path, fs); Path cmPath = getCMPath(hiveConf, fileCheckSum); // set timestamp before moving to cmroot, so we can // avoid race condition CM remove the file before setting // timestamp long now = System.currentTimeMillis(); fs.setTimes(path, now, now); boolean success = false; if (fs.exists(cmPath) && fileCheckSum.equalsIgnoreCase(checksumFor(cmPath, fs))) { // If already a file with same checksum exists in cmPath, just ignore the copy/move // Also, mark the operation is unsuccessful to notify that file with same name already // exist which will ensure the timestamp of cmPath is updated to avoid clean-up by // CM cleaner. success = false; } else { switch (type) { case MOVE: { if (LOG.isDebugEnabled()) { LOG.debug("Moving {} to {}", path.toString(), cmPath.toString()); } // Rename fails if the file with same name already exist. success = fs.rename(path, cmPath); break; } case COPY: { if (LOG.isDebugEnabled()) { LOG.debug("Copying {} to {}", path.toString(), cmPath.toString()); } // It is possible to have a file with same checksum in cmPath but the content is // partially copied or corrupted. In this case, just overwrite the existing file with // new one. success = FileUtils.copy(fs, path, fs, cmPath, false, true, hiveConf); break; } default: // Operation fails as invalid input break; } } // Ignore if a file with same content already exist in cmroot // We might want to setXAttr for the new location in the future if (success) { // set the file owner to hive (or the id metastore run as) fs.setOwner(cmPath, msUser, msGroup); // tag the original file name so we know where the file comes from // Note we currently only track the last known trace as // xattr has limited capacity. We shall revisit and store all original // locations if orig-loc becomes important try { fs.setXAttr(cmPath, ORIG_LOC_TAG, path.toString().getBytes()); } catch (UnsupportedOperationException e) { LOG.warn("Error setting xattr for {}", path.toString()); } count++; } else { if (LOG.isDebugEnabled()) { LOG.debug("A file with the same content of {} already exists, ignore", path.toString()); } // Need to extend the tenancy if we saw a newer file with the same content fs.setTimes(cmPath, now, now); } // Tag if we want to remain in trash after deletion. // If multiple files share the same content, then // any file claim remain in trash would be granted if ((type == RecycleType.MOVE) && !ifPurge) { try { fs.setXAttr(cmPath, REMAIN_IN_TRASH_TAG, new byte[] { 0 }); } catch (UnsupportedOperationException e) { LOG.warn("Error setting xattr for {}", cmPath.toString()); } } } return count; } catch (IOException e) { throw new MetaException(StringUtils.stringifyException(e)); } } // Get checksum of a file static public String checksumFor(Path path, FileSystem fs) throws IOException { // TODO: fs checksum only available on hdfs, need to // find a solution for other fs (eg, local fs, s3, etc) String checksumString = null; FileChecksum checksum = fs.getFileChecksum(path); if (checksum != null) { checksumString = StringUtils.byteToHexString(checksum.getBytes(), 0, checksum.getLength()); } return checksumString; } static public void setCmRoot(Path cmRoot) { ReplChangeManager.cmroot = cmRoot; } /*** * Convert a path of file inside a partition or table (if non-partitioned) * to a deterministic location of cmroot. So user can retrieve the file back * with the original location plus checksum. * @param conf * @param checkSum checksum of the file, can be retrieved by {@link #checksumFor(Path, FileSystem)} * @return Path */ static Path getCMPath(Configuration conf, String checkSum) throws IOException, MetaException { String newFileName = checkSum; int maxLength = conf.getInt(DFSConfigKeys.DFS_NAMENODE_MAX_COMPONENT_LENGTH_KEY, DFSConfigKeys.DFS_NAMENODE_MAX_COMPONENT_LENGTH_DEFAULT); if (newFileName.length() > maxLength) { newFileName = newFileName.substring(0, maxLength - 1); } return new Path(cmroot, newFileName); } /*** * Get original file specified by src and chksumString. If the file exists and checksum * matches, return the file; otherwise, use chksumString to retrieve it from cmroot * @param src Original file location * @param checksumString Checksum of the original file * @param hiveConf * @return Corresponding FileStatus object */ static public FileStatus getFileStatus(Path src, String checksumString, HiveConf hiveConf) throws MetaException { try { FileSystem srcFs = src.getFileSystem(hiveConf); if (checksumString == null) { return srcFs.getFileStatus(src); } if (!srcFs.exists(src)) { return srcFs.getFileStatus(getCMPath(hiveConf, checksumString)); } String currentChecksumString = checksumFor(src, srcFs); if (currentChecksumString == null || checksumString.equals(currentChecksumString)) { return srcFs.getFileStatus(src); } else { return srcFs.getFileStatus(getCMPath(hiveConf, checksumString)); } } catch (IOException e) { throw new MetaException(StringUtils.stringifyException(e)); } } /*** * Concatenate filename and checksum with "#" * @param fileUriStr Filename string * @param fileChecksum Checksum string * @return Concatenated Uri string */ // TODO: this needs to be enhanced once change management based filesystem is implemented // Currently using fileuri#checksum as the format static public String encodeFileUri(String fileUriStr, String fileChecksum) { if (fileChecksum != null) { return fileUriStr + URI_FRAGMENT_SEPARATOR + fileChecksum; } else { return fileUriStr; } } /*** * Split uri with fragment into file uri and checksum * @param fileURIStr uri with fragment * @return array of file name and checksum */ static public String[] getFileWithChksumFromURI(String fileURIStr) { String[] uriAndFragment = fileURIStr.split(URI_FRAGMENT_SEPARATOR); String[] result = new String[2]; result[0] = uriAndFragment[0]; if (uriAndFragment.length > 1) { result[1] = uriAndFragment[1]; } return result; } public static boolean isCMFileUri(Path fromPath, FileSystem srcFs) { String[] result = getFileWithChksumFromURI(fromPath.toString()); return result[1] != null; } /** * Thread to clear old files of cmroot recursively */ static class CMClearer implements Runnable { private Path cmroot; private long secRetain; private HiveConf hiveConf; CMClearer(String cmrootString, long secRetain, HiveConf hiveConf) { this.cmroot = new Path(cmrootString); this.secRetain = secRetain; this.hiveConf = hiveConf; } @Override public void run() { try { LOG.info("CMClearer started"); long now = System.currentTimeMillis(); FileSystem fs = cmroot.getFileSystem(hiveConf); FileStatus[] files = fs.listStatus(cmroot); for (FileStatus file : files) { long modifiedTime = file.getModificationTime(); if (now - modifiedTime > secRetain * 1000) { try { if (fs.getXAttrs(file.getPath()).containsKey(REMAIN_IN_TRASH_TAG)) { boolean succ = Trash.moveToAppropriateTrash(fs, file.getPath(), hiveConf); if (succ) { if (LOG.isDebugEnabled()) { LOG.debug("Move " + file.toString() + " to trash"); } } else { LOG.warn("Fail to move " + file.toString() + " to trash"); } } else { boolean succ = fs.delete(file.getPath(), false); if (succ) { if (LOG.isDebugEnabled()) { LOG.debug("Remove " + file.toString()); } } else { LOG.warn("Fail to remove " + file.toString()); } } } catch (UnsupportedOperationException e) { LOG.warn("Error getting xattr for " + file.getPath().toString()); } } } } catch (IOException e) { LOG.error("Exception when clearing cmroot:" + StringUtils.stringifyException(e)); } } } // Schedule CMClearer thread. Will be invoked by metastore static void scheduleCMClearer(HiveConf hiveConf) { if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.REPLCMENABLED)) { ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor( new BasicThreadFactory.Builder().namingPattern("cmclearer-%d").daemon(true).build()); executor.scheduleAtFixedRate( new CMClearer(hiveConf.get(HiveConf.ConfVars.REPLCMDIR.varname), hiveConf.getTimeVar(ConfVars.REPLCMRETIAN, TimeUnit.SECONDS), hiveConf), 0, hiveConf.getTimeVar(ConfVars.REPLCMINTERVAL, TimeUnit.SECONDS), TimeUnit.SECONDS); } } }