Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.asterix.external.util; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.asterix.common.config.DatasetConfig.ExternalFilePendingOp; import org.apache.asterix.external.indexing.ExternalFile; import org.apache.asterix.external.indexing.IndexingScheduler; import org.apache.asterix.external.indexing.RecordId.RecordIdType; import org.apache.asterix.external.input.stream.HDFSInputStream; import org.apache.asterix.om.util.AsterixAppContextInfo; import org.apache.asterix.om.util.AsterixClusterProperties; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.RCFileInputFormat; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint; import org.apache.hyracks.api.context.ICCContext; import org.apache.hyracks.api.exceptions.HyracksException; import org.apache.hyracks.hdfs.scheduler.Scheduler; public class HDFSUtils { public static Scheduler initializeHDFSScheduler() { ICCContext ccContext = AsterixAppContextInfo.getInstance().getCCApplicationContext().getCCContext(); Scheduler scheduler = null; try { scheduler = new Scheduler(ccContext.getClusterControllerInfo().getClientNetAddress(), ccContext.getClusterControllerInfo().getClientNetPort()); } catch (HyracksException e) { throw new IllegalStateException("Cannot obtain hdfs scheduler"); } return scheduler; } public static IndexingScheduler initializeIndexingHDFSScheduler() { ICCContext ccContext = AsterixAppContextInfo.getInstance().getCCApplicationContext().getCCContext(); IndexingScheduler scheduler = null; try { scheduler = new IndexingScheduler(ccContext.getClusterControllerInfo().getClientNetAddress(), ccContext.getClusterControllerInfo().getClientNetPort()); } catch (HyracksException e) { throw new IllegalStateException("Cannot obtain hdfs scheduler"); } return scheduler; } /** * Instead of creating the split using the input format, we do it manually * This function returns fileSplits (1 per hdfs file block) irrespective of the number of partitions * and the produced splits only cover intersection between current files in hdfs and files stored internally * in AsterixDB * 1. NoOp means appended file * 2. AddOp means new file * 3. UpdateOp means the delta of a file * @return * @throws IOException */ public static InputSplit[] getSplits(JobConf conf, List<ExternalFile> files) throws IOException { // Create file system object FileSystem fs = FileSystem.get(conf); ArrayList<FileSplit> fileSplits = new ArrayList<FileSplit>(); ArrayList<ExternalFile> orderedExternalFiles = new ArrayList<ExternalFile>(); // Create files splits for (ExternalFile file : files) { Path filePath = new Path(file.getFileName()); FileStatus fileStatus; try { fileStatus = fs.getFileStatus(filePath); } catch (FileNotFoundException e) { // file was deleted at some point, skip to next file continue; } if (file.getPendingOp() == ExternalFilePendingOp.PENDING_ADD_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) { // Get its information from HDFS name node BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, file.getSize()); // Create a split per block for (BlockLocation block : fileBlocks) { if (block.getOffset() < file.getSize()) { fileSplits.add(new FileSplit(filePath, block.getOffset(), (block.getLength() + block.getOffset()) < file.getSize() ? block.getLength() : (file.getSize() - block.getOffset()), block.getHosts())); orderedExternalFiles.add(file); } } } else if (file.getPendingOp() == ExternalFilePendingOp.PENDING_NO_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) { long oldSize = 0L; long newSize = file.getSize(); for (int i = 0; i < files.size(); i++) { if (files.get(i).getFileName() == file.getFileName() && files.get(i).getSize() != file.getSize()) { newSize = files.get(i).getSize(); oldSize = file.getSize(); break; } } // Get its information from HDFS name node BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, newSize); // Create a split per block for (BlockLocation block : fileBlocks) { if (block.getOffset() + block.getLength() > oldSize) { if (block.getOffset() < newSize) { // Block interact with delta -> Create a split long startCut = (block.getOffset() > oldSize) ? 0L : oldSize - block.getOffset(); long endCut = (block.getOffset() + block.getLength() < newSize) ? 0L : block.getOffset() + block.getLength() - newSize; long splitLength = block.getLength() - startCut - endCut; fileSplits.add(new FileSplit(filePath, block.getOffset() + startCut, splitLength, block.getHosts())); orderedExternalFiles.add(file); } } } } } fs.close(); files.clear(); files.addAll(orderedExternalFiles); return fileSplits.toArray(new FileSplit[fileSplits.size()]); } public static String getInputFormatClassName(Map<String, String> configuration) { String inputFormatParameter = configuration.get(ExternalDataConstants.KEY_INPUT_FORMAT).trim(); switch (inputFormatParameter) { case ExternalDataConstants.INPUT_FORMAT_TEXT: return ExternalDataConstants.CLASS_NAME_TEXT_INPUT_FORMAT; case ExternalDataConstants.INPUT_FORMAT_SEQUENCE: return ExternalDataConstants.CLASS_NAME_SEQUENCE_INPUT_FORMAT; case ExternalDataConstants.INPUT_FORMAT_RC: return ExternalDataConstants.CLASS_NAME_RC_INPUT_FORMAT; default: return inputFormatParameter; } } public static Class<?> getInputFormatClass(Map<String, String> configuration) throws ClassNotFoundException { String inputFormatParameter = configuration.get(ExternalDataConstants.KEY_INPUT_FORMAT).trim(); switch (inputFormatParameter) { case ExternalDataConstants.INPUT_FORMAT_TEXT: return TextInputFormat.class; case ExternalDataConstants.INPUT_FORMAT_SEQUENCE: return SequenceFileInputFormat.class; case ExternalDataConstants.INPUT_FORMAT_RC: return RCFileInputFormat.class; default: return Class.forName(inputFormatParameter); } } public static JobConf configureHDFSJobConf(Map<String, String> configuration) { JobConf conf = new JobConf(); String localShortCircuitSocketPath = configuration.get(ExternalDataConstants.KEY_LOCAL_SOCKET_PATH); String formatClassName = HDFSUtils.getInputFormatClassName(configuration); conf.set(ExternalDataConstants.KEY_HADOOP_FILESYSTEM_URI, configuration.get(ExternalDataConstants.KEY_HDFS_URL).trim()); conf.set(ExternalDataConstants.KEY_HADOOP_FILESYSTEM_CLASS, ExternalDataConstants.CLASS_NAME_HDFS_FILESYSTEM); conf.setClassLoader(HDFSInputStream.class.getClassLoader()); conf.set(ExternalDataConstants.KEY_HADOOP_INPUT_DIR, configuration.get(ExternalDataConstants.KEY_PATH).trim()); conf.set(ExternalDataConstants.KEY_HADOOP_INPUT_FORMAT, formatClassName); // Enable local short circuit reads if user supplied the parameters if (localShortCircuitSocketPath != null) { conf.set(ExternalDataConstants.KEY_HADOOP_SHORT_CIRCUIT, "true"); conf.set(ExternalDataConstants.KEY_HADOOP_SOCKET_PATH, localShortCircuitSocketPath.trim()); } return conf; } public static AlgebricksAbsolutePartitionConstraint getPartitionConstraints( AlgebricksAbsolutePartitionConstraint clusterLocations) { if (clusterLocations == null) { ArrayList<String> locs = new ArrayList<String>(); Map<String, String[]> stores = AsterixAppContextInfo.getInstance().getMetadataProperties().getStores(); for (String i : stores.keySet()) { int numIODevices = AsterixClusterProperties.INSTANCE.getNumberOfIODevices(i); for (int k = 0; k < numIODevices; k++) { locs.add(i); } } String[] cluster = new String[locs.size()]; cluster = locs.toArray(cluster); clusterLocations = new AlgebricksAbsolutePartitionConstraint(cluster); } return clusterLocations; } public static RecordIdType getRecordIdType(Map<String, String> configuration) { String inputFormatParameter = configuration.get(ExternalDataConstants.KEY_INPUT_FORMAT).trim(); switch (inputFormatParameter) { case ExternalDataConstants.INPUT_FORMAT_TEXT: case ExternalDataConstants.INPUT_FORMAT_SEQUENCE: return RecordIdType.OFFSET; case ExternalDataConstants.INPUT_FORMAT_RC: return RecordIdType.RC; default: return null; } } }