Java tutorial
/** * Copyright [2012] [Datasalt Systems S.L.] * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.datasalt.pangool.utils; import java.io.FileNotFoundException; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectInputStream; import java.io.ObjectOutput; import java.io.ObjectOutputStream; import java.net.URISyntaxException; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; /** * This class contains useful methods for serializing/deserializing * instances that implement {@link java.io.Serializable}. This class is used in Pangool * to distribute the instances around the cluster * <p> * You can do things like saving a Java Serializable instance and recovering it afterwards. Check methods * {@link InstancesDistributor#distribute(Object, String, Configuration)} and * {@link InstancesDistributor#loadInstance(Configuration, Class, String, boolean)} for this purpose. * */ public class InstancesDistributor { public final static String HDFS_TMP_FOLDER_CONF = InstancesDistributor.class.getName() + ".hdfs.pangool.tmp.folder"; public final static String DEFAULT_HDFS_TMP_FOLDER_CONF_VALUE = "./pangool-instances"; /** * Utility method for serializing an object and saving it in a way that later can be recovered * anywhere in the cluster. * <p> * The file where it has been serialized will be saved into a Hadoop Configuration property so that you can call * {@link InstancesDistributor#loadInstance(Configuration, Class, String, boolean)} to re-instantiate the serialized instance. * * @param obj The obj instance to serialize using Java serialization. * @param fileName The file name where the instance will be serialized. * @param conf The Hadoop Configuration. * @throws FileNotFoundException * @throws IOException * @throws URISyntaxException */ public static void distribute(Object obj, String fileName, Configuration conf) throws FileNotFoundException, IOException, URISyntaxException { FileSystem fS = FileSystem.get(conf); // set the temporary folder for Pangool instances to the temporary of the user that is running the Job // This folder will be used across the cluster for location the instances. // The default value can be changed by a user-provided one. String tmpHdfsFolder = conf.get(HDFS_TMP_FOLDER_CONF, DEFAULT_HDFS_TMP_FOLDER_CONF_VALUE); Path toHdfs = new Path(tmpHdfsFolder, fileName); if (fS.exists(toHdfs)) { // Optionally, copy to DFS if fS.delete(toHdfs, false); } ObjectOutput out = new ObjectOutputStream(fS.create(toHdfs)); out.writeObject(obj); out.close(); DistributedCache.addCacheFile(toHdfs.toUri(), conf); } /** * Given a Hadoop Configuration property and an Class, this method can re-instantiate an Object instance that was * previously distributed using * {@link InstancesDistributor#distribute(Object, String, Configuration)}. * * @param <T> The object type. * @param conf The Hadoop Configuration. * @param objClass The object type class. * @param fileName The file name to locate the instance * @param callSetConf If true, will call setConf() if deserialized instance is {@link Configurable} * @throws IOException */ public static <T> T loadInstance(Configuration conf, Class<T> objClass, String fileName, boolean callSetConf) throws IOException { Path path = InstancesDistributor.locateFileInCache(conf, fileName); T obj; ObjectInput in; if (path == null) { throw new IOException("Path is null"); } in = new ObjectInputStream(FileSystem.get(conf).open(path)); try { obj = objClass.cast(in.readObject()); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } in.close(); if (obj instanceof Configurable && callSetConf) { ((Configurable) obj).setConf(conf); } return obj; } /** * Locates a file in the temporal folder * * @param conf * The Hadoop Configuration. * @param filename * The file name. * @throws IOException */ private static Path locateFileInCache(Configuration conf, String filename) throws IOException { return new Path(conf.get(HDFS_TMP_FOLDER_CONF, DEFAULT_HDFS_TMP_FOLDER_CONF_VALUE), filename); } /** * Delete a file that has been distributed using {@link #distribute(Object, String, Configuration)}. */ public static void removeFromCache(Configuration conf, String filename) throws IOException { FileSystem fS = FileSystem.get(conf); fS.delete(locateFileInCache(conf, filename), true); } }