Java tutorial
/* * Copyright 2015 Fluo authors (see AUTHORS) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package io.fluo.webindex.data; import java.io.BufferedInputStream; import java.io.IOException; import java.io.OutputStream; import java.net.URL; import java.util.List; import io.fluo.webindex.core.DataConfig; import io.fluo.webindex.data.spark.IndexEnv; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class Copy { private static final Logger log = LoggerFactory.getLogger(Copy.class); public static String getFilename(String fullPath) { int slashIndex = fullPath.lastIndexOf("/"); if (slashIndex == -1) { return fullPath; } return fullPath.substring(slashIndex + 1); } public static void main(String[] args) throws Exception { if (args.length != 3) { log.error("Usage: Copy <pathsFile> <range> <dest>"); System.exit(1); } final String hadoopConfDir = IndexEnv.getHadoopConfDir(); final List<String> copyList = IndexEnv.getPathsRange(args[0], args[1]); if (copyList.isEmpty()) { log.error("No files to copy given {} {}", args[0], args[1]); System.exit(1); } DataConfig dataConfig = DataConfig.load(); SparkConf sparkConf = new SparkConf().setAppName("webindex-copy"); try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) { FileSystem hdfs = FileSystem.get(ctx.hadoopConfiguration()); Path destPath = new Path(args[2]); if (!hdfs.exists(destPath)) { hdfs.mkdirs(destPath); } log.info("Copying {} files (Range {} of paths file {}) from AWS to HDFS {}", copyList.size(), args[1], args[0], destPath.toString()); JavaRDD<String> copyRDD = ctx.parallelize(copyList, dataConfig.getNumExecutorInstances()); final String prefix = DataConfig.CC_URL_PREFIX; final String destDir = destPath.toString(); copyRDD.foreachPartition(iter -> { FileSystem fs = IndexEnv.getHDFS(hadoopConfDir); iter.forEachRemaining(ccPath -> { try { Path dfsPath = new Path(destDir + "/" + getFilename(ccPath)); if (fs.exists(dfsPath)) { log.error("File {} exists in HDFS and should have been previously filtered", dfsPath.getName()); } else { String urlToCopy = prefix + ccPath; log.info("Starting copy of {} to {}", urlToCopy, destDir); try (OutputStream out = fs.create(dfsPath); BufferedInputStream in = new BufferedInputStream( new URL(urlToCopy).openStream())) { IOUtils.copy(in, out); } log.info("Created {}", dfsPath.getName()); } } catch (IOException e) { log.error("Exception while copying {}", ccPath, e); } }); }); } } }