Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.cifs; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.net.MalformedURLException; import java.security.PrivilegedExceptionAction; import java.util.List; import jcifs.smb.SmbFile; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.cifs.mapred.Constants; import org.apache.hadoop.cifs.password.CifsCredentialProvider; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.GenericOptionsParser; public class Cifs2HdfsClient { // Static's these won't change they are defined once and only once at start // of java program static Object objectWaiter = new Object(); static Options options; static HdfsClient hdfsClient; static String hdfsInputFolder; static String hdfsOutputFolder; static String cifsInputFolder; static String cifsOutputFolder; static String cifsHost; static String cifsUserId; static String cifsPwd; static String cifsPwdAlias = null; static String cifsPwdCredPath = null; static String cifsDomain; static String cifsLogonTo; static String cifsInputFile; static Integer maxDepth = -1; static boolean ignoreTopFolder = false; static boolean noNesting = false; static boolean transferLimitTrue = false; static boolean hdfs2cifs = false; static boolean cifs2hdfs = false; static String transferLimit = null; private static CifsClient cifsClient; private static List<String> cifsFileList; // private static FileSystem fileSystem; static boolean setKrb = false; static String keytab = null; static String keytabupn = null; // These are used at Thread creation time during the recursion of the files. // These define the Smb/CIFS file handle into the thread // They also define Hadoop Security Configuration allowing the program to // talk directly to HDFS static Configuration hdpConf; static UserGroupInformation ugi; SmbFile f; SmbFile inSmbFile; // This is the main.. This starts the entire program execution. public static void main(String[] args) { // This handles parsing args.. This is a really crappy implementation. I // have a better one I can share from Commons-cli package Configuration conf = new Configuration(); String[] otherArgs = null; try { otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); } catch (IOException e4) { // TODO Auto-generated catch block e4.printStackTrace(); } options = new Options(); options.addOption("cifs_host", true, "CIFS/SMB Server Hostname --cifs_host winfileserver1.nt.example.com"); options.addOption("cifs_domain", true, "CIFS/SMB Domain --cifs_domain nt.example.com"); options.addOption("cifs_logonto", true, "CIFS/SMB LogonTo --cifs_logonto windc1nt, hadoopserver"); options.addOption("cifs_input_folder", true, "CIFS/SMB Server Input Folder --cifs_input_folder M201209 "); options.addOption("cifs_output_folder", true, "CIFS/SMB Server Output Folder --cifs_output_folder M201209 "); options.addOption("cifs_input_file", true, "CIFS/SMB Server Single Input File filename.csv or filename*"); options.addOption("cifs_userid", true, "CIFS/SMB Domain Userid --cifs_userid usergoeshere"); options.addOption("cifs_pwd", true, "CIFS/SMB Domain Password --cifs_pwd passwordgoeshere"); options.addOption("cifs_hadoop_cred_path", true, "CIFS Password --cifs_hadoop_cred_path /user/username/credstore.jceks"); options.addOption("cifs_pwd_alias", true, "CIFS Password Alias --cifs_pwd_alias password.alias"); options.addOption("transfer_limit", true, "# of transfers to execute simultaneously should not transfer Note: 10-15 = optimal --transfer_limit 10"); options.addOption("max_depth", true, "CIFS ONLY - Max Depth to recurse --max_depth 10"); options.addOption("ignore_top_folder_files", false, "CIFS ONLY - Ignore Top Level Folder files"); options.addOption("no_nested_transfer", false, "CIFS ONLY - Do not nest into folders for transfer"); options.addOption("hdfs_output_folder", true, "HDFS Output Folder --hdfs_output_dir /scm/"); options.addOption("hdfs_input_folder", true, "HDFS Input Folder --hdfs_input_dir /scm/"); // options.addOption("hdfs_input_file", true, "HDFS Single Input File // filename.csv or filename*"); options.addOption("krb_keytab", true, "KeyTab File to Connect to HDFS --krb_keytab $HOME/S00000.keytab"); options.addOption("krb_upn", true, "Kerberos Princpial for Keytab to Connect to HDFS --krb_upn S00000@EXAMP.EXAMPLE.COM"); options.addOption("help", false, "Display help"); CommandLineParser parser = new CIFSParser(); CommandLine cmd = null; try { cmd = parser.parse(options, otherArgs); } catch (ParseException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } if (cmd.hasOption("cifs_host") && cmd.hasOption("cifs_domain") && cmd.hasOption("cifs_userid")) { cifsHost = cmd.getOptionValue("cifs_host"); cifsDomain = cmd.getOptionValue("cifs_domain"); cifsUserId = cmd.getOptionValue("cifs_userid"); if (cmd.hasOption("cifs_pwd")) { cifsPwd = cmd.getOptionValue("cifs_pwd"); } else if (cmd.hasOption("cifs_pwd_alias") && cmd.hasOption("cifs_hadoop_cred_path")) { cifsPwdAlias = cmd.getOptionValue("cifs_pwd_alias"); cifsPwdCredPath = cmd.getOptionValue("cifs_hadoop_cred_path"); } else { System.out.println("Missing CIFS Password / CIFS Password Alias / CIFS Hadoop Cred Path"); missingParams(); System.exit(0); } if (cmd.hasOption("cifs_logonto")) { cifsLogonTo = cmd.getOptionValue("cifs_logonto"); } else { cifsLogonTo = null; } if (cmd.hasOption("ignore_top_folder_files")) { ignoreTopFolder = true; } if (cmd.hasOption("no_nested_transfer")) { noNesting = true; } if (cmd.hasOption("transfer_limit")) { transferLimitTrue = true; transferLimit = cmd.getOptionValue("transfer_limit"); } if (cmd.hasOption("max_depth")) { maxDepth = Integer.valueOf(cmd.getOptionValue("max_depth")); } if (cmd.hasOption("hdfs_input_folder") && cmd.hasOption("cifs_output_folder")) { hdfsInputFolder = cmd.getOptionValue("hdfs_input_folder"); cifsOutputFolder = cmd.getOptionValue("cifs_output_folder"); hdfs2cifs = true; if (!(cifsOutputFolder.startsWith("/"))) { cifsOutputFolder = "/" + cifsOutputFolder; cifsOutputFolder.substring(1, cifsOutputFolder.length()); } if (!(cifsOutputFolder.endsWith("/"))) { cifsOutputFolder.substring(0, cifsOutputFolder.length() - 1); } /* * if (cmd.hasOption("hdfs_input_file")) { hdfsInputFile = * cmd.getOptionValue("hdfs_input_file"); maxDepth = -1; noNesting = true; } */ } if (cmd.hasOption("hdfs_output_folder") && cmd.hasOption("cifs_input_folder")) { cifsInputFolder = cmd.getOptionValue("cifs_input_folder"); if (!(cifsInputFolder.startsWith("/"))) { cifsInputFolder = "/" + cifsInputFolder; } if (!(cifsInputFolder.endsWith("/"))) { cifsInputFolder = cifsInputFolder + "/"; } hdfsOutputFolder = cmd.getOptionValue("hdfs_output_folder"); cifs2hdfs = true; if (cmd.hasOption("cifs_input_file")) { cifsInputFile = cmd.getOptionValue("cifs_input_file"); maxDepth = -1; noNesting = true; } } if (cifs2hdfs && hdfs2cifs) { System.out.println( "Error Cannot specify hdfs_output_folder/hdfs_input_folder or cifs_output_folder/cifs_input_folder together"); missingParams(); System.exit(0); } } else { missingParams(); System.exit(0); } if (cmd.hasOption("krb_keytab") && cmd.hasOption("krb_upn")) { setKrb = true; keytab = cmd.getOptionValue("krb_keytab"); keytabupn = cmd.getOptionValue("krb_upn"); File keytabFile = new File(keytab); if (keytabFile.exists()) { if (!(keytabFile.canRead())) { System.out.println("KeyTab exists but cannot read it - exiting"); missingParams(); System.exit(1); } } else { System.out.println("KeyTab doesn't exist - exiting"); missingParams(); System.exit(1); } } hdfsClient = new HdfsClient(setKrb, keytabupn, keytab); hdfsClient.checkSecurity(); if (cifsPwdCredPath != null && cifsPwdAlias != null) { cifsPwd = hdfsClient.getCredsViaJceks(cifsPwdCredPath, cifsPwdAlias); } if (hdfs2cifs) { cifsClient = new CifsClient(cifsLogonTo, cifsUserId, cifsPwd, cifsDomain, -1, false); List<String> hdfsfileList = null; try { hdfsfileList = hdfsClient.getHdfsFiles(hdfsInputFolder); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (InterruptedException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } // Spins up a thread per directory to allow some parallelism.. // Theoretically this can be run as a Mapreduce job ThreadGroup cifsTg = new ThreadGroup("CifsThreadGroup"); for (int i = 0; i < hdfsfileList.size(); i++) { String fileName = hdfsfileList.get(i); HDFS2CifsThread sc = null; if (transferLimitTrue) { while (Integer.valueOf(transferLimit) == cifsTg.activeCount()) { synchronized (objectWaiter) { try { objectWaiter.wait(10000L); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } String threadName = "cifs" + i; sc = new HDFS2CifsThread(cifsClient, cifsTg, threadName, fileName, cifsHost, cifsOutputFolder, setKrb, keytabupn, keytab); sc.start(); } } if (cifs2hdfs) { cifsClient = new CifsClient(cifsLogonTo, cifsUserId, cifsPwd, cifsDomain, Integer.valueOf(maxDepth), noNesting); SmbFile smbFileConn = cifsClient.createInitialConnection(cifsHost, cifsInputFolder); try { cifsClient.traverse(smbFileConn, Integer.valueOf(maxDepth), ignoreTopFolder, cifsInputFile); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } cifsFileList = cifsClient.getFileList(); int cifsCount = cifsFileList.size(); // Spins up a thread per directory to allow some parallelism.. // Theoretically this can be run as a Mapreduce job ThreadGroup cifsTg = new ThreadGroup("CifsThreadGroup"); for (int i = 0; i < cifsCount; i++) { String fileName = cifsFileList.get(i); Cifs2HDFSThread sc = null; if (transferLimitTrue) { while (Integer.valueOf(transferLimit) == cifsTg.activeCount()) { synchronized (objectWaiter) { try { objectWaiter.wait(10000L); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } try { String threadName = "cifs" + i; sc = new Cifs2HDFSThread(cifsTg, threadName, new SmbFile(fileName, cifsClient.auth), hdfsOutputFolder, cifsHost, cifsInputFolder, setKrb, keytabupn, keytab); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } sc.start(); } } } private static void missingParams() { String header = "CIFS to Hadoop Client"; String footer = "\nPlease report issues at http://github.com/gss2002/cifs2hdfs"; HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("cifs2hdfs", header, options, footer, true); System.exit(0); } }