org.apache.zeppelin.submarine.hadoop.HdfsClient.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.zeppelin.submarine.hadoop.HdfsClient.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.zeppelin.submarine.hadoop;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.zeppelin.conf.ZeppelinConfiguration;
import org.apache.zeppelin.interpreter.thrift.ParagraphInfo;
import org.apache.zeppelin.submarine.commons.SubmarineConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.ByteArrayOutputStream;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Hadoop FileSystem wrapper. Support both secure and no-secure mode
 */
public class HdfsClient {
    private static Logger LOGGER = LoggerFactory.getLogger(HdfsClient.class);

    private ZeppelinConfiguration zConf = ZeppelinConfiguration.create();
    private Configuration hadoopConf;
    private boolean isSecurityEnabled;
    private FileSystem fs;

    private static Pattern REPL_PATTERN = Pattern.compile("(\\s*)%([\\w\\.]+)(\\(.*?\\))?.*", Pattern.DOTALL);

    public HdfsClient(Properties properties) {
        String krb5conf = properties.getProperty(SubmarineConstants.SUBMARINE_HADOOP_KRB5_CONF, "");
        if (!StringUtils.isEmpty(krb5conf)) {
            System.setProperty("java.security.krb5.conf", krb5conf);
        }

        this.hadoopConf = new Configuration();
        // disable checksum for local file system. because interpreter.json may be updated by
        // non-hadoop filesystem api
        // disable caching for file:// scheme to avoid getting LocalFS which does CRC checks
        // this.hadoopConf.setBoolean("fs.file.impl.disable.cache", true);
        this.hadoopConf.set("fs.file.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
        // UserGroupInformation.setConfiguration(hadoopConf);
        this.isSecurityEnabled = UserGroupInformation.isSecurityEnabled();

        if (isSecurityEnabled) {
            String keytab = properties.getProperty(SubmarineConstants.SUBMARINE_HADOOP_KEYTAB, "");
            String principal = properties.getProperty(SubmarineConstants.SUBMARINE_HADOOP_PRINCIPAL, "");

            ZeppelinConfiguration zConf = ZeppelinConfiguration.create();
            if (StringUtils.isEmpty(keytab)) {
                keytab = zConf.getString(ZeppelinConfiguration.ConfVars.ZEPPELIN_SERVER_KERBEROS_KEYTAB);
            }
            if (StringUtils.isEmpty(principal)) {
                principal = zConf.getString(ZeppelinConfiguration.ConfVars.ZEPPELIN_SERVER_KERBEROS_PRINCIPAL);
            }
            if (StringUtils.isBlank(keytab) || StringUtils.isBlank(principal)) {
                throw new RuntimeException(
                        "keytab and principal can not be empty, keytab: " + keytab + ", principal: " + principal);
            }
            try {
                UserGroupInformation.loginUserFromKeytab(principal, keytab);
            } catch (IOException e) {
                throw new RuntimeException("Fail to login via keytab:" + keytab + ", principal:" + principal, e);
            } catch (Exception e) {
                LOGGER.error(e.getMessage(), e);
            }
        }

        try {
            this.fs = FileSystem.get(new URI("/"), this.hadoopConf);
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        } catch (URISyntaxException e) {
            LOGGER.error(e.getMessage(), e);
        }
    }

    public FileSystem getFs() {
        return fs;
    }

    public Path makeQualified(Path path) {
        return fs.makeQualified(path);
    }

    public boolean exists(final Path path) throws IOException {
        return callHdfsOperation(new HdfsOperation<Boolean>() {

            @Override
            public Boolean call() throws IOException {
                return fs.exists(path);
            }
        });
    }

    public void tryMkDir(final Path dir) throws IOException {
        callHdfsOperation(new HdfsOperation<Void>() {
            @Override
            public Void call() throws IOException {
                if (!fs.exists(dir)) {
                    fs.mkdirs(dir);
                    LOGGER.info("Create dir {} in hdfs", dir.toString());
                }
                if (fs.isFile(dir)) {
                    throw new IOException(dir.toString() + " is file instead of directory, please remove "
                            + "it or specify another directory");
                }
                fs.mkdirs(dir);
                return null;
            }
        });
    }

    public List<Path> list(final Path path) throws IOException {
        return callHdfsOperation(new HdfsOperation<List<Path>>() {
            @Override
            public List<Path> call() throws IOException {
                List<Path> paths = new ArrayList<>();
                for (FileStatus status : fs.globStatus(path)) {
                    paths.add(status.getPath());
                }
                return paths;
            }
        });
    }

    // recursive search path, (list folder in sub folder on demand, instead of load all
    // data when zeppelin server start)
    public List<Path> listAll(final Path path) throws IOException {
        return callHdfsOperation(new HdfsOperation<List<Path>>() {
            @Override
            public List<Path> call() throws IOException {
                List<Path> paths = new ArrayList<>();
                collectNoteFiles(path, paths);
                return paths;
            }

            private void collectNoteFiles(Path folder, List<Path> noteFiles) throws IOException {
                FileStatus[] paths = fs.listStatus(folder);
                for (FileStatus path : paths) {
                    if (path.isDirectory()) {
                        collectNoteFiles(path.getPath(), noteFiles);
                    } else {
                        if (path.getPath().getName().endsWith(".zpln")) {
                            noteFiles.add(path.getPath());
                        } else {
                            LOGGER.warn("Unknown file: " + path.getPath());
                        }
                    }
                }
            }
        });
    }

    public boolean delete(final Path path) throws IOException {
        return callHdfsOperation(new HdfsOperation<Boolean>() {
            @Override
            public Boolean call() throws IOException {
                return fs.delete(path, true);
            }
        });
    }

    public String readFile(final Path file) throws IOException {
        return callHdfsOperation(new HdfsOperation<String>() {
            @Override
            public String call() throws IOException {
                LOGGER.debug("Read from file: " + file);
                ByteArrayOutputStream noteBytes = new ByteArrayOutputStream();
                IOUtils.copyBytes(fs.open(file), noteBytes, hadoopConf);
                return new String(
                        noteBytes.toString(zConf.getString(ZeppelinConfiguration.ConfVars.ZEPPELIN_ENCODING)));
            }
        });
    }

    public void writeFile(final String content, final Path file) throws IOException {
        callHdfsOperation(new HdfsOperation<Void>() {
            @Override
            public Void call() throws IOException {
                InputStream in = new ByteArrayInputStream(
                        content.getBytes(zConf.getString(ZeppelinConfiguration.ConfVars.ZEPPELIN_ENCODING)));
                Path tmpFile = new Path(file.toString() + ".tmp");
                IOUtils.copyBytes(in, fs.create(tmpFile), hadoopConf);
                fs.delete(file, true);
                fs.rename(tmpFile, file);
                return null;
            }
        });
    }

    public void move(Path src, Path dest) throws IOException {
        callHdfsOperation(() -> {
            fs.rename(src, dest);
            return null;
        });
    }

    private interface HdfsOperation<T> {
        T call() throws IOException;
    }

    private synchronized <T> T callHdfsOperation(final HdfsOperation<T> func) throws IOException {
        if (isSecurityEnabled) {
            try {
                return UserGroupInformation.getCurrentUser().doAs(new PrivilegedExceptionAction<T>() {
                    @Override
                    public T run() throws Exception {
                        return func.call();
                    }
                });
            } catch (InterruptedException e) {
                throw new IOException(e);
            }
        } else {
            return func.call();
        }
    }

    public String parseText(String text) {
        String script = "", intpText = "";

        // parse text to get interpreter component
        if (text != null) {
            Matcher matcher = REPL_PATTERN.matcher(text);
            if (matcher.matches()) {
                String headingSpace = matcher.group(1);
                intpText = matcher.group(2);

                if (matcher.groupCount() == 3 && matcher.group(3) != null) {
                    String localPropertiesText = matcher.group(3);
                    String[] splits = localPropertiesText.substring(1, localPropertiesText.length() - 1).split(",");
                    for (String split : splits) {
                        String[] kv = split.split("=");
                        if (StringUtils.isBlank(split) || kv.length == 0) {
                            continue;
                        }
                        if (kv.length > 2) {
                            throw new RuntimeException("Invalid paragraph properties format: " + split);
                        }
                    }
                    script = text
                            .substring(headingSpace.length() + intpText.length() + localPropertiesText.length() + 1)
                            .trim();
                } else {
                    script = text.substring(headingSpace.length() + intpText.length() + 1).trim();
                }
            } else {
                script = text.trim();
            }
        }

        return script;
    }

    public String saveParagraphToFiles(String noteId, List<ParagraphInfo> paragraphInfos, String dirName,
            Properties properties) throws Exception {
        StringBuffer outputMsg = new StringBuffer();

        String hdfsUploadPath = properties.getProperty(SubmarineConstants.SUBMARINE_ALGORITHM_HDFS_PATH, "");

        HashMap<String, StringBuffer> mapParagraph = new HashMap<>();
        for (int i = 0; i < paragraphInfos.size(); i++) {
            ParagraphInfo paragraph = paragraphInfos.get(i);
            String paragraphTitle = paragraph.getParagraphTitle();
            if (org.apache.commons.lang.StringUtils.isEmpty(paragraphTitle)) {
                String message = "WARN: The title of the [" + i
                        + "] paragraph is empty and was not submitted to HDFS.\n";
                LOGGER.warn(message);
                outputMsg.append(message);
                continue;
            }
            if (!mapParagraph.containsKey(paragraphTitle)) {
                StringBuffer mergeScript = new StringBuffer();
                mapParagraph.put(paragraphTitle, mergeScript);
            }
            StringBuffer mergeScript = mapParagraph.get(paragraphTitle);
            String parapraphText = paragraph.getParagraphText();
            String text = parseText(parapraphText);
            mergeScript.append(text + "\n\n");
        }

        // Clear all files in the local noteId directory
        if (!org.apache.commons.lang3.StringUtils.isEmpty(dirName)) {
            String noteDir = dirName + "/" + noteId;
            File fileNoteDir = new File(noteDir);
            if (fileNoteDir.exists()) {
                fileNoteDir.delete();
            }
            fileNoteDir.mkdirs();
        }

        // Clear all files in the noteid directory in HDFS
        if (!org.apache.commons.lang3.StringUtils.isEmpty(hdfsUploadPath)) {
            Path hdfsPath = new Path(hdfsUploadPath + "/" + noteId);
            try {
                if (exists(hdfsPath)) {
                    delete(hdfsPath);
                    tryMkDir(hdfsPath);
                }
            } catch (IOException e) {
                LOGGER.error(e.getMessage(), e);
                throw new Exception(e);
            }
        }

        for (Map.Entry<String, StringBuffer> entry : mapParagraph.entrySet()) {
            try {
                String fileName = entry.getKey();
                String fileContext = entry.getValue().toString();
                String paragraphFile = dirName + "/" + noteId + "/" + fileName;

                // save to local file
                if (!org.apache.commons.lang3.StringUtils.isEmpty(dirName)) {
                    File fileParagraph = new File(paragraphFile);
                    if (!fileParagraph.exists()) {
                        fileParagraph.createNewFile();
                    }
                    FileWriter writer = new FileWriter(paragraphFile);
                    writer.write(fileContext);
                    writer.close();
                }

                // save to hdfs
                if (!StringUtils.isEmpty(hdfsUploadPath)) {
                    String fileDir = hdfsUploadPath + "/" + noteId + "/" + fileName;
                    // upload algorithm file
                    LOGGER.info("Commit algorithm to HDFS: {}", fileDir);
                    Path filePath = new Path(fileDir);
                    writeFile(fileContext, filePath);
                }
            } catch (IOException e) {
                LOGGER.error(e.getMessage(), e);
                throw new Exception(e);
            }
        }

        return outputMsg.toString();
    }
}