cascading.tap.Hfs.java Source code

Introduction

Here is the source code for cascading.tap.Hfs.java
Source

/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.tap;

import java.beans.ConstructorProperties;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Map;

import cascading.flow.hadoop.HadoopUtil;
import cascading.scheme.Scheme;
import cascading.scheme.SequenceFile;
import cascading.tap.hadoop.TapCollector;
import cascading.tap.hadoop.TapIterator;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import cascading.tuple.hadoop.TupleSerialization;
import cascading.util.Util;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3native.NativeS3FileSystem;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;

/**
 * Class Hfs is the base class for all Hadoop file system access. Use {@link Dfs}, {@link Lfs}, or {@link S3fs}
 * for resources specific to Hadoop Distributed file system, the Local file system, or Amazon S3, respectively.
 * <p/>
 * Use the Hfs class if the 'kind' of resource is unknown at design time. To use, prefix a scheme to the 'stringPath'. Where
 * <code>hdfs://...</code> will denonte Dfs, <code>file://...</code> will denote Lfs, and
 * <code>s3://aws_id:aws_secret@bucket/...</code> will denote S3fs.
 * <p/>
 * Call {@link #setTemporaryDirectory(java.util.Map, String)} to use a different temporary file directory path
 * other than the current Hadoop default path.
 */
public class Hfs extends Tap {
    /** Field LOG */
    private static final Logger LOG = Logger.getLogger(Hfs.class);
    /** Field serialVersionUID */
    private static final long serialVersionUID = 1L;

    /** Field TEMPORARY_DIRECTORY */
    private static final String TEMPORARY_DIRECTORY = "cascading.tmp.dir";

    /** Field stringPath */
    String stringPath;
    /** Field uriScheme */
    transient URI uriScheme;
    /** Field path */
    transient Path path;
    /** Field paths */
    private transient FileStatus[] statuses;

    /**
     * Method setTemporaryDirectory sets the temporary directory on the given properties object.
     *
     * @param properties of type Map<Object,Object>
     * @param tempDir    of type String
     */
    public static void setTemporaryDirectory(Map<Object, Object> properties, String tempDir) {
        properties.put(TEMPORARY_DIRECTORY, tempDir);
    }

    /**
     * Methdo getTemporaryDirectory returns the configured temporary directory from the given properties object.
     *
     * @param properties of type Map<Object,Object>
     * @return a String or null if not set
     */
    public static String getTemporaryDirectory(Map<Object, Object> properties) {
        return (String) properties.get(TEMPORARY_DIRECTORY);
    }

    protected Hfs() {
    }

    @ConstructorProperties({ "scheme" })
    protected Hfs(Scheme scheme) {
        super(scheme);
    }

    /**
     * Constructor Hfs creates a new Hfs instance.
     *
     * @param fields     of type Fields
     * @param stringPath of type String
     */
    @ConstructorProperties({ "fields", "stringPath" })
    public Hfs(Fields fields, String stringPath) {
        super(new SequenceFile(fields));
        setStringPath(stringPath);
    }

    /**
     * Constructor Hfs creates a new Hfs instance.
     *
     * @param fields     of type Fields
     * @param stringPath of type String
     * @param replace    of type boolean
     */
    @ConstructorProperties({ "fields", "stringPath", "replace" })
    public Hfs(Fields fields, String stringPath, boolean replace) {
        super(new SequenceFile(fields), replace ? SinkMode.REPLACE : SinkMode.KEEP);
        setStringPath(stringPath);
    }

    /**
     * Constructor Hfs creates a new Hfs instance.
     *
     * @param fields     of type Fields
     * @param stringPath of type String
     * @param sinkMode   of type SinkMode
     */
    @ConstructorProperties({ "fields", "stringPath", "sinkMode" })
    public Hfs(Fields fields, String stringPath, SinkMode sinkMode) {
        super(new SequenceFile(fields), sinkMode);
        setStringPath(stringPath);

        if (sinkMode == SinkMode.UPDATE)
            throw new IllegalArgumentException("updates are not supported");
    }

    /**
     * Constructor Hfs creates a new Hfs instance.
     *
     * @param scheme     of type Scheme
     * @param stringPath of type String
     */
    @ConstructorProperties({ "scheme", "stringPath" })
    public Hfs(Scheme scheme, String stringPath) {
        super(scheme);
        setStringPath(stringPath);
    }

    /**
     * Constructor Hfs creates a new Hfs instance.
     *
     * @param scheme     of type Scheme
     * @param stringPath of type String
     * @param replace    of type boolean
     */
    @ConstructorProperties({ "scheme", "stringPath", "replace" })
    public Hfs(Scheme scheme, String stringPath, boolean replace) {
        super(scheme, replace ? SinkMode.REPLACE : SinkMode.KEEP);
        setStringPath(stringPath);
    }

    /**
     * Constructor Hfs creates a new Hfs instance.
     *
     * @param scheme     of type Scheme
     * @param stringPath of type String
     * @param sinkMode   of type SinkMode
     */
    @ConstructorProperties({ "scheme", "stringPath", "sinkMode" })
    public Hfs(Scheme scheme, String stringPath, SinkMode sinkMode) {
        super(scheme, sinkMode);
        setStringPath(stringPath);
    }

    protected void setStringPath(String stringPath) {
        this.stringPath = Util.normalizeUrl(stringPath);
    }

    protected void setUriScheme(URI uriScheme) {
        this.uriScheme = uriScheme;
    }

    public URI getURIScheme(JobConf jobConf) throws IOException {
        if (uriScheme != null)
            return uriScheme;

        uriScheme = makeURIScheme(jobConf);

        return uriScheme;
    }

    protected URI makeURIScheme(JobConf jobConf) throws IOException {
        try {
            URI uriScheme = null;

            if (LOG.isDebugEnabled())
                LOG.debug("handling path: " + stringPath);

            URI uri = new URI(stringPath);
            String schemeString = uri.getScheme();
            String authority = uri.getAuthority();

            if (LOG.isDebugEnabled()) {
                LOG.debug("found scheme: " + schemeString);
                LOG.debug("found authority: " + authority);
            }

            if (schemeString != null && authority != null)
                uriScheme = new URI(schemeString + "://" + uri.getAuthority());
            else if (schemeString != null)
                uriScheme = new URI(schemeString + ":///");
            else
                uriScheme = getDefaultFileSystemURIScheme(jobConf);

            if (LOG.isDebugEnabled())
                LOG.debug("using uri scheme: " + uriScheme);

            return uriScheme;
        } catch (URISyntaxException exception) {
            throw new TapException("could not determine scheme from path: " + getPath(), exception);
        }
    }

    /**
     * Method getDefaultFileSystemURIScheme returns the URI scheme for the default Hadoop FileSystem.
     *
     * @param jobConf of type JobConf
     * @return URI
     * @throws IOException when
     */
    public URI getDefaultFileSystemURIScheme(JobConf jobConf) throws IOException {
        return getDefaultFileSystem(jobConf).getUri();
    }

    @Override
    public boolean isWriteDirect() {
        return super.isWriteDirect() || stringPath != null && stringPath.matches("(^https?://.*$)|(^s3tp://.*$)");
    }

    protected FileSystem getDefaultFileSystem(JobConf jobConf) throws IOException {
        return FileSystem.get(jobConf);
    }

    protected FileSystem getFileSystem(JobConf jobConf) throws IOException {
        return FileSystem.get(getURIScheme(jobConf), jobConf);
    }

    /** @see Tap#getPath() */
    @Override
    public Path getPath() {
        if (path != null)
            return path;

        if (stringPath == null)
            throw new IllegalStateException("path not initialized");

        path = new Path(stringPath);

        return path;
    }

    @Override
    public Path getQualifiedPath(JobConf conf) throws IOException {
        return getPath().makeQualified(getFileSystem(conf));
    }

    @Override
    public void sourceInit(JobConf conf) throws IOException {
        Path qualifiedPath = getQualifiedPath(conf);

        for (Path exitingPath : FileInputFormat.getInputPaths(conf)) {
            if (exitingPath.equals(qualifiedPath))
                throw new TapException("may not add duplicate paths, found: " + exitingPath);
        }

        FileInputFormat.addInputPath(conf, qualifiedPath);

        super.sourceInit(conf);

        makeLocal(conf, qualifiedPath, "forcing job to local mode, via source: ");

        TupleSerialization.setSerializations(conf); // allows Hfs to be used independent of Flow
    }

    @Override
    public void sinkInit(JobConf conf) throws IOException {
        // do not delete if initialized from within a task
        if (isReplace() && conf.get("mapred.task.partition") == null)
            deletePath(conf);

        Path qualifiedPath = getQualifiedPath(conf);

        FileOutputFormat.setOutputPath(conf, qualifiedPath);
        super.sinkInit(conf);

        makeLocal(conf, qualifiedPath, "forcing job to local mode, via sink: ");

        TupleSerialization.setSerializations(conf); // allows Hfs to be used independent of Flow
    }

    private void makeLocal(JobConf conf, Path qualifiedPath, String infoMessage) {
        if (!conf.get("mapred.job.tracker", "").equalsIgnoreCase("local")
                && qualifiedPath.toUri().getScheme().equalsIgnoreCase("file")) {
            if (LOG.isInfoEnabled())
                LOG.info(infoMessage + toString());

            conf.set("mapred.job.tracker", "local"); // force job to run locally
        }
    }

    @Override
    public boolean makeDirs(JobConf conf) throws IOException {
        if (LOG.isDebugEnabled())
            LOG.debug("making dirs: " + getQualifiedPath(conf));

        return getFileSystem(conf).mkdirs(getPath());
    }

    @Override
    public boolean deletePath(JobConf conf) throws IOException {
        if (LOG.isDebugEnabled())
            LOG.debug("deleting: " + getQualifiedPath(conf));

        // do not delete the root directory
        if (getQualifiedPath(conf).depth() == 0)
            return true;

        FileSystem fileSystem = getFileSystem(conf);

        try {
            return fileSystem.delete(getPath(), true);
        } catch (NullPointerException exception) {
            // hack to get around npe thrown when fs reaches root directory
            if (!(fileSystem instanceof NativeS3FileSystem))
                throw exception;
        }

        return true;
    }

    @Override
    public boolean pathExists(JobConf conf) throws IOException {
        return getFileSystem(conf).exists(getPath());
    }

    @Override
    public long getPathModified(JobConf conf) throws IOException {
        FileStatus fileStatus = getFileSystem(conf).getFileStatus(getPath());

        if (!fileStatus.isDir())
            return fileStatus.getModificationTime();

        makeStatuses(conf);

        // statuses is empty, return 0
        if (statuses == null || statuses.length == 0)
            return 0;

        long date = 0;

        // filter out directories as we don't recurse into sub dirs
        for (FileStatus status : statuses) {
            if (!status.isDir())
                date = Math.max(date, status.getModificationTime());
        }

        return date;
    }

    protected Path getTempPath(JobConf conf) {
        String tempDir = conf.get(TEMPORARY_DIRECTORY);

        if (tempDir == null)
            tempDir = conf.get("hadoop.tmp.dir");

        return new Path(tempDir);
    }

    protected String makeTemporaryPathDir(String name) {
        // _ is treated as a hidden file, so wipe them out
        name = name.replaceAll("^[_\\W\\s]+", "");

        if (name.isEmpty())
            name = "temp-path";

        return name.replaceAll("[\\W\\s]+", "_") + Integer.toString((int) (10000000 * Math.random()));
    }

    /**
     * Given a file-system object, it makes an array of paths
     *
     * @param conf of type JobConf
     * @throws IOException on failure
     */
    private void makeStatuses(JobConf conf) throws IOException {
        if (statuses != null)
            return;

        statuses = getFileSystem(conf).listStatus(getPath());
    }

    /** @see Object#toString() */
    @Override
    public String toString() {
        if (stringPath != null)
            return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[\"" + Util.sanitizeUrl(stringPath)
                    + "\"]"; // sanitize
        else
            return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[not initialized]";
    }

    /** @see Tap#equals(Object) */
    @Override
    public boolean equals(Object object) {
        if (this == object)
            return true;
        if (object == null || getClass() != object.getClass())
            return false;
        if (!super.equals(object))
            return false;

        Hfs hfs = (Hfs) object;

        if (stringPath != null ? !stringPath.equals(hfs.stringPath) : hfs.stringPath != null)
            return false;

        return true;
    }

    /** @see Tap#hashCode() */
    @Override
    public int hashCode() {
        int result = super.hashCode();
        result = 31 * result + (stringPath != null ? stringPath.hashCode() : 0);
        return result;
    }

    public TupleEntryIterator openForRead(JobConf conf) throws IOException {
        Map<Object, Object> properties = HadoopUtil.createProperties(conf);

        properties.remove("mapred.input.dir");

        conf = HadoopUtil.createJobConf(properties, null);

        return new TupleEntryIterator(getSourceFields(), new TapIterator(this, conf));
    }

    public TupleEntryCollector openForWrite(JobConf conf) throws IOException {
        return new TapCollector(this, conf);
    }
}